<a href="https://colab.research.google.com/github/runnin123/Jeju_Bigdata/blob/master/Jeju_GW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import auth
auth.authenticate_user()

In [2]:
from google.cloud import bigquery
from tabulate import tabulate
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

from itertools import product

# 평가 메트릭
from sklearn.metrics import mean_squared_error, mean_absolute_error

# 통계
from scipy import stats
from scipy.stats import norm, skew #for some statistics

from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

In [3]:
class Model:
    def __init__(self, data, num):
        self.X_train = data[0]
        self.X_test = data[1]
        self.y_train = data[2]
        self.y_test = data[3]
        self.encoding_data = num

    def rmsle(self, y, pred): 
        log_y = np.log1p(y)
        log_pred = np.log1p(pred)
        squared_error = (log_y - log_pred)**2
        rmsle = np.sqrt(np.mean(squared_error))
        print(round(rmsle, 3))

        return round(rmsle, 3)

    def best_params_model(self, model, params):
        cv_model = GridSearchCV(model, param_grid=params, scoring="neg_mean_squared_error", cv = 5)
        cv_model.fit(self.X_train, self.y_train)
        eval_pred = cv_model.predict(self.X_test)
        eval_pred = np.expm1(eval_pred)
        rmsle_ = self.rmsle(self.y_test, eval_pred)

        return  cv_model.best_estimator_, rmsle_

    def get_model(self):
        xgb = XGBRegressor(random_state=0)
        gbm = GradientBoostingRegressor(random_state=0)
        lgb = LGBMRegressor(random_state=0)

        params = {'n_estimators': [1000, 2000]}

        models = [xgb, gbm, lgb]
        best_models = []
        rmsles = []
        
        for model in models:
            new_model = self.best_params_model(model, params)[0]
            new_rmsle = self.best_params_model(model, params)[1]
            best_models.append(new_model)
            rmsles.append(new_rmsle)

        self.xgb_reg = best_models[0]
        self.gbm_reg = best_models[1]
        self.lgb_reg = best_models[2]

        self.xgb_rmsle = rmsles[0]
        self.gbm_rmsle = rmsles[1]
        self.lgb_rmsle = rmsles[2]

    def final(self):

        xgb_pred = self.xgb_reg.predict(self.X_test)
        xgb_pred = np.expm1(xgb_pred)

        gbm_pred = self.gbm_reg.predict(self.X_test)
        gbm_pred = np.expm1(gbm_pred)

        lgb_pred = self.lgb_reg.predict(self.X_test)
        lgb_pred = np.expm1(lgb_pred)

        pred = np.array([xgb_pred, gbm_pred, lgb_pred])
        pred = np.transpose(pred)

        rmsle_sum = self.xgb_rmsle + self.gbm_rmsle + self.lgb_rmsle

        self.xgb_per = self.xgb_rmsle / rmsle_sum
        self.gbm_per = self.gbm_rmsle / rmsle_sum
        self.lgb_per = self.lgb_rmsle / rmsle_sum
        
        final = xgb_pred*self.xgb_per + gbm_pred*self.gbm_per + lgb_pred*self.lgb_per
        self.rmsle(self.y_test, final)

    def make_temp(self):
        CARD_SIDO_NMs = self.encoding_data['CARD_SIDO_NM'].unique()
        CARD_CCG_NMs = self.encoding_data['CARD_CCG_NM'].unique()
        STD_CLSS_NMs  = self.encoding_data['STD_CLSS_NM'].unique()
        HOM_SIDO_NMs  = self.encoding_data['HOM_SIDO_NM'].unique()
        AGEs          = self.encoding_data['AGE'].unique()
        SEX_CTGO_CDs  = self.encoding_data['SEX_CTGO_CD'].unique()
        FLCs          = self.encoding_data['FLC'].unique()
        years         = [2020]
        months        = [4, 7]

        comb_list = [CARD_SIDO_NMs, CARD_CCG_NMs, STD_CLSS_NMs, HOM_SIDO_NMs, AGEs, SEX_CTGO_CDs, FLCs, years, months]
        temp = np.array(list(product(*comb_list)))

        train_features = self.encoding_data.drop(['CSTMR_CNT', 'AMT', 'CNT'], axis=1)
        tmp = pd.DataFrame(data=temp, columns=train_features.columns)

        return tmp

    def make_sub(self, temp):
        xgb_pred = self.xgb_reg.predict(temp)
        xgb_pred = np.expm1(xgb_pred)

        gbm_pred = self.gbm_reg.predict(temp)
        gbm_pred = np.expm1(gbm_pred)

        lgb_pred = self.lgb_reg.predict(temp)
        lgb_pred = np.expm1(lgb_pred)

        final_rmsle = xgb_pred*self.xgb_per + gbm_pred*self.gbm_per + lgb_pred*self.lgb_per

        temp['AMT'] = np.round(final_rmsle, 0)
        temp['REG_YYMM'] = temp['year']*100 + temp['month']
        temp = temp[['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT']]
        temp = temp.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM']).sum().reset_index(drop = False)

        temp['CARD_SIDO_NM'] = encoders['CARD_SIDO_NM'].inverse_transform(temp['CARD_SIDO_NM'])
        temp['STD_CLSS_NM'] = encoders['STD_CLSS_NM'].inverse_transform(temp['STD_CLSS_NM'])

        return temp

In [4]:
def sampling():
    project_id = 'jeju-bigquery-282708'
    client = bigquery.Client(project=project_id)

    train = client.query('''
    SELECT 
        * 
    FROM `jeju-bigquery-282708.jeju_bigdata.201901_202003_train`
    ''').to_dataframe()

    return train

In [5]:
def grap_year(data):
    data = str(data)
    return int(data[:4])

def grap_month(data):
    data = str(data)
    return int(data[4:])

In [6]:
def data_pre(data):
    data = data.fillna('')
    data['year'] = data['REG_YYMM'].apply(lambda x: grap_year(x))
    data['month'] = data['REG_YYMM'].apply(lambda x: grap_month(x))
    data = data.drop(['REG_YYMM'], axis = 1)

    return data

In [7]:
def cate(data):
    local = data[data['CARD_CCG_NM'] == data['HOM_CCG_NM']].reset_index(drop = True)
    tourist = data[data['CARD_CCG_NM'] != data['HOM_CCG_NM']].reset_index(drop = True)
    columns = ['CARD_SIDO_NM', 'CARD_CCG_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'year', 'month']
    local, tourist = local.groupby(columns).sum().reset_index(drop=False), tourist.groupby(columns).sum().reset_index(drop=False)

    return local, tourist

In [8]:
def encoding(cate):

    global encoders
    dtypes_cate = cate.dtypes
    encoders = {}
    for column in cate.columns:
        if str(dtypes_cate[column]) == 'object':
            encoder = LabelEncoder()
            encoder.fit(cate[column])
            encoders[column] = encoder

    cate_num = cate.copy()

    for column in encoders.keys():
        encoder = encoders[column]
        cate_num[column] = encoder.transform(cate[column])

    return cate_num

In [9]:
def split(cate_num):
    split_data = []

    X_data, y_data = cate_num.loc[:, cate_num.columns != 'AMT'], cate_num['AMT']
    X_data = X_data.drop(['CSTMR_CNT', 'CNT'], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.3, random_state=126, shuffle=True)
    y_train = np.log1p(y_train)
    
    return X_train, X_test, y_train, y_test

In [10]:
train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/jeju/201901-202003.csv')

In [11]:
train = train[(train['REG_YYMM'] == 202002) | (train['REG_YYMM'] == 202003)]

In [12]:
GW = train[train['CARD_SIDO_NM'] == '강원'].reset_index(drop = True)

In [13]:
GW = GW.sample(n = 10000).reset_index(drop = True)
GW

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,CARD_CCG_NM,STD_CLSS_NM,HOM_SIDO_NM,HOM_CCG_NM,AGE,SEX_CTGO_CD,FLC,CSTMR_CNT,AMT,CNT
0,202003,강원,횡성군,차량용 주유소 운영업,충남,천안시 서북구,50s,1,4,3,174000,4
1,202003,강원,삼척시,기타 대형 종합 소매업,부산,동래구,50s,1,4,4,412510,6
2,202003,강원,원주시,골프장 운영업,서울,광진구,60s,1,5,26,4606670,34
3,202002,강원,인제군,비알콜 음료점업,경기,용인시 수지구,50s,1,4,3,35600,3
4,202002,강원,고성군,체인화 편의점,서울,양천구,20s,2,1,7,61470,10
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,202002,강원,철원군,한식 음식점업,경기,성남시 중원구,20s,1,1,5,209000,8
9996,202003,강원,양양군,슈퍼마켓,서울,송파구,30s,2,2,3,44000,3
9997,202002,강원,속초시,과실 및 채소 소매업,서울,광진구,20s,2,1,4,228500,6
9998,202002,강원,평창군,기타음식료품위주종합소매업,서울,서초구,40s,1,3,17,568450,22


In [14]:
data = data_pre(GW)

In [15]:
local, tourist = cate(data)

In [16]:
lcl_num, trst_num = encoding(local), encoding(tourist)

In [17]:
lcl_data, trst_data = split(lcl_num), split(trst_num)

In [18]:
lcl = Model(lcl_data, lcl_num)
trst = Model(trst_data, trst_num)

In [19]:
lcl.get_model()

0.977
0.977
0.96
0.96
1.053
1.053


In [20]:
trst.get_model()

1.002
1.002
1.01
1.01
1.059
1.059


In [21]:
lcl.final()

0.959


In [22]:
trst.final()

1.0


In [23]:
lcl_temp = lcl.make_temp()

In [24]:
trst_temp = trst.make_temp()

In [25]:
lcl_sub = lcl.make_sub(lcl_temp)

In [26]:
trst_sub = trst.make_sub(trst_temp)

In [32]:
lcl_sub

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
0,202004,강원,골프장 운영업,1.118174e+09
1,202004,강원,과실 및 채소 소매업,1.438270e+09
2,202004,강원,관광 민예품 및 선물용품 소매업,1.510596e+09
3,202004,강원,그외 기타 종합 소매업,1.972877e+08
4,202004,강원,기타 대형 종합 소매업,4.278686e+08
...,...,...,...,...
59,202007,강원,피자 햄버거 샌드위치 및 유사 음식점업,1.764559e+09
60,202007,강원,한식 음식점업,2.963223e+10
61,202007,강원,호텔업,2.274622e+09
62,202007,강원,화장품 및 방향제 소매업,1.005212e+09


In [27]:
trst_sub

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
0,202004,강원,골프장 운영업,7.922751e+09
1,202004,강원,과실 및 채소 소매업,1.723494e+09
2,202004,강원,관광 민예품 및 선물용품 소매업,9.806363e+08
3,202004,강원,그외 기타 종합 소매업,2.279472e+09
4,202004,강원,기타 대형 종합 소매업,2.886123e+09
...,...,...,...,...
59,202007,강원,피자 햄버거 샌드위치 및 유사 음식점업,2.573601e+08
60,202007,강원,한식 음식점업,1.939143e+09
61,202007,강원,호텔업,2.716570e+09
62,202007,강원,화장품 및 방향제 소매업,1.961601e+09


In [28]:
test = train[['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT']]

In [29]:
test = test[test['REG_YYMM'] == 202003]

In [30]:
test = test[test['CARD_SIDO_NM'] == '강원']

In [31]:
test.groupby(['REG_YYMM','STD_CLSS_NM']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,AMT
REG_YYMM,STD_CLSS_NM,Unnamed: 2_level_1
202003,건강보조식품 소매업,96059012
202003,골프장 운영업,2915797995
202003,과실 및 채소 소매업,994816943
202003,관광 민예품 및 선물용품 소매업,13317300
202003,그외 기타 스포츠시설 운영업,2075000
202003,그외 기타 종합 소매업,470200240
202003,기타 대형 종합 소매업,6384110710
202003,기타 수상오락 서비스업,261000
202003,기타 외국식 음식점업,887636153
202003,기타 주점업,250737400
