<a href="https://colab.research.google.com/github/runnin123/Jeju_Bigdata/blob/master/Jeju_GW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
from google.colab import auth
auth.authenticate_user()

In [27]:
from google.cloud import bigquery
from tabulate import tabulate
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

from itertools import product

# 평가 메트릭
from sklearn.metrics import mean_squared_error, mean_absolute_error

# 통계
from scipy import stats
from scipy.stats import norm, skew #for some statistics

from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

In [28]:
class Model:
    def __init__(self, data, num):
        self.X_train = data[0]
        self.X_test = data[1]
        self.y_train = data[2]
        self.y_test = data[3]
        self.encoding_data = num

    def rmsle(self, y, pred): 
        log_y = np.log1p(y)
        log_pred = np.log1p(pred)
        squared_error = (log_y - log_pred)**2
        rmsle = np.sqrt(np.mean(squared_error))
        print(round(rmsle, 3))

        return round(rmsle, 3)

    def best_params_model(self, model, params):
        cv_model = GridSearchCV(model, param_grid=params, scoring="neg_mean_squared_error", cv = 5)
        cv_model.fit(self.X_train, self.y_train)
        eval_pred = cv_model.predict(self.X_test)
        eval_pred = np.expm1(eval_pred)
        rmsle_ = self.rmsle(self.y_test, eval_pred)

        return  cv_model.best_estimator_, rmsle_

    def get_model(self):
        xgb = XGBRegressor(random_state=0)
        gbm = GradientBoostingRegressor(random_state=0)
        lgb = LGBMRegressor(random_state=0)

        params = {'n_estimators': [1000, 2000]}

        models = [xgb, gbm, lgb]
        best_models = []
        rmsles = []
        
        for model in models:
            new_model = self.best_params_model(model, params)[0]
            new_rmsle = self.best_params_model(model, params)[1]
            best_models.append(new_model)
            rmsles.append(new_rmsle)

        self.xgb_reg = best_models[0]
        self.gbm_reg = best_models[1]
        self.lgb_reg = best_models[2]

        self.xgb_rmsle = rmsles[0]
        self.gbm_rmsle = rmsles[1]
        self.lgb_rmsle = rmsles[2]

    def final(self):

        xgb_pred = self.xgb_reg.predict(self.X_test)
        xgb_pred = np.expm1(xgb_pred)

        gbm_pred = self.gbm_reg.predict(self.X_test)
        gbm_pred = np.expm1(gbm_pred)

        lgb_pred = self.lgb_reg.predict(self.X_test)
        lgb_pred = np.expm1(lgb_pred)

        pred = np.array([xgb_pred, gbm_pred, lgb_pred])
        pred = np.transpose(pred)

        rmsle_sum = self.xgb_rmsle + self.gbm_rmsle + self.lgb_rmsle

        self.xgb_per = self.xgb_rmsle / rmsle_sum
        self.gbm_per = self.gbm_rmsle / rmsle_sum
        self.lgb_per = self.lgb_rmsle / rmsle_sum
        
        final = xgb_pred*self.xgb_per + gbm_pred*self.gbm_per + lgb_pred*self.lgb_per
        self.rmsle(self.y_test, final)

    def make_temp(self):
        CARD_SIDO_NMs = self.encoding_data['CARD_SIDO_NM'].unique()
        STD_CLSS_NMs  = self.encoding_data['STD_CLSS_NM'].unique()
        HOM_SIDO_NMs  = self.encoding_data['HOM_SIDO_NM'].unique()
        AGEs          = self.encoding_data['AGE'].unique()
        SEX_CTGO_CDs  = self.encoding_data['SEX_CTGO_CD'].unique()
        FLCs          = self.encoding_data['FLC'].unique()
        years         = [2020]
        months        = [4, 7]

        comb_list = [CARD_SIDO_NMs, STD_CLSS_NMs, HOM_SIDO_NMs, AGEs, SEX_CTGO_CDs, FLCs, years, months]
        temp = np.array(list(product(*comb_list)))

        train_features = self.encoding_data.drop(['CSTMR_CNT', 'AMT', 'CNT'], axis=1)
        tmp = pd.DataFrame(data=temp, columns=train_features.columns)

        return tmp

    def make_sub(self, temp):
        xgb_pred = self.xgb_reg.predict(temp)
        xgb_pred = np.expm1(xgb_pred)

        gbm_pred = self.gbm_reg.predict(temp)
        gbm_pred = np.expm1(gbm_pred)

        lgb_pred = self.lgb_reg.predict(temp)
        lgb_pred = np.expm1(lgb_pred)

        final_rmsle = xgb_pred*self.xgb_per + gbm_pred*self.gbm_per + lgb_pred*self.lgb_per

        temp['AMT'] = np.round(final_rmsle, 0)
        temp['REG_YYMM'] = temp['year']*100 + temp['month']
        temp = temp[['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT']]
        temp = temp.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM']).sum().reset_index(drop = False)

        temp['CARD_SIDO_NM'] = encoders['CARD_SIDO_NM'].inverse_transform(temp['CARD_SIDO_NM'])
        temp['STD_CLSS_NM'] = encoders['STD_CLSS_NM'].inverse_transform(temp['STD_CLSS_NM'])

        return temp

In [29]:
def sampling():
    project_id = 'jeju-bigquery-282708'
    client = bigquery.Client(project=project_id)

    train = client.query('''
    SELECT 
        * 
    FROM `jeju-bigquery-282708.jeju_bigdata.201901_202003_train`
    ''').to_dataframe()

    return train

In [30]:
def grap_year(data):
    data = str(data)
    return int(data[:4])

def grap_month(data):
    data = str(data)
    return int(data[4:])

In [31]:
def data_pre(data):
    data = data.fillna('')
    data['year'] = data['REG_YYMM'].apply(lambda x: grap_year(x))
    data['month'] = data['REG_YYMM'].apply(lambda x: grap_month(x))
    data = data.drop(['REG_YYMM'], axis = 1)

    return data

In [32]:
def cate(data):
    local = data[(data['CARD_SIDO_NM'] == data['HOM_SIDO_NM']) & (data['CARD_CCG_NM'] == data['HOM_CCG_NM'])].reset_index(drop = True)
    tourist = data[(data['CARD_SIDO_NM'] != data['HOM_SIDO_NM']) | (data['CARD_CCG_NM'] != data['HOM_CCG_NM'])].reset_index(drop = True)
    
    local, tourist = local.drop(['CARD_CCG_NM', 'HOM_CCG_NM'], axis=1), tourist.drop(['CARD_CCG_NM', 'HOM_CCG_NM'], axis=1)
    columns = ['CARD_SIDO_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'year', 'month']
    local, tourist = local.groupby(columns).sum().reset_index(drop=False), tourist.groupby(columns).sum().reset_index(drop=False)

    return local, tourist

In [33]:
def encoding(cate):

    global encoders
    dtypes_cate = cate.dtypes
    encoders = {}
    for column in cate.columns:
        if str(dtypes_cate[column]) == 'object':
            encoder = LabelEncoder()
            encoder.fit(cate[column])
            encoders[column] = encoder

    cate_num = cate.copy()

    for column in encoders.keys():
        encoder = encoders[column]
        cate_num[column] = encoder.transform(cate[column])

    return cate_num

In [34]:
def split(cate_num):
    split_data = []

    X_data, y_data = cate_num.loc[:, cate_num.columns != 'AMT'], cate_num['AMT']
    X_data = X_data.drop(['CSTMR_CNT', 'CNT'], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.3, random_state=126, shuffle=True)
    y_train = np.log1p(y_train)
    
    return X_train, X_test, y_train, y_test

In [35]:
train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/jeju/201901-202003.csv')

In [36]:
# train = train[(train['REG_YYMM'] == 202002) | (train['REG_YYMM'] == 202003)]

In [37]:
GW = train[train['CARD_SIDO_NM'] == '강원'].reset_index(drop = True)

In [38]:
GW = GW.sample(n = 10000).reset_index(drop = True)
GW

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,CARD_CCG_NM,STD_CLSS_NM,HOM_SIDO_NM,HOM_CCG_NM,AGE,SEX_CTGO_CD,FLC,CSTMR_CNT,AMT,CNT
0,201909,강원,정선군,한식 음식점업,경북,포항시 북구,60s,1,5,3,90000,3
1,201905,강원,춘천시,차량용 주유소 운영업,경기,가평군,60s,1,5,87,6309532,117
2,201911,강원,강릉시,한식 음식점업,서울,종로구,30s,2,2,4,165000,4
3,201904,강원,원주시,체인화 편의점,강원,원주시,20s,2,1,5529,101630410,19372
4,201903,강원,강릉시,차량용 주유소 운영업,충북,충주시,40s,1,4,3,311009,3
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,201905,강원,원주시,서양식 음식점업,강원,원주시,30s,2,1,214,5065765,343
9996,202002,강원,횡성군,체인화 편의점,경기,성남시 중원구,40s,1,3,8,58800,12
9997,201907,강원,양양군,체인화 편의점,경기,수원시 장안구,50s,1,4,11,99370,11
9998,201901,강원,정선군,체인화 편의점,대구,달서구,30s,2,3,3,26200,4


In [39]:
data = data_pre(GW)

In [40]:
local, tourist = cate(data)

In [41]:
lcl_num, trst_num = encoding(local), encoding(tourist)

In [42]:
lcl_data, trst_data = split(lcl_num), split(trst_num)

In [43]:
lcl = Model(lcl_data, lcl_num)
trst = Model(trst_data, trst_num)

In [44]:
lcl.get_model()

1.577
1.577
1.604
1.604
1.757
1.757


In [45]:
trst.get_model()

1.077
1.077
1.072
1.072
1.163
1.163


In [46]:
lcl.final()

1.595


In [47]:
trst.final()

1.085


In [48]:
lcl_temp = lcl.make_temp()

In [49]:
trst_temp = trst.make_temp()

In [50]:
lcl_sub = lcl.make_sub(lcl_temp)

In [51]:
trst_sub = trst.make_sub(trst_temp)

In [52]:
lcl_sub

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
0,202004,강원,건강보조식품 소매업,92617967.0
1,202004,강원,골프장 운영업,100509557.0
2,202004,강원,과실 및 채소 소매업,129906810.0
3,202004,강원,관광 민예품 및 선물용품 소매업,78382095.0
4,202004,강원,그외 기타 종합 소매업,340677247.0
...,...,...,...,...
61,202007,강원,피자 햄버거 샌드위치 및 유사 음식점업,57998678.0
62,202007,강원,한식 음식점업,671391679.0
63,202007,강원,호텔업,11702247.0
64,202007,강원,화장품 및 방향제 소매업,33615930.0


In [53]:
trst_sub

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
0,202004,강원,건강보조식품 소매업,283614194.0
1,202004,강원,골프장 운영업,561722926.0
2,202004,강원,과실 및 채소 소매업,87997604.0
3,202004,강원,관광 민예품 및 선물용품 소매업,32543010.0
4,202004,강원,그외 기타 종합 소매업,80525673.0
...,...,...,...,...
61,202007,강원,피자 햄버거 샌드위치 및 유사 음식점업,16715798.0
62,202007,강원,한식 음식점업,166471435.0
63,202007,강원,호텔업,154537336.0
64,202007,강원,화장품 및 방향제 소매업,86385641.0


In [54]:
test = train[['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT']]

In [55]:
test = test[test['REG_YYMM'] == 202003]

In [56]:
test = test[test['CARD_SIDO_NM'] == '강원']

In [57]:
test.groupby(['REG_YYMM','STD_CLSS_NM']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,AMT
REG_YYMM,STD_CLSS_NM,Unnamed: 2_level_1
202003,건강보조식품 소매업,96059012
202003,골프장 운영업,2915797995
202003,과실 및 채소 소매업,994816943
202003,관광 민예품 및 선물용품 소매업,13317300
202003,그외 기타 스포츠시설 운영업,2075000
202003,그외 기타 종합 소매업,470200240
202003,기타 대형 종합 소매업,6384110710
202003,기타 수상오락 서비스업,261000
202003,기타 외국식 음식점업,887636153
202003,기타 주점업,250737400
