<a href="https://colab.research.google.com/github/runnin123/Jeju_Bigdata/blob/master/Jeju_DG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import auth
auth.authenticate_user()

In [2]:
from google.cloud import bigquery
from tabulate import tabulate
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

from itertools import product

# 평가 메트릭
from sklearn.metrics import mean_squared_error, mean_absolute_error

# 통계
from scipy import stats
from scipy.stats import norm, skew #for some statistics

from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

In [3]:
class Model:
    def __init__(self, data, num):
        self.X_train = data[0]
        self.X_test = data[1]
        self.y_train = data[2]
        self.y_test = data[3]
        self.encoding_data = num

    def rmsle(self, y, pred): 
        log_y = np.log1p(y)
        log_pred = np.log1p(pred)
        squared_error = (log_y - log_pred)**2
        rmsle = np.sqrt(np.mean(squared_error))
        print(round(rmsle, 3))

        return round(rmsle, 3)

    def best_params_model(self, model, params):
        cv_model = GridSearchCV(model, param_grid=params, scoring="neg_mean_squared_error", cv = 5)
        cv_model.fit(self.X_train, self.y_train)

        print("----", model.__class__.__name__, "----")
        print("GridSearchCV 최적 하이퍼 파라미터 :", cv_model.best_params_)

        rmse = np.sqrt(-1*cv_model.best_score_)
        print("GridSearchCV 최적 평균 RMSE값 :", np.round(rmse, 3))

        eval_pred = cv_model.predict(self.X_test)
        eval_pred = np.expm1(eval_pred)
        rmsle_ = self.rmsle(self.y_test, eval_pred)

        return  cv_model.best_estimator_, rmsle_

    def get_model(self):
        xgb = XGBRegressor(random_state=0)
        gbm = GradientBoostingRegressor(random_state=0)
        lgb = LGBMRegressor(random_state=0)

        params = {'n_estimators': [1000, 2000]}

        models = [xgb, gbm, lgb]
        best_models = []
        rmsles = []
        
        for model in models:
            new_model = self.best_params_model(model, params)[0]
            new_rmsle = self.best_params_model(model, params)[1]
            best_models.append(new_model)
            rmsles.append(new_rmsle)

        self.xgb_reg = best_models[0]
        self.gbm_reg = best_models[1]
        self.lgb_reg = best_models[2]

        self.xgb_rmsle = rmsles[0]
        self.gbm_rmsle = rmsles[1]
        self.lgb_rmsle = rmsles[2]

    def final(self):

        xgb_pred = self.xgb_reg.predict(self.X_test)
        xgb_pred = np.expm1(xgb_pred)

        gbm_pred = self.gbm_reg.predict(self.X_test)
        gbm_pred = np.expm1(gbm_pred)

        lgb_pred = self.lgb_reg.predict(self.X_test)
        lgb_pred = np.expm1(lgb_pred)

        pred = np.array([xgb_pred, gbm_pred, lgb_pred])
        pred = np.transpose(pred)

        rmsle_sum = self.xgb_rmsle + self.gbm_rmsle + self.lgb_rmsle

        self.xgb_per = self.xgb_rmsle / rmsle_sum
        self.gbm_per = self.gbm_rmsle / rmsle_sum
        self.lgb_per = self.lgb_rmsle / rmsle_sum
        
        final = xgb_pred*self.xgb_per + gbm_pred*self.gbm_per + lgb_pred*self.lgb_per
        self.rmsle(self.y_test, final)

    def make_temp(self):
        CARD_SIDO_NMs = self.encoding_data['CARD_SIDO_NM'].unique()
        STD_CLSS_NMs  = self.encoding_data['STD_CLSS_NM'].unique()
        HOM_SIDO_NMs  = self.encoding_data['HOM_SIDO_NM'].unique()
        AGEs          = self.encoding_data['AGE'].unique()
        SEX_CTGO_CDs  = self.encoding_data['SEX_CTGO_CD'].unique()
        FLCs          = self.encoding_data['FLC'].unique()
        years         = [2020]
        months        = [4, 7]

        comb_list = [CARD_SIDO_NMs, STD_CLSS_NMs, HOM_SIDO_NMs, AGEs, SEX_CTGO_CDs, FLCs, years, months]
        temp = np.array(list(product(*comb_list)))

        train_features = self.encoding_data.drop(['CSTMR_CNT', 'AMT', 'CNT'], axis=1)
        tmp = pd.DataFrame(data=temp, columns=train_features.columns)

        return tmp

    def make_sub(self, temp):
        xgb_pred = self.xgb_reg.predict(temp)
        xgb_pred = np.expm1(xgb_pred)

        gbm_pred = self.gbm_reg.predict(temp)
        gbm_pred = np.expm1(gbm_pred)

        lgb_pred = self.lgb_reg.predict(temp)
        lgb_pred = np.expm1(lgb_pred)

        final_rmsle = xgb_pred*self.xgb_per + gbm_pred*self.gbm_per + lgb_pred*self.lgb_per

        temp['AMT'] = np.round(final_rmsle, 0)
        temp['REG_YYMM'] = temp['year']*100 + temp['month']
        temp = temp[['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT']]
        temp = temp.groupby(['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM']).sum().reset_index(drop = False)

        temp['CARD_SIDO_NM'] = encoders['CARD_SIDO_NM'].inverse_transform(temp['CARD_SIDO_NM'])
        temp['STD_CLSS_NM'] = encoders['STD_CLSS_NM'].inverse_transform(temp['STD_CLSS_NM'])

        return temp

In [4]:
def sampling():
    project_id = 'jeju-bigquery-282708'
    client = bigquery.Client(project=project_id)

    train = client.query('''
    SELECT 
        * 
    FROM `jeju-bigquery-282708.jeju_bigdata.201901_202003_train`
    ''').to_dataframe()

    return train

In [5]:
def grap_year(data):
    data = str(data)
    return int(data[:4])

def grap_month(data):
    data = str(data)
    return int(data[4:])

In [6]:
def data_pre(data):
    data = data.fillna('')
    data['year'] = data['REG_YYMM'].apply(lambda x: grap_year(x))
    data['month'] = data['REG_YYMM'].apply(lambda x: grap_month(x))
    data = data.drop(['REG_YYMM'], axis = 1)

    return data

In [7]:
def cate(data):
    local = data[(data['CARD_SIDO_NM'] == data['HOM_SIDO_NM']) & (data['CARD_CCG_NM'] == data['HOM_CCG_NM'])].reset_index(drop = True)
    tourist = data[(data['CARD_SIDO_NM'] != data['HOM_SIDO_NM']) | (data['CARD_CCG_NM'] != data['HOM_CCG_NM'])].reset_index(drop = True)
    
    local, tourist = local.drop(['CARD_CCG_NM', 'HOM_CCG_NM'], axis=1), tourist.drop(['CARD_CCG_NM', 'HOM_CCG_NM'], axis=1)
    columns = ['CARD_SIDO_NM', 'STD_CLSS_NM', 'HOM_SIDO_NM', 'AGE', 'SEX_CTGO_CD', 'FLC', 'year', 'month']
    local, tourist = local.groupby(columns).sum().reset_index(drop=False), tourist.groupby(columns).sum().reset_index(drop=False)

    return local, tourist

In [8]:
def encoding(cate):

    global encoders
    dtypes_cate = cate.dtypes
    encoders = {}
    for column in cate.columns:
        if str(dtypes_cate[column]) == 'object':
            encoder = LabelEncoder()
            encoder.fit(cate[column])
            encoders[column] = encoder

    cate_num = cate.copy()

    for column in encoders.keys():
        encoder = encoders[column]
        cate_num[column] = encoder.transform(cate[column])

    return cate_num

In [9]:
def split(cate_num):
    split_data = []

    X_data, y_data = cate_num.loc[:, cate_num.columns != 'AMT'], cate_num['AMT']
    X_data = X_data.drop(['CSTMR_CNT', 'CNT'], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.3, random_state=126, shuffle=True)
    y_train = np.log1p(y_train)
    
    return X_train, X_test, y_train, y_test

In [10]:
train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/jeju/201901-202003.csv')

In [11]:
DG = train[train['CARD_SIDO_NM'] == '대구'].reset_index(drop = True)

In [12]:
DG = DG.sample(n = 10000).reset_index(drop = True)
DG

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,CARD_CCG_NM,STD_CLSS_NM,HOM_SIDO_NM,HOM_CCG_NM,AGE,SEX_CTGO_CD,FLC,CSTMR_CNT,AMT,CNT
0,201902,대구,중구,빵 및 과자류 소매업,경기,화성시,20s,1,1,4,24000,4
1,201905,대구,북구,한식 음식점업,대구,동구,30s,1,1,205,9006540,295
2,201906,대구,남구,체인화 편의점,대전,유성구,20s,2,1,8,31000,11
3,201905,대구,남구,기타 대형 종합 소매업,대구,중구,30s,2,1,3,141800,6
4,201904,대구,남구,서양식 음식점업,대구,달서구,60s,1,5,143,3309010,185
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,201902,대구,남구,체인화 편의점,경남,진주시,30s,2,2,7,79930,19
9996,201907,대구,달서구,기타음식료품위주종합소매업,대구,달서구,50s,1,3,7,1092650,21
9997,201911,대구,동구,체인화 편의점,경기,성남시 수정구,30s,2,2,5,27300,6
9998,201912,대구,달성군,서양식 음식점업,대구,수성구,40s,2,3,128,2699800,159


In [13]:
DG = data_pre(DG)

In [15]:
DG = DG.drop(['CARD_CCG_NM', 'HOM_CCG_NM'], axis=1)

In [16]:
DG_num = encoding(DG)

In [17]:
DG_data = split(DG_num)

In [18]:
DG = Model(DG_data, DG_num)

In [19]:
DG.get_model()

---- XGBRegressor ----
GridSearchCV 최적 하이퍼 파라미터 : {'n_estimators': 2000}
GridSearchCV 최적 평균 RMSE값 : 1.312
1.305
---- XGBRegressor ----
GridSearchCV 최적 하이퍼 파라미터 : {'n_estimators': 2000}
GridSearchCV 최적 평균 RMSE값 : 1.312
1.305
---- GradientBoostingRegressor ----
GridSearchCV 최적 하이퍼 파라미터 : {'n_estimators': 2000}
GridSearchCV 최적 평균 RMSE값 : 1.313
1.304
---- GradientBoostingRegressor ----
GridSearchCV 최적 하이퍼 파라미터 : {'n_estimators': 2000}
GridSearchCV 최적 평균 RMSE값 : 1.313
1.304
---- LGBMRegressor ----
GridSearchCV 최적 하이퍼 파라미터 : {'n_estimators': 1000}
GridSearchCV 최적 평균 RMSE값 : 1.383
1.359
---- LGBMRegressor ----
GridSearchCV 최적 하이퍼 파라미터 : {'n_estimators': 1000}
GridSearchCV 최적 평균 RMSE값 : 1.383
1.359


In [20]:
DG.final()

1.304


In [21]:
DG_temp = DG.make_temp()

In [22]:
DG_sub = DG.make_sub(DG_temp)

In [23]:
DG_sub

Unnamed: 0,REG_YYMM,CARD_SIDO_NM,STD_CLSS_NM,AMT
0,202004,대구,건강보조식품 소매업,119426276.0
1,202004,대구,골프장 운영업,150178547.0
2,202004,대구,과실 및 채소 소매업,150149500.0
3,202004,대구,관광 민예품 및 선물용품 소매업,24954212.0
4,202004,대구,그외 기타 스포츠시설 운영업,25121546.0
...,...,...,...,...
69,202007,대구,택시 운송업,289538934.0
70,202007,대구,피자 햄버거 샌드위치 및 유사 음식점업,48774759.0
71,202007,대구,한식 음식점업,318927825.0
72,202007,대구,호텔업,123018435.0


In [24]:
test = train[['REG_YYMM', 'CARD_SIDO_NM', 'STD_CLSS_NM', 'AMT']]

In [25]:
test = test[test['REG_YYMM'] == 202003]

In [26]:
test = test[test['CARD_SIDO_NM'] == '대구']

In [27]:
test.groupby(['REG_YYMM','STD_CLSS_NM']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,AMT
REG_YYMM,STD_CLSS_NM,Unnamed: 2_level_1
202003,건강보조식품 소매업,232458341
202003,골프장 운영업,124715010
202003,과실 및 채소 소매업,3247185311
202003,관광 민예품 및 선물용품 소매업,7100600
202003,그외 기타 스포츠시설 운영업,690000
202003,그외 기타 종합 소매업,1429581910
202003,기타 대형 종합 소매업,18577150550
202003,기타 수상오락 서비스업,72000
202003,기타 외국식 음식점업,636417220
202003,기타 주점업,114456630
