In [16]:
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, GridSearchCV
from xgboost import plot_importance
from xgboost import XGBRegressor
from sklearn.decomposition import PCA

import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns

import os
from os.path import join

# 1. LGBM - GridSearchCV

1. Light GBM의 GBM은 Gradient Boosting Model로, tree를 기반으로 하는 학습 알고리즘으로 여러 모델을 만들어 예측력이 약한 모델에 가중치를 부여해 보다 강한 모델을 만드는 앙상블 모델이다. targe과 예측값의 차이인 error를 반복적으로 학습하는 방식으로 학습이 진행된다.
2. 다른 level wise 방식과 다른 leaf wise tree 분할 방식으로 깊고 비대칭적인 tree를 가지고 있음.
3. 학습이 빠르며 메모리 사용량이 적고 categorical feature를 자동 변환하고 최적 분할을 함.
4. 작은 데이터셋의 경우 과적합 가능성이 큼.(10,000개 이하의 경우)

## 데이터 준비

In [4]:
# 데이터 불러오기
dpath = '../data'
ftrain = 'new2_train.pickle'
ftest = 'new2_test.pickle'

matrix = pd.read_pickle(join(dpath, ftrain))
test = pd.read_pickle(join(dpath, ftest))
submission = pd.read_csv(join(dpath, 'raw/sample_submission.csv'))
df_test = pd.read_csv(join(dpath,'raw/test.csv'))

In [5]:
# 데이터 분할
train = matrix[matrix.date_block_num < 33]
validation = matrix[matrix.date_block_num == 33]

x_train = train.drop(columns='item_cnt_month')
y_train = train[['item_cnt_month']]
x_valid = validation.drop(columns='item_cnt_month')
y_valid = validation[['item_cnt_month']]
x_test = test.drop(columns='item_cnt_month')

print('train_x shape:', x_train.shape)
print('train_y shape:', y_train.shape)
print('validation_x shape:', x_valid.shape)
print('validation_y shape:', y_valid.shape)
print('test_x shape:', x_test.shape)

train_x shape: (545673, 26)
train_y shape: (545673, 1)
validation_x shape: (28680, 26)
validation_y shape: (28680, 1)
test_x shape: (214200, 26)


## 모델링

- LGBM에 최적의 하이퍼파라미터를 탐색하는 도구인 GridSearchCV를 사용하여 학습을 진행함.

In [8]:
from sklearn.model_selection import GridSearchCV

# 그리드 탐색범위 설정
grid_params = {
    'num_leaves': [100,300,600,1000],
    'max_depth':[-1,50,100],
    'min_child_samples':[5,10,15],
}

# 입력데이터셋설정
train_ds = lgb.Dataset(x_train, label = y_train) 
valid_ds = lgb.Dataset(x_valid, label = y_valid) 

# LGBM wrapper + GridSearchCV
lgb_reg = lgb.LGBMRegressor()
clf = GridSearchCV(lgb_reg, grid_params)
clf.fit(x_train, y_train, early_stopping_rounds=100, eval_metric='rmse', eval_set=[(x_train,y_train), (x_valid,y_valid)],
       verbose=50)

[50]	valid_0's rmse: 0.358775	valid_0's l2: 0.128719	valid_1's rmse: 0.367159	valid_1's l2: 0.134806
[100]	valid_0's rmse: 0.340184	valid_0's l2: 0.115725	valid_1's rmse: 0.358405	valid_1's l2: 0.128454
[50]	valid_0's rmse: 0.360622	valid_0's l2: 0.130048	valid_1's rmse: 0.36861	valid_1's l2: 0.135873
[100]	valid_0's rmse: 0.344471	valid_0's l2: 0.118661	valid_1's rmse: 0.365954	valid_1's l2: 0.133923
[50]	valid_0's rmse: 0.360323	valid_0's l2: 0.129832	valid_1's rmse: 0.365617	valid_1's l2: 0.133676
[100]	valid_0's rmse: 0.344464	valid_0's l2: 0.118656	valid_1's rmse: 0.360787	valid_1's l2: 0.130167
[50]	valid_0's rmse: 0.359988	valid_0's l2: 0.129591	valid_1's rmse: 0.367006	valid_1's l2: 0.134694
[100]	valid_0's rmse: 0.341075	valid_0's l2: 0.116332	valid_1's rmse: 0.359607	valid_1's l2: 0.129317
[50]	valid_0's rmse: 0.358469	valid_0's l2: 0.1285	valid_1's rmse: 0.366605	valid_1's l2: 0.1344
[100]	valid_0's rmse: 0.340793	valid_0's l2: 0.11614	valid_1's rmse: 0.359215	valid_1's l2: 

[100]	valid_0's rmse: 0.340329	valid_0's l2: 0.115824	valid_1's rmse: 0.355852	valid_1's l2: 0.126631
[50]	valid_0's rmse: 0.360671	valid_0's l2: 0.130084	valid_1's rmse: 0.370852	valid_1's l2: 0.137531
[100]	valid_0's rmse: 0.344701	valid_0's l2: 0.118819	valid_1's rmse: 0.365773	valid_1's l2: 0.13379
[50]	valid_0's rmse: 0.360542	valid_0's l2: 0.129991	valid_1's rmse: 0.367028	valid_1's l2: 0.13471
[100]	valid_0's rmse: 0.34466	valid_0's l2: 0.118791	valid_1's rmse: 0.362154	valid_1's l2: 0.131155
[50]	valid_0's rmse: 0.359002	valid_0's l2: 0.128883	valid_1's rmse: 0.367897	valid_1's l2: 0.135349
[100]	valid_0's rmse: 0.340875	valid_0's l2: 0.116195	valid_1's rmse: 0.360768	valid_1's l2: 0.130153
[50]	valid_0's rmse: 0.358767	valid_0's l2: 0.128714	valid_1's rmse: 0.366016	valid_1's l2: 0.133968
[100]	valid_0's rmse: 0.341137	valid_0's l2: 0.116374	valid_1's rmse: 0.359439	valid_1's l2: 0.129196
[50]	valid_0's rmse: 0.333951	valid_0's l2: 0.111523	valid_1's rmse: 0.358883	valid_1's l

[50]	valid_0's rmse: 0.36173	valid_0's l2: 0.130849	valid_1's rmse: 0.369579	valid_1's l2: 0.136588
[100]	valid_0's rmse: 0.34431	valid_0's l2: 0.11855	valid_1's rmse: 0.364921	valid_1's l2: 0.133168
[50]	valid_0's rmse: 0.360464	valid_0's l2: 0.129934	valid_1's rmse: 0.365571	valid_1's l2: 0.133642
[100]	valid_0's rmse: 0.345125	valid_0's l2: 0.119112	valid_1's rmse: 0.362313	valid_1's l2: 0.131271
[50]	valid_0's rmse: 0.359713	valid_0's l2: 0.129394	valid_1's rmse: 0.367936	valid_1's l2: 0.135377
[100]	valid_0's rmse: 0.341006	valid_0's l2: 0.116285	valid_1's rmse: 0.359627	valid_1's l2: 0.129332
[50]	valid_0's rmse: 0.35771	valid_0's l2: 0.127956	valid_1's rmse: 0.367078	valid_1's l2: 0.134746
[100]	valid_0's rmse: 0.340394	valid_0's l2: 0.115868	valid_1's rmse: 0.361488	valid_1's l2: 0.130674
[50]	valid_0's rmse: 0.334328	valid_0's l2: 0.111776	valid_1's rmse: 0.362938	valid_1's l2: 0.131724
[100]	valid_0's rmse: 0.313642	valid_0's l2: 0.0983715	valid_1's rmse: 0.357429	valid_1's l

[100]	valid_0's rmse: 0.344471	valid_0's l2: 0.118661	valid_1's rmse: 0.365954	valid_1's l2: 0.133923
[50]	valid_0's rmse: 0.360323	valid_0's l2: 0.129832	valid_1's rmse: 0.365617	valid_1's l2: 0.133676
[100]	valid_0's rmse: 0.344464	valid_0's l2: 0.118656	valid_1's rmse: 0.360787	valid_1's l2: 0.130167
[50]	valid_0's rmse: 0.359988	valid_0's l2: 0.129591	valid_1's rmse: 0.367006	valid_1's l2: 0.134694
[100]	valid_0's rmse: 0.341075	valid_0's l2: 0.116332	valid_1's rmse: 0.359607	valid_1's l2: 0.129317
[50]	valid_0's rmse: 0.358469	valid_0's l2: 0.1285	valid_1's rmse: 0.366605	valid_1's l2: 0.1344
[100]	valid_0's rmse: 0.340793	valid_0's l2: 0.11614	valid_1's rmse: 0.359215	valid_1's l2: 0.129036
[50]	valid_0's rmse: 0.333095	valid_0's l2: 0.110952	valid_1's rmse: 0.36438	valid_1's l2: 0.132773
[100]	valid_0's rmse: 0.31252	valid_0's l2: 0.0976686	valid_1's rmse: 0.36052	valid_1's l2: 0.129975
[50]	valid_0's rmse: 0.337019	valid_0's l2: 0.113582	valid_1's rmse: 0.365795	valid_1's l2: 0

[50]	valid_0's rmse: 0.360542	valid_0's l2: 0.129991	valid_1's rmse: 0.367028	valid_1's l2: 0.13471
[100]	valid_0's rmse: 0.34466	valid_0's l2: 0.118791	valid_1's rmse: 0.362154	valid_1's l2: 0.131155
[50]	valid_0's rmse: 0.359002	valid_0's l2: 0.128883	valid_1's rmse: 0.367897	valid_1's l2: 0.135349
[100]	valid_0's rmse: 0.340875	valid_0's l2: 0.116195	valid_1's rmse: 0.360768	valid_1's l2: 0.130153
[50]	valid_0's rmse: 0.358767	valid_0's l2: 0.128714	valid_1's rmse: 0.366016	valid_1's l2: 0.133968
[100]	valid_0's rmse: 0.341137	valid_0's l2: 0.116374	valid_1's rmse: 0.359439	valid_1's l2: 0.129196
[50]	valid_0's rmse: 0.333951	valid_0's l2: 0.111523	valid_1's rmse: 0.358883	valid_1's l2: 0.128797
[100]	valid_0's rmse: 0.312759	valid_0's l2: 0.0978183	valid_1's rmse: 0.35326	valid_1's l2: 0.124793
[50]	valid_0's rmse: 0.338592	valid_0's l2: 0.114644	valid_1's rmse: 0.36785	valid_1's l2: 0.135314
[100]	valid_0's rmse: 0.319429	valid_0's l2: 0.102035	valid_1's rmse: 0.362885	valid_1's l

GridSearchCV(estimator=LGBMRegressor(),
             param_grid={'max_depth': [-1, 50, 100],
                         'min_child_samples': [5, 10, 15],
                         'num_leaves': [100, 300, 600, 1000]})

In [9]:
# 최적파라미터 출력
print('best params :',clf.best_params_)

best params : {'max_depth': 50, 'min_child_samples': 5, 'num_leaves': 600}


In [21]:
# feature importance 출력
importance = clf.best_estimator_.feature_importances_
importance_name = clf.best_estimator_.feature_name_
df_importance = pd.DataFrame(dict(name=importance_name, value=importance)).sort_values('value', ascending=False)
df_importance

Unnamed: 0,name,value
2,item_id,7775
5,date_item_avg_item_price,5481
3,item_price,5435
4,item_avg_item_price,4986
0,date_block_num,4560
1,shop_id,3640
9,delta_price_lag_1,3617
12,month,3139
13,item_cnt_month_lag_1,2693
10,delta_price_lag_2,2519


In [22]:
fig = px.bar_polar(df_importance, r="value", theta="name",
                   color="name", template="plotly_dark",
                   color_discrete_sequence= px.colors.sequential.Plasma_r)
fig.show()

In [23]:
# 예측값 저장
pred_test = np.expm1(clf.predict(x_test)).clip(0,20)
x_test['item_cnt_month'] = pred_test

result = pd.merge(df_test, x_test[['shop_id', 'item_id', 'item_cnt_month']], on=['shop_id', 'item_id'], how='left')
submission['item_cnt_month'] = result['item_cnt_month']
submission.to_csv('./submission_gridsearchcv.csv', index=False)

# 2. XGBoost

## 데이터 로드

- 기존 데이터 전처리에 추가로 robust scaling을 적용한 데이터를 사용함.

In [54]:
with open("../data/robust_train.pickle","rb") as fr:
    data = pickle.load(fr)

with open("../data/robust_test.pickle","rb") as fr:
    test = pickle.load(fr)
test = test.drop('item_cnt_month', axis=1)

## 데이터 분리
X = data.drop(['item_cnt_month'], axis=1)
y = data['item_cnt_month']

## 데이터 준비 : PCA

- 이에 추가로 5개의 주성분으로 데이터를 압축하여 학습 데이터로 사용함.

In [55]:
## PCA 주성분 분석
pca = PCA(n_components=5)
printcipalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data=printcipalComponents, columns = ['1', '2', '3',
                                                                 '4', '5'])

# explained_variance_ratio_는 고유값이며 설명가능한 분산량을 의미한다.
# 5개의 주성분으로 95% 이상 설명력을 가지는 것을 확인
pca.explained_variance_ratio_
pca.explained_variance_ratio_[0]

X = principalDf
# 테스트 데이터도 동일한 5개 차원으로 축소
printcipalComponents = pca.fit_transform(test)
principalDf = pd.DataFrame(data=printcipalComponents, columns = ['1', '2', '3',
                                                                 '4', '5'])


In [56]:
pca.explained_variance_

array([3.90955645e+07, 3.08433053e+02, 1.41824510e+01, 4.81717989e+00,
       4.45234147e+00])

In [57]:
print(pca.explained_variance_)
print('데이터 설명력 :', sum(pca.explained_variance_ratio_))

[3.90955645e+07 3.08433053e+02 1.41824510e+01 4.81717989e+00
 4.45234147e+00]
데이터 설명력 : 0.9999998684414434


In [59]:
test = principalDf
test

Unnamed: 0,1,2,3,4,5
0,-5982.398791,29.639678,2.883212,2.560316,0.156043
1,-5699.398547,29.641790,-1.491292,0.234233,-0.412732
2,-5786.398794,29.639228,3.327696,0.574977,0.323342
3,-5787.398781,29.639646,3.151957,0.385571,0.191010
4,-5751.398569,29.641375,-1.379968,0.171409,-0.332244
...,...,...,...,...,...
214195,7434.601429,-4.357685,-0.763538,0.116467,-0.238025
214196,5168.601377,-4.357951,-0.248212,-0.360647,0.135197
214197,4737.601432,-4.357663,-0.887990,0.118251,-0.260485
214198,8628.601432,-4.357670,-0.759226,0.154814,-0.260231


## 모델링

- XGB 모델을 생성 후 학습.

In [60]:
## 모델 생성 및 학습
xgb = XGBRegressor(n_estimators=200, max_depth=12, learning_rate=0.05, subsample=0.7,
                   colsample_bytree = 0.7, random_state = 42)
xgb.fit(X, y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.05, max_delta_step=0,
             max_depth=12, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=200, n_jobs=6,
             num_parallel_tree=1, predictor='auto', random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.7,
             tree_method='exact', validate_parameters=1, verbosity=None)

- 예측값 생성

In [61]:
pred = xgb.predict(test)
pred = pd.DataFrame(pred)
pred.columns = ['item_cnt_month']
temp = [i for i in range(0,214200,1)]
pred['ID'] = temp
pred = pred[['ID', 'item_cnt_month']]
pred.to_csv('./robust_submission.csv', index=False)

pred

Unnamed: 0,ID,item_cnt_month
0,0,1.006377
1,1,0.923591
2,2,0.873031
3,3,1.013940
4,4,0.966906
...,...,...
214195,214195,0.762519
214196,214196,0.720578
214197,214197,0.810646
214198,214198,0.761775


- feature importance 확인

In [63]:
## 피처 importance 확인
importance = xgb.feature_importances_
importance_name = ['PCA%d'%i for i in range(1,6)]
df_importance = pd.DataFrame(dict(name=importance_name, value=importance)).sort_values('value', ascending=False)
df_importance

Unnamed: 0,name,value
0,PCA1,0.388992
4,PCA5,0.197942
2,PCA3,0.150909
3,PCA4,0.14527
1,PCA2,0.116888


In [64]:
fig = px.bar_polar(df_importance, r="value", theta="name",
                   color="name", template="plotly_dark",
                   color_discrete_sequence= px.colors.sequential.Plasma_r)
fig.show()