In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

## 분류 실습 : XGBoost와 LightGBM을 이용하여 고객만족 예측

#### 예제 데이터
- Kaggle의 산탄데르 고객만족(Santander Customer Satisfaction) 데이터 세트
- 산탄데르 은행이 캐글에 의뢰한 데이터
- https://www.kaggle.com/c/santander-customer-satisfaction/data
- features : 370개, 모두 익명 처리
- target : 1이면 불만, 0이면 만족
- 모델의 성능평가:  ROC-AUC

#### 데이터 준비 및 파악

In [3]:
df = pd.read_csv('data/santander/train.csv', encoding='latin-1')
print(f'datasets shape: {df.shape}')
df.head(3)

datasets shape: (76020, 371)


Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0


In [5]:
df.TARGET.value_counts()

TARGET
0    73012
1     3008
Name: count, dtype: int64

In [7]:
print(f'불만족 비율: {df[df.TARGET == 1].TARGET.count() / df.TARGET.count() * 100:.2f}')

불만족 비율: 3.96


In [8]:
df.describe()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
count,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,...,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0
mean,75964.050723,-1523.199277,33.212865,86.208265,72.363067,119.529632,3.55913,6.472698,0.412946,0.567352,...,7.935824,1.365146,12.21558,8.784074,31.505324,1.858575,76.026165,56.614351,117235.8,0.039569
std,43781.947379,39033.462364,12.956486,1614.757313,339.315831,546.266294,93.155749,153.737066,30.604864,36.513513,...,455.887218,113.959637,783.207399,538.439211,2013.125393,147.786584,4040.337842,2852.579397,182664.6,0.194945
min,1.0,-999999.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5163.75,0.0
25%,38104.75,2.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67870.61,0.0
50%,76043.0,2.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,106409.2,0.0
75%,113748.75,2.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,118756.3,0.0
max,151838.0,238.0,105.0,210000.0,12888.03,21024.81,8237.82,11073.57,6600.0,6600.0,...,50003.88,20385.72,138831.63,91778.73,438329.22,24650.01,681462.9,397884.3,22034740.0,1.0


#### 데이터 전처리

- 결측치 처리 / 피처 삭제

In [11]:
df.var3.value_counts()

var3
 2         74165
 8           138
-999999      116
 9           110
 3           108
           ...  
 231           1
 188           1
 168           1
 135           1
 87            1
Name: count, Length: 208, dtype: int64

In [12]:
df.var3.replace(-999999, 2, inplace=True)
df.var3.value_counts()

var3
2      74281
8        138
9        110
3        108
1        105
       ...  
231        1
188        1
168        1
135        1
87         1
Name: count, Length: 207, dtype: int64

In [13]:
df.columns

Index(['ID', 'var3', 'var15', 'imp_ent_var16_ult1', 'imp_op_var39_comer_ult1',
       'imp_op_var39_comer_ult3', 'imp_op_var40_comer_ult1',
       'imp_op_var40_comer_ult3', 'imp_op_var40_efect_ult1',
       'imp_op_var40_efect_ult3',
       ...
       'saldo_medio_var33_hace2', 'saldo_medio_var33_hace3',
       'saldo_medio_var33_ult1', 'saldo_medio_var33_ult3',
       'saldo_medio_var44_hace2', 'saldo_medio_var44_hace3',
       'saldo_medio_var44_ult1', 'saldo_medio_var44_ult3', 'var38', 'TARGET'],
      dtype='object', length=371)

In [14]:
df.drop('ID', axis=1, inplace=True)

In [15]:
x = df.iloc[:,:-1]
y = df.iloc[:,-1]

#### 학습/테스트 데이터 준비

In [29]:
from sklearn.model_selection import train_test_split, cross_val_score

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2,
                                                   random_state=0)

In [20]:
print(f'학습데이터: {y_train.value_counts()/y_train.count()*100}\n')
print(f'테스트데이터: {y_test.value_counts()/y_test.count()*100}')

학습데이터: TARGET
0    96.096422
1     3.903578
Name: count, dtype: float64

테스트데이터: TARGET
0    95.830045
1     4.169955
Name: count, dtype: float64


#### 학습/검증 데이터 세트

In [21]:
x_tr, x_val, y_tr, y_val = train_test_split(x_train,y_train, test_size=0.3,
                                                   random_state=0)

### XGBoost 모델 학습과 하이퍼 파라미터 튜닝

In [22]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

In [23]:
# 모델 생성
xgb_clf = XGBClassifier(n_estimators=500, learning_rate=0.05, random_state=156)

# 학습 / early_stopping_rounds -> 조기중단 / 조기 중단은 검증 세트의 평가 지표가 지정된 라운드 동안 개선되지 않을 때 발생
xgb_clf.fit(x_train, y_train, early_stopping_rounds=100,eval_metric='auc',
           eval_set=[(x_tr,y_tr),(x_val,y_val)])

# 예측 / 평가
xgb_roc_score = roc_auc_score(y_test, xgb_clf.predict_proba(x_test)[:,1])
print(f'ROC AUC: {xgb_roc_score:.4f}')

[0]	validation_0-auc:0.83869	validation_1-auc:0.83603
[1]	validation_0-auc:0.84000	validation_1-auc:0.83881
[2]	validation_0-auc:0.84026	validation_1-auc:0.83848
[3]	validation_0-auc:0.84016	validation_1-auc:0.83998
[4]	validation_0-auc:0.84162	validation_1-auc:0.84068
[5]	validation_0-auc:0.84435	validation_1-auc:0.84119
[6]	validation_0-auc:0.84545	validation_1-auc:0.84260
[7]	validation_0-auc:0.84609	validation_1-auc:0.84416
[8]	validation_0-auc:0.84772	validation_1-auc:0.84558
[9]	validation_0-auc:0.84825	validation_1-auc:0.84570
[10]	validation_0-auc:0.84888	validation_1-auc:0.84622
[11]	validation_0-auc:0.84935	validation_1-auc:0.84663
[12]	validation_0-auc:0.84961	validation_1-auc:0.84666
[13]	validation_0-auc:0.84976	validation_1-auc:0.84696
[14]	validation_0-auc:0.85081	validation_1-auc:0.84708
[15]	validation_0-auc:0.85326	validation_1-auc:0.84850
[16]	validation_0-auc:0.85464	validation_1-auc:0.84944
[17]	validation_0-auc:0.85532	validation_1-auc:0.84983
[18]	validation_0-au

#### 검색 공간 설정

In [31]:
from hyperopt import hp

xgb_search_space = {'max_depth': hp.quniform('max_depth',5,15,1),
                   'min_child_weight':hp.quniform('min_child_weight',1,6,1),
                   'colsample_bytree':hp.uniform('colsample_bytree',0.5,0.95),
                   'learning_rate':hp.uniform('learning_rate',0.01,0.2)}

#### 목적 함수 설정

In [None]:
from hyperopt import fmin, tpe, Trials
def objective_func(search_space):
    xgb_clf = XGBClassifier(n_estimators=100,
                            max_depth= int(search_space['max_depth']),
               min_child_weight= int(search_space['min_child_weight']),
                    learning_rate=search_space['learning_rate'],
               colsample_bytree=search_space['colsample_bytree'],
                           eval_metric='logloss')
    roc_auc = cross_val_score(xgb_clf, x_train, y_train, cv=3, scoring='roc_auc')

    return {'loss': -1*np.mean(roc_auc), 'status':STATUS_OK}

#### fmin()함수를 사용하여 최적 파라미터 추출

In [38]:
from hyperopt import STATUS_OK
trial_val = Trials()

best = fmin(fn=objective_func, space=xgb_search_space,algo=tpe.suggest,
           max_evals= 50,trials= trial_val,rstate= np.random.default_rng(seed=9))
print(f'best: {best}')

100%|█████████████████████████████████████████████████| 50/50 [22:51<00:00, 27.43s/trial, best loss: -0.83811339720573]
best: {'colsample_bytree': 0.6989263607502518, 'learning_rate': 0.07116170670082439, 'max_depth': 5.0, 'min_child_weight': 6.0}


In [39]:
roc_auc = [loss_dict['loss']*(-1) for loss_dict in trial_val.results]
result_df = pd.DataFrame(trial_val.vals)
result_df['ROC_AUC'] = roc_auc
result_df

Unnamed: 0,colsample_bytree,learning_rate,max_depth,min_child_weight,ROC_AUC
0,0.576711,0.033688,14.0,4.0,0.827991
1,0.704468,0.105956,5.0,4.0,0.836135
2,0.91395,0.154804,5.0,5.0,0.83062
3,0.905011,0.120686,6.0,6.0,0.832904
4,0.656903,0.142392,12.0,5.0,0.817836
5,0.827397,0.106579,7.0,4.0,0.832388
6,0.911769,0.079111,11.0,4.0,0.82746
7,0.675516,0.095213,14.0,4.0,0.822152
8,0.665998,0.14752,8.0,6.0,0.828043
9,0.582904,0.081179,7.0,2.0,0.83465


In [None]:
pd.Series(best)

### 최적으로 찾은 하이퍼파라미터로 학습과 예측

In [None]:
xgb_best_clf = XGBClassifier(n_estimators=500,
                            max_depth= int(best['max_depth']),
               min_child_weight= int(best['min_child_weight']),
                    learning_rate=round(best['learning_rate'],5),
               colsample_bytree=round(best['colsample_bytree'],5))

xgb_best_clf.fit(x_tr, y_tr, early_stopping_rounds=100,
                eval_metric='auc', eval_set=[(x_tr,y_tr),(x_val, y_val)]) 

pred_proba = xgb_best_clf.predict_proba(x_test)[:,1]
print(f'ROC_AUC: {roc_auc_score(y_test, pred_proba):.4f}')

### 피처 중요도 시각화

In [None]:
from xgboost import plot_importance

fig, ax = plt.subplots(1,1, figsize=(10,8))
plot_importance(xgb_clf, ax=ax, max_num_features=20, height=0.4)
plt.show()

#### 목적 함수 설정

- 조기 중단을 위해 KFold 사용

In [59]:
# kfold를 이용하여 조기중단 적용하기
from sklearn.model_selection import KFold

def objective_func2(search_space):
    xgb_clf = XGBClassifier(n_estimators=100,
                            max_depth= int(search_space['max_depth']),
               min_child_weight= int(search_space['min_child_weight']),
                    learning_rate=search_space['learning_rate'],
               colsample_bytree=search_space['colsample_bytree'])
    scores = []
    kf = KFold(n_splits=3)
    for tr_idx, val_idx in kf.split(x_train):
        x_tr, y_tr = x_train.iloc[tr_idx], y_train.iloc[tr_idx]
        x_val, y_val = x_train.iloc[val_idx], y_train.iloc[val_idx]
        xgb_clf.fit(x_tr, y_tr, eval_metric='auc', eval_set=[(x_tr,y_tr),(x_val,y_val)],
                   early_stopping_rounds=30)
        score = roc_auc_score(y_val, xgb_clf.predict_proba(x_val)[:,1])
        scores.append(score)
        
    return -1 * np.mean(scores)

In [60]:
# kfold를 이용하여 조기중단 적용하기
trial_val2 = Trials()

best2 = fmin(fn=objective_func2, space=xgb_search_space,algo=tpe.suggest,
           max_evals= 50,trials= trial_val2,rstate= np.random.default_rng(seed=30))
print(f'조기중단 적용 best: {best2}')

[0]	validation_0-auc:0.76925	validation_1-auc:0.73368                                                                  
[1]	validation_0-auc:0.77931	validation_1-auc:0.74099                                                                  
[2]	validation_0-auc:0.78206	validation_1-auc:0.74516                                                                  
[3]	validation_0-auc:0.82632	validation_1-auc:0.79329                                                                  
[4]	validation_0-auc:0.82410	validation_1-auc:0.79134                                                                  
[5]	validation_0-auc:0.83679	validation_1-auc:0.80348                                                                  
[6]	validation_0-auc:0.84272	validation_1-auc:0.80982                                                                  
[7]	validation_0-auc:0.84774	validation_1-auc:0.81687                                                                  
[8]	validation_0-auc:0.84653	validation_

KeyboardInterrupt: 

In [None]:
roc_auc = [loss_dict['loss']*(-1) for loss_dict in trial_val.results]
result_df2 = pd.DataFrame(trial_val2.vals)
result_df2['ROC_AUC'] = roc_auc
result_df2

In [None]:
pd.Series(best2)

#### 최적으로 찾은 하이퍼파라미터로 학습과 예측

In [40]:
xgb_best_clf2 = XGBClassifier(n_estimators=500,
                            max_depth= int(best['max_depth']),
               min_child_weight= int(best['min_child_weight']),
                    learning_rate=round(best['learning_rate'],5),
               colsample_bytree=round(best['colsample_bytree'],5))

xgb_best_clf2.fit(x_tr, y_tr, early_stopping_rounds=100,
                eval_metric='auc', eval_set=[(x_tr,y_tr),(x_val, y_val)]) 

pred_proba2 = xgb_best_clf2.predict_proba(x_test)[:,1]
print(f'ROC_AUC: {roc_auc_score(y_test, pred_proba2):.4f}')

[0]	validation_0-auc:0.75616	validation_1-auc:0.72423
[1]	validation_0-auc:0.76454	validation_1-auc:0.73411
[2]	validation_0-auc:0.77096	validation_1-auc:0.74296
[3]	validation_0-auc:0.81583	validation_1-auc:0.79552
[4]	validation_0-auc:0.81450	validation_1-auc:0.79190
[5]	validation_0-auc:0.82393	validation_1-auc:0.80398
[6]	validation_0-auc:0.82918	validation_1-auc:0.81072
[7]	validation_0-auc:0.83345	validation_1-auc:0.81515
[8]	validation_0-auc:0.83204	validation_1-auc:0.81335
[9]	validation_0-auc:0.83725	validation_1-auc:0.81638
[10]	validation_0-auc:0.83566	validation_1-auc:0.81427
[11]	validation_0-auc:0.83985	validation_1-auc:0.81786
[12]	validation_0-auc:0.83782	validation_1-auc:0.81511
[13]	validation_0-auc:0.83709	validation_1-auc:0.81494
[14]	validation_0-auc:0.84035	validation_1-auc:0.81903
[15]	validation_0-auc:0.83945	validation_1-auc:0.81775
[16]	validation_0-auc:0.84203	validation_1-auc:0.82006
[17]	validation_0-auc:0.84198	validation_1-auc:0.81887
[18]	validation_0-au

#### 피처 중요도 시각화

In [None]:
from xgboost import plot_importance

fig, ax = plt.subplots(1,1, figsize=(10,8))
plot_importance(xgb_clf, ax=ax, max_num_features=20, height=0.4)
plt.show()

### LightGBM 모델 학습과 하이퍼 파라미터 튜닝

In [50]:
# x_train을 x_tr과 x_val로 나누는 이유 -> tr로 학습을 하면서 val로 조기중단 조건이 되는지 확인
# -> 조기중단을 하기 위한 조건이 되는지 확인하기 위해 _val이 필요하다.
from lightgbm import LGBMClassifier, early_stopping

lgbm_clf = LGBMClassifier(n_estimators=500)

lgbm_clf.fit(x_tr, y_tr, callbacks=[early_stopping(stopping_rounds=100)],
            eval_metric='auc', eval_set=[(x_tr,y_tr),(x_val,y_val)])

lgbm_roc_auc_score = roc_auc_score(y_test, lgbm_clf.predict_proba(x_test)[:,1])
print(f'ROC_AUC: {lgbm_roc_auc_score:.4f}')

[LightGBM] [Info] Number of positive: 1658, number of negative: 40913
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.177812 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13308
[LightGBM] [Info] Number of data points in the train set: 42571, number of used features: 242
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.038947 -> initscore=-3.205836
[LightGBM] [Info] Start training from score -3.205836
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[42]	training's auc: 0.91059	training's binary_logloss: 0.112183	valid_1's auc: 0.831787	valid_1's binary_logloss: 0.13527
ROC_AUC: 0.8384


#### 검색 공간 설정

In [51]:
lgbm_search_space = {'max_depth': hp.quniform('max_depth',100, 160, 1),
                   'min_child_samples':hp.quniform('min_child_samples', 60, 100, 1),
                   'num_leaves':hp.quniform('num_leaves',32, 64, 1),
                   'subsample':hp.uniform('subsample', 0.7, 1),
                    'learning_rate':hp.uniform('learning_rate', 0.01, 0.2)}

#### 목적 함수 설정

In [54]:
def objective_func_lgbm(search_space):
    lgbm_clf = LGBMClassifier(n_estimators=100, 
                              max_depth= int(search_space['max_depth']),
                           min_child_samples= int(search_space['min_child_samples']),
                           num_leaves= int(search_space['num_leaves']),
                           subsample= search_space['subsample'],
                            learning_rate= search_space['learning_rate'])

    scores = []
    kf = KFold(n_splits=3)
    for tr_idx, val_idx in kf.split(x_train):
        x_tr, y_tr = x_train.iloc[tr_idx], y_train.iloc[tr_idx]
        x_val, y_val = x_train.iloc[val_idx], y_train.iloc[val_idx]
        lgbm_clf.fit(x_tr, y_tr, eval_metric='auc', eval_set=[(x_tr,y_tr),(x_val,y_val)],
                   callbacks=[early_stopping(stopping_rounds=30)])
        score = roc_auc_score(y_val, lgbm_clf.predict_proba(x_val)[:,1])
        scores.append(score)
        
    return -1 * np.mean(scores)

#### fmin()함수로 최적 파라미터 추출

In [56]:
trial_val_lgbm = Trials()

best_lgbm = fmin(fn=objective_func_lgbm, 
                 space=lgbm_search_space,
                 algo=tpe.suggest,
                   max_evals= 50,
                 trials= trial_val_lgbm,
                 rstate= np.random.default_rng(seed=30))
print(f'LGBM best: {best_lgbm}')

[LightGBM] [Info] Number of positive: 1579, number of negative: 38965                                                  
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.073289 seconds.                
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12827                                                                                     
[LightGBM] [Info] Number of data points in the train set: 40544, number of used features: 192                          
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.038945 -> initscore=-3.205872                                        
[LightGBM] [Info] Start training from score -3.205872                                                                  
Training until validation scores don't improve for 30 rounds                                                           
Did not meet early stopping. Best iter

#### 최적의 하이퍼파라미터로 학습 및 예측

In [57]:
lgbm_best_clf = LGBMClassifier(n_estimators=100, 
                              max_depth= int(best_lgbm['max_depth']),
                           min_child_samples= int(best_lgbm['min_child_samples']),
                           num_leaves= int(best_lgbm['num_leaves']),
                           subsample= best_lgbm['subsample'],
                            learning_rate= best_lgbm['learning_rate'])

lgbm_best_clf.fit(x_tr, y_tr, eval_metric='auc', 
                  eval_set=[(x_tr,y_tr),(x_val,y_val)],
               callbacks=[early_stopping(stopping_rounds=100)])
lgbm_best_score = roc_auc_score(y_test, lgbm_best_clf.predict_proba(x_test)[:,1])
print(f'ROC_AUC: {lgbm_best_score:.4f}')

[LightGBM] [Info] Number of positive: 1658, number of negative: 40913
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.060707 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13001
[LightGBM] [Info] Number of data points in the train set: 42571, number of used features: 202
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.038947 -> initscore=-3.205836
[LightGBM] [Info] Start training from score -3.205836
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[77]	training's auc: 0.899118	training's binary_logloss: 0.117171	valid_1's auc: 0.833934	valid_1's binary_logloss: 0.13474
ROC_AUC: 0.8412


----