In [None]:
# conda install -c conda-forge bayesian-optimization
# conda install -c conda-forge xgboost

In [1]:
import numpy as np
import pandas as pd
import gc
import time
from contextlib import contextmanager
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from lightgbm import LGBMClassifier
from bayes_opt import BayesianOptimization
# from xgboost import XGBClassifier

In [7]:
lgb.__version__

'2.3.1'

In [2]:
# Reading the saved dtypes Series
final_df_dtypes = \
pd.read_csv('../../../BDSE12-Group3/datasets/homecdt_ss_output/ss_fteng_fromBDSE12_03G_HomeCredit_V2_20200204a_dtypes_series.csv'\
            , header=None, index_col=0, squeeze=True)
del final_df_dtypes.index.name
final_df_dtypes = final_df_dtypes.to_dict()

final_df = \
pd.read_csv('../../../BDSE12-Group3/datasets/homecdt_ss_output/ss_fteng_fromBDSE12_03G_HomeCredit_V2_20200204a.csv'\
           , dtype= final_df_dtypes)

In [3]:
final_df.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in final_df.columns]
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 356255 entries, 0 to 356254
Columns: 4081 entries, SK_ID_CURR to GOODS_PRICE_PREV__na
dtypes: float64(543), int64(4), uint8(3534)
memory usage: 2.6 GB


In [4]:
df = final_df
del final_df
gc.collect()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 356255 entries, 0 to 356254
Columns: 4081 entries, SK_ID_CURR to GOODS_PRICE_PREV__na
dtypes: float64(543), int64(4), uint8(3534)
memory usage: 2.6 GB


In [5]:
df.index.size

356255

---

## LGBM

In [6]:
gc.collect()

44

In [8]:
def lgbm_evaluate(**params):
    warnings.simplefilter('ignore')
    
    params['num_leaves'] = int(params['num_leaves'])
    params['max_depth'] = int(params['max_depth'])
    params['min_data_in_leaf'] = int(params['min_data_in_leaf'])
    params['max_bin'] = int(params['max_bin'])
    
        
    clf = LGBMClassifier(**params, 
                         n_estimators = 2000,
                         nthread = 2, 
                         boosting_type='goss', 
                         objective='binary')

    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]

    folds = StratifiedKFold(n_splits= 5, shuffle=True, random_state=1001)
        
    test_pred_proba = np.zeros(train_df.shape[0])
    
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        clf.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)], eval_metric = 'auc', 
                verbose = False, early_stopping_rounds = 200)

        test_pred_proba[valid_idx] = clf.predict_proba(valid_x, num_iteration = clf.best_iteration_)[:, 1]
        
        del train_x, train_y, valid_x, valid_y
        gc.collect()

    return roc_auc_score(train_df['TARGET'], test_pred_proba)

In [9]:
init_time = time.time()
params = {'learning_rate': (.01, .1), 
          'num_leaves': (21, 99), 
          'subsample': (0.6, 1), 
          'max_depth': (5, 31), 
          'reg_alpha': (.00, 1.0), 
          'reg_lambda': (.00, 1.0), 
          'min_split_gain': (.01, .1),
          'min_child_weight': (20, 70),
          'min_data_in_leaf': (20, 70),
          'max_bin': (55, 255)}
bo = BayesianOptimization(lgbm_evaluate, params)
bo.maximize(init_points = 10, n_iter = 10)
print("Elapsed time={:5.2f} sec.".format(time.time() - init_time))

|   iter    |  target   | learni... |  max_bin  | max_depth | min_ch... | min_da... | min_sp... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.7893  [0m | [0m 0.04583 [0m | [0m 75.66   [0m | [0m 26.94   [0m | [0m 29.06   [0m | [0m 44.89   [0m | [0m 0.02358 [0m | [0m 77.64   [0m | [0m 0.6871  [0m | [0m 0.4597  [0m | [0m 0.9086  [0m |
| [95m 2       [0m | [95m 0.7899  [0m | [95m 0.04446 [0m | [95m 108.9   [0m | [95m 21.68   [0m | [95m 37.8    [0m | [95m 38.26   [0m | [95m 0.06275 [0m | [95m 91.49   [0m | [95m 0.2459  [0m | [95m 0.02489 [0m | [95m 0.8439  [0m |
| [0m 3       [0m | [0m 0.7896  [0m | [0m 0.06406 [0m | [0m 73.8    [0m | [0m 14.23   [0m | [0m 62.27   [0m | [0m 66.17   [0m | [0m 0.08437 [0m | [0m 56.18   [0m | [0m 0.1512  [0m | [0m 0.334

In [25]:
bo.res[8]

{'target': 0.7932865050364686,
 'params': {'learning_rate': 0.01470849186434721,
  'max_bin': 142.66101896998015,
  'max_depth': 27.91451741057181,
  'min_child_weight': 31.702341834307422,
  'min_data_in_leaf': 50.05967985991292,
  'min_split_gain': 0.09320570777639621,
  'num_leaves': 44.20174344127514,
  'reg_alpha': 0.7786108741002781,
  'reg_lambda': 0.3782185675928136,
  'subsample': 0.9556663511637553}}

In [26]:
params_list = bo.res
len(params_list)

20

In [27]:
params_list

[{'target': 0.7892582282838362,
  'params': {'learning_rate': 0.04583323724573647,
   'max_bin': 75.66130204764131,
   'max_depth': 26.93670677339479,
   'min_child_weight': 29.061668992146714,
   'min_data_in_leaf': 44.88827492307174,
   'min_split_gain': 0.023580215858223265,
   'num_leaves': 77.63678953875991,
   'reg_alpha': 0.6870880026815223,
   'reg_lambda': 0.4597406888758858,
   'subsample': 0.9085642187782049}},
 {'target': 0.7898685715925247,
  'params': {'learning_rate': 0.044464138824737494,
   'max_bin': 108.9368801273189,
   'max_depth': 21.68090781985766,
   'min_child_weight': 37.802188981916316,
   'min_data_in_leaf': 38.256660463929066,
   'min_split_gain': 0.06274854280393752,
   'num_leaves': 91.48754191256563,
   'reg_alpha': 0.2459353550478498,
   'reg_lambda': 0.024891892399810822,
   'subsample': 0.8439197142804309}},
 {'target': 0.7895794327297585,
  'params': {'learning_rate': 0.06406119063087498,
   'max_bin': 73.80411142032176,
   'max_depth': 14.2316851044

In [28]:
# no need for looping
# params_list = [bo.res[i] for i in range(10)]

In [29]:
# 輸出參數檔案
import json
with open('../../../BDSE12-Group3/datasets/homecdt_ss_output/arams_list_BayesOpt_20200206a.txt', 'w', encoding='utf-8') as fout:
    for params in params_list:
        json.dump(params, fout) 
        fout.write("\n")

---

In [30]:
# 讀取參數檔案
with open('../../../BDSE12-Group3/datasets/homecdt_ss_output/params_list_BayesOpt_20200206a.txt', 'r', encoding='utf-8') as f:
    params_list_read = list(map(json.loads,f))

In [31]:
params_list_read[8]

{'target': 0.7932865050364686,
 'params': {'learning_rate': 0.01470849186434721,
  'max_bin': 142.66101896998015,
  'max_depth': 27.91451741057181,
  'min_child_weight': 31.702341834307422,
  'min_data_in_leaf': 50.05967985991292,
  'min_split_gain': 0.09320570777639621,
  'num_leaves': 44.20174344127514,
  'reg_alpha': 0.7786108741002781,
  'reg_lambda': 0.3782185675928136,
  'subsample': 0.9556663511637553}}

In [32]:
type(params_list_read[8])

dict

In [None]:
init_time = time.time()
params = {'learning_rate': (.0, .1), 
          'num_leaves': (20, 100), 
          'subsample': (.0, 1.0), 
          'max_depth': (6, 9), 
          'reg_alpha': (.00, 1.0), 
          'reg_lambda': (.00, 1.0), 
          'min_split_gain': (.0, .1),
          'min_child_weight': (20, 70)}
bo = BayesianOptimization(lgbm_evaluate, params)
bo.maximize(init_points = 5, n_iter = 10)
print("Elapsed time={:5.2f} sec.".format(time.time() - init_time))

---

## XGboost

In [None]:
def xgb_evaluate(**params):
    warnings.simplefilter('ignore')
    
    params['max_depth'] = int(params['max_depth'])
        
    clf = XGBClassifier(**params, 
                        n_estimators = 2000, 
                        nthread = 5, 
                        objective= 'binary:logistic')

    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]

    folds = StratifiedKFold(n_splits= 5, shuffle=True, random_state=1001)
        
    test_pred_proba = np.zeros(train_df.shape[0])
    
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        clf.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)], eval_metric = 'auc', 
                verbose = False, early_stopping_rounds = 100)

        test_pred_proba[valid_idx] = clf.predict_proba(valid_x, num_iteration = clf.best_iteration_)[:, 1]
        
        del train_x, train_y, valid_x, valid_y
        gc.collect()

    return roc_auc_score(train_df['TARGET'], test_pred_proba)

In [None]:
init_time = time.time()
params = {'learning_rate': (.01, .03), 
          'subsample': (.0, 1.0), 
          'max_depth': (4, 9), 
          'reg_alpha': (.0, 1.0), 
          'reg_lambda': (.0, 1.0), 
          'scale_pos_weight': (.0, 5.0),
          'colsample_bytree': (.0, 1.0)}
bo = BayesianOptimization(xgb_evaluate, params)
bo.maximize(init_points = 5, n_iter = 5)
print("Elapsed time={:5.2f} sec.".format(time.time() - init_time))