In [None]:
# conda install -c conda-forge bayesian-optimization
# conda install -c conda-forge xgboost

In [1]:
import numpy as np
import pandas as pd
import gc
import time
from contextlib import contextmanager
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from lightgbm import LGBMClassifier
from bayes_opt import BayesianOptimization
# from xgboost import XGBClassifier

In [2]:
lgb.__version__

'2.3.1'

In [3]:
# Reading the saved dtypes Series
final_df_dtypes = \
pd.read_csv('../../../BDSE12-Group3/datasets/homecdt_ss_output/ss_fteng_fromBDSE12_03G_HomeCredit_V2_le_20200210a_dtypes_series.csv'\
            , header=None, index_col=0, squeeze=True)
del final_df_dtypes.index.name
final_df_dtypes = final_df_dtypes.to_dict()

final_df = \
pd.read_csv('../../../BDSE12-Group3/datasets/homecdt_ss_output/ss_fteng_fromBDSE12_03G_HomeCredit_V2_le_20200210a.csv'\
           , dtype= final_df_dtypes)

In [4]:
final_df.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in final_df.columns]
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 356255 entries, 0 to 356254
Columns: 1325 entries, SK_ID_CURR to GOODS_PRICE_PREV__na
dtypes: float64(543), int64(254), uint8(528)
memory usage: 2.3 GB


In [5]:
df = final_df
del final_df
gc.collect()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 356255 entries, 0 to 356254
Columns: 1325 entries, SK_ID_CURR to GOODS_PRICE_PREV__na
dtypes: float64(543), int64(254), uint8(528)
memory usage: 2.3 GB


In [6]:
df['TARGET'].value_counts()

0.0    282686
1.0     24825
Name: TARGET, dtype: int64

In [7]:
# scale_pos_weight 
282686 / 24825

11.387150050352467

---

## LGBM

In [8]:
gc.collect()

66

In [14]:
def lgbm_evaluate(**params):
    warnings.simplefilter('ignore')
    
    params['num_leaves'] = int(params['num_leaves'])
    params['max_depth'] = int(params['max_depth'])
    params['min_child_samples'] = int(params['min_child_samples'])
    params['max_bin'] = int(params['max_bin'])
    params['max_drop'] = int(params['max_drop'])
    
        
    clf = LGBMClassifier(**params, 
#                          n_estimators = 2000,
#                          nthread = 2, 
                         boosting_type='dart', 
                         drop_seed = 924,
                         objective='binary',
                         scale_pos_weight = 11.387150050352467,
                         random_state = 924,
                         n_jobs = 4,
                         silent = False,
#                          importance_type (string, optional (default='split')) – 
#                          The type of feature importance to be filled into feature_importances_. 
#                          If ‘split’, result contains numbers of times the feature is used in a model. 
#                          If ‘gain’, result contains total gains of splits which use the feature.
                        )

    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]

    folds = StratifiedKFold(n_splits= 10, shuffle=True, random_state=1001)
        
    test_pred_proba = np.zeros(train_df.shape[0])
    
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        clf.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)], eval_metric = 'auc', 
                verbose = False, early_stopping_rounds = 200)

        test_pred_proba[valid_idx] = clf.predict_proba(valid_x, num_iteration = clf.best_iteration_)[:, 1]
        
        del train_x, train_y, valid_x, valid_y
        gc.collect()

    return roc_auc_score(train_df['TARGET'], test_pred_proba)

In [15]:
init_time = time.time()
params = { 
          'num_leaves': (21, 441), 
          'max_depth': (7, 567),
          'learning_rate': (.001, .1),
#           'n_estimators':(50, 1000),
#           'subsample_for_bin':(50000, 1000000),
#           'top_rate':(0.0 ,1.0),
          'min_split_gain': (.01, 1000),
          'drop_rate': (0.0, 1.0),
          'max_drop': (28, 343),
          'skip_drop': (0.0, 1.0),
          'min_child_weight': (0.001, 1000),
          'min_child_samples': (99, 9999),
#         subsample (float, optional (default=1.)) – Subsample ratio of the training instance.
#         subsample_freq (int, optional (default=0)) – Frequence of subsample, <=0 means no enable.
#         colsample_bytree (float, optional (default=1.)) – Subsample ratio of columns when constructing each tree.
          'reg_alpha': (.00, 10.0), 
          'reg_lambda': (.00, 10.0), 
          'max_bin': (127, 1023)}
bo = BayesianOptimization(lgbm_evaluate, params)
bo.maximize(init_points = 49, n_iter = 49)
print("Elapsed time={:5.2f} sec.".format(time.time() - init_time))

|   iter    |  target   | drop_rate | learni... |  max_bin  | max_depth | max_drop  | min_ch... | min_ch... | min_sp... | num_le... | reg_alpha | reg_la... | skip_drop |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.7673  [0m | [0m 0.618   [0m | [0m 0.0806  [0m | [0m 131.8   [0m | [0m 407.7   [0m | [0m 191.9   [0m | [0m 7.882e+0[0m | [0m 299.6   [0m | [0m 419.9   [0m | [0m 84.39   [0m | [0m 6.261   [0m | [0m 6.17    [0m | [0m 0.7943  [0m |
| [0m 2       [0m | [0m 0.7613  [0m | [0m 0.5879  [0m | [0m 0.009792[0m | [0m 927.9   [0m | [0m 257.3   [0m | [0m 257.9   [0m | [0m 1.12e+03[0m | [0m 274.6   [0m | [0m 75.7    [0m | [0m 197.0   [0m | [0m 4.733   [0m | [0m 1.631   [0m | [0m 0.9685  [0m |
| [0m 3       [0m | [0m 0.7582  [0m | [0m 0.02525 [0m | [0m 0.05097 [0m | [0m 858.6   

In [16]:
params_list = bo.res
len(params_list)

98

In [17]:
params_list

[{'target': 0.7673267841175914,
  'params': {'drop_rate': 0.6180470127074371,
   'learning_rate': 0.08060053270599628,
   'max_bin': 131.8099848462744,
   'max_depth': 407.6752252011149,
   'max_drop': 191.89041334356182,
   'min_child_samples': 7881.948490409369,
   'min_child_weight': 299.5994599778316,
   'min_split_gain': 419.86784247229383,
   'num_leaves': 84.38665429389337,
   'reg_alpha': 6.261174225652878,
   'reg_lambda': 6.169976604867706,
   'skip_drop': 0.7942804061508807}},
 {'target': 0.761293028901952,
  'params': {'drop_rate': 0.5878987660304986,
   'learning_rate': 0.0097922568836208,
   'max_bin': 927.9101524368205,
   'max_depth': 257.2861891506547,
   'max_drop': 257.90899338190445,
   'min_child_samples': 1119.9868644076519,
   'min_child_weight': 274.5951254011385,
   'min_split_gain': 75.69530655674488,
   'num_leaves': 196.97136028824465,
   'reg_alpha': 4.732514940750581,
   'reg_lambda': 1.6305275444867573,
   'skip_drop': 0.9684703861361347}},
 {'target': 0.

In [19]:
params_list[84]

{'target': 0.7837047966258421,
 'params': {'drop_rate': 0.8634963435039087,
  'learning_rate': 0.09071147437740834,
  'max_bin': 954.9398951911035,
  'max_depth': 58.51016719832349,
  'max_drop': 315.97615172235214,
  'min_child_samples': 4540.4221450167315,
  'min_child_weight': 303.1450936865266,
  'min_split_gain': 56.9166391504588,
  'num_leaves': 417.71299864710545,
  'reg_alpha': 8.404340765373679,
  'reg_lambda': 8.846087376248494,
  'skip_drop': 0.8983629227224048}}

In [20]:
params_list[55]

{'target': 0.7833118851480253,
 'params': {'drop_rate': 0.39678642351517635,
  'learning_rate': 0.07042762297320553,
  'max_bin': 1020.6579046753698,
  'max_depth': 56.98785384747694,
  'max_drop': 148.12161855007423,
  'min_child_samples': 5120.6407182253015,
  'min_child_weight': 950.838054048603,
  'min_split_gain': 10.23167833272785,
  'num_leaves': 354.20441526129224,
  'reg_alpha': 8.440917746626472,
  'reg_lambda': 3.1657921115853105,
  'skip_drop': 0.9620777160650893}}

In [21]:
params_list[24]

{'target': 0.7825335236326929,
 'params': {'drop_rate': 0.37267477414182804,
  'learning_rate': 0.09086777977158635,
  'max_bin': 177.5265407522511,
  'max_depth': 172.94331663660213,
  'max_drop': 157.82988480250984,
  'min_child_samples': 3038.8817109920014,
  'min_child_weight': 970.5818476132932,
  'min_split_gain': 5.250112095666932,
  'num_leaves': 229.81127393391162,
  'reg_alpha': 3.5733743477050037,
  'reg_lambda': 3.2903614712675164,
  'skip_drop': 0.7305765366916096}}

In [None]:
# no need for looping
# params_list = [bo.res[i] for i in range(10)]

In [18]:
# 輸出參數檔案
import json
with open('../../../BDSE12-Group3/datasets/homecdt_ss_output/params_list_BayesOpt_20200213c_G3V2_le_dart.txt', 'w', encoding='utf-8') as fout:
    for params in params_list:
        json.dump(params, fout) 
        fout.write("\n")

---

In [None]:
# 讀取參數檔案
with open('../../../BDSE12-Group3/datasets/homecdt_ss_output/params_list_BayesOpt_20200210a.txt', 'r', encoding='utf-8') as f:
    params_list_read = list(map(json.loads,f))

In [None]:
params_list_read[8]

In [None]:
type(params_list_read[8])

In [None]:
init_time = time.time()
params = {'learning_rate': (.0, .1), 
          'num_leaves': (20, 100), 
          'subsample': (.0, 1.0), 
          'max_depth': (6, 9), 
          'reg_alpha': (.00, 1.0), 
          'reg_lambda': (.00, 1.0), 
          'min_split_gain': (.0, .1),
          'min_child_weight': (20, 70)}
bo = BayesianOptimization(lgbm_evaluate, params)
bo.maximize(init_points = 5, n_iter = 10)
print("Elapsed time={:5.2f} sec.".format(time.time() - init_time))

---

## XGboost

In [None]:
def xgb_evaluate(**params):
    warnings.simplefilter('ignore')
    
    params['max_depth'] = int(params['max_depth'])
        
    clf = XGBClassifier(**params, 
                        n_estimators = 2000, 
                        nthread = 5, 
                        objective= 'binary:logistic')

    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]

    folds = StratifiedKFold(n_splits= 5, shuffle=True, random_state=1001)
        
    test_pred_proba = np.zeros(train_df.shape[0])
    
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        clf.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)], eval_metric = 'auc', 
                verbose = False, early_stopping_rounds = 100)

        test_pred_proba[valid_idx] = clf.predict_proba(valid_x, num_iteration = clf.best_iteration_)[:, 1]
        
        del train_x, train_y, valid_x, valid_y
        gc.collect()

    return roc_auc_score(train_df['TARGET'], test_pred_proba)

In [None]:
init_time = time.time()
params = {'learning_rate': (.01, .03), 
          'subsample': (.0, 1.0), 
          'max_depth': (4, 9), 
          'reg_alpha': (.0, 1.0), 
          'reg_lambda': (.0, 1.0), 
          'scale_pos_weight': (.0, 5.0),
          'colsample_bytree': (.0, 1.0)}
bo = BayesianOptimization(xgb_evaluate, params)
bo.maximize(init_points = 5, n_iter = 5)
print("Elapsed time={:5.2f} sec.".format(time.time() - init_time))