In [2]:
import numpy as np
import pandas as pd
import gc
import time
from contextlib import contextmanager
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from lightgbm import LGBMClassifier
from bayes_opt import BayesianOptimization
from xgboost import XGBClassifier

In [3]:
df = pd.read_csv('../../datasets/homeCredit/BDSE12_03G_HomeCredit_V2.csv')
df = pd.get_dummies(df)

In [4]:
def lgbm_evaluate(**params):
    warnings.simplefilter('ignore')
    
    params['num_leaves'] = int(params['num_leaves'])
    params['max_depth'] = int(params['max_depth'])
        
    clf = LGBMClassifier(**params, 
                         n_estimators = 2000,
                         nthread = 5, 
                         boosting_type='goss', 
                         objective='binary')

    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]

    folds = StratifiedKFold(n_splits= 5, shuffle=True, random_state=1001)
        
    test_pred_proba = np.zeros(train_df.shape[0])
    
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        clf.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)], eval_metric = 'auc', 
                verbose = False, early_stopping_rounds = 100)

        test_pred_proba[valid_idx] = clf.predict_proba(valid_x, num_iteration = clf.best_iteration_)[:, 1]
        
        del train_x, train_y, valid_x, valid_y
        gc.collect()

    return roc_auc_score(train_df['TARGET'], test_pred_proba)

In [6]:
def xgb_evaluate(**params):
    warnings.simplefilter('ignore')
    
    params['max_depth'] = int(params['max_depth'])
        
    clf = XGBClassifier(**params, 
                        n_estimators = 2000, 
                        nthread = 5, 
                        objective= 'binary:logistic')

    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]

    folds = StratifiedKFold(n_splits= 5, shuffle=True, random_state=1001)
        
    test_pred_proba = np.zeros(train_df.shape[0])
    
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

        clf.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)], eval_metric = 'auc', 
                verbose = False, early_stopping_rounds = 100)

        test_pred_proba[valid_idx] = clf.predict_proba(valid_x, num_iteration = clf.best_iteration_)[:, 1]
        
        del train_x, train_y, valid_x, valid_y
        gc.collect()

    return roc_auc_score(train_df['TARGET'], test_pred_proba)

In [None]:
init_time = time.time()
params = {'learning_rate': (.01, .03), 
          'subsample': (.0, 1.0), 
          'max_depth': (4, 9), 
          'reg_alpha': (.0, 1.0), 
          'reg_lambda': (.0, 1.0), 
          'scale_pos_weight': (.0, 5.0),
          'colsample_bytree': (.0, 1.0)}
bo = BayesianOptimization(xgb_evaluate, params)
bo.maximize(init_points = 5, n_iter = 5)
print("Elapsed time={:5.2f} sec.".format(time.time() - init_time))

|   iter    |  target   | colsam... | learni... | max_depth | reg_alpha | reg_la... | scale_... | subsample |
-------------------------------------------------------------------------------------------------------------


In [15]:
init_time = time.time()
params = {'learning_rate': (.01, .03), 
          'num_leaves': (20, 100), 
          'subsample': (0.8, 1), 
          'max_depth': (6, 9), 
          'reg_alpha': (.00, 1.0), 
          'reg_lambda': (.00, 1.0), 
          'min_split_gain': (.01, .03),
          'min_child_weight': (20, 70)}
bo = BayesianOptimization(lgbm_evaluate, params)
bo.maximize(init_points = 5, n_iter = 5)
print("Elapsed time={:5.2f} sec.".format(time.time() - init_time))

|   iter    |  target   | learni... | max_depth | min_ch... | min_sp... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.7935  [0m | [0m 0.01112 [0m | [0m 7.106   [0m | [0m 44.6    [0m | [0m 0.01451 [0m | [0m 91.72   [0m | [0m 0.09609 [0m | [0m 0.3668  [0m | [0m 0.9818  [0m |
| [0m 2       [0m | [0m 0.7932  [0m | [0m 0.02896 [0m | [0m 6.387   [0m | [0m 57.54   [0m | [0m 0.02782 [0m | [0m 23.84   [0m | [0m 0.05095 [0m | [0m 0.6451  [0m | [0m 0.8986  [0m |
| [0m 3       [0m | [0m 0.793   [0m | [0m 0.02692 [0m | [0m 7.423   [0m | [0m 62.36   [0m | [0m 0.02013 [0m | [0m 92.88   [0m | [0m 0.6868  [0m | [0m 0.8196  [0m | [0m 0.9318  [0m |
| [0m 4       [0m | [0m 0.7934  [0m | [0m 0.01211 [0m | [0m 6.622   [0m | [0m 68.66   [0m | [0m 0.01294 [0m | [0m 54.74   [0m | [0m 0.1

In [16]:
bo.res[5]

{'target': 0.792940652843537,
 'params': {'learning_rate': 0.013237147439636492,
  'max_depth': 8.176972909272658,
  'min_child_weight': 20.210808853599698,
  'min_split_gain': 0.019613448600170856,
  'num_leaves': 20.01281707147962,
  'reg_alpha': 0.4762374177443556,
  'reg_lambda': 0.795459509951648,
  'subsample': 0.8782478787186309}}

In [17]:
init_time = time.time()
params = {'learning_rate': (.0, .1), 
          'num_leaves': (20, 100), 
          'subsample': (.0, 1.0), 
          'max_depth': (6, 9), 
          'reg_alpha': (.00, 1.0), 
          'reg_lambda': (.00, 1.0), 
          'min_split_gain': (.0, .1),
          'min_child_weight': (20, 70)}
bo = BayesianOptimization(lgbm_evaluate, params)
bo.maximize(init_points = 5, n_iter = 10)
print("Elapsed time={:5.2f} sec.".format(time.time() - init_time))

|   iter    |  target   | learni... | max_depth | min_ch... | min_sp... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.789   [0m | [0m 0.06537 [0m | [0m 8.788   [0m | [0m 22.77   [0m | [0m 0.08022 [0m | [0m 60.89   [0m | [0m 0.9679  [0m | [0m 0.2213  [0m | [0m 0.4622  [0m |
| [95m 2       [0m | [95m 0.7921  [0m | [95m 0.0361  [0m | [95m 8.687   [0m | [95m 41.12   [0m | [95m 0.08093 [0m | [95m 61.53   [0m | [95m 0.8299  [0m | [95m 0.4435  [0m | [95m 0.6061  [0m |
| [95m 3       [0m | [95m 0.7937  [0m | [95m 0.0138  [0m | [95m 8.885   [0m | [95m 57.73   [0m | [95m 0.07277 [0m | [95m 81.73   [0m | [95m 0.6619  [0m | [95m 0.3204  [0m | [95m 0.5381  [0m |
| [0m 4       [0m | [0m 0.7909  [0m | [0m 0.07356 [0m | [0m 7.91    [0m | [0m 63.69   [0m | [0m 0.04934 [0m | [0m 44.

In [19]:
bo.res[2]

{'target': 0.7936603978356124,
 'params': {'learning_rate': 0.013803211212539657,
  'max_depth': 8.884921565475697,
  'min_child_weight': 57.725650927715265,
  'min_split_gain': 0.07276947619457204,
  'num_leaves': 81.72909839300354,
  'reg_alpha': 0.6618734111073816,
  'reg_lambda': 0.320433363007782,
  'subsample': 0.5381263969882377}}