In [1]:
import pandas as pd
import numpy as np
import gc
import lightgbm as lgb
import pickle
from bayes_opt import BayesianOptimization
from sklearn import metrics, model_selection

# Récupération et préparation des données

In [2]:

X_train = pd.read_csv("data/X_train_resampled.csv")
y_train = pd.read_csv("data/y_train_resampled.csv")
X_test = pd.read_csv("data/X_test.csv")
y_test = pd.read_csv("data/y_test.csv")

cat_features = pickle.load(open("data/pickle_cat_features.pkl", "rb"))

In [3]:
X_train.drop(columns="Unnamed: 0", inplace=True)
X_test.drop(columns="SK_ID_CURR", inplace=True)
y_train.drop(columns="Unnamed: 0", inplace=True)
y_test.drop(columns="SK_ID_CURR", inplace=True)

print(f"X_train : {X_train.shape}")
print(f"X_test : {X_test.shape}")
print(f"y_train : {y_train.shape}")
print(f"y_test : {y_test.shape}")

def replace_name(name):
    for c in ["[","]",",","{","}",'"',":"," "]:
        if c in name :
            name = name.replace(c,"_")
    return name
            
features = list(map(replace_name, X_train.columns))
X_train.columns = features

train_data = lgb.Dataset(data=X_train, label=y_train, free_raw_data=False)

del X_train, X_test, y_train, y_test
gc.collect()

X_train : (310572, 676)
X_test : (61502, 676)
y_train : (310572, 1)
y_test : (61502, 1)


20

# Optimisation des paramètres

In [4]:
# métrique f_beta

BETA = 2

def f_beta(probas_pred, y_true):
    y_pred = np.vectorize(lambda x : 0 if x<0.5 else 1)(probas_pred)
    score = metrics.fbeta_score(y_true.get_label(), y_pred, beta=BETA)
    return "F_beta", score, True

In [5]:
# paramètres à optimiser

def lgb_eval(num_leaves, colsample_bytree, subsample, 
             max_depth, reg_alpha, reg_lambda, min_split_gain, min_child_weight):
    params = {'application':'binary',
              'learning_rate':0.02,
              'n_estimators':100,
              'num_iterations':4000, 
              'early_stopping_round':100, 
              'metric':'auc'}
    params["num_leaves"] = int(round(num_leaves))
    params["colsample_bytree"] = max(min(colsample_bytree, 1), 0)
    params["subsample"] = max(min(subsample, 1), 0)
    params["max_depth"] = int(round(max_depth))
    params["reg_alpha"] = max(reg_alpha, 0)
    params["reg_lambda"] = max(reg_lambda, 0)
    params["min_split_gain"] = min_split_gain
    params["min_child_weight"] = min_child_weight
    cv_result = lgb.cv(params, train_data, 
                       nfold=5, seed=123, 
                       stratified=True, 
                       verbose_eval=-1, 
                       feval=f_beta,
                       categorical_feature=cat_features)
    return max(cv_result['F_beta-mean'])



In [6]:
# intervalle pour chaque paramètre

lgbBO = BayesianOptimization(
    lgb_eval, 
    { 'num_leaves': (24, 45),
      'colsample_bytree': (0.1, 1),
      'subsample': (0.8, 1),
      'max_depth': (5, 8.99),
      'reg_alpha': (0.001, 1),
      'reg_lambda': (0.001, 1),
      'min_split_gain': (0.001, 0.1),
      'min_child_weight': (5, 50)}, 
    random_state=123)



In [7]:
lgbBO.maximize(init_points=5, n_iter=10)

|   iter    |  target   | colsam... | max_depth | min_ch... | min_sp... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------


New categorical_feature is ['CODE_GENDER', 'DAYS_EMPLOYED_ANOM', 'EMERGENCYSTATE_MODE', 'FLAG_CONT_MOBILE', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_EMAIL', 'FLAG_EMP_PHONE', 'FLAG_MOBIL', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'FLAG_PHONE', 'FLAG_WORK_PHONE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'NAME_CONTRACT_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'NAME_INCOME_TYPE', 'NAME_TYPE_SUITE', 'OCCUPATION_TYPE', 'ORGANIZATION_TYPE', 'WALLSMATERIAL_MODE', 'WEEKDAY_APPR_PROCESS_START']


[LightGBM] [Info] Number of positive: 124229, number of negative: 124228
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248457, number of used features: 622
[LightGBM] [Info] Number of positive: 124228, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248457, number of used features: 622
[LightGBM] [Info] Number of positive: 124229, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248458, number of used features: 622
[LightGBM] [Info] Number of positive: 124229, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the 



[LightGBM] [Info] Number of positive: 124229, number of negative: 124228
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248457, number of used features: 622
[LightGBM] [Info] Number of positive: 124228, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248457, number of used features: 622
[LightGBM] [Info] Number of positive: 124229, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248458, number of used features: 622
[LightGBM] [Info] Number of positive: 124229, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the 



[LightGBM] [Info] Number of positive: 124229, number of negative: 124228
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248457, number of used features: 622
[LightGBM] [Info] Number of positive: 124228, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248457, number of used features: 622
[LightGBM] [Info] Number of positive: 124229, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248458, number of used features: 622
[LightGBM] [Info] Number of positive: 124229, number of negative: 124229
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] T



[LightGBM] [Info] Number of positive: 124229, number of negative: 124228
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248457, number of used features: 622
[LightGBM] [Info] Number of positive: 124228, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248457, number of used features: 622
[LightGBM] [Info] Number of positive: 124229, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248458, number of used features: 622
[LightGBM] [Info] Number of positive: 124229, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the 



[LightGBM] [Info] Number of positive: 124229, number of negative: 124228
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248457, number of used features: 622
[LightGBM] [Info] Number of positive: 124228, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248457, number of used features: 622
[LightGBM] [Info] Number of positive: 124229, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248458, number of used features: 622
[LightGBM] [Info] Number of positive: 124229, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the 



[LightGBM] [Info] Number of positive: 124229, number of negative: 124228
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248457, number of used features: 622
[LightGBM] [Info] Number of positive: 124228, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248457, number of used features: 622
[LightGBM] [Info] Number of positive: 124229, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248458, number of used features: 622
[LightGBM] [Info] Number of positive: 124229, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the 







| [0m 6       [0m | [0m 0.8916  [0m | [0m 0.1922  [0m | [0m 5.474   [0m | [0m 31.86   [0m | [0m 0.07434 [0m | [0m 40.07   [0m | [0m 0.9178  [0m | [0m 0.6334  [0m | [0m 0.9322  [0m |




[LightGBM] [Info] Number of positive: 124229, number of negative: 124228
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248457, number of used features: 622
[LightGBM] [Info] Number of positive: 124228, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248457, number of used features: 622
[LightGBM] [Info] Number of positive: 124229, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248458, number of used features: 622
[LightGBM] [Info] Number of positive: 124229, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the 





| [0m 7       [0m | [0m 0.8877  [0m | [0m 0.1     [0m | [0m 5.0     [0m | [0m 34.8    [0m | [0m 0.07462 [0m | [0m 34.15   [0m | [0m 0.4559  [0m | [0m 0.001   [0m | [0m 1.0     [0m |




[LightGBM] [Info] Number of positive: 124229, number of negative: 124228
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248457, number of used features: 622
[LightGBM] [Info] Number of positive: 124228, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248457, number of used features: 622
[LightGBM] [Info] Number of positive: 124229, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248458, number of used features: 622
[LightGBM] [Info] Number of positive: 124229, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the 







| [0m 8       [0m | [0m 0.8893  [0m | [0m 0.2633  [0m | [0m 5.0     [0m | [0m 27.22   [0m | [0m 0.001   [0m | [0m 42.13   [0m | [0m 1.0     [0m | [0m 1.0     [0m | [0m 0.8344  [0m |




[LightGBM] [Info] Number of positive: 124229, number of negative: 124228
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248457, number of used features: 622
[LightGBM] [Info] Number of positive: 124228, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248457, number of used features: 622
[LightGBM] [Info] Number of positive: 124229, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248458, number of used features: 622
[LightGBM] [Info] Number of positive: 124229, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the 



[LightGBM] [Info] Number of positive: 124229, number of negative: 124228
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248457, number of used features: 622
[LightGBM] [Info] Number of positive: 124228, number of negative: 124229
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248457, number of used features: 622
[LightGBM] [Info] Number of positive: 124229, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248458, number of used features: 622
[LightGBM] [Info] Number of positive: 124229, number of negative: 124229
You can set `force_row_wise=true` to remove the overhead.
And if memory is no





| [0m 10      [0m | [0m 0.8877  [0m | [0m 0.1     [0m | [0m 5.0     [0m | [0m 29.87   [0m | [0m 0.001   [0m | [0m 39.01   [0m | [0m 0.4717  [0m | [0m 0.001   [0m | [0m 1.0     [0m |




[LightGBM] [Info] Number of positive: 124229, number of negative: 124228
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248457, number of used features: 622
[LightGBM] [Info] Number of positive: 124228, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248457, number of used features: 622
[LightGBM] [Info] Number of positive: 124229, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248458, number of used features: 622
[LightGBM] [Info] Number of positive: 124229, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the 



[LightGBM] [Info] Number of positive: 124229, number of negative: 124228
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248457, number of used features: 622
[LightGBM] [Info] Number of positive: 124228, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248457, number of used features: 622
[LightGBM] [Info] Number of positive: 124229, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248458, number of used features: 622
[LightGBM] [Info] Number of positive: 124229, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the 



[LightGBM] [Info] Number of positive: 124229, number of negative: 124228
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248457, number of used features: 622
[LightGBM] [Info] Number of positive: 124228, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248457, number of used features: 622
[LightGBM] [Info] Number of positive: 124229, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248458, number of used features: 622
[LightGBM] [Info] Number of positive: 124229, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the 



[LightGBM] [Info] Number of positive: 124229, number of negative: 124228
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248457, number of used features: 622
[LightGBM] [Info] Number of positive: 124228, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248457, number of used features: 622
[LightGBM] [Info] Number of positive: 124229, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248458, number of used features: 622
[LightGBM] [Info] Number of positive: 124229, number of negative: 124229
You can set `force_row_wise=true` to remove the overhead.
And if memory is no



[LightGBM] [Info] Number of positive: 124229, number of negative: 124228
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248457, number of used features: 622
[LightGBM] [Info] Number of positive: 124228, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248457, number of used features: 622
[LightGBM] [Info] Number of positive: 124229, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the train set: 248458, number of used features: 622
[LightGBM] [Info] Number of positive: 124229, number of negative: 124229
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140505
[LightGBM] [Info] Number of data points in the 

In [8]:
lgbBO.res

[{'target': 0.8888220927862911,
  'params': {'colsample_bytree': 0.7268222670380755,
   'max_depth': 6.141695946452014,
   'min_child_weight': 15.20831541038914,
   'min_split_gain': 0.05558016213920623,
   'num_leaves': 39.108848365496826,
   'reg_alpha': 0.4236833536643365,
   'reg_lambda': 0.9807834341862308,
   'subsample': 0.9369659477169727}},
 {'target': 0.8861931125701048,
  'params': {'colsample_bytree': 0.5328387113359249,
   'max_depth': 6.56454889759466,
   'min_child_weight': 20.443010726789122,
   'min_split_gain': 0.07317592103102012,
   'num_leaves': 33.210017138272114,
   'reg_alpha': 0.060618218712958784,
   'reg_lambda': 0.398646211075101,
   'subsample': 0.9475990811464071}},
 {'target': 0.8918735005885263,
  'params': {'colsample_bytree': 0.26424255740815,
   'max_depth': 5.700052507028495,
   'min_child_weight': 28.91981182288273,
   'min_split_gain': 0.05365093112258974,
   'num_leaves': 37.32242012957774,
   'reg_alpha': 0.8495823622837118,
   'reg_lambda': 0.72

In [9]:
max(lgbBO.res, key=lambda x:x["target"])

{'target': 0.8918735005885263,
 'params': {'colsample_bytree': 0.26424255740815,
  'max_depth': 5.700052507028495,
  'min_child_weight': 28.91981182288273,
  'min_split_gain': 0.05365093112258974,
  'num_leaves': 37.32242012957774,
  'reg_alpha': 0.8495823622837118,
  'reg_lambda': 0.7247308695357746,
  'subsample': 0.9222047021355166}}