In [1]:
import pandas as pd
import numpy as np
import gc
import lightgbm as lgb

from bayes_opt import BayesianOptimization
from sklearn import metrics, model_selection

# Récupération des données

In [2]:
data = pd.read_csv("data/train_df.csv")

# Isolation de la cible et séparation des données (train, test)

In [3]:
X = data.drop(columns=['TARGET','SK_ID_CURR','index', "Unnamed: 0"])
y = data["TARGET"]

del data
gc.collect()

20

In [4]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=123)

print(f"X_train : {X_train.shape}")
print(f"X_test : {X_test.shape}")
print(f"y_train : {y_train.shape}")
print(f"y_test : {y_test.shape}")

def replace_name(name):
    for c in ["[","]",",","{","}",'"',":"," "]:
        if c in name :
            name = name.replace(c,"_")
    return name
            
features = list(map(replace_name, X_train.columns))
X_train.columns = features

train_data = lgb.Dataset(data=X_train, label=y_train, free_raw_data=False)

del X, y, X_train, X_test, y_train, y_test
gc.collect()

X_train : (246005, 796)
X_test : (61502, 796)
y_train : (246005,)
y_test : (61502,)


0

# Optimisation des paramètres

In [6]:
# paramètres à optimiser

def lgb_eval(learning_rate, num_leaves, colsample_bytree, subsample, 
             max_depth, reg_alpha, reg_lambda, min_split_gain, min_child_weight):
    params = {'application':'binary',
              'n_estimators':100,
              'num_iterations':4000, 
              'early_stopping_round':100, 
              'metric':'auc'}
    params["learning_rate"] = max(learning_rate, 0)
    params["num_leaves"] = int(round(num_leaves))
    params["colsample_bytree"] = max(min(colsample_bytree, 1), 0)
    params["subsample"] = max(min(subsample, 1), 0)
    params["max_depth"] = int(round(max_depth))
    params["reg_alpha"] = max(reg_alpha, 0)
    params["reg_lambda"] = max(reg_lambda, 0)
    params["min_split_gain"] = min_split_gain
    params["min_child_weight"] = min_child_weight
    cv_result = lgb.cv(params, train_data, nfold=5, seed=123, stratified=True, verbose_eval=-1, metrics=['auc'])
    return max(cv_result['auc-mean'])



In [7]:
# intervalle pour chaque paramètre

lgbBO = BayesianOptimization(lgb_eval, {'learning_rate':(0.01, 0.1),
                                        'num_leaves': (24, 45),
                                        'colsample_bytree': (0.1, 1),
                                        'subsample': (0.8, 1),
                                        'max_depth': (5, 8.99),
                                        'reg_alpha': (0.001, 1),
                                        'reg_lambda': (0.001, 1),
                                        'min_split_gain': (0.001, 0.1),
                                        'min_child_weight': (5, 50)}, random_state=123)



In [8]:
lgbBO.maximize(init_points=5, n_iter=10)

|   iter    |  target   | colsam... | learni... | max_depth | min_ch... | min_sp... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------




[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train se





















































| [0m 1       [0m | [0m 0.7876  [0m | [0m 0.7268  [0m | [0m 0.03575 [0m | [0m 5.905   [0m | [0m 29.81   [0m | [0m 0.07223 [0m | [0m 32.89   [0m | [0m 0.9808  [0m | [0m 0.6851  [0m | [0m 0.8962  [0m |




[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train se







| [95m 2       [0m | [95m 0.788   [0m | [95m 0.4529  [0m | [95m 0.04089 [0m | [95m 7.909   [0m | [95m 24.74   [0m | [95m 0.006908[0m | [95m 32.36   [0m | [95m 0.7383  [0m | [95m 0.1833  [0m | [95m 0.8351  [0m |




[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train se

















| [0m 3       [0m | [0m 0.7872  [0m | [0m 0.5784  [0m | [0m 0.05786 [0m | [0m 7.531   [0m | [0m 43.22   [0m | [0m 0.07272 [0m | [0m 36.83   [0m | [0m 0.7227  [0m | [0m 0.3236  [0m | [0m 0.8724  [0m |




[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train se

| [95m 4       [0m | [95m 0.7885  [0m | [95m 0.3054  [0m | [95m 0.03643 [0m | [95m 7.518   [0m | [95m 9.145   [0m | [95m 0.04394 [0m | [95m 33.05   [0m | [95m 0.4942  [0m | [95m 0.4264  [0m | [95m 0.8625  [0m |




[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train se



[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train se



| [0m 6       [0m | [0m 0.7869  [0m | [0m 0.8384  [0m | [0m 0.05836 [0m | [0m 6.883   [0m | [0m 9.873   [0m | [0m 0.01033 [0m | [0m 32.41   [0m | [0m 0.9409  [0m | [0m 0.2204  [0m | [0m 0.9442  [0m |




[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train se



















| [0m 7       [0m | [0m 0.7865  [0m | [0m 0.4203  [0m | [0m 0.08937 [0m | [0m 6.158   [0m | [0m 19.39   [0m | [0m 0.08532 [0m | [0m 32.0    [0m | [0m 0.8559  [0m | [0m 0.7457  [0m | [0m 0.9327  [0m |




[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train se



















































| [95m 8       [0m | [95m 0.7895  [0m | [95m 0.5008  [0m | [95m 0.01604 [0m | [95m 7.665   [0m | [95m 39.4    [0m | [95m 0.03533 [0m | [95m 35.74   [0m | [95m 0.4553  [0m | [95m 0.4129  [0m | [95m 0.8836  [0m |




[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of 





| [0m 9       [0m | [0m 0.7877  [0m | [0m 0.1058  [0m | [0m 0.06365 [0m | [0m 8.367   [0m | [0m 24.45   [0m | [0m 0.07869 [0m | [0m 30.25   [0m | [0m 0.2843  [0m | [0m 0.4768  [0m | [0m 0.9177  [0m |




[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train se

| [0m 10      [0m | [0m 0.7884  [0m | [0m 0.24    [0m | [0m 0.02719 [0m | [0m 8.331   [0m | [0m 6.29    [0m | [0m 0.04117 [0m | [0m 32.81   [0m | [0m 0.0852  [0m | [0m 0.7413  [0m | [0m 0.8455  [0m |




[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train se





































| [0m 11      [0m | [0m 0.7888  [0m | [0m 0.5651  [0m | [0m 0.0299  [0m | [0m 7.396   [0m | [0m 31.26   [0m | [0m 0.04269 [0m | [0m 27.56   [0m | [0m 0.03838 [0m | [0m 0.2562  [0m | [0m 0.9637  [0m |




[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train se



















































| [0m 12      [0m | [0m 0.7856  [0m | [0m 0.2815  [0m | [0m 0.08793 [0m | [0m 5.036   [0m | [0m 12.69   [0m | [0m 0.08495 [0m | [0m 39.58   [0m | [0m 0.5509  [0m | [0m 0.7922  [0m | [0m 0.8338  [0m |




[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train se

















































































| [0m 13      [0m | [0m 0.7878  [0m | [0m 0.9419  [0m | [0m 0.02544 [0m | [0m 6.204   [0m | [0m 40.02   [0m | [0m 0.0881  [0m | [0m 30.5    [0m | [0m 0.01874 [0m | [0m 0.7609  [0m | [0m 0.9169  [0m |




[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train se









| [0m 14      [0m | [0m 0.7864  [0m | [0m 0.2544  [0m | [0m 0.07856 [0m | [0m 7.624   [0m | [0m 39.19   [0m | [0m 0.02665 [0m | [0m 35.81   [0m | [0m 0.4795  [0m | [0m 0.05726 [0m | [0m 0.8647  [0m |




[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train set: 196804, number of used features: 786
[LightGBM] [Info] Number of positive: 15896, number of negative: 180908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 99469
[LightGBM] [Info] Number of data points in the train se



























| [0m 15      [0m | [0m 0.7855  [0m | [0m 0.9127  [0m | [0m 0.08518 [0m | [0m 6.06    [0m | [0m 35.73   [0m | [0m 0.07534 [0m | [0m 35.65   [0m | [0m 0.9916  [0m | [0m 0.4176  [0m | [0m 0.9742  [0m |


In [12]:
lgbBO.res

[{'target': 0.787626990255241,
  'params': {'colsample_bytree': 0.7268222670380755,
   'learning_rate': 0.035752540145534153,
   'max_depth': 5.90513729972117,
   'min_child_weight': 29.809164608730107,
   'min_split_gain': 0.07222742800877074,
   'num_leaves': 32.88523566261368,
   'reg_alpha': 0.9807834341862308,
   'reg_lambda': 0.6851449088462784,
   'subsample': 0.8961863802968723}},
 {'target': 0.7879752552601929,
  'params': {'colsample_bytree': 0.45290576637473545,
   'learning_rate': 0.04088602145357825,
   'max_depth': 7.908908332462326,
   'min_child_weight': 24.735751010583098,
   'min_split_gain': 0.006908111764347267,
   'num_leaves': 32.35892936193906,
   'reg_alpha': 0.7382574103263037,
   'reg_lambda': 0.18330923872304647,
   'subsample': 0.8350903512294985}},
 {'target': 0.7871909156704997,
  'params': {'colsample_bytree': 0.5783962364576546,
   'learning_rate': 0.05786448283871795,
   'max_depth': 7.531259824619771,
   'min_child_weight': 43.22443073350053,
   'min_s

In [15]:
max(lgbBO.res, key=lambda x:x["target"])

{'target': 0.7894946688932547,
 'params': {'colsample_bytree': 0.5008424936188149,
  'learning_rate': 0.01604238736764179,
  'max_depth': 7.664565427674419,
  'min_child_weight': 39.39942093978256,
  'min_split_gain': 0.035332365150277316,
  'num_leaves': 35.74102141662571,
  'reg_alpha': 0.45530956234831316,
  'reg_lambda': 0.41291226155010513,
  'subsample': 0.8836454684910712}}