In [1]:
import pandas as pd
import numpy as np
import gc
import lightgbm as lgb
import pickle
from bayes_opt import BayesianOptimization
from sklearn import metrics, model_selection

# Récupération des données

In [2]:
data = pd.read_csv("data/train_resampled.csv")
cat_features = pickle.load(open("data/pickle_cat_features.pkl", "rb"))

# Isolation de la cible et séparation des données (train, test)

In [3]:
X = data.drop(columns=['TARGET','Unnamed: 0'])
y = data["TARGET"]

del data
gc.collect()

20

In [4]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=123)

print(f"X_train : {X_train.shape}")
print(f"X_test : {X_test.shape}")
print(f"y_train : {y_train.shape}")
print(f"y_test : {y_test.shape}")

def replace_name(name):
    for c in ["[","]",",","{","}",'"',":"," "]:
        if c in name :
            name = name.replace(c,"_")
    return name
            
features = list(map(replace_name, X_train.columns))
X_train.columns = features

train_data = lgb.Dataset(data=X_train, label=y_train, free_raw_data=False)

del X, y, X_train, X_test, y_train, y_test
gc.collect()

X_train : (311128, 676)
X_test : (77782, 676)
y_train : (311128,)
y_test : (77782,)


0

# Optimisation des paramètres

In [5]:
# métrique f_beta

BETA = 2

def f_beta(probas_pred, y_true):
    y_pred = np.vectorize(lambda x : 0 if x<0.5 else 1)(probas_pred)
    score = metrics.fbeta_score(y_true.get_label(), y_pred, beta=BETA)
    return "F_beta", score, True

In [6]:
# paramètres à optimiser

def lgb_eval(num_leaves, colsample_bytree, subsample, 
             max_depth, reg_alpha, reg_lambda, min_split_gain, min_child_weight):
    params = {'application':'binary',
              'learning_rate':0.02,
              'n_estimators':100,
              'num_iterations':4000, 
              'early_stopping_round':100, 
              'metric':'auc'}
    params["num_leaves"] = int(round(num_leaves))
    params["colsample_bytree"] = max(min(colsample_bytree, 1), 0)
    params["subsample"] = max(min(subsample, 1), 0)
    params["max_depth"] = int(round(max_depth))
    params["reg_alpha"] = max(reg_alpha, 0)
    params["reg_lambda"] = max(reg_lambda, 0)
    params["min_split_gain"] = min_split_gain
    params["min_child_weight"] = min_child_weight
    cv_result = lgb.cv(params, train_data, 
                       nfold=5, seed=123, 
                       stratified=True, 
                       verbose_eval=-1, 
                       feval=f_beta,
                       categorical_feature=cat_features)
    return max(cv_result['F_beta-mean'])



In [7]:
# intervalle pour chaque paramètre

lgbBO = BayesianOptimization(
    lgb_eval, 
    { 'num_leaves': (24, 45),
      'colsample_bytree': (0.1, 1),
      'subsample': (0.8, 1),
      'max_depth': (5, 8.99),
      'reg_alpha': (0.001, 1),
      'reg_lambda': (0.001, 1),
      'min_split_gain': (0.001, 0.1),
      'min_child_weight': (5, 50)}, 
    random_state=123)



In [8]:
lgbBO.maximize(init_points=5, n_iter=10)

|   iter    |  target   | colsam... | max_depth | min_ch... | min_sp... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------


New categorical_feature is ['CODE_GENDER', 'DAYS_EMPLOYED_ANOM', 'EMERGENCYSTATE_MODE', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'NAME_CONTRACT_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'NAME_INCOME_TYPE', 'NAME_TYPE_SUITE', 'OCCUPATION_TYPE', 'ORGANIZATION_TYPE', 'WALLSMATERIAL_MODE', 'WEEKDAY_APPR_PROCESS_START']


[LightGBM] [Info] Number of positive: 124437, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124437, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124438, number of negative: 124464
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124438, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the 



[LightGBM] [Info] Number of positive: 124437, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124437, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124438, number of negative: 124464
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124438, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the 



[LightGBM] [Info] Number of positive: 124437, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124437, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124438, number of negative: 124464
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124438, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the 



[LightGBM] [Info] Number of positive: 124437, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124437, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124438, number of negative: 124464
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124438, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the 



[LightGBM] [Info] Number of positive: 124437, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124437, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124438, number of negative: 124464
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124438, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the 



[LightGBM] [Info] Number of positive: 124437, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124437, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124438, number of negative: 124464
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124438, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the 



[LightGBM] [Info] Number of positive: 124437, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124437, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124438, number of negative: 124464
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124438, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the 





| [0m 7       [0m | [0m 0.8922  [0m | [0m 0.1059  [0m | [0m 5.0     [0m | [0m 37.61   [0m | [0m 0.05199 [0m | [0m 36.21   [0m | [0m 0.005427[0m | [0m 0.02028 [0m | [0m 1.0     [0m |




[LightGBM] [Info] Number of positive: 124437, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124437, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124438, number of negative: 124464
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124438, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the 







| [0m 8       [0m | [0m 0.8899  [0m | [0m 0.3164  [0m | [0m 5.0     [0m | [0m 28.23   [0m | [0m 0.02833 [0m | [0m 44.32   [0m | [0m 0.307   [0m | [0m 0.5224  [0m | [0m 0.8391  [0m |




[LightGBM] [Info] Number of positive: 124437, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124437, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124438, number of negative: 124464
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124438, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the 



[LightGBM] [Info] Number of positive: 124437, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124437, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124438, number of negative: 124464
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124438, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the 



| [0m 10      [0m | [0m 0.8899  [0m | [0m 1.0     [0m | [0m 5.0     [0m | [0m 31.55   [0m | [0m 0.1     [0m | [0m 37.66   [0m | [0m 0.001   [0m | [0m 0.001   [0m | [0m 0.8     [0m |




[LightGBM] [Info] Number of positive: 124437, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124437, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124438, number of negative: 124464
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124438, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the 



[LightGBM] [Info] Number of positive: 124437, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124437, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124438, number of negative: 124464
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124438, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the 



[LightGBM] [Info] Number of positive: 124437, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124437, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124438, number of negative: 124464
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124438, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the 



[LightGBM] [Info] Number of positive: 124437, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124437, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124438, number of negative: 124464
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124438, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the 



[LightGBM] [Info] Number of positive: 124437, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124437, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124438, number of negative: 124464
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the train set: 248902, number of used features: 622
[LightGBM] [Info] Number of positive: 124438, number of negative: 124465
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 144017
[LightGBM] [Info] Number of data points in the 

In [9]:
lgbBO.res

[{'target': 0.891570050294335,
  'params': {'colsample_bytree': 0.7268222670380755,
   'max_depth': 6.141695946452014,
   'min_child_weight': 15.20831541038914,
   'min_split_gain': 0.05558016213920623,
   'num_leaves': 39.108848365496826,
   'reg_alpha': 0.4236833536643365,
   'reg_lambda': 0.9807834341862308,
   'subsample': 0.9369659477169727}},
 {'target': 0.891080188239951,
  'params': {'colsample_bytree': 0.5328387113359249,
   'max_depth': 6.56454889759466,
   'min_child_weight': 20.443010726789122,
   'min_split_gain': 0.07317592103102012,
   'num_leaves': 33.210017138272114,
   'reg_alpha': 0.060618218712958784,
   'reg_lambda': 0.398646211075101,
   'subsample': 0.9475990811464071}},
 {'target': 0.8942913523212924,
  'params': {'colsample_bytree': 0.26424255740815,
   'max_depth': 5.700052507028495,
   'min_child_weight': 28.91981182288273,
   'min_split_gain': 0.05365093112258974,
   'num_leaves': 37.32242012957774,
   'reg_alpha': 0.8495823622837118,
   'reg_lambda': 0.7247

In [10]:
max(lgbBO.res, key=lambda x:x["target"])

{'target': 0.89504918664766,
 'params': {'colsample_bytree': 0.18704675203879897,
  'max_depth': 5.596525079817775,
  'min_child_weight': 31.190087127061712,
  'min_split_gain': 0.0848793956471949,
  'num_leaves': 39.49169250353543,
  'reg_alpha': 0.8470633083206642,
  'reg_lambda': 0.5740182739981458,
  'subsample': 0.9261485670593478}}