In [18]:
# Importing all the required libraries

import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('expand_frame_repr', False)
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
% matplotlib inline
from sklearn.model_selection import StratifiedKFold
from scipy.stats import rankdata
from sklearn import metrics
import lightgbm as lgb
import gc
import warnings

In [4]:
# IMPORTING THE TRAINING FILE

df_train = pd.read_csv("C:\\Users\\sivac\\Documents\\Python Projects\\Banco Santander Kaggle\\input\\train.csv")
df_test = pd.read_csv("C:\\Users\\sivac\\Documents\\Python Projects\\Banco Santander Kaggle\\input\\test.csv")


In [5]:
target = 'target'
predictors = df_train.columns.values.tolist()[2:]
df_train.target.value_counts() # The problem is imbalanced as only around 10% of target is positive

0    179902
1     20098
Name: target, dtype: int64

In [6]:
# 50% of the data set will be held for validation to set optimal parameters, and then 5 fold CV 
# for final model

bayesian_tr_index, bayesian_val_index = list(StratifiedKFold(n_splits=2, shuffle=True, random_state=2).split(df_train, df_train.target.values))[0]

# The indices will be later used for the bayesian optimization

In [12]:
# Function for optimizing Light GBM

def lgb_parameter_optimization(
    num_leaves, # int
    min_data_in_leaf, # int
    learning_rate,
    min_sum_hessian_in_leaf, # int
    feature_fraction,
    lambda_l1,
    lambda_l2,
    min_gain_to_split,
    max_depth):
    
    # LightGBM expects the following variables in int. So, lets initialize them as integer
    
    num_leaves = int(num_leaves)
    min_data_in_leaf = int(min_data_in_leaf)
    max_depth = int(max_depth)
    
    assert type(num_leaves) == int
    assert type(min_data_in_leaf) == int
    assert type(max_depth) == int
    
    param = {
        'num_leaves': num_leaves,
        'max_bin': 63,
        'min_data_in_leaf': min_data_in_leaf,
        'learning_rate': learning_rate,
        'min_sum_hessian_in_leaf': min_sum_hessian_in_leaf,
        'bagging_fraction': 1.0,
        'bagging_freq': 5,
        'feature_fraction': feature_fraction,
        'lambda_l1': lambda_l1,
        'lambda_l2': lambda_l2,
        'min_gain_to_split': min_gain_to_split,
        'max_depth': max_depth,
        'save_binary': True, 
        'seed': 1337,
        'feature_fraction_seed': 1337,
        'bagging_seed': 1337,
        'drop_seed': 1337,
        'data_random_seed': 1337,
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'verbose': 1,
        'metric': 'auc',
        'is_unbalance': True,
        'boost_from_average': False,  
    }
    
    xg_train = lgb.Dataset(df_train.iloc[bayesian_tr_index][predictors].values,
                          label=df_train.iloc[bayesian_tr_index][target].values,
                          feature_name=predictors,
                          free_raw_data = False
                          )
    xg_validation = lgb.Dataset(df_train.iloc[bayesian_val_index][predictors].values,
                          label=df_train.iloc[bayesian_val_index][target].values,
                          feature_name=predictors,
                          free_raw_data = False
                          )
    num_round=5000
    model = lgb.train(param, xg_train, num_round, valid_sets = [xg_validation], verbose_eval=250,
                   early_stopping_rounds=60)
    predictions = model.predict(df_train.iloc[bayesian_val_index][predictors].values, 
                                num_iteration=model.best_iteration)
    score = metrics.roc_auc_score(df_train.iloc[bayesian_val_index][target].values, predictions)
    return score
    


The above LGB_bayesian function will act as black box function for Bayesian optimization. I already defined the the trainng and validation dataset for LightGBM inside the LGB_bayesian function.

The LGB_bayesian function takes values for num_leaves, min_data_in_leaf, learning_rate, min_sum_hessian_in_leaf, feature_fraction, lambda_l1, lambda_l2, min_gain_to_split, max_depth from Bayesian optimization framework. Keep in mind that num_leaves, min_data_in_leaf, and max_depth should be integer for LightGBM. But Bayesian Optimization sends continous vales to function. So I force them to be integer. I am only going to find optimal parameter values of them. The reader may increase or decrease number of parameters to optimize.

Now I need to give bounds for these parameters, so that Bayesian optimization only search inside the bounds.

In [13]:
# Bounded region of parameter space
bounds_LGB = {
    'num_leaves': (5, 20), 
    'min_data_in_leaf': (5, 20),  
    'learning_rate': (0.01, 0.3),
    'min_sum_hessian_in_leaf': (0.00001, 0.01),    
    'feature_fraction': (0.05, 0.5),
    'lambda_l1': (0, 5.0), 
    'lambda_l2': (0, 5.0), 
    'min_gain_to_split': (0, 1.0),
    'max_depth':(3,15),
}

In [14]:
# Let us now include everythingg in the optimization function

from bayes_opt import BayesianOptimization

LGB_BO = BayesianOptimization(lgb_parameter_optimization, bounds_LGB, random_state=1)

Now, let's the the key space (parameters) we are going to optimize:

In [15]:
print(LGB_BO.space.keys)

['feature_fraction', 'lambda_l1', 'lambda_l2', 'learning_rate', 'max_depth', 'min_data_in_leaf', 'min_gain_to_split', 'min_sum_hessian_in_leaf', 'num_leaves']


I have created the BayesianOptimization object (LGB_BO), it will not work until I call maximize. Before calling it, I want to explain two parameters of BayesianOptimization object (LGB_BO) which we can pass to maximize:

init_points: How many initial random runs of random exploration we want to perform. In our case LGB_bayesian will be called n_iter times.
n_iter: How many runs of bayesian optimization we want to perform after number of init_points runs.
Now, it's time to call the function from Bayesian optimization framework to maximize. I allow LGB_BO object to run for 5 init_points (exploration) and 5 n_iter (exploitation).

In [16]:
init_points = 5
n_iter = 5

In [19]:
print('-' * 130)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    LGB_BO.maximize(init_points=init_points, n_iter=n_iter, acq='ucb', xi=0.0, alpha=1e-6)

----------------------------------------------------------------------------------------------------------------------------------
|   iter    |  target   | featur... | lambda_l1 | lambda_l2 | learni... | max_depth | min_da... | min_ga... | min_su... | num_le... |
-------------------------------------------------------------------------------------------------------------------------------------
Training until validation scores don't improve for 60 rounds.
[250]	valid_0's auc: 0.87519
[500]	valid_0's auc: 0.888834
[750]	valid_0's auc: 0.891982
Early stopping, best iteration is:
[876]	valid_0's auc: 0.892315
| [0m 1       [0m | [0m 0.8923  [0m | [0m 0.2925  [0m | [0m 2.096   [0m | [0m 3.426   [0m | [0m 0.06929 [0m | [0m 13.54   [0m | [0m 5.411   [0m | [0m 0.6705  [0m | [0m 0.004179[0m | [0m 13.38   [0m |
Training until validation scores don't improve for 60 rounds.
[250]	valid_0's auc: 0.888477
Early stopping, best iteration is:
[324]	valid_0's auc: 0.889685
| [0m

In [20]:
# Let us see the max AUC score
LGB_BO.max['target']

0.8964483321228044

In [21]:
# Let us see the parameters responsible for this max score

LGB_BO.max['params']

{'feature_fraction': 0.06757465245479707,
 'lambda_l1': 0.8491520978228445,
 'lambda_l2': 4.390712517147065,
 'learning_rate': 0.03852058181158453,
 'max_depth': 8.053291500060626,
 'min_data_in_leaf': 19.368342952257528,
 'min_gain_to_split': 0.5331652849730171,
 'min_sum_hessian_in_leaf': 0.006921852368365229,
 'num_leaves': 9.732734465090944}

In [None]:
# https://www.kaggle.com/fayzur/lgb-bayesian-parameters-finding-rank-average
# https://www.kaggle.com/fayzur/lightgbm-customer-transaction-prediction