# Model tuning : LightGBM Classifier

**Plan :**

[1. Loading the libraries and the data](#1)  
[2. Tuning the parameters](#2)  
> [Max depth tuning](#2a)  
> [Number of leaves tuning](#2b)  
> [Min data in leaves tuning](#2c)  
> [Lasso regularization tuning](#2d)   
> [Ridge regularization tuning](#2e)   
> [Max bins tuning](#2f)   
> [Feature fraction tuning](#2g) 

[3. Defining the best parameters](#3)

<a id="1"></a>
## 1. Loading the libraries and the data

### Loading the libraries

In [3]:
import random
import numpy as np
import igraph
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn import preprocessing
import nltk
import csv
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

In [4]:
random.seed(0)

### Loading the features created in the feature engineering code

In [None]:
train = pd.read_csv('train_complete.csv', header = 0)
test = pd.read_csv('test_complete.csv', header = 0)

In [None]:
features = ['Title overlap', 'Abstract overlap', 'Temporal difference', 'Common authors', 'Common journal',
            'Cosine similarity', 'Authors in abstract', 'LSA distance', # Semantic features
            'Betweenness centrality', 'Same cluster', 'Page rank', 'Ressource allocation', 'Jaccard coefficient',
            'Adamic Adar', 'Preferential attachment', 'Target_indegree', 'Target_outdegree',
            'Source_indegree', 'Source_outdegree', 'Common_in', 'Common_out' #Topological features
           ]

In [None]:
training_features = train[features]
testing_features = test[features]

In [None]:
labels_array = train['Edge']

In [15]:
training_features.head()

Unnamed: 0,Title overlap,Temporal difference,Common authors,Cosine similarity,Common journal,Overlap abstract,Authors in abstract,res_alloc,Jaccard,ad_adar,...,Common_out,Transitive_ts,Transitive_st,Friend_measure_st,Friend_measure_ts,Scc,Wcc,Scc_plus,Len_path_st,Len_path_ts
0,2.0,0.0,0.0,0.039132,1.0,4.0,0.0,0.142857,0.058824,0.513898,...,0.0,0.0,0.0,0.0,0.0,14.0,12.0,16.0,-1.0,-1.0
1,1.0,1.0,0.0,0.015247,0.0,7.0,0.0,0.226401,0.097087,4.320366,...,0.0,0.0,0.0,576.0,181.0,161.0,1.0,163.0,-1.0,2.0
2,0.0,-2.0,0.0,0.008888,0.0,6.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,4.0,6.0,-1.0,-1.0
3,0.0,-4.0,0.0,0.00474,0.0,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,14.0,0.0,27.0,1.0,29.0,6.0,-1.0
4,0.0,-5.0,0.0,0.027379,0.0,8.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,23.0,2.0,25.0,3.0,-1.0


In [16]:
testing_features.head()

Unnamed: 0,Title overlap,Temporal difference,Common authors,Cosine similarity,Common journal,Overlap abstract,Authors in abstract,res_alloc,Jaccard,ad_adar,...,Common_out,Transitive_ts,Transitive_st,Friend_measure_st,Friend_measure_ts,Scc,Wcc,Scc_plus,Len_path_st,Len_path_ts
0,0.0,0.0,0.0,0.055452,0.0,7.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,77.0,1.0,79.0,7.0,16.0
1,2.0,1.0,0.0,0.11067,1.0,6.0,0.0,0.311535,0.074303,5.377973,...,0.0,0.0,0.0,864.0,910.0,302.0,6.0,302.0,13.0,2.0
2,1.0,2.0,0.0,0.043831,1.0,4.0,0.0,1.342594,0.065338,15.053612,...,0.0,0.0,0.0,1924.0,2080.0,901.0,4.0,903.0,-1.0,2.0
3,1.0,0.0,0.0,0.054856,1.0,13.0,0.0,0.298419,0.221053,4.899424,...,0.0,0.0,0.0,399.0,385.0,95.0,1.0,97.0,9.0,2.0
4,0.0,5.0,0.0,0.147222,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,8.0,7.0,156.0,9.0,158.0,-1.0,3.0


In [None]:
print training_features.shape
print testing_features.shape

In [17]:
nltk.download('punkt') # for tokenization
nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()

with open("data/testing_set.txt", "r") as f:
    reader = csv.reader(f)
    testing_set  = list(reader)

testing_set = [element[0].split(" ") for element in testing_set]

###################
# random baseline #
###################

random_predictions = np.random.choice([0, 1], size=len(testing_set))
random_predictions = zip(range(len(testing_set)),random_predictions)

with open("data/random_predictions.csv","wb") as pred:
    csv_out = csv.writer(pred)
    for row in random_predictions:
        csv_out.writerow(row)
        
# note: Kaggle requires that you add "ID" and "category" column headers

###############################
# beating the random baseline #
###############################

# the following script gets an F1 score of approximately 0.66

# data loading and preprocessing 

# the columns of the data frame below are: 
# (1) paper unique ID (integer)
# (2) publication year (integer)
# (3) paper title (string)
# (4) authors (strings separated by ,)
# (5) name of journal (optional) (string)
# (6) abstract (string) - lowercased, free of punctuation except intra-word dashes

with open("data/training_set.txt", "r") as f:
    reader = csv.reader(f)
    training_set  = list(reader)

training_set = [element[0].split(" ") for element in training_set]

with open("data/node_information.csv", "r") as f:
    reader = csv.reader(f)
    node_info  = list(reader)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/delavergne/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/delavergne/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<a id="2"></a>
## 2. Tuning the parameters

### Scaling the data

In [18]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
training_features = min_max_scaler.fit_transform(training_features)
testing_features = min_max_scaler.transform(testing_features)

### Tuning

In [19]:
from sklearn.model_selection import train_test_split
train, valid, y_train, y_valid,  = train_test_split(training_features, labels_array, test_size = 0.4) # with your dataset

In [20]:
y_valid.shape

(246205,)

In [21]:
from sklearn.metrics import f1_score
def f1_score_lgbm(preds, train_data):
    labels = train_data.get_label()
    tp = np.sum(labels[labels == 1] == (preds[labels == 1] > 0.5))
    tn = np.sum(labels[labels == 0] == (preds[labels == 0] > 0.5))
    fp = np.sum(labels[labels == 1] != (preds[labels == 1] > 0.5))
    fn = np.sum(labels[labels == 0] != (preds[labels == 0] > 0.5))
    p = tp / float(tp + fp)
    r = tp / float(tp + fn)

    return 'f1 score', (2 * p * r / (p + r)), False

In [22]:
import lightgbm as lgb

In [23]:
lgb_params = {
    'learning_rate': 0.1,
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary'}
# form LightGBM datasets
dtrain_lgb = lgb.Dataset(train, y_train)
deval_lgb = lgb.Dataset(valid, y_valid, reference=dtrain_lgb)

<a id="2a"></a>
### Max depth tuning

In [29]:
%%time
dtrain_lgb_full = lgb.Dataset(training_features, labels_array)
# Imagine now that you want to optimize num_leaves and
# learning_rate, and also use early stopping:
max_depth_choices = [2**i for i in range(6,0,-1)]


# We will store the cross validation results in a simple list,
# with tuples in the form of (hyperparam dict, cv score):
cv_results = []

for depth_lv in max_depth_choices:
        print "Starting cv for :", depth_lv

        lgb_params = {
        'learning_rate': 0.07,
        'max_depth': depth_lv,
        'task': 'train', 
        'boosting_type': 'gbdt',
        'objective': 'binary'}
        validation_summary = lgb.cv(lgb_params, dtrain_lgb_full, num_boost_round=8000, # any high number will do
                                                                     nfold=5,
                                                                     metrics=["binary_logloss"],
                                                                     early_stopping_rounds=50, # Here it is
                                                                     verbose_eval=50)
        optimal_num_trees = len(validation_summary["binary_logloss-mean"])
            # Let's just add the optimal number of trees (chosen by early stopping)
            # to the hyperparameter dictionary:
        lgb_params["optimal_number_of_trees"] = optimal_num_trees

           # And we append results to cv_results:
        cv_results.append((lgb_params, validation_summary["binary_logloss-mean"][-1]))

Starting cv for : 64
[50]	cv_agg's binary_logloss: 0.0795454 + 0.000979466
[100]	cv_agg's binary_logloss: 0.0641691 + 0.00117504
[150]	cv_agg's binary_logloss: 0.0615719 + 0.00116461
[200]	cv_agg's binary_logloss: 0.0641916 + 0.00250378
Starting cv for : 32
[50]	cv_agg's binary_logloss: 0.0795454 + 0.000979466
[100]	cv_agg's binary_logloss: 0.0641691 + 0.00117504
[150]	cv_agg's binary_logloss: 0.0615719 + 0.00116461
[200]	cv_agg's binary_logloss: 0.0641916 + 0.00250378
Starting cv for : 16
[50]	cv_agg's binary_logloss: 0.0795454 + 0.000979466
[100]	cv_agg's binary_logloss: 0.064213 + 0.00118243
[150]	cv_agg's binary_logloss: 0.0615643 + 0.00127109
[200]	cv_agg's binary_logloss: 0.0625555 + 0.000981934
Starting cv for : 8
[50]	cv_agg's binary_logloss: 0.0796872 + 0.00100198
[100]	cv_agg's binary_logloss: 0.0646964 + 0.00119372
[150]	cv_agg's binary_logloss: 0.0618836 + 0.00117435
[200]	cv_agg's binary_logloss: 0.0614443 + 0.00163817
Starting cv for : 4
[50]	cv_agg's binary_logloss: 0.08

KeyboardInterrupt: 

In [30]:
cv_results

[({'boosting_type': 'gbdt',
   'learning_rate': 0.07,
   'max_depth': 2,
   'metric': ['binary_logloss'],
   'objective': 'binary',
   'optimal_number_of_trees': 166,
   'task': 'train',
   'verbose': 1},
  0.061289759233353128),
 ({'boosting_type': 'gbdt',
   'learning_rate': 0.07,
   'max_depth': 32,
   'metric': ['binary_logloss'],
   'objective': 'binary',
   'optimal_number_of_trees': 166,
   'task': 'train',
   'verbose': 1},
  0.061289759233353128),
 ({'boosting_type': 'gbdt',
   'learning_rate': 0.07,
   'max_depth': 16,
   'metric': ['binary_logloss'],
   'objective': 'binary',
   'optimal_number_of_trees': 169,
   'task': 'train',
   'verbose': 1},
  0.061296296136130392),
 ({'boosting_type': 'gbdt',
   'learning_rate': 0.07,
   'max_depth': 8,
   'metric': ['binary_logloss'],
   'objective': 'binary',
   'optimal_number_of_trees': 174,
   'task': 'train',
   'verbose': 1},
  0.061290540483624867),
 ({'boosting_type': 'gbdt',
   'learning_rate': 0.07,
   'max_depth': 4,
   'm

In [31]:
%%time
dtrain_lgb_full = lgb.Dataset(training_features, labels_array)
# Imagine now that you want to optimize num_leaves and
# learning_rate, and also use early stopping:
max_depth_choices = [4, 5, 6, 7, 8]


# We will store the cross validation results in a simple list,
# with tuples in the form of (hyperparam dict, cv score):
cv_results = []

for depth_lv in max_depth_choices:
    print "Starting cv for :", depth_lv
        
    lgb_params = {
    'learning_rate': 0.07,
    'max_depth': depth_lv,
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary'}
    validation_summary = lgb.cv(lgb_params,
                                                                 dtrain_lgb_full,
                                                                 num_boost_round=4000, # any high number will do
                                                                 nfold=5,
                                                                 metrics=["binary_logloss"],
                                                                
                                                                 early_stopping_rounds=50, # Here it is
                                                                 verbose_eval=50)
    optimal_num_trees = len(validation_summary["binary_logloss-mean"])
        # Let's just add the optimal number of trees (chosen by early stopping)
        # to the hyperparameter dictionary:
    lgb_params["optimal_number_of_trees"] = optimal_num_trees

       # And we append results to cv_results:
    cv_results.append((lgb_params, validation_summary["binary_logloss-mean"][-1]))

Starting cv for : 4
[50]	cv_agg's binary_logloss: 0.0871718 + 0.000924393
[100]	cv_agg's binary_logloss: 0.0691146 + 0.00111743
[150]	cv_agg's binary_logloss: 0.0652982 + 0.00117947
[200]	cv_agg's binary_logloss: 0.063351 + 0.00125922
[250]	cv_agg's binary_logloss: 0.06209 + 0.00132284
[300]	cv_agg's binary_logloss: 0.0611852 + 0.00128709
[350]	cv_agg's binary_logloss: 0.0605569 + 0.00126531
[400]	cv_agg's binary_logloss: 0.0601424 + 0.00129053
[450]	cv_agg's binary_logloss: 0.0599214 + 0.00139953
[500]	cv_agg's binary_logloss: 0.0609497 + 0.00368726
[550]	cv_agg's binary_logloss: 0.0594651 + 0.00153832
[600]	cv_agg's binary_logloss: 0.0592846 + 0.00156972
[650]	cv_agg's binary_logloss: 0.0591398 + 0.00159503
[700]	cv_agg's binary_logloss: 0.059179 + 0.0014697
Starting cv for : 5
[50]	cv_agg's binary_logloss: 0.0830615 + 0.000934899
[100]	cv_agg's binary_logloss: 0.0668972 + 0.0011535
[150]	cv_agg's binary_logloss: 0.0633273 + 0.00120217
[200]	cv_agg's binary_logloss: 0.0616516 + 0.001

In [32]:
cv_results

[({'boosting_type': 'gbdt',
   'learning_rate': 0.07,
   'max_depth': 8,
   'metric': ['binary_logloss'],
   'objective': 'binary',
   'optimal_number_of_trees': 652,
   'task': 'train',
   'verbose': 1},
  0.059134232722921717),
 ({'boosting_type': 'gbdt',
   'learning_rate': 0.07,
   'max_depth': 5,
   'metric': ['binary_logloss'],
   'objective': 'binary',
   'optimal_number_of_trees': 863,
   'task': 'train',
   'verbose': 1},
  0.058714710889308762),
 ({'boosting_type': 'gbdt',
   'learning_rate': 0.07,
   'max_depth': 6,
   'metric': ['binary_logloss'],
   'objective': 'binary',
   'optimal_number_of_trees': 390,
   'task': 'train',
   'verbose': 1},
  0.060069403929643009),
 ({'boosting_type': 'gbdt',
   'learning_rate': 0.07,
   'max_depth': 7,
   'metric': ['binary_logloss'],
   'objective': 'binary',
   'optimal_number_of_trees': 185,
   'task': 'train',
   'verbose': 1},
  0.061276455942056052),
 ({'boosting_type': 'gbdt',
   'learning_rate': 0.07,
   'max_depth': 8,
   'met

<a id="2b"></a>
### Number of leaves tuning

In [36]:
%%time
dtrain_lgb_full = lgb.Dataset(training_features, labels_array)
# Imagine now that you want to optimize num_leaves and
# learning_rate, and also use early stopping:
#num_leaves = [2**i for i in range(3,9)]
num_leaves = [i for i in range(26,39,2)]

# We will store the cross validation results in a simple list,
# with tuples in the form of (hyperparam dict, cv score):
cv_results = []

for leave in num_leaves:
        print "Starting cv for :", leave

        lgb_params = {
        'learning_rate': 0.07,
        'max_depth': 5,
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'num_leaves':leave}
        validation_summary = lgb.cv(lgb_params,
                                                                     dtrain_lgb_full,
                                                                     num_boost_round=4000, # any high number will do
                                                                     nfold=5,
                                                                     metrics=["binary_logloss"],

                                                                     early_stopping_rounds=50, # Here it is
                                                                     verbose_eval=50)
        optimal_num_trees = len(validation_summary["binary_logloss-mean"])
            # Let's just add the optimal number of trees (chosen by early stopping)
            # to the hyperparameter dictionary:
        lgb_params["optimal_number_of_trees"] = optimal_num_trees

           # And we append results to cv_results:
        cv_results.append((lgb_params, validation_summary["binary_logloss-mean"][-1]))

Starting cv for : 26
[50]	cv_agg's binary_logloss: 0.0831885 + 0.000873361
[100]	cv_agg's binary_logloss: 0.0669458 + 0.00106731
[150]	cv_agg's binary_logloss: 0.0634273 + 0.00120498
[200]	cv_agg's binary_logloss: 0.0618051 + 0.00123928
[250]	cv_agg's binary_logloss: 0.0609004 + 0.00115452
[300]	cv_agg's binary_logloss: 0.0602483 + 0.00123748
[350]	cv_agg's binary_logloss: 0.0598104 + 0.001191
[400]	cv_agg's binary_logloss: 0.0596042 + 0.00127749
[450]	cv_agg's binary_logloss: 0.0595659 + 0.00125506
Starting cv for : 28
[50]	cv_agg's binary_logloss: 0.0831345 + 0.000933579
[100]	cv_agg's binary_logloss: 0.0669359 + 0.00105064
[150]	cv_agg's binary_logloss: 0.0633947 + 0.00111039
[200]	cv_agg's binary_logloss: 0.0618126 + 0.00120395
[250]	cv_agg's binary_logloss: 0.060867 + 0.00114495
[300]	cv_agg's binary_logloss: 0.0602526 + 0.0012272
[350]	cv_agg's binary_logloss: 0.0598436 + 0.00130651
[400]	cv_agg's binary_logloss: 0.0595814 + 0.00126024
[450]	cv_agg's binary_logloss: 0.0594444 + 0

In [37]:
cv_results

[({'boosting_type': 'gbdt',
   'learning_rate': 0.07,
   'max_depth': 5,
   'metric': ['binary_logloss'],
   'num_leaves': 38,
   'objective': 'binary',
   'optimal_number_of_trees': 426,
   'task': 'train',
   'verbose': 1},
  0.059481169197700981),
 ({'boosting_type': 'gbdt',
   'learning_rate': 0.07,
   'max_depth': 5,
   'metric': ['binary_logloss'],
   'num_leaves': 28,
   'objective': 'binary',
   'optimal_number_of_trees': 485,
   'task': 'train',
   'verbose': 1},
  0.059409553708127463),
 ({'boosting_type': 'gbdt',
   'learning_rate': 0.07,
   'max_depth': 5,
   'metric': ['binary_logloss'],
   'num_leaves': 30,
   'objective': 'binary',
   'optimal_number_of_trees': 600,
   'task': 'train',
   'verbose': 1},
  0.059089151134795105),
 ({'boosting_type': 'gbdt',
   'learning_rate': 0.07,
   'max_depth': 5,
   'metric': ['binary_logloss'],
   'num_leaves': 32,
   'objective': 'binary',
   'optimal_number_of_trees': 469,
   'task': 'train',
   'verbose': 1},
  0.05934026100142694

<a id="2c"></a>
### Minimum data in leaves tuning

In [38]:
%%time
dtrain_lgb_full = lgb.Dataset(training_features, labels_array)
# Imagine now that you want to optimize num_leaves and
# learning_rate, and also use early stopping:
min_data_leaves = [i for i in range(100,1001,100)]


# We will store the cross validation results in a simple list,
# with tuples in the form of (hyperparam dict, cv score):
cv_results = []

for data_leaf in min_data_leaves:
        print "Starting cv for :", data_leaf

        lgb_params = {
        'learning_rate': 0.07,
        'max_depth': 5,
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'num_leaves':30,
        'min_data_in_leaf':data_leaf}
        validation_summary = lgb.cv(lgb_params,
                                                                     dtrain_lgb_full,
                                                                     num_boost_round=4000, # any high number will do
                                                                     nfold=5,
                                                                     metrics=["binary_logloss"],

                                                                     early_stopping_rounds=50, # Here it is
                                                                     verbose_eval=50)
        optimal_num_trees = len(validation_summary["binary_logloss-mean"])
            # Let's just add the optimal number of trees (chosen by early stopping)
            # to the hyperparameter dictionary:
        lgb_params["optimal_number_of_trees"] = optimal_num_trees

           # And we append results to cv_results:
        cv_results.append((lgb_params, validation_summary["binary_logloss-mean"][-1]))

Starting cv for : 100
[50]	cv_agg's binary_logloss: 0.0831346 + 0.000967489
[100]	cv_agg's binary_logloss: 0.0668719 + 0.00115047
[150]	cv_agg's binary_logloss: 0.0632964 + 0.00114442
[200]	cv_agg's binary_logloss: 0.0615439 + 0.0012028
[250]	cv_agg's binary_logloss: 0.0604831 + 0.00120993
[300]	cv_agg's binary_logloss: 0.0598046 + 0.0012104
[350]	cv_agg's binary_logloss: 0.0593307 + 0.00121701
[400]	cv_agg's binary_logloss: 0.0589858 + 0.00121954
[450]	cv_agg's binary_logloss: 0.0587705 + 0.00120732
[500]	cv_agg's binary_logloss: 0.0585762 + 0.00121146
[550]	cv_agg's binary_logloss: 0.0584311 + 0.00122463
[600]	cv_agg's binary_logloss: 0.05833 + 0.00122927
[650]	cv_agg's binary_logloss: 0.0582582 + 0.00123105
[700]	cv_agg's binary_logloss: 0.0582104 + 0.00122362
[750]	cv_agg's binary_logloss: 0.0581822 + 0.0012271
[800]	cv_agg's binary_logloss: 0.0581805 + 0.00122812
Starting cv for : 200
[50]	cv_agg's binary_logloss: 0.0831769 + 0.000920764
[100]	cv_agg's binary_logloss: 0.0669205 + 

[400]	cv_agg's binary_logloss: 0.0590539 + 0.00129524
[450]	cv_agg's binary_logloss: 0.0587889 + 0.0013012
[500]	cv_agg's binary_logloss: 0.0586146 + 0.00129328
[550]	cv_agg's binary_logloss: 0.058465 + 0.00129035
[600]	cv_agg's binary_logloss: 0.0583434 + 0.00128287
[650]	cv_agg's binary_logloss: 0.058274 + 0.00128325
[700]	cv_agg's binary_logloss: 0.0582073 + 0.00129511
[750]	cv_agg's binary_logloss: 0.0581597 + 0.00132433
[800]	cv_agg's binary_logloss: 0.0581181 + 0.00131579
[850]	cv_agg's binary_logloss: 0.0580805 + 0.00131713
[900]	cv_agg's binary_logloss: 0.0580926 + 0.00132517
Starting cv for : 1000
[50]	cv_agg's binary_logloss: 0.0836808 + 0.000876187
[100]	cv_agg's binary_logloss: 0.0672423 + 0.00105487
[150]	cv_agg's binary_logloss: 0.063705 + 0.00118132
[200]	cv_agg's binary_logloss: 0.0618975 + 0.0012338
[250]	cv_agg's binary_logloss: 0.0607613 + 0.00125723
[300]	cv_agg's binary_logloss: 0.0599894 + 0.00127334
[350]	cv_agg's binary_logloss: 0.0594613 + 0.00125411
[400]	cv_a

In [39]:
cv_results

[({'boosting_type': 'gbdt',
   'learning_rate': 0.07,
   'max_depth': 5,
   'metric': ['binary_logloss'],
   'min_data_in_leaf': 1000,
   'num_leaves': 30,
   'objective': 'binary',
   'optimal_number_of_trees': 784,
   'task': 'train',
   'verbose': 1},
  0.05816980055568273),
 ({'boosting_type': 'gbdt',
   'learning_rate': 0.07,
   'max_depth': 5,
   'metric': ['binary_logloss'],
   'min_data_in_leaf': 200,
   'num_leaves': 30,
   'objective': 'binary',
   'optimal_number_of_trees': 828,
   'task': 'train',
   'verbose': 1},
  0.058212185036541429),
 ({'boosting_type': 'gbdt',
   'learning_rate': 0.07,
   'max_depth': 5,
   'metric': ['binary_logloss'],
   'min_data_in_leaf': 300,
   'num_leaves': 30,
   'objective': 'binary',
   'optimal_number_of_trees': 840,
   'task': 'train',
   'verbose': 1},
  0.058104363191568219),
 ({'boosting_type': 'gbdt',
   'learning_rate': 0.07,
   'max_depth': 5,
   'metric': ['binary_logloss'],
   'min_data_in_leaf': 400,
   'num_leaves': 30,
   'obje

<a id="2d"></a>
### Lasso regularization tuning

In [41]:
%%time
dtrain_lgb_full = lgb.Dataset(training_features, labels_array)
# Imagine now that you want to optimize num_leaves and
# learning_rate, and also use early stopping:
reg_lambda = [i/10.0 for i in range(0,11,1)]


# We will store the cross validation results in a simple list,
# with tuples in the form of (hyperparam dict, cv score):
cv_results = []

for lam in reg_lambda:
    print "Starting cv for :", lam
        
    lgb_params = {
        'learning_rate': 0.07,
        'max_depth': 5,
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'num_leaves':30,
        'min_data_in_leaf':600,
        'lambda_l1':lam}
    validation_summary = lgb.cv(lgb_params,
                                                                 dtrain_lgb_full,
                                                                 num_boost_round=4000, # any high number will do
                                                                 nfold=5,
                                                                 metrics=["binary_logloss"],
                                                                
                                                                 early_stopping_rounds=50, # Here it is
                                                                 verbose_eval=100)
    optimal_num_trees = len(validation_summary["binary_logloss-mean"])
        # Let's just add the optimal number of trees (chosen by early stopping)
        # to the hyperparameter dictionary:
    lgb_params["optimal_number_of_trees"] = optimal_num_trees

       # And we append results to cv_results:
    cv_results.append((lgb_params, validation_summary["binary_logloss-mean"][-1]))

Starting cv for : 0.0
[100]	cv_agg's binary_logloss: 0.0671178 + 0.00105531
[200]	cv_agg's binary_logloss: 0.0617242 + 0.00122836
[300]	cv_agg's binary_logloss: 0.0598658 + 0.00122819
[400]	cv_agg's binary_logloss: 0.0589813 + 0.00121171
[500]	cv_agg's binary_logloss: 0.0585415 + 0.00125062
[600]	cv_agg's binary_logloss: 0.0583002 + 0.00125958
[700]	cv_agg's binary_logloss: 0.0581346 + 0.0012652
[800]	cv_agg's binary_logloss: 0.0580853 + 0.00125172
Starting cv for : 0.1
[100]	cv_agg's binary_logloss: 0.0671052 + 0.00113217
[200]	cv_agg's binary_logloss: 0.0617581 + 0.00122411
[300]	cv_agg's binary_logloss: 0.0598881 + 0.00125013
[400]	cv_agg's binary_logloss: 0.0590422 + 0.00125553
[500]	cv_agg's binary_logloss: 0.0585901 + 0.00126743
[600]	cv_agg's binary_logloss: 0.0583502 + 0.00127287
[700]	cv_agg's binary_logloss: 0.0582019 + 0.00128245
[800]	cv_agg's binary_logloss: 0.0581483 + 0.00128187
Starting cv for : 0.2
[100]	cv_agg's binary_logloss: 0.0671398 + 0.00108318
[200]	cv_agg's bi

In [42]:
cv_results

[({'boosting_type': 'gbdt',
   'lambda_l1': 1.0,
   'learning_rate': 0.07,
   'max_depth': 5,
   'metric': ['binary_logloss'],
   'min_data_in_leaf': 600,
   'num_leaves': 30,
   'objective': 'binary',
   'optimal_number_of_trees': 842,
   'task': 'train',
   'verbose': 1},
  0.058069880724337761),
 ({'boosting_type': 'gbdt',
   'lambda_l1': 0.1,
   'learning_rate': 0.07,
   'max_depth': 5,
   'metric': ['binary_logloss'],
   'min_data_in_leaf': 600,
   'num_leaves': 30,
   'objective': 'binary',
   'optimal_number_of_trees': 781,
   'task': 'train',
   'verbose': 1},
  0.058136296830961463),
 ({'boosting_type': 'gbdt',
   'lambda_l1': 0.2,
   'learning_rate': 0.07,
   'max_depth': 5,
   'metric': ['binary_logloss'],
   'min_data_in_leaf': 600,
   'num_leaves': 30,
   'objective': 'binary',
   'optimal_number_of_trees': 853,
   'task': 'train',
   'verbose': 1},
  0.058028098980590102),
 ({'boosting_type': 'gbdt',
   'lambda_l1': 0.3,
   'learning_rate': 0.07,
   'max_depth': 5,
   'me

<a id="2e"></a>
### Ridge regularization tuning

In [43]:
%%time
dtrain_lgb_full = lgb.Dataset(training_features, labels_array)
# Imagine now that you want to optimize num_leaves and
# learning_rate, and also use early stopping:
reg_lambda = [i/10.0 for i in range(0,11,2)]


# We will store the cross validation results in a simple list,
# with tuples in the form of (hyperparam dict, cv score):
cv_results = []

for lam in reg_lambda:
    print "Starting cv for :", lam
        
    lgb_params = {
        'learning_rate': 0.07,
        'max_depth': 5,
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'num_leaves':30,
        'min_data_in_leaf':600,
        'lambda_l1': 0.7,
        'lambda_l2':lam}
    validation_summary = lgb.cv(lgb_params,
                                                                 dtrain_lgb_full,
                                                                 num_boost_round=4000, # any high number will do
                                                                 nfold=5,
                                                                 metrics=["binary_logloss"],
                                                                
                                                                 early_stopping_rounds=50, # Here it is
                                                                 verbose_eval=100)
    optimal_num_trees = len(validation_summary["binary_logloss-mean"])
        # Let's just add the optimal number of trees (chosen by early stopping)
        # to the hyperparameter dictionary:
    lgb_params["optimal_number_of_trees"] = optimal_num_trees

       # And we append results to cv_results:
    cv_results.append((lgb_params, validation_summary["binary_logloss-mean"][-1]))

Starting cv for : 0.0
[100]	cv_agg's binary_logloss: 0.0670936 + 0.00111595
[200]	cv_agg's binary_logloss: 0.0617358 + 0.00123353
[300]	cv_agg's binary_logloss: 0.0598369 + 0.00124109
[400]	cv_agg's binary_logloss: 0.0589612 + 0.00126699
[500]	cv_agg's binary_logloss: 0.0584838 + 0.00125933
[600]	cv_agg's binary_logloss: 0.0582112 + 0.00127688
[700]	cv_agg's binary_logloss: 0.0580769 + 0.00123661
[800]	cv_agg's binary_logloss: 0.0580311 + 0.00124117
[900]	cv_agg's binary_logloss: 0.0580033 + 0.00124169
Starting cv for : 0.2
[100]	cv_agg's binary_logloss: 0.067141 + 0.00111777
[200]	cv_agg's binary_logloss: 0.0617542 + 0.00122671
[300]	cv_agg's binary_logloss: 0.0598457 + 0.00126481
[400]	cv_agg's binary_logloss: 0.0589388 + 0.00128197
[500]	cv_agg's binary_logloss: 0.0584603 + 0.00128002
[600]	cv_agg's binary_logloss: 0.0582161 + 0.00126943
[700]	cv_agg's binary_logloss: 0.0580948 + 0.00127072
[800]	cv_agg's binary_logloss: 0.0580352 + 0.00125895
Starting cv for : 0.4
[100]	cv_agg's bi

In [44]:
cv_results

[({'boosting_type': 'gbdt',
   'lambda_l1': 0.7,
   'lambda_l2': 1.0,
   'learning_rate': 0.07,
   'max_depth': 5,
   'metric': ['binary_logloss'],
   'min_data_in_leaf': 600,
   'num_leaves': 30,
   'objective': 'binary',
   'optimal_number_of_trees': 865,
   'task': 'train',
   'verbose': 1},
  0.057998031378193392),
 ({'boosting_type': 'gbdt',
   'lambda_l1': 0.7,
   'lambda_l2': 0.2,
   'learning_rate': 0.07,
   'max_depth': 5,
   'metric': ['binary_logloss'],
   'min_data_in_leaf': 600,
   'num_leaves': 30,
   'objective': 'binary',
   'optimal_number_of_trees': 830,
   'task': 'train',
   'verbose': 1},
  0.058028643995671425),
 ({'boosting_type': 'gbdt',
   'lambda_l1': 0.7,
   'lambda_l2': 0.4,
   'learning_rate': 0.07,
   'max_depth': 5,
   'metric': ['binary_logloss'],
   'min_data_in_leaf': 600,
   'num_leaves': 30,
   'objective': 'binary',
   'optimal_number_of_trees': 851,
   'task': 'train',
   'verbose': 1},
  0.058025311309216107),
 ({'boosting_type': 'gbdt',
   'lambd

<a id="2f"></a>
### Max bin tuning

In [37]:
%%time
dtrain_lgb_full = lgb.Dataset(training_features, labels_array)
# Imagine now that you want to optimize num_leaves and
# learning_rate, and also use early stopping:
bins = [i for i in range(100,251,50)]


# We will store the cross validation results in a simple list,
# with tuples in the form of (hyperparam dict, cv score):
cv_results = []

for max_bi in bins:
    print "Starting cv for :", max_bi
        
    lgb_params = {
    'learning_rate': 0.1,
        'max_depth': 8,
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'num_leaves':256,
        'min_data_in_leaf':900,
        'lambda_l1': 0.8,
        'lambda_l2':0.4,
        'max_bin':max_bi}
    validation_summary = lgb.cv(lgb_params,
                                                                 dtrain_lgb_full,
                                                                 num_boost_round=4000, # any high number will do
                                                                 nfold=5,
                                                                 metrics=["binary_logloss"],
                                                                
                                                                 early_stopping_rounds=50, # Here it is
                                                                 verbose_eval=20)
    optimal_num_trees = len(validation_summary["binary_logloss-mean"])
        # Let's just add the optimal number of trees (chosen by early stopping)
        # to the hyperparameter dictionary:
    lgb_params["optimal_number_of_trees"] = optimal_num_trees

       # And we append results to cv_results:
    cv_results.append((lgb_params, validation_summary["binary_logloss-mean"][-1]))

Starting cv for : 200
[20]	cv_agg's binary_logloss: 0.0912299 + 0.000200594
[40]	cv_agg's binary_logloss: 0.0307431 + 0.000258411
[60]	cv_agg's binary_logloss: 0.0202871 + 0.00019821
[80]	cv_agg's binary_logloss: 0.0171365 + 0.000198929
[100]	cv_agg's binary_logloss: 0.0162793 + 0.000202121
[120]	cv_agg's binary_logloss: 0.0158437 + 0.00021699
[140]	cv_agg's binary_logloss: 0.0156268 + 0.000208391
[160]	cv_agg's binary_logloss: 0.0154985 + 0.000232281
[180]	cv_agg's binary_logloss: 0.0154395 + 0.000249458
[200]	cv_agg's binary_logloss: 0.0154121 + 0.000244705
[220]	cv_agg's binary_logloss: 0.0153975 + 0.000251805
[240]	cv_agg's binary_logloss: 0.0153922 + 0.000257281
[260]	cv_agg's binary_logloss: 0.01538 + 0.000251256
[280]	cv_agg's binary_logloss: 0.0153969 + 0.00027378
[300]	cv_agg's binary_logloss: 0.0154232 + 0.000285013
Starting cv for : 250
[20]	cv_agg's binary_logloss: 0.0912299 + 0.000200594
[40]	cv_agg's binary_logloss: 0.0307431 + 0.000258411
[60]	cv_agg's binary_logloss: 0.

KeyboardInterrupt: 

In [87]:
cv_results

[({'bagging_fraction': 0.75,
   'boosting_type': 'gbdt',
   'feature_fraction': 0.75,
   'lambda_l1': 0.6,
   'learning_rate': 0.1,
   'max_bin': 100,
   'max_depth': 14,
   'metric': ['binary_logloss'],
   'num_leaves': 256,
   'objective': 'binary',
   'optimal_number_of_trees': 121,
   'task': 'train',
   'verbose': 1},
  0.015839546633335075),
 ({'bagging_fraction': 0.75,
   'boosting_type': 'gbdt',
   'feature_fraction': 0.75,
   'lambda_l1': 0.6,
   'learning_rate': 0.1,
   'max_bin': 100,
   'max_depth': 14,
   'metric': ['binary_logloss'],
   'num_leaves': 64,
   'objective': 'binary',
   'optimal_number_of_trees': 121,
   'task': 'train',
   'verbose': 1},
  0.015666990180683637),
 ({'bagging_fraction': 0.75,
   'boosting_type': 'gbdt',
   'feature_fraction': 0.75,
   'lambda_l1': 0.6,
   'learning_rate': 0.1,
   'max_bin': 100,
   'max_depth': 14,
   'metric': ['binary_logloss'],
   'num_leaves': 128,
   'objective': 'binary',
   'optimal_number_of_trees': 141,
   'task': 'tr

<a id="2g"></a>
### Feature fraction tuning

In [45]:
%%time
dtrain_lgb_full = lgb.Dataset(training_features, labels_array)
# Imagine now that you want to optimize num_leaves and
# learning_rate, and also use early stopping:
feat_frac = [i/100.0 for i in range(75,101,5)]


# We will store the cross validation results in a simple list,
# with tuples in the form of (hyperparam dict, cv score):
cv_results = []

for frac in feat_frac:
    print "Starting cv for :", frac
        
    lgb_params = {
    'learning_rate': 0.07,
        'max_depth': 5,
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'feature_fraction': frac,
        'num_leaves':30,
        'min_data_in_leaf':600,
        'lambda_l1': 0.7,
        'lambda_l2':0.6}
    validation_summary = lgb.cv(lgb_params,
                                                                 dtrain_lgb_full,
                                                                 num_boost_round=4000, # any high number will do
                                                                 nfold=5,
                                                                 metrics=["binary_logloss"],
                                                                
                                                                 early_stopping_rounds=50, # Here it is
                                                                 verbose_eval=100)
    optimal_num_trees = len(validation_summary["binary_logloss-mean"])
        # Let's just add the optimal number of trees (chosen by early stopping)
        # to the hyperparameter dictionary:
    lgb_params["optimal_number_of_trees"] = optimal_num_trees

       # And we append results to cv_results:
    cv_results.append((lgb_params, validation_summary["binary_logloss-mean"][-1]))

Starting cv for : 0.75
[100]	cv_agg's binary_logloss: 0.0669939 + 0.00108001
[200]	cv_agg's binary_logloss: 0.0617569 + 0.00117524
[300]	cv_agg's binary_logloss: 0.0598 + 0.00123877
[400]	cv_agg's binary_logloss: 0.0588969 + 0.00126617
[500]	cv_agg's binary_logloss: 0.0584237 + 0.0012595
[600]	cv_agg's binary_logloss: 0.0581457 + 0.00127014
[700]	cv_agg's binary_logloss: 0.0580075 + 0.00128115
[800]	cv_agg's binary_logloss: 0.0579343 + 0.00129584
[900]	cv_agg's binary_logloss: 0.0579044 + 0.00128929
Starting cv for : 0.8
[100]	cv_agg's binary_logloss: 0.0670704 + 0.00108174
[200]	cv_agg's binary_logloss: 0.06172 + 0.00119799
[300]	cv_agg's binary_logloss: 0.0598015 + 0.00121759
[400]	cv_agg's binary_logloss: 0.0588911 + 0.00123391
[500]	cv_agg's binary_logloss: 0.0584094 + 0.00125453
[600]	cv_agg's binary_logloss: 0.0581549 + 0.00126165
[700]	cv_agg's binary_logloss: 0.0580025 + 0.0012601
[800]	cv_agg's binary_logloss: 0.0579299 + 0.00128427
[900]	cv_agg's binary_logloss: 0.057897 + 0.

In [46]:
cv_results

[({'boosting_type': 'gbdt',
   'feature_fraction': 1.0,
   'lambda_l1': 0.7,
   'lambda_l2': 0.6,
   'learning_rate': 0.07,
   'max_depth': 5,
   'metric': ['binary_logloss'],
   'min_data_in_leaf': 600,
   'num_leaves': 30,
   'objective': 'binary',
   'optimal_number_of_trees': 868,
   'task': 'train',
   'verbose': 1},
  0.057900598062077167),
 ({'boosting_type': 'gbdt',
   'feature_fraction': 0.8,
   'lambda_l1': 0.7,
   'lambda_l2': 0.6,
   'learning_rate': 0.07,
   'max_depth': 5,
   'metric': ['binary_logloss'],
   'min_data_in_leaf': 600,
   'num_leaves': 30,
   'objective': 'binary',
   'optimal_number_of_trees': 887,
   'task': 'train',
   'verbose': 1},
  0.057892973805677386),
 ({'boosting_type': 'gbdt',
   'feature_fraction': 0.85,
   'lambda_l1': 0.7,
   'lambda_l2': 0.6,
   'learning_rate': 0.07,
   'max_depth': 5,
   'metric': ['binary_logloss'],
   'min_data_in_leaf': 600,
   'num_leaves': 30,
   'objective': 'binary',
   'optimal_number_of_trees': 893,
   'task': 'tra

<a id="3"></a>
## 3. Defining the best parameters

In [48]:
lgb_params = {
    'learning_rate': 0.07,
        'max_depth': 5,
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'feature_fraction': 0.8,
        'num_leaves':30,
        'min_data_in_leaf':600,
        'lambda_l1': 0.7,
        'lambda_l2':0.6}

### Choice of the optimal number of boosting rounds with early stopping

In [57]:
validation_summary = lgb.cv(lgb_params,
                                                                 dtrain_lgb_full,
                                                                 num_boost_round=10000, # any high number will do
                                                                 nfold=5,
                                                                 metrics=["binary_logloss"],
                                                                
                                                                 early_stopping_rounds=50, # Here it is
                                                                 verbose_eval=50)
optimal_num_trees = len(validation_summary["binary_logloss-mean"])
print optimal_num_trees

[50]	cv_agg's binary_logloss: 0.384789 + 9.68792e-05
[100]	cv_agg's binary_logloss: 0.232227 + 0.000163719
[150]	cv_agg's binary_logloss: 0.145945 + 0.000184383
[200]	cv_agg's binary_logloss: 0.0967113 + 0.000195211
[250]	cv_agg's binary_logloss: 0.0674862 + 0.000221474
[300]	cv_agg's binary_logloss: 0.0503866 + 0.000230637
[350]	cv_agg's binary_logloss: 0.0389355 + 0.00023761
[400]	cv_agg's binary_logloss: 0.0314532 + 0.000238527
[450]	cv_agg's binary_logloss: 0.0265072 + 0.000221055
[500]	cv_agg's binary_logloss: 0.0235413 + 0.000210612
[550]	cv_agg's binary_logloss: 0.0215375 + 0.000189209
[600]	cv_agg's binary_logloss: 0.019959 + 0.000174961
[650]	cv_agg's binary_logloss: 0.0188688 + 0.000165177
[700]	cv_agg's binary_logloss: 0.0181695 + 0.000159844
[750]	cv_agg's binary_logloss: 0.017615 + 0.000163786
[800]	cv_agg's binary_logloss: 0.0171738 + 0.000166393
[850]	cv_agg's binary_logloss: 0.016825 + 0.000166354
[900]	cv_agg's binary_logloss: 0.0165705 + 0.000166443
[950]	cv_agg's bin