# Porto Seguro dataset

## Part 1: Data encoding

### Import library

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import xgboost
import copy
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

In [3]:
os.chdir('/home/tai/Projects/research-project-Roland')

### Load train and test data

In [4]:
train = pd.read_csv("data/porto_seguro/porto_seguro.0.train", encoding='latin1',
                 na_values='?',
                 names=['id',
                 'target',
                 'ps_ind_01',
                 'ps_ind_02_cat',
                 'ps_ind_03',
                 'ps_ind_04_cat',
                 'ps_ind_05_cat',
                 'ps_ind_06_bin',
                 'ps_ind_07_bin',
                 'ps_ind_08_bin',
                 'ps_ind_09_bin',
                 'ps_ind_10_bin',
                 'ps_ind_11_bin',
                 'ps_ind_12_bin',
                 'ps_ind_13_bin',
                 'ps_ind_14',
                 'ps_ind_15',
                 'ps_ind_16_bin',
                 'ps_ind_17_bin',
                 'ps_ind_18_bin',
                 'ps_reg_01',
                 'ps_reg_02',
                 'ps_reg_03',
                 'ps_car_01_cat',
                 'ps_car_02_cat',
                 'ps_car_03_cat',
                 'ps_car_04_cat',
                 'ps_car_05_cat',
                 'ps_car_06_cat',
                 'ps_car_07_cat',
                 'ps_car_08_cat',
                 'ps_car_09_cat',
                 'ps_car_10_cat',
                 'ps_car_11_cat',
                 'ps_car_11',
                 'ps_car_12',
                 'ps_car_13',
                 'ps_car_14',
                 'ps_car_15',
                 'ps_calc_01',
                 'ps_calc_02',
                 'ps_calc_03',
                 'ps_calc_04',
                 'ps_calc_05',
                 'ps_calc_06',
                 'ps_calc_07',
                 'ps_calc_08',
                 'ps_calc_09',
                 'ps_calc_10',
                 'ps_calc_11',
                 'ps_calc_12',
                 'ps_calc_13',
                 'ps_calc_14',
                 'ps_calc_15_bin',
                 'ps_calc_16_bin',
                 'ps_calc_17_bin',
                 'ps_calc_18_bin',
                 'ps_calc_19_bin',
                 'ps_calc_20_bin'],
                 low_memory=False)

In [5]:
train.head(10)

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,169399,1,1,3.0,9,0.0,0.0,0,0,0,...,6,3,1,6,0,1,0,0,1,0
1,146949,1,7,1.0,6,0.0,6.0,0,1,0,...,2,3,5,8,0,1,0,0,0,1
2,54918,0,1,1.0,4,0.0,3.0,1,0,0,...,9,2,0,6,0,1,0,0,0,0
3,1160514,0,0,1.0,11,0.0,0.0,0,0,0,...,5,1,0,6,0,1,1,1,1,0
4,48624,1,6,1.0,8,1.0,0.0,0,0,1,...,4,4,6,6,0,1,1,0,1,0
5,304385,1,3,4.0,7,1.0,0.0,0,1,0,...,11,1,2,2,0,1,0,0,1,1
6,1083903,1,0,2.0,0,0.0,0.0,0,1,0,...,10,0,1,8,1,0,1,0,1,0
7,605921,0,5,1.0,4,1.0,0.0,0,1,0,...,8,1,2,6,0,1,0,0,1,1
8,1058435,1,1,1.0,3,1.0,0.0,0,0,1,...,1,4,5,5,0,1,1,1,1,0
9,648880,1,3,1.0,6,0.0,0.0,0,1,0,...,5,4,2,8,0,1,1,1,1,0


In [6]:
test = pd.read_csv("data/porto_seguro/porto_seguro.0.test", encoding='latin1', 
                 names=['id',
                 'target',
                 'ps_ind_01',
                 'ps_ind_02_cat',
                 'ps_ind_03',
                 'ps_ind_04_cat',
                 'ps_ind_05_cat',
                 'ps_ind_06_bin',
                 'ps_ind_07_bin',
                 'ps_ind_08_bin',
                 'ps_ind_09_bin',
                 'ps_ind_10_bin',
                 'ps_ind_11_bin',
                 'ps_ind_12_bin',
                 'ps_ind_13_bin',
                 'ps_ind_14',
                 'ps_ind_15',
                 'ps_ind_16_bin',
                 'ps_ind_17_bin',
                 'ps_ind_18_bin',
                 'ps_reg_01',
                 'ps_reg_02',
                 'ps_reg_03',
                 'ps_car_01_cat',
                 'ps_car_02_cat',
                 'ps_car_03_cat',
                 'ps_car_04_cat',
                 'ps_car_05_cat',
                 'ps_car_06_cat',
                 'ps_car_07_cat',
                 'ps_car_08_cat',
                 'ps_car_09_cat',
                 'ps_car_10_cat',
                 'ps_car_11_cat',
                 'ps_car_11',
                 'ps_car_12',
                 'ps_car_13',
                 'ps_car_14',
                 'ps_car_15',
                 'ps_calc_01',
                 'ps_calc_02',
                 'ps_calc_03',
                 'ps_calc_04',
                 'ps_calc_05',
                 'ps_calc_06',
                 'ps_calc_07',
                 'ps_calc_08',
                 'ps_calc_09',
                 'ps_calc_10',
                 'ps_calc_11',
                 'ps_calc_12',
                 'ps_calc_13',
                 'ps_calc_14',
                 'ps_calc_15_bin',
                 'ps_calc_16_bin',
                 'ps_calc_17_bin',
                 'ps_calc_18_bin',
                 'ps_calc_19_bin',
                 'ps_calc_20_bin'],
                 na_values='?',
                 low_memory=False)

In [7]:
test.isna().sum()

id                    0
target                0
ps_ind_01             0
ps_ind_02_cat        22
ps_ind_03             0
ps_ind_04_cat        16
ps_ind_05_cat       355
ps_ind_06_bin         0
ps_ind_07_bin         0
ps_ind_08_bin         0
ps_ind_09_bin         0
ps_ind_10_bin         0
ps_ind_11_bin         0
ps_ind_12_bin         0
ps_ind_13_bin         0
ps_ind_14             0
ps_ind_15             0
ps_ind_16_bin         0
ps_ind_17_bin         0
ps_ind_18_bin         0
ps_reg_01             0
ps_reg_02             0
ps_reg_03          3489
ps_car_01_cat        17
ps_car_02_cat         0
ps_car_03_cat     14249
ps_car_04_cat         0
ps_car_05_cat      9105
ps_car_06_cat         0
ps_car_07_cat       674
ps_car_08_cat         0
ps_car_09_cat        36
ps_car_10_cat         0
ps_car_11_cat         0
ps_car_11             0
ps_car_12             0
ps_car_13             0
ps_car_14          1618
ps_car_15             0
ps_calc_01            0
ps_calc_02            0
ps_calc_03      

In [8]:
train.isna().sum()

id                    0
target                0
ps_ind_01             0
ps_ind_02_cat        26
ps_ind_03             0
ps_ind_04_cat        19
ps_ind_05_cat       315
ps_ind_06_bin         0
ps_ind_07_bin         0
ps_ind_08_bin         0
ps_ind_09_bin         0
ps_ind_10_bin         0
ps_ind_11_bin         0
ps_ind_12_bin         0
ps_ind_13_bin         0
ps_ind_14             0
ps_ind_15             0
ps_ind_16_bin         0
ps_ind_17_bin         0
ps_ind_18_bin         0
ps_reg_01             0
ps_reg_02             0
ps_reg_03          3560
ps_car_01_cat        19
ps_car_02_cat         0
ps_car_03_cat     14335
ps_car_04_cat         0
ps_car_05_cat      9173
ps_car_06_cat         0
ps_car_07_cat       615
ps_car_08_cat         0
ps_car_09_cat        37
ps_car_10_cat         0
ps_car_11_cat         0
ps_car_11             1
ps_car_12             0
ps_car_13             0
ps_car_14          1623
ps_car_15             0
ps_calc_01            0
ps_calc_02            0
ps_calc_03      

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21696 entries, 0 to 21695
Data columns (total 59 columns):
id                21696 non-null int64
target            21696 non-null int64
ps_ind_01         21696 non-null int64
ps_ind_02_cat     21670 non-null float64
ps_ind_03         21696 non-null int64
ps_ind_04_cat     21677 non-null float64
ps_ind_05_cat     21381 non-null float64
ps_ind_06_bin     21696 non-null int64
ps_ind_07_bin     21696 non-null int64
ps_ind_08_bin     21696 non-null int64
ps_ind_09_bin     21696 non-null int64
ps_ind_10_bin     21696 non-null int64
ps_ind_11_bin     21696 non-null int64
ps_ind_12_bin     21696 non-null int64
ps_ind_13_bin     21696 non-null int64
ps_ind_14         21696 non-null int64
ps_ind_15         21696 non-null int64
ps_ind_16_bin     21696 non-null int64
ps_ind_17_bin     21696 non-null int64
ps_ind_18_bin     21696 non-null int64
ps_reg_01         21696 non-null float64
ps_reg_02         21696 non-null float64
ps_reg_03         18136

### One hot encoding

In [10]:
# train = pd.get_dummies(train, columns=[ 'ps_car_02_cat',
#                                          'ps_car_04_cat',
#                                          'ps_car_06_cat',
#                                          'ps_car_08_cat',
#                                          'ps_car_10_cat',
#                                          'ps_car_11_cat'])
# train = pd.get_dummies(train, columns=[
#     'ps_ind_02_cat',
#     'ps_ind_04_cat',
#     'ps_ind_05_cat',
#     'ps_car_01_cat',
#     'ps_car_03_cat',
#     'ps_car_05_cat',
#     'ps_car_07_cat',
#     'ps_car_09_cat'
# ], dummy_na=True)

In [11]:
# test = pd.get_dummies(test, columns=[ 'ps_car_02_cat',
#                                          'ps_car_04_cat',
#                                          'ps_car_06_cat',
#                                          'ps_car_08_cat',
#                                          'ps_car_10_cat',
#                                          'ps_car_11_cat'])
# test = pd.get_dummies(test, columns=[
#     'ps_ind_02_cat',
#     'ps_ind_04_cat',
#     'ps_ind_05_cat',
#     'ps_car_01_cat',
#     'ps_car_03_cat',
#     'ps_car_05_cat',
#     'ps_car_07_cat',
#     'ps_car_09_cat'
# ], dummy_na=True)

In [12]:
X_train = train.drop(['target', 'id'], axis=1)
y_train = train.target

X_test = test.drop(['target', 'id'], axis=1)
y_test = test.target

In [13]:
# X_train, X_test = X_train.align(X_test, join='outer', fill_value=0, axis=1)

In [14]:
X_train.shape

(21696, 57)

In [15]:
X_test.shape

(21696, 57)

In [16]:
X_train.columns

Index(['ps_ind_01', 'ps_ind_02_cat', 'ps_ind_03', 'ps_ind_04_cat',
       'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin',
       'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin',
       'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15', 'ps_ind_16_bin',
       'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03',
       'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat',
       'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat',
       'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat', 'ps_car_11',
       'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15', 'ps_calc_01',
       'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05', 'ps_calc_06',
       'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10', 'ps_calc_11',
       'ps_calc_12', 'ps_calc_13', 'ps_calc_14', 'ps_calc_15_bin',
       'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin',
       'ps_calc_20_bin'],
      dtype='obj

In [17]:
X_test.columns

Index(['ps_ind_01', 'ps_ind_02_cat', 'ps_ind_03', 'ps_ind_04_cat',
       'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin',
       'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin',
       'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15', 'ps_ind_16_bin',
       'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03',
       'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat',
       'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat',
       'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat', 'ps_car_11',
       'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15', 'ps_calc_01',
       'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05', 'ps_calc_06',
       'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10', 'ps_calc_11',
       'ps_calc_12', 'ps_calc_13', 'ps_calc_14', 'ps_calc_15_bin',
       'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin',
       'ps_calc_20_bin'],
      dtype='obj

## Part 2: Tuning on train data

### Find optimal n_estimators

In [18]:
xgb = XGBClassifier(
    learning_rate =0.1,
    n_estimators=5000,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    n_jobs=-1)

xgb_param = xgb.get_xgb_params()
xgtrain = xgboost.DMatrix(X_train, label=y_train)


xgboost.cv(xgb_param, xgtrain, num_boost_round=5000, nfold=5, metrics=['error'],
     early_stopping_rounds=50, stratified=True, seed=0)

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.4148,0.00225,0.435887,0.003168
1,0.407898,0.00272,0.430678,0.003672
2,0.401756,0.001675,0.423995,0.003579
3,0.397147,0.001263,0.423764,0.004604
4,0.395303,0.001451,0.42146,0.005344
5,0.392987,0.003064,0.419939,0.005495
6,0.390222,0.002785,0.417773,0.004257
7,0.387698,0.002863,0.419754,0.003935
8,0.385658,0.003625,0.416759,0.002964
9,0.38378,0.005493,0.416943,0.00471


### Tuning max_depth and min_child_weight

In [19]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test1 = {
 'max_depth':range(1,10,2),
 'min_child_weight':range(1,250,50)
}
# Grid search 1 cv result
grid_score1 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=49,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch1 = GridSearchCV(estimator = xgb,
                            param_grid = param_test1,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch1.fit(X_train,y_train)    
    if grid_score1.empty:
        grid_score1 = pd.DataFrame(gsearch1.cv_results_, columns=['params', 'mean_test_score'])
        grid_score1.columns = ['params', 'mean_test_score_0']
    else:
        grid_score1['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch1.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch1.best_params_)
    print('Run {} best score: '.format(i), gsearch1.best_score_)

grid_score1['avg'] = grid_score1.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score1.loc[grid_score1.avg.idxmax(), :])

Run 10 times
Run 0 best param:  {'max_depth': 7, 'min_child_weight': 151}
Run 0 best score:  0.5945796460176991
Run 1 best param:  {'max_depth': 5, 'min_child_weight': 51}
Run 1 best score:  0.5935656342182891
Run 2 best param:  {'max_depth': 7, 'min_child_weight': 201}
Run 2 best score:  0.5935656342182891
Run 3 best param:  {'max_depth': 3, 'min_child_weight': 51}
Run 3 best score:  0.5914915191740413
Run 4 best param:  {'max_depth': 7, 'min_child_weight': 151}
Run 4 best score:  0.5935656342182891
Run 5 best param:  {'max_depth': 7, 'min_child_weight': 101}
Run 5 best score:  0.5911688790560472
Run 6 best param:  {'max_depth': 5, 'min_child_weight': 101}
Run 6 best score:  0.590523598820059
Run 7 best param:  {'max_depth': 7, 'min_child_weight': 151}
Run 7 best score:  0.5941648230088495
Run 8 best param:  {'max_depth': 7, 'min_child_weight': 201}
Run 8 best score:  0.5919063421828908
Run 9 best param:  {'max_depth': 5, 'min_child_weight': 151}
Run 9 best score:  0.5907079646017699


In [22]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test1b = {
 'max_depth':range(1,10,2),
 'min_child_weight':range(150, 250, 10)
}
# Grid search 1 cv result
grid_score1b = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=49,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch1b = GridSearchCV(estimator = xgb,
                            param_grid = param_test1b,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch1b.fit(X_train,y_train)    
    if grid_score1b.empty:
        grid_score1b = pd.DataFrame(gsearch1b.cv_results_, columns=['params', 'mean_test_score'])
        grid_score1b.columns = ['params', 'mean_test_score_0']
    else:
        grid_score1b['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch1b.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch1b.best_params_)
    print('Run {} best score: '.format(i), gsearch1b.best_score_)

grid_score1b['avg'] = grid_score1b.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score1b.loc[grid_score1b.avg.idxmax(), :])

Run 10 times
Run 0 best param:  {'max_depth': 9, 'min_child_weight': 150}
Run 0 best score:  0.5954553834808259
Run 1 best param:  {'max_depth': 5, 'min_child_weight': 170}
Run 1 best score:  0.5919063421828908
Run 2 best param:  {'max_depth': 7, 'min_child_weight': 220}
Run 2 best score:  0.5941187315634219
Run 3 best param:  {'max_depth': 5, 'min_child_weight': 210}
Run 3 best score:  0.5928281710914455
Run 4 best param:  {'max_depth': 7, 'min_child_weight': 150}
Run 4 best score:  0.5934734513274337
Run 5 best param:  {'max_depth': 5, 'min_child_weight': 170}
Run 5 best score:  0.5930125368731564
Run 6 best param:  {'max_depth': 9, 'min_child_weight': 150}
Run 6 best score:  0.5919985250737463
Run 7 best param:  {'max_depth': 7, 'min_child_weight': 190}
Run 7 best score:  0.5943952802359882
Run 8 best param:  {'max_depth': 7, 'min_child_weight': 200}
Run 8 best score:  0.5950405604719764
Run 9 best param:  {'max_depth': 7, 'min_child_weight': 230}
Run 9 best score:  0.59080014749262

In [23]:
# Look carefully again the neigbor values
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test2 = {
 'max_depth':[4, 5, 6],
 'min_child_weight':range(160, 171)
}
# Grid search 1 cv result
grid_score2 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=49,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,        
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch2 = GridSearchCV(estimator = xgb,
                            param_grid = param_test2,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch2.fit(X_train,y_train)    
    if grid_score2.empty:
        grid_score2 = pd.DataFrame(gsearch2.cv_results_, columns=['params', 'mean_test_score'])
        grid_score2.columns = ['params', 'mean_test_score_0']
    else:
        grid_score2['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch2.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch2.best_params_)
    print('Run {} best score: '.format(i), gsearch2.best_score_)

grid_score2['avg'] = grid_score2.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score2.loc[grid_score2.avg.idxmax(), :])

Run 10 times
Run 0 best param:  {'max_depth': 5, 'min_child_weight': 165}
Run 0 best score:  0.5953632005899705
Run 1 best param:  {'max_depth': 6, 'min_child_weight': 161}
Run 1 best score:  0.5922750737463127
Run 2 best param:  {'max_depth': 5, 'min_child_weight': 163}
Run 2 best score:  0.5922750737463127
Run 3 best param:  {'max_depth': 6, 'min_child_weight': 162}
Run 3 best score:  0.5919524336283186
Run 4 best param:  {'max_depth': 6, 'min_child_weight': 164}
Run 4 best score:  0.5946718289085545
Run 5 best param:  {'max_depth': 6, 'min_child_weight': 165}
Run 5 best score:  0.5943952802359882
Run 6 best param:  {'max_depth': 5, 'min_child_weight': 160}
Run 6 best score:  0.589832227138643
Run 7 best param:  {'max_depth': 5, 'min_child_weight': 169}
Run 7 best score:  0.5927359882005899
Run 8 best param:  {'max_depth': 5, 'min_child_weight': 166}
Run 8 best score:  0.592551622418879
Run 9 best param:  {'max_depth': 6, 'min_child_weight': 161}
Run 9 best score:  0.5926898967551623

### Tuning gamma

In [24]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
# Grid search 1 cv result
grid_score3 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=49,
        max_depth=6,
        min_child_weight=161,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,        
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch3 = GridSearchCV(estimator = xgb,
                            param_grid = param_test3,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch3.fit(X_train,y_train)    
    if grid_score3.empty:
        grid_score3 = pd.DataFrame(gsearch3.cv_results_, columns=['params', 'mean_test_score'])
        grid_score3.columns = ['params', 'mean_test_score_0']
    else:
        grid_score3['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch3.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch3.best_params_)
    print('Run {} best score: '.format(i), gsearch3.best_score_)

grid_score3['avg'] = grid_score3.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score3.loc[grid_score3.avg.idxmax(), :])

Run 10 times
Run 0 best param:  {'gamma': 0.2}
Run 0 best score:  0.5936578171091446
Run 1 best param:  {'gamma': 0.0}
Run 1 best score:  0.5922750737463127
Run 2 best param:  {'gamma': 0.4}
Run 2 best score:  0.5907540560471977
Run 3 best param:  {'gamma': 0.3}
Run 3 best score:  0.5915837020648967
Run 4 best param:  {'gamma': 0.0}
Run 4 best score:  0.5912610619469026
Run 5 best param:  {'gamma': 0.0}
Run 5 best score:  0.5914915191740413
Run 6 best param:  {'gamma': 0.4}
Run 6 best score:  0.5884494837758112
Run 7 best param:  {'gamma': 0.0}
Run 7 best score:  0.5915837020648967
Run 8 best param:  {'gamma': 0.3}
Run 8 best score:  0.5908462389380531
Run 9 best param:  {'gamma': 0.0}
Run 9 best score:  0.5926898967551623
Best params:  params               {'gamma': 0.1}
mean_test_score_0          0.593427
mean_test_score_1          0.592275
mean_test_score_2          0.590293
mean_test_score_3          0.590938
mean_test_score_4          0.591261
mean_test_score_5          0.591492
m

### Recablirating the n_estimators

In [25]:
xgb = XGBClassifier(
    learning_rate =0.1,
    n_estimators=49,
    max_depth=6,
    min_child_weight=161,
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    n_jobs=-1,
    scale_pos_weight=1,        
    seed=0)

xgb_param = xgb.get_xgb_params()
xgtrain = xgboost.DMatrix(X_train, label=y_train)

xgboost.cv(xgb_param, xgtrain, num_boost_round=5000, nfold=5, metrics=['error'],
     early_stopping_rounds=50, stratified=True, seed=0)

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.421022,0.002023,0.432521,0.005487
1,0.414835,0.001924,0.424824,0.004300
2,0.410744,0.001292,0.422197,0.004521
3,0.408439,0.001587,0.419846,0.006790
4,0.406711,0.002886,0.421229,0.006079
5,0.405651,0.001565,0.420630,0.006465
6,0.403980,0.001476,0.421045,0.004949
7,0.403865,0.001519,0.417911,0.005358
8,0.401963,0.001680,0.417127,0.005718
9,0.401618,0.001550,0.415698,0.005775


In [26]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test_recalibrate = {
 'n_estimators':[i for i in range(100, 1000, 100)]+[79]
}
# Grid search 1 cv result
grid_score_recalibrate = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=49,
        max_depth=6,
        min_child_weight=161,
        gamma=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,        
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch_recalibrate = GridSearchCV(estimator = xgb,
                            param_grid = param_test_recalibrate,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch_recalibrate.fit(X_train,y_train)    
    if grid_score_recalibrate.empty:
        grid_score_recalibrate = pd.DataFrame(gsearch_recalibrate.cv_results_, columns=['params', 'mean_test_score'])
        grid_score_recalibrate.columns = ['params', 'mean_test_score_0']
    else:
        grid_score_recalibrate['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch_recalibrate.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch_recalibrate.best_params_)
    print('Run {} best score: '.format(i), gsearch_recalibrate.best_score_)

grid_score_recalibrate['avg'] = grid_score_recalibrate.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score_recalibrate.loc[grid_score_recalibrate.avg.idxmax(), :])

Run 10 times
Run 0 best param:  {'n_estimators': 79}
Run 0 best score:  0.5924133480825958
Run 1 best param:  {'n_estimators': 100}
Run 1 best score:  0.5922289823008849
Run 2 best param:  {'n_estimators': 100}
Run 2 best score:  0.5899244100294986
Run 3 best param:  {'n_estimators': 79}
Run 3 best score:  0.59172197640118
Run 4 best param:  {'n_estimators': 79}
Run 4 best score:  0.592551622418879
Run 5 best param:  {'n_estimators': 79}
Run 5 best score:  0.5897400442477876
Run 6 best param:  {'n_estimators': 100}
Run 6 best score:  0.591030604719764
Run 7 best param:  {'n_estimators': 79}
Run 7 best score:  0.5925055309734514
Run 8 best param:  {'n_estimators': 79}
Run 8 best score:  0.5891408554572272
Run 9 best param:  {'n_estimators': 79}
Run 9 best score:  0.5900626843657817
Best params:  params               {'n_estimators': 79}
mean_test_score_0                0.592413
mean_test_score_1                0.591999
mean_test_score_2                 0.58891
mean_test_score_3         

In [27]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test_recalibrateb = {
 'n_estimators':[i for i in range(10, 100, 10)]+[79]
}
# Grid search 1 cv result
grid_score_recalibrateb = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=79,
        max_depth=6,
        min_child_weight=161,
        gamma=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,        
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch_recalibrateb = GridSearchCV(estimator = xgb,
                            param_grid = param_test_recalibrateb,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch_recalibrateb.fit(X_train,y_train)    
    if grid_score_recalibrateb.empty:
        grid_score_recalibrateb = pd.DataFrame(gsearch_recalibrateb.cv_results_, columns=['params', 'mean_test_score'])
        grid_score_recalibrateb.columns = ['params', 'mean_test_score_0']
    else:
        grid_score_recalibrateb['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch_recalibrateb.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch_recalibrateb.best_params_)
    print('Run {} best score: '.format(i), gsearch_recalibrateb.best_score_)

grid_score_recalibrateb['avg'] = grid_score_recalibrateb.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score_recalibrateb.loc[grid_score_recalibrateb.avg.idxmax(), :])

Run 10 times
Run 0 best param:  {'n_estimators': 50}
Run 0 best score:  0.5933812684365781
Run 1 best param:  {'n_estimators': 79}
Run 1 best score:  0.5919985250737463
Run 2 best param:  {'n_estimators': 40}
Run 2 best score:  0.5913532448377581
Run 3 best param:  {'n_estimators': 50}
Run 3 best score:  0.5919985250737463
Run 4 best param:  {'n_estimators': 79}
Run 4 best score:  0.592551622418879
Run 5 best param:  {'n_estimators': 50}
Run 5 best score:  0.5908923303834809
Run 6 best param:  {'n_estimators': 90}
Run 6 best score:  0.5894174041297935
Run 7 best param:  {'n_estimators': 70}
Run 7 best score:  0.5930125368731564
Run 8 best param:  {'n_estimators': 50}
Run 8 best score:  0.5912610619469026
Run 9 best param:  {'n_estimators': 50}
Run 9 best score:  0.5932890855457227
Best params:  params               {'n_estimators': 50}
mean_test_score_0                0.593381
mean_test_score_1                 0.59186
mean_test_score_2                0.591123
mean_test_score_3         

### Tuning the subsample and colsample_bytree

In [28]:
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
# Grid search 1 cv result
grid_score4 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=50,
        max_depth=6,
        min_child_weight=161,
        gamma=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,        
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch4 = GridSearchCV(estimator = xgb,
                            param_grid = param_test4,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch4.fit(X_train,y_train)    
    if grid_score4.empty:
        grid_score4 = pd.DataFrame(gsearch4.cv_results_, columns=['params', 'mean_test_score'])
        grid_score4.columns = ['params', 'mean_test_score_0']
    else:
        grid_score4['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch4.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch4.best_params_)
    print('Run {} best score: '.format(i), gsearch4.best_score_)

grid_score4['avg'] = grid_score4.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score4.loc[grid_score4.avg.idxmax(), :])

Run 10 times
Run 0 best param:  {'colsample_bytree': 0.8, 'subsample': 0.8}
Run 0 best score:  0.5933812684365781
Run 1 best param:  {'colsample_bytree': 0.6, 'subsample': 0.8}
Run 1 best score:  0.5927359882005899
Run 2 best param:  {'colsample_bytree': 0.9, 'subsample': 0.9}
Run 2 best score:  0.5938421828908554
Run 3 best param:  {'colsample_bytree': 0.8, 'subsample': 0.8}
Run 3 best score:  0.5919985250737463
Run 4 best param:  {'colsample_bytree': 0.6, 'subsample': 0.9}
Run 4 best score:  0.5946718289085545
Run 5 best param:  {'colsample_bytree': 0.7, 'subsample': 0.8}
Run 5 best score:  0.5912610619469026
Run 6 best param:  {'colsample_bytree': 0.7, 'subsample': 0.9}
Run 6 best score:  0.5928742625368731
Run 7 best param:  {'colsample_bytree': 0.8, 'subsample': 0.9}
Run 7 best score:  0.5935656342182891
Run 8 best param:  {'colsample_bytree': 0.7, 'subsample': 0.9}
Run 8 best score:  0.5927820796460177
Run 9 best param:  {'colsample_bytree': 0.8, 'subsample': 0.8}
Run 9 best scor

In [29]:
# Carefully search for each neighboring 0.05
param_test5 = {
 'subsample':[i/100.0 for i in range(75,86,5)],
 'colsample_bytree':[i/100.0 for i in range(75,86,5)]
}
# Grid search 1 cv result
grid_score5 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=50,
        max_depth=6,
        min_child_weight=161,
        gamma=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,        
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch5 = GridSearchCV(estimator = xgb,
                            param_grid = param_test5,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch5.fit(X_train,y_train)    
    if grid_score5.empty:
        grid_score5 = pd.DataFrame(gsearch5.cv_results_, columns=['params', 'mean_test_score'])
        grid_score5.columns = ['params', 'mean_test_score_0']
    else:
        grid_score5['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch5.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch5.best_params_)
    print('Run {} best score: '.format(i), gsearch5.best_score_)

grid_score5['avg'] = grid_score5.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score5.loc[grid_score5.avg.idxmax(), :])

Run 10 times
Run 0 best param:  {'colsample_bytree': 0.85, 'subsample': 0.85}
Run 0 best score:  0.594257005899705
Run 1 best param:  {'colsample_bytree': 0.85, 'subsample': 0.75}
Run 1 best score:  0.5928281710914455
Run 2 best param:  {'colsample_bytree': 0.75, 'subsample': 0.8}
Run 2 best score:  0.5929203539823009
Run 3 best param:  {'colsample_bytree': 0.75, 'subsample': 0.75}
Run 3 best score:  0.5924594395280236
Run 4 best param:  {'colsample_bytree': 0.75, 'subsample': 0.8}
Run 4 best score:  0.5940265486725663
Run 5 best param:  {'colsample_bytree': 0.8, 'subsample': 0.8}
Run 5 best score:  0.5908923303834809
Run 6 best param:  {'colsample_bytree': 0.8, 'subsample': 0.85}
Run 6 best score:  0.590016592920354
Run 7 best param:  {'colsample_bytree': 0.8, 'subsample': 0.8}
Run 7 best score:  0.592551622418879
Run 8 best param:  {'colsample_bytree': 0.85, 'subsample': 0.8}
Run 8 best score:  0.5918141592920354
Run 9 best param:  {'colsample_bytree': 0.8, 'subsample': 0.8}
Run 9 be

### Tuning Regularization Parameters

In [30]:
param_test6 = {
 'reg_alpha':[0, 1e-5, 1e-2, 0.1, 1, 100]
}
# Grid search 1 cv result
grid_score6 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=50,
        max_depth=6,
        min_child_weight=161,
        gamma=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,        
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch6 = GridSearchCV(estimator = xgb,
                            param_grid = param_test6,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch6.fit(X_train,y_train)    
    if grid_score6.empty:
        grid_score6 = pd.DataFrame(gsearch6.cv_results_, columns=['params', 'mean_test_score'])
        grid_score6.columns = ['params', 'mean_test_score_0']
    else:
        grid_score6['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch6.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch6.best_params_)
    print('Run {} best score: '.format(i), gsearch6.best_score_)

grid_score6['avg'] = grid_score6.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score6.loc[grid_score6.avg.idxmax(), :])

Run 10 times
Run 0 best param:  {'reg_alpha': 0}
Run 0 best score:  0.5933812684365781
Run 1 best param:  {'reg_alpha': 0}
Run 1 best score:  0.5918602507374632
Run 2 best param:  {'reg_alpha': 1}
Run 2 best score:  0.5922289823008849
Run 3 best param:  {'reg_alpha': 0}
Run 3 best score:  0.5919985250737463
Run 4 best param:  {'reg_alpha': 0.1}
Run 4 best score:  0.5936117256637168
Run 5 best param:  {'reg_alpha': 1}
Run 5 best score:  0.5915376106194691
Run 6 best param:  {'reg_alpha': 0.01}
Run 6 best score:  0.589002581120944
Run 7 best param:  {'reg_alpha': 0}
Run 7 best score:  0.592551622418879
Run 8 best param:  {'reg_alpha': 0.1}
Run 8 best score:  0.5921828908554573
Run 9 best param:  {'reg_alpha': 0}
Run 9 best score:  0.5932890855457227
Best params:  params               {'reg_alpha': 0}
mean_test_score_0            0.593381
mean_test_score_1             0.59186
mean_test_score_2            0.591123
mean_test_score_3            0.591999
mean_test_score_4            0.590385


In [31]:
param_test7 = {
 'reg_alpha':[0, 1e-6, 5e-6, 5e-5, 1e-4, 5e-4]
}
# Grid search 1 cv result
grid_score7 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=50,
        max_depth=6,
        min_child_weight=161,
        gamma=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,        
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch7 = GridSearchCV(estimator = xgb,
                            param_grid = param_test7,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch7.fit(X_train,y_train)    
    if grid_score7.empty:
        grid_score7 = pd.DataFrame(gsearch7.cv_results_, columns=['params', 'mean_test_score'])
        grid_score7.columns = ['params', 'mean_test_score_0']
    else:
        grid_score7['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch7.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch7.best_params_)
    print('Run {} best score: '.format(i), gsearch7.best_score_)

grid_score7['avg'] = grid_score7.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score7.loc[grid_score7.avg.idxmax(), :])

Run 10 times
Run 0 best param:  {'reg_alpha': 0}
Run 0 best score:  0.5933812684365781
Run 1 best param:  {'reg_alpha': 0}
Run 1 best score:  0.5918602507374632
Run 2 best param:  {'reg_alpha': 0}
Run 2 best score:  0.5911227876106194
Run 3 best param:  {'reg_alpha': 0}
Run 3 best score:  0.5919985250737463
Run 4 best param:  {'reg_alpha': 0}
Run 4 best score:  0.5903853244837758
Run 5 best param:  {'reg_alpha': 0}
Run 5 best score:  0.5908923303834809
Run 6 best param:  {'reg_alpha': 0.0005}
Run 6 best score:  0.5880346607669616
Run 7 best param:  {'reg_alpha': 0}
Run 7 best score:  0.592551622418879
Run 8 best param:  {'reg_alpha': 0}
Run 8 best score:  0.5912610619469026
Run 9 best param:  {'reg_alpha': 0}
Run 9 best score:  0.5932890855457227
Best params:  params               {'reg_alpha': 0.0005}
mean_test_score_0                 0.593381
mean_test_score_1                  0.59186
mean_test_score_2                 0.591123
mean_test_score_3                 0.591999
mean_test_scor

### Reduce the learning rate and tune n_estimators

In [33]:
xgb = XGBClassifier(
    learning_rate =0.01,
    n_estimators=50,
    max_depth=6,
    min_child_weight=161,
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha = 5e-4,
    objective= 'binary:logistic',
    n_jobs=-1,
    scale_pos_weight=1,        
    seed=0)

xgb_param = xgb.get_xgb_params()
xgtrain = xgboost.DMatrix(X_train, label=y_train)

xgboost.cv(xgb_param, xgtrain, num_boost_round=5000, nfold=5, metrics=['error'],
     early_stopping_rounds=50, stratified=True, seed=0)

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.421022,0.002023,0.432521,0.005487
1,0.416989,0.002263,0.426483,0.006825
2,0.414512,0.002853,0.423257,0.005936
3,0.412207,0.001913,0.421736,0.003281
4,0.411539,0.002062,0.421460,0.005557
5,0.410410,0.001800,0.422289,0.004668
6,0.410617,0.001390,0.422382,0.005791
7,0.409476,0.002545,0.422243,0.006696
8,0.409165,0.002597,0.422244,0.006260
9,0.407875,0.002117,0.422520,0.005837


In [34]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test8 = {
 'n_estimators':[i for i in range(100, 1500, 100)]+[483]
}
# Grid search 1 cv result
grid_score8 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.01,
        n_estimators=50,
        max_depth=6,
        min_child_weight=161,
        gamma=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha = 5e-4,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,        
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch8 = GridSearchCV(estimator = xgb,
                            param_grid = param_test8,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch8.fit(X_train,y_train)    
    if grid_score8.empty:
        grid_score8 = pd.DataFrame(gsearch8.cv_results_, columns=['params', 'mean_test_score'])
        grid_score8.columns = ['params', 'mean_test_score_0']
    else:
        grid_score8['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch8.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch8.best_params_)
    print('Run {} best score: '.format(i), gsearch8.best_score_)

grid_score8['avg'] = grid_score8.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score8.loc[grid_score8.avg.idxmax(), :])

Run 10 times
Run 0 best param:  {'n_estimators': 600}
Run 0 best score:  0.5930125368731564
Run 1 best param:  {'n_estimators': 600}
Run 1 best score:  0.5927359882005899
Run 2 best param:  {'n_estimators': 700}
Run 2 best score:  0.5927820796460177
Run 3 best param:  {'n_estimators': 1200}
Run 3 best score:  0.5927359882005899
Run 4 best param:  {'n_estimators': 1100}
Run 4 best score:  0.5928281710914455
Run 5 best param:  {'n_estimators': 1400}
Run 5 best score:  0.5916758849557522
Run 6 best param:  {'n_estimators': 1400}
Run 6 best score:  0.5921828908554573
Run 7 best param:  {'n_estimators': 483}
Run 7 best score:  0.5920446165191741
Run 8 best param:  {'n_estimators': 1400}
Run 8 best score:  0.5939804572271387
Run 9 best param:  {'n_estimators': 1200}
Run 9 best score:  0.5916758849557522
Best params:  params               {'n_estimators': 700}
mean_test_score_0                 0.592598
mean_test_score_1                 0.592552
mean_test_score_2                 0.592782
mean_

In [35]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test8 = {
 'n_estimators':[i for i in range(650, 751, 10)]
}
# Grid search 1 cv result
grid_score8 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.01,
        n_estimators=50,
        max_depth=6,
        min_child_weight=161,
        gamma=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha = 5e-4,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,        
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch8 = GridSearchCV(estimator = xgb,
                            param_grid = param_test8,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch8.fit(X_train,y_train)    
    if grid_score8.empty:
        grid_score8 = pd.DataFrame(gsearch8.cv_results_, columns=['params', 'mean_test_score'])
        grid_score8.columns = ['params', 'mean_test_score_0']
    else:
        grid_score8['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch8.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch8.best_params_)
    print('Run {} best score: '.format(i), gsearch8.best_score_)

grid_score8['avg'] = grid_score8.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score8.loc[grid_score8.avg.idxmax(), :])

Run 10 times
Run 0 best param:  {'n_estimators': 680}
Run 0 best score:  0.5926898967551623
Run 1 best param:  {'n_estimators': 650}
Run 1 best score:  0.5931508112094396
Run 2 best param:  {'n_estimators': 690}
Run 2 best score:  0.5931508112094396
Run 3 best param:  {'n_estimators': 650}
Run 3 best score:  0.5932890855457227
Run 4 best param:  {'n_estimators': 670}
Run 4 best score:  0.5928281710914455
Run 5 best param:  {'n_estimators': 660}
Run 5 best score:  0.5917680678466076
Run 6 best param:  {'n_estimators': 650}
Run 6 best score:  0.5914454277286135
Run 7 best param:  {'n_estimators': 670}
Run 7 best score:  0.5916758849557522
Run 8 best param:  {'n_estimators': 710}
Run 8 best score:  0.5933812684365781
Run 9 best param:  {'n_estimators': 740}
Run 9 best score:  0.5916758849557522
Best params:  params               {'n_estimators': 650}
mean_test_score_0                 0.591999
mean_test_score_1                 0.593151
mean_test_score_2                 0.592045
mean_test_s

## Part 3 Test on test set

In [36]:
accuracy_array = []
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.01,
        n_estimators=650,
        max_depth=6,
        min_child_weight=161,
        gamma=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha = 5e-4,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,
        seed=i
    )
    model = xgb.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # evaluate predictions
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_array.append(accuracy)
    print('Accuracy {}: %.2f%%'.format(i) % (accuracy * 100.0))
mean_accuracy_score = sum(accuracy_array) / NUM_TRIALS
print('Average accuracy is: %.2f%%' % (mean_accuracy_score * 100.0))

Accuracy 0: 59.78%
Accuracy 1: 59.71%
Accuracy 2: 59.72%
Accuracy 3: 59.72%
Accuracy 4: 59.69%
Accuracy 5: 59.70%
Accuracy 6: 59.71%
Accuracy 7: 59.73%
Accuracy 8: 59.59%
Accuracy 9: 59.74%
Average accuracy is: 59.71%
