# Flavours of physics dataset

## Part 1: Data encoding

### Import library

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import xgboost
import copy
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

In [5]:
os.chdir('/home/tai/Projects/research-project-Roland/')

### Load train and test data

In [6]:
train = pd.read_csv("data/flavours_of_physics/flavours_of_physics.0.train", encoding='latin1', 
                 header=None,
                 na_values='?',
                 low_memory=False )

In [7]:
test = pd.read_csv("data/flavours_of_physics/flavours_of_physics.0.test", encoding='latin1', 
                 header=None,
                 na_values='?',
                 low_memory=False )

In [8]:
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
0,9082753,0.000643,0.999952,4.066977,0.328034,0.039081,2.24083,6.968715,2934.679199,0.048262,...,17462.171875,10766.777344,3.868153,2.994936,2.947038,261,-99,0,1797.79895,0.238707
1,1124795,0.000978,0.999466,8.500341,0.375011,0.237087,15.826837,11.026155,2551.849609,0.106709,...,7694.305664,38083.234375,3.437981,2.975132,3.747602,231,-99,0,1727.85498,0.662632
2,6620172,0.00197,0.999982,42.543549,0.711831,0.264044,12.668289,2.365929,5646.794434,0.035337,...,80296.695312,18222.199219,3.821355,3.950199,3.454009,186,-99,0,1864.751953,0.20985
3,4502983,0.001992,0.999995,15.797983,0.507167,0.048741,2.586148,6.734583,2136.523438,0.027997,...,25081.158203,12336.913086,3.655804,3.555438,3.397589,268,1,1,1762.937744,0.254817
4,6703161,0.00075,0.999992,11.153282,0.314592,0.042594,2.986686,13.142979,8152.657715,0.016565,...,55539.929688,20167.025391,2.847753,3.169065,2.935316,188,1,1,1780.325562,0.883157


In [9]:
X_train = train.drop([0, 47, 48, 49, 50], axis=1)
y_train = train[48]

X_test = test.drop([0, 47, 48, 49, 50], axis=1)
y_test = test[48]

In [10]:
# X_train, X_test = X_train.align(X_test, join='outer', fill_value=0, axis=1)

In [11]:
X_train.shape

(33776, 46)

In [12]:
X_test.shape

(33776, 46)

In [13]:
X_train.columns

Int64Index([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
            18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
            35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46],
           dtype='int64')

In [14]:
X_test.columns

Int64Index([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
            18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
            35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46],
           dtype='int64')

## Part 2: Tuning on train data

### Find optimal n_estimators

In [15]:
xgb = XGBClassifier(
    learning_rate =0.1,
    n_estimators=5000,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    n_jobs=-1)

xgb_param = xgb.get_xgb_params()
xgtrain = xgboost.DMatrix(X_train, label=y_train)


xgboost.cv(xgb_param, xgtrain, num_boost_round=5000, nfold=5, metrics=['error'],
     early_stopping_rounds=50, stratified=True, seed=0)

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.172978,0.002216,0.183000,0.004101
1,0.153045,0.002330,0.163933,0.004897
2,0.148056,0.002379,0.159077,0.004323
3,0.145303,0.001870,0.155436,0.004247
4,0.142268,0.001531,0.152327,0.004475
5,0.140640,0.000999,0.151409,0.004321
6,0.138071,0.001499,0.149751,0.004280
7,0.137094,0.001291,0.148744,0.004191
8,0.136332,0.001200,0.147827,0.003618
9,0.135422,0.001097,0.145962,0.004306


### Tuning max_depth and min_child_weight

In [22]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test1 = {
 'max_depth':range(1,10,2),
 'min_child_weight':range(1,300,50)
}
# Grid search 1 cv result
grid_score1 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=252,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch1 = GridSearchCV(estimator = xgb,
                            param_grid = param_test1,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch1.fit(X_train,y_train)    
    if grid_score1.empty:
        grid_score1 = pd.DataFrame(gsearch1.cv_results_, columns=['params', 'mean_test_score'])
        grid_score1.columns = ['params', 'mean_test_score_0']
    else:
        grid_score1['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch1.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch1.best_params_)
    print('Run {} best score: '.format(i), gsearch1.best_score_)

grid_score1['avg'] = grid_score1.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score1.loc[grid_score1.avg.idxmax(), :])

Run 6 times
Run 0 best param:  {'max_depth': 9, 'min_child_weight': 51}
Run 0 best score:  0.8930897678825201
Run 1 best param:  {'max_depth': 7, 'min_child_weight': 51}
Run 1 best score:  0.8925568450971104
Run 2 best param:  {'max_depth': 7, 'min_child_weight': 51}
Run 2 best score:  0.8928233064898152
Run 3 best param:  {'max_depth': 9, 'min_child_weight': 51}
Run 3 best score:  0.8926160587399337
Run 4 best param:  {'max_depth': 7, 'min_child_weight': 51}
Run 4 best score:  0.8929417337754618
Run 5 best param:  {'max_depth': 9, 'min_child_weight': 51}
Run 5 best score:  0.8930009474182852
Best params:  params               {'max_depth': 9, 'min_child_weight': 51}
mean_test_score_0                                     0.89309
mean_test_score_1                                    0.892409
mean_test_score_2                                    0.892142
mean_test_score_3                                    0.892616
mean_test_score_4                                    0.891757
mean_test_scor

In [18]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test1b = {
 'max_depth':range(1,10,2),
 'min_child_weight':range(45,56,2)
}
# Grid search 1 cv result
grid_score1b = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=252,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch1b = GridSearchCV(estimator = xgb,
                            param_grid = param_test1b,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch1b.fit(X_train,y_train)    
    if grid_score1b.empty:
        grid_score1b = pd.DataFrame(gsearch1b.cv_results_, columns=['params', 'mean_test_score'])
        grid_score1b.columns = ['params', 'mean_test_score_0']
    else:
        grid_score1b['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch1b.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch1b.best_params_)
    print('Run {} best score: '.format(i), gsearch1b.best_score_)

grid_score1b['avg'] = grid_score1b.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score1b.loc[grid_score1b.avg.idxmax(), :])

Run 6 times
Run 0 best param:  {'max_depth': 7, 'min_child_weight': 49}
Run 0 best score:  0.8932081951681667
Run 1 best param:  {'max_depth': 9, 'min_child_weight': 47}
Run 1 best score:  0.8928825201326386
Run 2 best param:  {'max_depth': 9, 'min_child_weight': 53}
Run 2 best score:  0.8933266224538133
Run 3 best param:  {'max_depth': 7, 'min_child_weight': 49}
Run 3 best score:  0.8926160587399337
Run 4 best param:  {'max_depth': 9, 'min_child_weight': 49}
Run 4 best score:  0.8935338702036949
Run 5 best param:  {'max_depth': 7, 'min_child_weight': 53}
Run 5 best score:  0.8938299384178114
Best params:  params               {'max_depth': 9, 'min_child_weight': 45}
mean_test_score_0                                    0.892912
mean_test_score_1                                    0.892409
mean_test_score_2                                    0.893031
mean_test_score_3                                    0.891965
mean_test_score_4                                    0.893267
mean_test_scor

In [24]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test1b = {
 'max_depth':[9, 10],
 'min_child_weight':[45, 46]
}
# Grid search 1 cv result
grid_score1b = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=252,
        max_depth=5,
        min_child_weight=45,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch1b = GridSearchCV(estimator = xgb,
                            param_grid = param_test1b,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch1b.fit(X_train,y_train)    
    if grid_score1b.empty:
        grid_score1b = pd.DataFrame(gsearch1b.cv_results_, columns=['params', 'mean_test_score'])
        grid_score1b.columns = ['params', 'mean_test_score_0']
    else:
        grid_score1b['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch1b.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch1b.best_params_)
    print('Run {} best score: '.format(i), gsearch1b.best_score_)

grid_score1b['avg'] = grid_score1b.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score1b.loc[grid_score1b.avg.idxmax(), :])

Run 6 times
Run 0 best param:  {'max_depth': 9, 'min_child_weight': 46}
Run 0 best score:  0.8934450497394599
Run 1 best param:  {'max_depth': 10, 'min_child_weight': 46}
Run 1 best score:  0.8929417337754618
Run 2 best param:  {'max_depth': 10, 'min_child_weight': 46}
Run 2 best score:  0.8938595452392231
Run 3 best param:  {'max_depth': 9, 'min_child_weight': 45}
Run 3 best score:  0.8919647086688773
Run 4 best param:  {'max_depth': 10, 'min_child_weight': 45}
Run 4 best score:  0.8934450497394599
Run 5 best param:  {'max_depth': 10, 'min_child_weight': 45}
Run 5 best score:  0.8937707247749882
Best params:  params               {'max_depth': 9, 'min_child_weight': 45}
mean_test_score_0                                    0.892912
mean_test_score_1                                    0.892409
mean_test_score_2                                    0.893031
mean_test_score_3                                    0.891965
mean_test_score_4                                    0.893267
mean_test_

### Tuning gamma

In [25]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
# Grid search 1 cv result
grid_score3 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=252,
        max_depth=9,
        min_child_weight=45,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch3 = GridSearchCV(estimator = xgb,
                            param_grid = param_test3,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch3.fit(X_train,y_train)    
    if grid_score3.empty:
        grid_score3 = pd.DataFrame(gsearch3.cv_results_, columns=['params', 'mean_test_score'])
        grid_score3.columns = ['params', 'mean_test_score_0']
    else:
        grid_score3['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch3.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch3.best_params_)
    print('Run {} best score: '.format(i), gsearch3.best_score_)

grid_score3['avg'] = grid_score3.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score3.loc[grid_score3.avg.idxmax(), :])

Run 6 times
Run 0 best param:  {'gamma': 0.4}
Run 0 best score:  0.893948365703458
Run 1 best param:  {'gamma': 0.4}
Run 1 best score:  0.8926160587399337
Run 2 best param:  {'gamma': 0.0}
Run 2 best score:  0.8930305542396968
Run 3 best param:  {'gamma': 0.1}
Run 3 best score:  0.8924680246328754
Run 4 best param:  {'gamma': 0.2}
Run 4 best score:  0.8942740407389863
Run 5 best param:  {'gamma': 0.4}
Run 5 best score:  0.8936522974893415
Best params:  params               {'gamma': 0.4}
mean_test_score_0          0.893948
mean_test_score_1          0.892616
mean_test_score_2          0.892498
mean_test_score_3          0.891905
mean_test_score_4          0.894274
mean_test_score_5          0.893652
avg                        0.893149
Name: 4, dtype: object


### Recablirating the n_estimators and 1st tune the n_estimators

In [26]:
xgb = XGBClassifier(
    learning_rate =0.1,
    n_estimators=252,
    max_depth=9,
    min_child_weight=45,
    gamma=0.4,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    n_jobs=-1,
    seed=0
    )

xgb_param = xgb.get_xgb_params()
xgtrain = xgboost.DMatrix(X_train, label=y_train)

xgboost.cv(xgb_param, xgtrain, num_boost_round=5000, nfold=5, metrics=['error'],
     early_stopping_rounds=50, stratified=True, seed=0)

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.167375,0.002291,0.176012,0.003202
1,0.149670,0.004855,0.159374,0.002220
2,0.143985,0.002813,0.154488,0.003537
3,0.139715,0.002060,0.152031,0.003386
4,0.137509,0.001978,0.149011,0.002720
5,0.135126,0.002682,0.146791,0.002856
6,0.133046,0.001904,0.145103,0.004105
7,0.131573,0.001386,0.143208,0.003325
8,0.130211,0.001225,0.142261,0.002701
9,0.129071,0.000942,0.140929,0.003406


In [27]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test8 = {
 'n_estimators':[i for i in range(100, 1000, 100)]+[315]
}
# Grid search 1 cv result
grid_score8 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=315,
        max_depth=9,
        min_child_weight=45,
        gamma=0.4,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch8 = GridSearchCV(estimator = xgb,
                            param_grid = param_test8,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch8.fit(X_train,y_train)    
    if grid_score8.empty:
        grid_score8 = pd.DataFrame(gsearch8.cv_results_, columns=['params', 'mean_test_score'])
        grid_score8.columns = ['params', 'mean_test_score_0']
    else:
        grid_score8['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch8.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch8.best_params_)
    print('Run {} best score: '.format(i), gsearch8.best_score_)

grid_score8['avg'] = grid_score8.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score8.loc[grid_score8.avg.idxmax(), :])

Run 6 times
Run 0 best param:  {'n_estimators': 315}
Run 0 best score:  0.8941852202747513
Run 1 best param:  {'n_estimators': 300}
Run 1 best score:  0.8925272382756987
Run 2 best param:  {'n_estimators': 500}
Run 2 best score:  0.8927344860255803
Run 3 best param:  {'n_estimators': 500}
Run 3 best score:  0.8929121269540502
Run 4 best param:  {'n_estimators': 400}
Run 4 best score:  0.8949549976314543
Run 5 best param:  {'n_estimators': 315}
Run 5 best score:  0.8939187588820464
Best params:  params               {'n_estimators': 400}
mean_test_score_0                   0.8938
mean_test_score_1                 0.892231
mean_test_score_2                 0.892468
mean_test_score_3                 0.892438
mean_test_score_4                 0.894955
mean_test_score_5                 0.893593
avg                               0.893248
Name: 3, dtype: object


In [33]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test8 = {
 'n_estimators':[i for i in range(350, 451, 10)]
}
# Grid search 1 cv result
grid_score8 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=315,
        max_depth=9,
        min_child_weight=45,
        gamma=0.4,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch8 = GridSearchCV(estimator = xgb,
                            param_grid = param_test8,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch8.fit(X_train,y_train)    
    if grid_score8.empty:
        grid_score8 = pd.DataFrame(gsearch8.cv_results_, columns=['params', 'mean_test_score'])
        grid_score8.columns = ['params', 'mean_test_score_0']
    else:
        grid_score8['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch8.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch8.best_params_)
    print('Run {} best score: '.format(i), gsearch8.best_score_)

grid_score8['avg'] = grid_score8.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score8.loc[grid_score8.avg.idxmax(), :])

Run 6 times
Run 0 best param:  {'n_estimators': 400}
Run 0 best score:  0.8938003315963998
Run 1 best param:  {'n_estimators': 350}
Run 1 best score:  0.8923792041686405
Run 2 best param:  {'n_estimators': 430}
Run 2 best score:  0.8927936996684036
Run 3 best param:  {'n_estimators': 440}
Run 3 best score:  0.8926160587399337
Run 4 best param:  {'n_estimators': 400}
Run 4 best score:  0.8949549976314543
Run 5 best param:  {'n_estimators': 430}
Run 5 best score:  0.8936522974893415
Best params:  params               {'n_estimators': 400}
mean_test_score_0                   0.8938
mean_test_score_1                 0.892231
mean_test_score_2                 0.892468
mean_test_score_3                 0.892438
mean_test_score_4                 0.894955
mean_test_score_5                 0.893593
avg                               0.893248
Name: 5, dtype: object


### Tuning the subsample and colsample_bytree

In [34]:
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
# Grid search 1 cv result
grid_score4 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=400,
        max_depth=9,
        min_child_weight=45,
        gamma=0.4,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch4 = GridSearchCV(estimator = xgb,
                            param_grid = param_test4,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch4.fit(X_train,y_train)    
    if grid_score4.empty:
        grid_score4 = pd.DataFrame(gsearch4.cv_results_, columns=['params', 'mean_test_score'])
        grid_score4.columns = ['params', 'mean_test_score_0']
    else:
        grid_score4['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch4.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch4.best_params_)
    print('Run {} best score: '.format(i), gsearch4.best_score_)

grid_score4['avg'] = grid_score4.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score4.loc[grid_score4.avg.idxmax(), :])

Run 6 times
Run 0 best param:  {'colsample_bytree': 0.7, 'subsample': 0.9}
Run 0 best score:  0.8945405021316911
Run 1 best param:  {'colsample_bytree': 0.8, 'subsample': 0.9}
Run 1 best score:  0.8935338702036949
Run 2 best param:  {'colsample_bytree': 0.9, 'subsample': 0.9}
Run 2 best score:  0.8940963998105164
Run 3 best param:  {'colsample_bytree': 0.8, 'subsample': 0.7}
Run 3 best score:  0.8928825201326386
Run 4 best param:  {'colsample_bytree': 0.8, 'subsample': 0.8}
Run 4 best score:  0.8949549976314543
Run 5 best param:  {'colsample_bytree': 0.6, 'subsample': 0.9}
Run 5 best score:  0.8938891520606348
Best params:  params               {'colsample_bytree': 0.6, 'subsample': 0.9}
mean_test_score_0                                       0.893238
mean_test_score_1                                       0.893149
mean_test_score_2                                       0.893534
mean_test_score_3                                        0.89232
mean_test_score_4                          

In [36]:
# Carefully search for each neighboring 0.05
param_test5 = {
 'subsample':[i/100.0 for i in range(85,100,5)],
 'colsample_bytree':[i/100.0 for i in range(55,70,5)]
}
# Grid search 1 cv result
grid_score5 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=400,
        max_depth=9,
        min_child_weight=45,
        gamma=0.4,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch5 = GridSearchCV(estimator = xgb,
                            param_grid = param_test5,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch5.fit(X_train,y_train)    
    if grid_score5.empty:
        grid_score5 = pd.DataFrame(gsearch5.cv_results_, columns=['params', 'mean_test_score'])
        grid_score5.columns = ['params', 'mean_test_score_0']
    else:
        grid_score5['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch5.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch5.best_params_)
    print('Run {} best score: '.format(i), gsearch5.best_score_)

grid_score5['avg'] = grid_score5.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score5.loc[grid_score5.avg.idxmax(), :])

Run 6 times
Run 0 best param:  {'colsample_bytree': 0.65, 'subsample': 0.9}
Run 0 best score:  0.8943924680246329
Run 1 best param:  {'colsample_bytree': 0.6, 'subsample': 0.95}
Run 1 best score:  0.8939187588820464
Run 2 best param:  {'colsample_bytree': 0.65, 'subsample': 0.95}
Run 2 best score:  0.8945701089531028
Run 3 best param:  {'colsample_bytree': 0.6, 'subsample': 0.85}
Run 3 best score:  0.8937115111321648
Run 4 best param:  {'colsample_bytree': 0.6, 'subsample': 0.9}
Run 4 best score:  0.8939187588820464
Run 5 best param:  {'colsample_bytree': 0.55, 'subsample': 0.95}
Run 5 best score:  0.8940075793462814
Best params:  params               {'colsample_bytree': 0.65, 'subsample': 0.9}
mean_test_score_0                                        0.894392
mean_test_score_1                                        0.892942
mean_test_score_2                                        0.893948
mean_test_score_3                                        0.892616
mean_test_score_4              

### Tuning Regularization Parameters

In [37]:
param_test6 = {
 'reg_alpha':[0, 1e-5, 1e-2, 0.1, 1, 100]
}
# Grid search 1 cv result
grid_score6 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=400,
        max_depth=9,
        min_child_weight=45,
        gamma=0.4,
        subsample=0.9,
        colsample_bytree=0.65,
        objective= 'binary:logistic',
        n_jobs=-1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch6 = GridSearchCV(estimator = xgb,
                            param_grid = param_test6,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch6.fit(X_train,y_train)    
    if grid_score6.empty:
        grid_score6 = pd.DataFrame(gsearch6.cv_results_, columns=['params', 'mean_test_score'])
        grid_score6.columns = ['params', 'mean_test_score_0']
    else:
        grid_score6['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch6.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch6.best_params_)
    print('Run {} best score: '.format(i), gsearch6.best_score_)

grid_score6['avg'] = grid_score6.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score6.loc[grid_score6.avg.idxmax(), :])

Run 6 times
Run 0 best param:  {'reg_alpha': 0.01}
Run 0 best score:  0.8946885362387494
Run 1 best param:  {'reg_alpha': 1}
Run 1 best score:  0.8940667929891047
Run 2 best param:  {'reg_alpha': 1}
Run 2 best score:  0.8945108953102795
Run 3 best param:  {'reg_alpha': 0.1}
Run 3 best score:  0.8933858360966367
Run 4 best param:  {'reg_alpha': 0}
Run 4 best score:  0.8936226906679299
Run 5 best param:  {'reg_alpha': 1}
Run 5 best score:  0.8939187588820464
Best params:  params               {'reg_alpha': 1}
mean_test_score_0            0.893682
mean_test_score_1            0.894067
mean_test_score_2            0.894511
mean_test_score_3            0.892883
mean_test_score_4            0.893238
mean_test_score_5            0.893919
avg                          0.893716
Name: 4, dtype: object


In [38]:
param_test7 = {
 'reg_alpha':range(1, 10)
}
# Grid search 1 cv result
grid_score7 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=400,
        max_depth=9,
        min_child_weight=45,
        gamma=0.4,
        subsample=0.9,
        colsample_bytree=0.65,
        objective= 'binary:logistic',
        n_jobs=-1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch7 = GridSearchCV(estimator = xgb,
                            param_grid = param_test7,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch7.fit(X_train,y_train)    
    if grid_score7.empty:
        grid_score7 = pd.DataFrame(gsearch7.cv_results_, columns=['params', 'mean_test_score'])
        grid_score7.columns = ['params', 'mean_test_score_0']
    else:
        grid_score7['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch7.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch7.best_params_)
    print('Run {} best score: '.format(i), gsearch7.best_score_)

grid_score7['avg'] = grid_score7.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score7.loc[grid_score7.avg.idxmax(), :])

Run 6 times
Run 0 best param:  {'reg_alpha': 3}
Run 0 best score:  0.8951622453813358
Run 1 best param:  {'reg_alpha': 1}
Run 1 best score:  0.8940667929891047
Run 2 best param:  {'reg_alpha': 1}
Run 2 best score:  0.8945108953102795
Run 3 best param:  {'reg_alpha': 7}
Run 3 best score:  0.8931785883467551
Run 4 best param:  {'reg_alpha': 9}
Run 4 best score:  0.8945405021316911
Run 5 best param:  {'reg_alpha': 9}
Run 5 best score:  0.8940075793462814
Best params:  params               {'reg_alpha': 3}
mean_test_score_0            0.895162
mean_test_score_1            0.893267
mean_test_score_2            0.893712
mean_test_score_3            0.892942
mean_test_score_4            0.894067
mean_test_score_5            0.893475
avg                          0.893771
Name: 2, dtype: object


### Reduce the learning rate and tune n_estimators

In [41]:
xgb = XGBClassifier(
    learning_rate =0.01,
    n_estimators=400,
    max_depth=9,
    min_child_weight=45,
    gamma=0.4,
    reg_alpha=3,
    subsample=0.9,
    colsample_bytree=0.65,
    objective= 'binary:logistic',
    n_jobs=-1,
    seed=0)

xgb_param = xgb.get_xgb_params()
xgtrain = xgboost.DMatrix(X_train, label=y_train)

xgboost.cv(xgb_param, xgtrain, num_boost_round=5000, nfold=5, metrics=['error'],
     early_stopping_rounds=50, stratified=True, seed=0)

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.168552,0.006524,0.179062,0.005558
1,0.150491,0.005126,0.159936,0.004172
2,0.144955,0.003038,0.155377,0.003556
3,0.142698,0.003865,0.152919,0.004687
4,0.140914,0.003461,0.151439,0.004636
5,0.139278,0.003105,0.149011,0.005123
6,0.138138,0.002688,0.148093,0.005377
7,0.137568,0.002463,0.146613,0.005416
8,0.137035,0.002564,0.147146,0.005935
9,0.136694,0.003268,0.146494,0.005112


In [45]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test8 = {
 'n_estimators':[i for i in range(1000, 2501, 100)]+[1139]
}
# Grid search 1 cv result
grid_score8 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.01,
        n_estimators=1139,
        max_depth=9,
        min_child_weight=45,
        gamma=0.4,
        reg_alpha=3,
        subsample=0.9,
        colsample_bytree=0.65,
        objective= 'binary:logistic',
        n_jobs=-1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch8 = GridSearchCV(estimator = xgb,
                            param_grid = param_test8,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch8.fit(X_train,y_train)    
    if grid_score8.empty:
        grid_score8 = pd.DataFrame(gsearch8.cv_results_, columns=['params', 'mean_test_score'])
        grid_score8.columns = ['params', 'mean_test_score_0']
    else:
        grid_score8['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch8.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch8.best_params_)
    print('Run {} best score: '.format(i), gsearch8.best_score_)

grid_score8['avg'] = grid_score8.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score8.loc[grid_score8.avg.idxmax(), :])

Run 6 times
Run 0 best param:  {'n_estimators': 2500}
Run 0 best score:  0.8945405021316911
Run 1 best param:  {'n_estimators': 2500}
Run 1 best score:  0.8931193747039318
Run 2 best param:  {'n_estimators': 2400}
Run 2 best score:  0.8942444339175746
Run 3 best param:  {'n_estimators': 2400}
Run 3 best score:  0.8940667929891047
Run 4 best param:  {'n_estimators': 2400}
Run 4 best score:  0.8952510658455708
Run 5 best param:  {'n_estimators': 2500}
Run 5 best score:  0.8945405021316911
Best params:  params               {'n_estimators': 2500}
mean_test_score_0                  0.894541
mean_test_score_1                  0.893119
mean_test_score_2                  0.894126
mean_test_score_3                  0.894008
mean_test_score_4                  0.895103
mean_test_score_5                  0.894541
avg                                0.894239
Name: 15, dtype: object


## Part 3 Test on test set

In [47]:
accuracy_array = []
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.01,
        n_estimators=2500,
        max_depth=9,
        min_child_weight=45,
        gamma=0.4,
        reg_alpha=3,
        subsample=0.9,
        colsample_bytree=0.65,
        objective= 'binary:logistic',
        n_jobs=-1,
        seed=i
    )
    model = xgb.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # evaluate predictions
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_array.append(accuracy)
    print('Accuracy {}: %.2f%%'.format(i) % (accuracy * 100.0))
mean_accuracy_score = sum(accuracy_array) / NUM_TRIALS
print('Average accuracy is: %.2f%%' % (mean_accuracy_score * 100.0))

Accuracy 0: 89.14%
Accuracy 1: 89.17%
Accuracy 2: 89.15%
Accuracy 3: 89.21%
Accuracy 4: 89.14%
Accuracy 5: 89.16%
Average accuracy is: 89.16%
