# MiniBooNE dataset

## Part 1: Data encoding

### Import library

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import xgboost
import copy
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

In [3]:
os.chdir('/home/tai/Projects/research-project-Roland')

### Load train and test data

In [4]:
train = pd.read_csv("data/MiniBooNE/MiniBooNE.0.train", encoding='latin1',
                 na_values='?',
                 header=None,
                 low_memory=False)

In [5]:
train.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
0,B,5.6757,1.00184,131.343,0.298118,0.012048,0.228417,1.09478,0.820118,3.49877,...,157.612,-42.9368,0.478071,12.1745,0.024096,0.192053,-0.483775,2.67502,0.487935,0.203891
1,B,4.21253,0.603285,173.042,0.328132,0.005568,0.150331,0.805325,0.857529,3.06962,...,188.546,15.0137,-0.253118,2.26856,0.0,0.046123,0.317755,3.91013,0.421626,0.319543
2,B,4.19362,1.04014,53.9026,0.254287,0.00463,0.243071,1.04177,0.878942,3.20468,...,141.582,-2.06235,0.755769,3.43672,0.0,0.147755,0.270756,1.86097,0.93087,0.247875
3,B,4.12466,1.34052,126.269,0.288492,0.018083,0.148613,0.94186,0.859737,3.25207,...,169.137,-30.8377,-0.042714,3.4554,0.0,0.179401,-0.047517,2.14257,1.41112,0.250612
4,S,3.82513,1.15343,299.106,0.25285,0.0,0.0,1.54122,0.903657,3.0986,...,115.348,-0.50963,0.716968,3.72687,0.0,0.151389,0.299851,1.21563,-0.192377,0.22693
5,B,6.18249,2.14757,99.4795,0.267395,0.007042,0.106472,0.961092,0.80223,3.50252,...,139.243,-54.5252,0.541931,6.96016,0.002347,0.153358,-0.065449,1.95154,0.604315,0.230527
6,B,5.20087,1.88387,112.723,0.383446,0.0,0.13963,0.791168,0.843921,3.28384,...,138.243,-52.2913,1.75254,8.63231,0.0,0.259622,-0.389033,2.99319,2.14016,0.207261
7,B,4.56052,1.61986,100.781,0.282722,0.0,0.128536,0.80971,0.768455,3.36314,...,153.667,-13.7297,0.696742,4.76317,0.0,0.116484,-0.08567,1.94926,-0.130875,0.260908
8,B,6.39708,1.29373,108.599,0.454244,0.004348,0.111512,0.918662,0.782574,3.38881,...,163.252,-5.44261,0.55184,3.61821,0.0,0.326823,0.639289,5.64057,5.9376,0.198393
9,B,3.20906,0.638871,124.298,0.233698,0.013139,0.383318,1.12433,0.911206,3.36329,...,136.395,4.27472,0.396741,5.69763,0.00292,0.109576,-0.121126,0.773704,0.04368,0.26591


In [6]:
test = pd.read_csv("data/MiniBooNE/MiniBooNE.0.test", encoding='latin1', 
                 header=None,
                 na_values='?',
                 low_memory=False)

In [7]:
test.isna().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
30    0
31    0
32    0
33    0
34    0
35    0
36    0
37    0
38    0
39    0
40    0
41    0
42    0
43    0
44    0
45    0
46    0
47    0
48    0
49    0
50    0
dtype: int64

In [8]:
train.isna().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
30    0
31    0
32    0
33    0
34    0
35    0
36    0
37    0
38    0
39    0
40    0
41    0
42    0
43    0
44    0
45    0
46    0
47    0
48    0
49    0
50    0
dtype: int64

### Binary convertion of output

In [8]:
X_train = train.drop([0], axis=1)
y_train = np.where(train[0] == 'S', 0, 1)

X_test = test.drop([0], axis=1)
y_test = np.where(test[0] == 'S', 0, 1)

In [9]:
X_train, X_test = X_train.align(X_test, join='outer', fill_value=0, axis=1)

In [10]:
X_train.shape

(65032, 50)

In [12]:
X_test.shape

(65032, 50)

In [13]:
X_test.columns

Int64Index([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
            18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
            35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50],
           dtype='int64')

## Part 2: Tuning on train data

### Find optimal n_estimators

In [13]:
xgb = XGBClassifier(
    learning_rate =0.1,
    n_estimators=5000,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    n_jobs=-1)

xgb_param = xgb.get_xgb_params()
xgtrain = xgboost.DMatrix(X_train, label=y_train)


xgboost.cv(xgb_param, xgtrain, num_boost_round=5000, nfold=5, metrics=['error'],
     early_stopping_rounds=50, stratified=True, seed=0)

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.107655,0.003351,0.112898,0.001686
1,0.101089,0.000679,0.106686,0.002531
2,0.098048,0.002493,0.103518,0.004369
3,0.094899,0.000974,0.100304,0.003360
4,0.093231,0.001553,0.098029,0.002940
5,0.090648,0.001253,0.095599,0.002710
6,0.088760,0.001119,0.094108,0.002223
7,0.087169,0.001039,0.092385,0.002378
8,0.086084,0.000675,0.091909,0.001861
9,0.085227,0.000842,0.090817,0.001966


### Tuning max_depth and min_child_weight

In [14]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test1 = {
 'max_depth':range(1,10,2),
 'min_child_weight':range(1,650,100)
}
# Grid search 1 cv result
grid_score1 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=427,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=8,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch1 = GridSearchCV(estimator = xgb,
                            param_grid = param_test1,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch1.fit(X_train,y_train)    
    if grid_score1.empty:
        grid_score1 = pd.DataFrame(gsearch1.cv_results_, columns=['params', 'mean_test_score'])
        grid_score1.columns = ['params', 'mean_test_score_0']
    else:
        grid_score1['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch1.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch1.best_params_)
    print('Run {} best score: '.format(i), gsearch1.best_score_)

grid_score1['avg'] = grid_score1.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score1.loc[grid_score1.avg.idxmax(), :])

Run 4 times
Run 0 best param:  {'max_depth': 9, 'min_child_weight': 1}
Run 0 best score:  0.9454883749538688
Run 1 best param:  {'max_depth': 9, 'min_child_weight': 1}
Run 1 best score:  0.9455191290441629
Run 2 best param:  {'max_depth': 7, 'min_child_weight': 1}
Run 2 best score:  0.9455191290441629
Run 3 best param:  {'max_depth': 7, 'min_child_weight': 1}
Run 3 best score:  0.9456421454053389
Best params:  params               {'max_depth': 9, 'min_child_weight': 1}
mean_test_score_0                                   0.945488
mean_test_score_1                                   0.945519
mean_test_score_2                                   0.945442
mean_test_score_3                                   0.945504
avg                                                 0.945488
Name: 28, dtype: object


In [14]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test1b = {
 'max_depth':range(1,10,2),
 'min_child_weight':range(1,10,2)
}
# Grid search 1 cv result
grid_score1b = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=427,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=8,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch1b = GridSearchCV(estimator = xgb,
                            param_grid = param_test1b,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch1b.fit(X_train,y_train)    
    if grid_score1b.empty:
        grid_score1b = pd.DataFrame(gsearch1b.cv_results_, columns=['params', 'mean_test_score'])
        grid_score1b.columns = ['params', 'mean_test_score_0']
    else:
        grid_score1b['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch1b.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch1b.best_params_)
    print('Run {} best score: '.format(i), gsearch1b.best_score_)

grid_score1b['avg'] = grid_score1b.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score1b.loc[grid_score1b.avg.idxmax(), :])

Run 4 times
Run 0 best param:  {'max_depth': 9, 'min_child_weight': 1}
Run 0 best score:  0.9454883749538688
Run 1 best param:  {'max_depth': 9, 'min_child_weight': 7}
Run 1 best score:  0.945872801082544
Run 2 best param:  {'max_depth': 7, 'min_child_weight': 1}
Run 2 best score:  0.9455191290441629
Run 3 best param:  {'max_depth': 9, 'min_child_weight': 3}
Run 3 best score:  0.946057325624308
Best params:  params               {'max_depth': 9, 'min_child_weight': 1}
mean_test_score_0                                   0.945488
mean_test_score_1                                   0.945519
mean_test_score_2                                   0.945442
mean_test_score_3                                   0.945504
avg                                                 0.945488
Name: 20, dtype: object


In [15]:
# Look carefully again the neigbor values
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test2 = {
 'max_depth':[9, 10],
 'min_child_weight':[1, 2]
}
# Grid search 1 cv result
grid_score2 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=427,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=8,
        scale_pos_weight=1,        
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch2 = GridSearchCV(estimator = xgb,
                            param_grid = param_test2,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch2.fit(X_train,y_train)    
    if grid_score2.empty:
        grid_score2 = pd.DataFrame(gsearch2.cv_results_, columns=['params', 'mean_test_score'])
        grid_score2.columns = ['params', 'mean_test_score_0']
    else:
        grid_score2['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch2.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch2.best_params_)
    print('Run {} best score: '.format(i), gsearch2.best_score_)

grid_score2['avg'] = grid_score2.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score2.loc[grid_score2.avg.idxmax(), :])

Run 4 times
Run 0 best param:  {'max_depth': 10, 'min_child_weight': 2}
Run 0 best score:  0.9458574240373969
Run 1 best param:  {'max_depth': 9, 'min_child_weight': 1}
Run 1 best score:  0.9455191290441629
Run 2 best param:  {'max_depth': 10, 'min_child_weight': 1}
Run 2 best score:  0.945934309263132
Run 3 best param:  {'max_depth': 10, 'min_child_weight': 1}
Run 3 best score:  0.946057325624308
Best params:  params               {'max_depth': 10, 'min_child_weight': 1}
mean_test_score_0                                    0.945119
mean_test_score_1                                    0.945458
mean_test_score_2                                    0.945934
mean_test_score_3                                    0.946057
avg                                                  0.945642
Name: 2, dtype: object


In [19]:
# Look carefully again the neigbor values
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test2 = {
 'max_depth':[i for i in range (10, 30, 5)],
}
# Grid search 1 cv result
grid_score2 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=427,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,        
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch2 = GridSearchCV(estimator = xgb,
                            param_grid = param_test2,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch2.fit(X_train,y_train)    
    if grid_score2.empty:
        grid_score2 = pd.DataFrame(gsearch2.cv_results_, columns=['params', 'mean_test_score'])
        grid_score2.columns = ['params', 'mean_test_score_0']
    else:
        grid_score2['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch2.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch2.best_params_)
    print('Run {} best score: '.format(i), gsearch2.best_score_)

grid_score2['avg'] = grid_score2.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score2.loc[grid_score2.avg.idxmax(), :])

Run 4 times
Run 0 best param:  {'max_depth': 20}
Run 0 best score:  0.9453653585926928
Run 1 best param:  {'max_depth': 10}
Run 1 best score:  0.9454576208635749
Run 2 best param:  {'max_depth': 10}
Run 2 best score:  0.945934309263132
Run 3 best param:  {'max_depth': 10}
Run 3 best score:  0.946057325624308
Best params:  params               {'max_depth': 10}
mean_test_score_0             0.945119
mean_test_score_1             0.945458
mean_test_score_2             0.945934
mean_test_score_3             0.946057
avg                           0.945642
Name: 0, dtype: object


### Tuning gamma

In [29]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
# Grid search 1 cv result
grid_score3 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=427,
        max_depth=10,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch3 = GridSearchCV(estimator = xgb,
                            param_grid = param_test3,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch3.fit(X_train,y_train)    
    if grid_score3.empty:
        grid_score3 = pd.DataFrame(gsearch3.cv_results_, columns=['params', 'mean_test_score'])
        grid_score3.columns = ['params', 'mean_test_score_0']
    else:
        grid_score3['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch3.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch3.best_params_)
    print('Run {} best score: '.format(i), gsearch3.best_score_)

grid_score3['avg'] = grid_score3.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score3.loc[grid_score3.avg.idxmax(), :])

Run 4 times
Run 0 best param:  {'gamma': 0.1}
Run 0 best score:  0.945903555172838
Run 1 best param:  {'gamma': 0.0}
Run 1 best score:  0.9454576208635749
Run 2 best param:  {'gamma': 0.0}
Run 2 best score:  0.945934309263132
Run 3 best param:  {'gamma': 0.0}
Run 3 best score:  0.946057325624308
Best params:  params               {'gamma': 0.1}
mean_test_score_0          0.945904
mean_test_score_1          0.945104
mean_test_score_2          0.945734
mean_test_score_3          0.945904
avg                        0.945661
Name: 1, dtype: object


### Recablirating the n_estimators

In [43]:
xgb = XGBClassifier(
    learning_rate =0.1,
    n_estimators=5000,
    max_depth=10,
    min_child_weight=1,
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    n_jobs=-1,
    scale_pos_weight=1
    )

xgb_param = xgb.get_xgb_params()
xgtrain = xgboost.DMatrix(X_train, label=y_train)

xgboost.cv(xgb_param, xgtrain, num_boost_round=5000, nfold=5, metrics=['error'],
     early_stopping_rounds=50, stratified=True, seed=0)

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.065860,0.003643,0.100550,0.002674
1,0.055580,0.000902,0.086942,0.000852
2,0.048642,0.000916,0.082974,0.000521
3,0.045481,0.001208,0.080191,0.000978
4,0.042406,0.001442,0.077423,0.000548
5,0.040227,0.000864,0.075793,0.000590
6,0.038320,0.000581,0.074948,0.001025
7,0.036789,0.000685,0.073579,0.001280
8,0.035460,0.000517,0.072534,0.001455
9,0.033980,0.000525,0.071303,0.001170


In [46]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test_recalibrate = {
 'n_estimators':[i for i in range(100, 1000, 100)]+[326]
}
# Grid search 1 cv result
grid_score_recalibrate = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=326,
        max_depth=10,
        min_child_weight=1,
        gamma=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch_recalibrate = GridSearchCV(estimator = xgb,
                            param_grid = param_test_recalibrate,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch_recalibrate.fit(X_train,y_train)    
    if grid_score_recalibrate.empty:
        grid_score_recalibrate = pd.DataFrame(gsearch_recalibrate.cv_results_, columns=['params', 'mean_test_score'])
        grid_score_recalibrate.columns = ['params', 'mean_test_score_0']
    else:
        grid_score_recalibrate['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch_recalibrate.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch_recalibrate.best_params_)
    print('Run {} best score: '.format(i), gsearch_recalibrate.best_score_)

grid_score_recalibrate['avg'] = grid_score_recalibrate.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score_recalibrate.loc[grid_score_recalibrate.avg.idxmax(), :])

Run 4 times
Run 0 best param:  {'n_estimators': 800}
Run 0 best score:  0.9465955222044532
Run 1 best param:  {'n_estimators': 900}
Run 1 best score:  0.9456728994956329
Run 2 best param:  {'n_estimators': 600}
Run 2 best score:  0.946195719030631
Run 3 best param:  {'n_estimators': 900}
Run 3 best score:  0.9462879813015131
Best params:  params               {'n_estimators': 800}
mean_test_score_0                 0.946596
mean_test_score_1                 0.945565
mean_test_score_2                  0.94618
mean_test_score_3                 0.946211
avg                               0.946138
Name: 7, dtype: object


In [47]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test_recalibrateb = {
 'n_estimators':[i for i in range(800, 900, 20)]
}
# Grid search 1 cv result
grid_score_recalibrateb = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=326,
        max_depth=10,
        min_child_weight=1,
        gamma=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch_recalibrateb = GridSearchCV(estimator = xgb,
                            param_grid = param_test_recalibrateb,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch_recalibrateb.fit(X_train,y_train)    
    if grid_score_recalibrateb.empty:
        grid_score_recalibrateb = pd.DataFrame(gsearch_recalibrateb.cv_results_, columns=['params', 'mean_test_score'])
        grid_score_recalibrateb.columns = ['params', 'mean_test_score_0']
    else:
        grid_score_recalibrateb['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch_recalibrateb.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch_recalibrateb.best_params_)
    print('Run {} best score: '.format(i), gsearch_recalibrateb.best_score_)

grid_score_recalibrateb['avg'] = grid_score_recalibrateb.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score_recalibrateb.loc[grid_score_recalibrateb.avg.idxmax(), :])

Run 4 times
Run 0 best param:  {'n_estimators': 880}
Run 0 best score:  0.9466416533398943
Run 1 best param:  {'n_estimators': 840}
Run 1 best score:  0.9456882765407799
Run 2 best param:  {'n_estimators': 800}
Run 2 best score:  0.9461803419854841
Run 3 best param:  {'n_estimators': 880}
Run 3 best score:  0.9463033583466601
Best params:  params               {'n_estimators': 880}
mean_test_score_0                 0.946642
mean_test_score_1                 0.945673
mean_test_score_2                 0.946042
mean_test_score_3                 0.946303
avg                               0.946165
Name: 4, dtype: object


### Tuning the subsample and colsample_bytree

In [48]:
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
# Grid search 1 cv result
grid_score4 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=880,
        max_depth=10,
        min_child_weight=1,
        gamma=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch4 = GridSearchCV(estimator = xgb,
                            param_grid = param_test4,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch4.fit(X_train,y_train)    
    if grid_score4.empty:
        grid_score4 = pd.DataFrame(gsearch4.cv_results_, columns=['params', 'mean_test_score'])
        grid_score4.columns = ['params', 'mean_test_score_0']
    else:
        grid_score4['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch4.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch4.best_params_)
    print('Run {} best score: '.format(i), gsearch4.best_score_)

grid_score4['avg'] = grid_score4.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score4.loc[grid_score4.avg.idxmax(), :])

Run 4 times
Run 0 best param:  {'colsample_bytree': 0.8, 'subsample': 0.8}
Run 0 best score:  0.9466416533398943
Run 1 best param:  {'colsample_bytree': 0.8, 'subsample': 0.6}
Run 1 best score:  0.9461649649403371
Run 2 best param:  {'colsample_bytree': 0.6, 'subsample': 0.7}
Run 2 best score:  0.946195719030631
Run 3 best param:  {'colsample_bytree': 0.8, 'subsample': 0.8}
Run 3 best score:  0.9463033583466601
Best params:  params               {'colsample_bytree': 0.8, 'subsample': 0.8}
mean_test_score_0                                       0.946642
mean_test_score_1                                       0.945673
mean_test_score_2                                       0.946042
mean_test_score_3                                       0.946303
avg                                                     0.946165
Name: 10, dtype: object


In [49]:
# Carefully search for each neighboring 0.05
param_test5 = {
 'subsample':[i/100.0 for i in range(75,90,5)],
 'colsample_bytree':[i/100.0 for i in range(75,90,5)]
}
# Grid search 1 cv result
grid_score5 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=880,
        max_depth=10,
        min_child_weight=1,
        gamma=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch5 = GridSearchCV(estimator = xgb,
                            param_grid = param_test5,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch5.fit(X_train,y_train)    
    if grid_score5.empty:
        grid_score5 = pd.DataFrame(gsearch5.cv_results_, columns=['params', 'mean_test_score'])
        grid_score5.columns = ['params', 'mean_test_score_0']
    else:
        grid_score5['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch5.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch5.best_params_)
    print('Run {} best score: '.format(i), gsearch5.best_score_)

grid_score5['avg'] = grid_score5.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score5.loc[grid_score5.avg.idxmax(), :])

Run 4 times
Run 0 best param:  {'colsample_bytree': 0.8, 'subsample': 0.8}
Run 0 best score:  0.9466416533398943
Run 1 best param:  {'colsample_bytree': 0.75, 'subsample': 0.8}
Run 1 best score:  0.946026571534014
Run 2 best param:  {'colsample_bytree': 0.8, 'subsample': 0.8}
Run 2 best score:  0.946041948579161
Run 3 best param:  {'colsample_bytree': 0.75, 'subsample': 0.75}
Run 3 best score:  0.9464417517529832
Best params:  params               {'colsample_bytree': 0.8, 'subsample': 0.8}
mean_test_score_0                                       0.946642
mean_test_score_1                                       0.945673
mean_test_score_2                                       0.946042
mean_test_score_3                                       0.946303
avg                                                     0.946165
Name: 4, dtype: object


### Tuning Regularization Parameters

In [50]:
param_test6 = {
 'reg_alpha':[0, 1e-5, 1e-2, 0.1, 1, 100]
}
# Grid search 1 cv result
grid_score6 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=880,
        max_depth=10,
        min_child_weight=1,
        gamma=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch6 = GridSearchCV(estimator = xgb,
                            param_grid = param_test6,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch6.fit(X_train,y_train)    
    if grid_score6.empty:
        grid_score6 = pd.DataFrame(gsearch6.cv_results_, columns=['params', 'mean_test_score'])
        grid_score6.columns = ['params', 'mean_test_score_0']
    else:
        grid_score6['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch6.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch6.best_params_)
    print('Run {} best score: '.format(i), gsearch6.best_score_)

grid_score6['avg'] = grid_score6.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score6.loc[grid_score6.avg.idxmax(), :])

Run 4 times
Run 0 best param:  {'reg_alpha': 0}
Run 0 best score:  0.9466416533398943
Run 1 best param:  {'reg_alpha': 0.1}
Run 1 best score:  0.945888178127691
Run 2 best param:  {'reg_alpha': 1e-05}
Run 2 best score:  0.9465186369787182
Run 3 best param:  {'reg_alpha': 0}
Run 3 best score:  0.9463033583466601
Best params:  params               {'reg_alpha': 0}
mean_test_score_0            0.946642
mean_test_score_1            0.945673
mean_test_score_2            0.946042
mean_test_score_3            0.946303
avg                          0.946165
Name: 0, dtype: object


In [51]:
param_test7 = {
 'reg_alpha':[0, 1e-6, 5e-6, 1e-5, 5e-5, 1e-4, 5e-4]
}
# Grid search 1 cv result
grid_score7 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=880,
        max_depth=10,
        min_child_weight=1,
        gamma=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch7 = GridSearchCV(estimator = xgb,
                            param_grid = param_test7,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch7.fit(X_train,y_train)    
    if grid_score7.empty:
        grid_score7 = pd.DataFrame(gsearch7.cv_results_, columns=['params', 'mean_test_score'])
        grid_score7.columns = ['params', 'mean_test_score_0']
    else:
        grid_score7['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch7.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch7.best_params_)
    print('Run {} best score: '.format(i), gsearch7.best_score_)

grid_score7['avg'] = grid_score7.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score7.loc[grid_score7.avg.idxmax(), :])

Run 4 times
Run 0 best param:  {'reg_alpha': 0}
Run 0 best score:  0.9466416533398943
Run 1 best param:  {'reg_alpha': 0.0005}
Run 1 best score:  0.946057325624308
Run 2 best param:  {'reg_alpha': 1e-05}
Run 2 best score:  0.9465186369787182
Run 3 best param:  {'reg_alpha': 0.0005}
Run 3 best score:  0.9465186369787182
Best params:  params               {'reg_alpha': 0}
mean_test_score_0            0.946642
mean_test_score_1            0.945673
mean_test_score_2            0.946042
mean_test_score_3            0.946303
avg                          0.946165
Name: 0, dtype: object


### Reduce the learning rate and tune n_estimators (remove)

In [52]:
xgb = XGBClassifier(
    learning_rate =0.01,
    n_estimators=880,
    max_depth=10,
    min_child_weight=1,
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0,
    objective= 'binary:logistic',
    n_jobs=-1,
    scale_pos_weight=1,
    seed=0)

xgb_param = xgb.get_xgb_params()
xgtrain = xgboost.DMatrix(X_train, label=y_train)

xgboost.cv(xgb_param, xgtrain, num_boost_round=5000, nfold=5, metrics=['error'],
     early_stopping_rounds=50, stratified=True, seed=0)

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.065860,0.003643,0.100550,0.002674
1,0.056299,0.000789,0.087787,0.000941
2,0.050225,0.001014,0.083759,0.000791
3,0.047838,0.001047,0.080837,0.001228
4,0.046254,0.001038,0.078823,0.000461
5,0.044778,0.000658,0.077454,0.001162
6,0.044140,0.000856,0.076731,0.001204
7,0.043110,0.000704,0.076224,0.001047
8,0.042360,0.000801,0.075317,0.001635
9,0.041752,0.000963,0.075132,0.001720


In [12]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test8 = {
 'n_estimators':[i for i in range(1000, 2100, 100)]+[1080]
}
# Grid search 1 cv result
grid_score8 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.01,
        n_estimators=1080,
        max_depth=10,
        min_child_weight=1,
        gamma=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch8 = GridSearchCV(estimator = xgb,
                            param_grid = param_test8,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch8.fit(X_train,y_train)    
    if grid_score8.empty:
        grid_score8 = pd.DataFrame(gsearch8.cv_results_, columns=['params', 'mean_test_score'])
        grid_score8.columns = ['params', 'mean_test_score_0']
    else:
        grid_score8['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch8.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch8.best_params_)
    print('Run {} best score: '.format(i), gsearch8.best_score_)

grid_score8['avg'] = grid_score8.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score8.loc[grid_score8.avg.idxmax(), :])

Run 4 times
Run 0 best param:  {'n_estimators': 2000}
Run 0 best score:  0.9453653585926928
Run 1 best param:  {'n_estimators': 2000}
Run 1 best score:  0.9449348013285767
Run 2 best param:  {'n_estimators': 1900}
Run 2 best score:  0.9450578176897527
Run 3 best param:  {'n_estimators': 2000}
Run 3 best score:  0.9453653585926928
Best params:  params               {'n_estimators': 2000}
mean_test_score_0                  0.945365
mean_test_score_1                  0.944935
mean_test_score_2                  0.944996
mean_test_score_3                  0.945365
avg                                0.945165
Name: 10, dtype: object


In [16]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test8b = {
 'n_estimators':[i for i in range(2000, 3100, 100)]
}
# Grid search 1 cv result
grid_score8b = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.01,
        n_estimators=1080,
        max_depth=10,
        min_child_weight=1,
        gamma=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch8b = GridSearchCV(estimator = xgb,
                            param_grid = param_test8b,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch8b.fit(X_train,y_train)    
    if grid_score8b.empty:
        grid_score8b = pd.DataFrame(gsearch8b.cv_results_, columns=['params', 'mean_test_score'])
        grid_score8b.columns = ['params', 'mean_test_score_0']
    else:
        grid_score8b['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch8b.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch8b.best_params_)
    print('Run {} best score: '.format(i), gsearch8b.best_score_)

grid_score8b['avg'] = grid_score8b.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score8b.loc[grid_score8b.avg.idxmax(), :])

Run 4 times
Run 0 best param:  {'n_estimators': 3000}
Run 0 best score:  0.945949686308279
Run 1 best param:  {'n_estimators': 2900}
Run 1 best score:  0.9456728994956329
Run 2 best param:  {'n_estimators': 2800}
Run 2 best score:  0.945872801082544
Run 3 best param:  {'n_estimators': 3000}
Run 3 best score:  0.9462264731209251
Best params:  params               {'n_estimators': 3000}
mean_test_score_0                   0.94595
mean_test_score_1                  0.945658
mean_test_score_2                  0.945873
mean_test_score_3                  0.946226
avg                                0.945927
Name: 10, dtype: object


## Part 3 Test on test set

In [20]:
accuracy_array = []
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=880,
        max_depth=10,
        min_child_weight=1,
        gamma=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,
        seed=i
    )
    model = xgb.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # evaluate predictions
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_array.append(accuracy)
    print('Accuracy {}: %.2f%%'.format(i) % (accuracy * 100.0))
mean_accuracy_score = sum(accuracy_array) / NUM_TRIALS
print('Average accuracy is: %.2f%%' % (mean_accuracy_score * 100.0))

Accuracy 0: 94.75%
Accuracy 1: 94.66%
Accuracy 2: 94.75%
Accuracy 3: 94.69%
Average accuracy is: 94.71%
