# Dota2 dataset

## Part 1: Data encoding

### Import library

In [22]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import xgboost
import copy
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

In [23]:
os.chdir('/home/tai/Projects/research-project-Roland')

### Load train and test data

In [24]:
train = pd.read_csv("data/dota2/dota2.0.train", encoding='latin1', 
                 header=None,
                 na_values='?',
                 low_memory=False)

In [25]:
test = pd.read_csv("data/dota2/dota2.0.test", encoding='latin1', 
                 header=None,
                 na_values='?',
                 low_memory=False)

In [26]:
train.isna().sum()

0      0
1      0
2      0
3      0
4      0
5      0
6      0
7      0
8      0
9      0
10     0
11     0
12     0
13     0
14     0
15     0
16     0
17     0
18     0
19     0
20     0
21     0
22     0
23     0
24     0
25     0
26     0
27     0
28     0
29     0
      ..
87     0
88     0
89     0
90     0
91     0
92     0
93     0
94     0
95     0
96     0
97     0
98     0
99     0
100    0
101    0
102    0
103    0
104    0
105    0
106    0
107    0
108    0
109    0
110    0
111    0
112    0
113    0
114    0
115    0
116    0
Length: 117, dtype: int64

In [27]:
test.isna().sum()

0      0
1      0
2      0
3      0
4      0
5      0
6      0
7      0
8      0
9      0
10     0
11     0
12     0
13     0
14     0
15     0
16     0
17     0
18     0
19     0
20     0
21     0
22     0
23     0
24     0
25     0
26     0
27     0
28     0
29     0
      ..
87     0
88     0
89     0
90     0
91     0
92     0
93     0
94     0
95     0
96     0
97     0
98     0
99     0
100    0
101    0
102    0
103    0
104    0
105    0
106    0
107    0
108    0
109    0
110    0
111    0
112    0
113    0
114    0
115    0
116    0
Length: 117, dtype: int64

In [28]:
y_train = train.iloc[:, 0]
X_train = train.iloc[:, 2:]


y_test = test.iloc[:, 0]
X_test = test.iloc[:, 2:]

In [29]:
X_train.describe()

Unnamed: 0,2,3,4,5,6,7,8,9,10,11,...,107,108,109,110,111,112,113,114,115,116
count,51472.0,51472.0,51472.0,51472.0,51472.0,51472.0,51472.0,51472.0,51472.0,51472.0,...,51472.0,51472.0,51472.0,51472.0,51472.0,51472.0,51472.0,51472.0,51472.0,51472.0
mean,3.316347,2.387162,-0.002409,-0.000291,0.000952,-0.00033,-0.002681,0.002545,-3.9e-05,-0.001088,...,-0.001535,0.000311,0.001146,0.000117,0.0,0.00101,0.00035,-0.001107,0.000214,0.000369
std,2.633338,0.487345,0.402043,0.465119,0.164272,0.35501,0.331891,0.483905,0.350355,0.502716,...,0.534865,0.205801,0.283297,0.154335,0.0,0.220562,0.205422,0.17031,0.191674,0.138336
min,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,9.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


### Standardize the y

In [30]:
y_train[(y_train==-1)] = 0

In [31]:
y_test[(y_test==-1)] = 0

## Onehot encoding

In [32]:
X_train = pd.get_dummies(X_train, columns=[2, 3])

In [33]:
X_test = pd.get_dummies(X_test, columns=[2, 3])

In [34]:
X_train.iloc[:, 0:113] = X_train.iloc[:, 0:113].astype('category')

In [35]:
X_train = pd.get_dummies(X_train)

In [36]:
X_test.iloc[:, 0:113] = X_test.iloc[:, 0:113].astype('category')

In [37]:
X_test = pd.get_dummies(X_test)

In [38]:
X_train.shape

(51472, 347)

## Part 2: Tuning on train data

### Find optimal n_estimators

In [19]:
xgb = XGBClassifier(
    learning_rate =0.1,
    n_estimators=1500,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    n_jobs=-1)

xgb_param = xgb.get_xgb_params()
xgtrain = xgboost.DMatrix(X_train, label=y_train)

xgboost.cv(xgb_param, xgtrain, num_boost_round=1500, nfold=5, metrics=['error'],
     early_stopping_rounds=50, stratified=True, seed=0)

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.456000,0.006321,0.466312,0.001944
1,0.447083,0.004315,0.457803,0.002286
2,0.444509,0.005431,0.454791,0.003364
3,0.439793,0.003425,0.450459,0.003843
4,0.437063,0.003233,0.450905,0.002244
5,0.434644,0.003702,0.448768,0.006003
6,0.433017,0.003748,0.449137,0.004384
7,0.429986,0.003041,0.446165,0.004118
8,0.428160,0.002557,0.443076,0.003589
9,0.425255,0.002746,0.441716,0.004589


### Tuning max_depth and min_child_weight

In [21]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test1 = {
 'max_depth':range(1,10,2),
 'min_child_weight':range(1,500,50)
}
# Grid search 1 cv result
grid_score1 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=267,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch1 = GridSearchCV(estimator = xgb,
                            param_grid = param_test1,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch1.fit(X_train,y_train)    
    if grid_score1.empty:
        grid_score1 = pd.DataFrame(gsearch1.cv_results_, columns=['params', 'mean_test_score'])
        grid_score1.columns = ['params', 'mean_test_score_0']
    else:
        grid_score1['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch1.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch1.best_params_)
    print('Run {} best score: '.format(i), gsearch1.best_score_)

grid_score1['avg'] = grid_score1.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score1.loc[grid_score1.avg.idxmax(), :])

Run 4 times
Run 0 best param:  {'max_depth': 7, 'min_child_weight': 51}
Run 0 best score:  0.596848772147964
Run 1 best param:  {'max_depth': 5, 'min_child_weight': 51}
Run 1 best score:  0.5970624805719614
Run 2 best param:  {'max_depth': 7, 'min_child_weight': 51}
Run 2 best score:  0.5945368355610817
Run 3 best param:  {'max_depth': 7, 'min_child_weight': 51}
Run 3 best score:  0.5965184954926951
Best params:  params               {'max_depth': 7, 'min_child_weight': 51}
mean_test_score_0                                    0.596849
mean_test_score_1                                    0.596285
mean_test_score_2                                    0.594537
mean_test_score_3                                    0.596518
avg                                                  0.596047
Name: 31, dtype: object


In [24]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test1b = {
 'max_depth':range(1,10,2),
 'min_child_weight':range(25,75,5)
}
# Grid search 1 cv result
grid_score1b = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=267,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch1b = GridSearchCV(estimator = xgb,
                            param_grid = param_test1b,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch1b.fit(X_train,y_train)    
    if grid_score1b.empty:
        grid_score1b = pd.DataFrame(gsearch1b.cv_results_, columns=['params', 'mean_test_score'])
        grid_score1b.columns = ['params', 'mean_test_score_0']
    else:
        grid_score1b['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch1b.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch1b.best_params_)
    print('Run {} best score: '.format(i), gsearch1b.best_score_)

grid_score1b['avg'] = grid_score1b.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score1b.loc[grid_score1b.avg.idxmax(), :])

Run 4 times
Run 0 best param:  {'max_depth': 7, 'min_child_weight': 65}
Run 0 best score:  0.5980533105377681
Run 1 best param:  {'max_depth': 9, 'min_child_weight': 65}
Run 1 best score:  0.5981893068075848
Run 2 best param:  {'max_depth': 7, 'min_child_weight': 65}
Run 2 best score:  0.5970624805719614
Run 3 best param:  {'max_depth': 5, 'min_child_weight': 65}
Run 3 best score:  0.5966739198010569
Best params:  params               {'max_depth': 7, 'min_child_weight': 65}
mean_test_score_0                                    0.598053
mean_test_score_1                                    0.597257
mean_test_score_2                                    0.597062
mean_test_score_3                                    0.595722
avg                                                  0.597024
Name: 38, dtype: object


In [25]:
# Look carefully again the neigbor values
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test2 = {
 'max_depth':[6, 7, 8],
 'min_child_weight':range(65, 71)
}
# Grid search 1 cv result
grid_score2 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=267,
        max_depth=7,
        min_child_weight=65,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch2 = GridSearchCV(estimator = xgb,
                            param_grid = param_test2,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch2.fit(X_train,y_train)    
    if grid_score2.empty:
        grid_score2 = pd.DataFrame(gsearch2.cv_results_, columns=['params', 'mean_test_score'])
        grid_score2.columns = ['params', 'mean_test_score_0']
    else:
        grid_score2['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch2.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch2.best_params_)
    print('Run {} best score: '.format(i), gsearch2.best_score_)

grid_score2['avg'] = grid_score2.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score2.loc[grid_score2.avg.idxmax(), :])

Run 4 times
Run 0 best param:  {'max_depth': 6, 'min_child_weight': 68}
Run 0 best score:  0.5985390115013988
Run 1 best param:  {'max_depth': 7, 'min_child_weight': 67}
Run 1 best score:  0.598811004041032
Run 2 best param:  {'max_depth': 7, 'min_child_weight': 67}
Run 2 best score:  0.5971207646875971
Run 3 best param:  {'max_depth': 8, 'min_child_weight': 69}
Run 3 best score:  0.596771059993783
Best params:  params               {'max_depth': 7, 'min_child_weight': 67}
mean_test_score_0                                    0.597373
mean_test_score_1                                    0.598811
mean_test_score_2                                    0.597121
mean_test_score_3                                    0.595819
avg                                                  0.597281
Name: 8, dtype: object


### Tuning gamma

In [26]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
# Grid search 1 cv result
grid_score3 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=267,
        max_depth=7,
        min_child_weight=67,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch3 = GridSearchCV(estimator = xgb,
                            param_grid = param_test3,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch3.fit(X_train,y_train)    
    if grid_score3.empty:
        grid_score3 = pd.DataFrame(gsearch3.cv_results_, columns=['params', 'mean_test_score'])
        grid_score3.columns = ['params', 'mean_test_score_0']
    else:
        grid_score3['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch3.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch3.best_params_)
    print('Run {} best score: '.format(i), gsearch3.best_score_)

grid_score3['avg'] = grid_score3.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score3.loc[grid_score3.avg.idxmax(), :])

Run 4 times
Run 0 best param:  {'gamma': 0.3}
Run 0 best score:  0.5984807273857631
Run 1 best param:  {'gamma': 0.0}
Run 1 best score:  0.598811004041032
Run 2 best param:  {'gamma': 0.0}
Run 2 best score:  0.5971207646875971
Run 3 best param:  {'gamma': 0.3}
Run 3 best score:  0.5961299347217904
Best params:  params               {'gamma': 0.0}
mean_test_score_0          0.597373
mean_test_score_1          0.598811
mean_test_score_2          0.597121
mean_test_score_3          0.595819
avg                        0.597281
Name: 0, dtype: object


### Recablirating the n_estimators

In [28]:
xgb = XGBClassifier(
    learning_rate =0.1,
    n_estimators=267,
    max_depth=7,
    min_child_weight=67,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    n_jobs=-1,
    seed=0)

xgb_param = xgb.get_xgb_params()
xgtrain = xgboost.DMatrix(X_train, label=y_train)

xgboost.cv(xgb_param, xgtrain, num_boost_round=1500, nfold=5, metrics=['error'],
     early_stopping_rounds=50, stratified=True, seed=0)

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.451269,0.006996,0.458094,0.003188
1,0.443703,0.006866,0.449682,0.006224
2,0.440356,0.008308,0.446185,0.007777
3,0.435824,0.003161,0.443620,0.006135
4,0.433911,0.003296,0.442415,0.005501
5,0.432522,0.004231,0.440492,0.004914
6,0.430793,0.002049,0.441094,0.006321
7,0.429355,0.003424,0.440200,0.004762
8,0.428009,0.002957,0.438841,0.004180
9,0.426742,0.002118,0.437247,0.004350


In [29]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test8 = {
 'n_estimators':[i for i in range(100, 1500, 100)]+[271]
}
# Grid search 1 cv result
grid_score8 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=271,
        max_depth=7,
        min_child_weight=67,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch8 = GridSearchCV(estimator = xgb,
                            param_grid = param_test8,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch8.fit(X_train,y_train)    
    if grid_score8.empty:
        grid_score8 = pd.DataFrame(gsearch8.cv_results_, columns=['params', 'mean_test_score'])
        grid_score8.columns = ['params', 'mean_test_score_0']
    else:
        grid_score8['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch8.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch8.best_params_)
    print('Run {} best score: '.format(i), gsearch8.best_score_)

grid_score8['avg'] = grid_score8.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score8.loc[grid_score8.avg.idxmax(), :])

Run 4 times
Run 0 best param:  {'n_estimators': 400}
Run 0 best score:  0.5981310226919491
Run 1 best param:  {'n_estimators': 271}
Run 1 best score:  0.5989664283493938
Run 2 best param:  {'n_estimators': 400}
Run 2 best score:  0.5969070562635996
Run 3 best param:  {'n_estimators': 400}
Run 3 best score:  0.5968293441094187
Best params:  params               {'n_estimators': 400}
mean_test_score_0                 0.598131
mean_test_score_1                 0.597257
mean_test_score_2                 0.596907
mean_test_score_3                 0.596829
avg                               0.597281
Name: 3, dtype: object


In [30]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test8 = {
 'n_estimators':[i for i in range(350, 450, 10)]
}
# Grid search 1 cv result
grid_score8 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=271,
        max_depth=7,
        min_child_weight=67,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch8 = GridSearchCV(estimator = xgb,
                            param_grid = param_test8,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch8.fit(X_train,y_train)    
    if grid_score8.empty:
        grid_score8 = pd.DataFrame(gsearch8.cv_results_, columns=['params', 'mean_test_score'])
        grid_score8.columns = ['params', 'mean_test_score_0']
    else:
        grid_score8['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch8.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch8.best_params_)
    print('Run {} best score: '.format(i), gsearch8.best_score_)

grid_score8['avg'] = grid_score8.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score8.loc[grid_score8.avg.idxmax(), :])

Run 4 times
Run 0 best param:  {'n_estimators': 410}
Run 0 best score:  0.5990829965806652
Run 1 best param:  {'n_estimators': 350}
Run 1 best score:  0.5978978862294063
Run 2 best param:  {'n_estimators': 420}
Run 2 best score:  0.5977230338824993
Run 3 best param:  {'n_estimators': 370}
Run 3 best score:  0.597023624494871
Best params:  params               {'n_estimators': 410}
mean_test_score_0                 0.599083
mean_test_score_1                 0.597509
mean_test_score_2                 0.596985
mean_test_score_3                 0.596616
avg                               0.597548
Name: 6, dtype: object


### Tuning the subsample and colsample_bytree

In [31]:
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
# Grid search 1 cv result
grid_score4 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=410,
        max_depth=7,
        min_child_weight=67,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch4 = GridSearchCV(estimator = xgb,
                            param_grid = param_test4,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch4.fit(X_train,y_train)    
    if grid_score4.empty:
        grid_score4 = pd.DataFrame(gsearch4.cv_results_, columns=['params', 'mean_test_score'])
        grid_score4.columns = ['params', 'mean_test_score_0']
    else:
        grid_score4['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch4.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch4.best_params_)
    print('Run {} best score: '.format(i), gsearch4.best_score_)

grid_score4['avg'] = grid_score4.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score4.loc[grid_score4.avg.idxmax(), :])

Run 4 times
Run 0 best param:  {'colsample_bytree': 0.8, 'subsample': 0.8}
Run 0 best score:  0.5990829965806652
Run 1 best param:  {'colsample_bytree': 0.7, 'subsample': 0.8}
Run 1 best score:  0.5987527199253964
Run 2 best param:  {'colsample_bytree': 0.8, 'subsample': 0.8}
Run 2 best score:  0.5969847684177806
Run 3 best param:  {'colsample_bytree': 0.7, 'subsample': 0.8}
Run 3 best score:  0.5967322039166926
Best params:  params               {'colsample_bytree': 0.7, 'subsample': 0.8}
mean_test_score_0                                       0.598714
mean_test_score_1                                       0.598753
mean_test_score_2                                       0.596363
mean_test_score_3                                       0.596732
avg                                                      0.59764
Name: 6, dtype: object


In [33]:
# Carefully search for each neighboring 0.05
param_test5 = {
 'subsample':[i/100.0 for i in range(75,86,5)],
 'colsample_bytree':[i/100.0 for i in range(65,76,5)]
}
# Grid search 1 cv result
grid_score5 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=410,
        max_depth=7,
        min_child_weight=67,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch5 = GridSearchCV(estimator = xgb,
                            param_grid = param_test5,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch5.fit(X_train,y_train)    
    if grid_score5.empty:
        grid_score5 = pd.DataFrame(gsearch5.cv_results_, columns=['params', 'mean_test_score'])
        grid_score5.columns = ['params', 'mean_test_score_0']
    else:
        grid_score5['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch5.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch5.best_params_)
    print('Run {} best score: '.format(i), gsearch5.best_score_)

grid_score5['avg'] = grid_score5.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score5.loc[grid_score5.avg.idxmax(), :])

Run 4 times
Run 0 best param:  {'colsample_bytree': 0.65, 'subsample': 0.75}
Run 0 best score:  0.600423531240286
Run 1 best param:  {'colsample_bytree': 0.75, 'subsample': 0.8}
Run 1 best score:  0.5992967050046627
Run 2 best param:  {'colsample_bytree': 0.75, 'subsample': 0.8}
Run 2 best score:  0.59694591234069
Run 3 best param:  {'colsample_bytree': 0.7, 'subsample': 0.85}
Run 3 best score:  0.5975676095741373
Best params:  params               {'colsample_bytree': 0.7, 'subsample': 0.8}
mean_test_score_0                                       0.598714
mean_test_score_1                                       0.598753
mean_test_score_2                                       0.596363
mean_test_score_3                                       0.596732
avg                                                      0.59764
Name: 4, dtype: object


### Tuning Regularization Parameters

In [35]:
param_test6 = {
 'reg_alpha':[0, 1e-5, 1e-2, 0.1, 1, 100]
}
# Grid search 1 cv result
grid_score6 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=410,
        max_depth=7,
        min_child_weight=67,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.7,
        objective= 'binary:logistic',
        n_jobs=-1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch6 = GridSearchCV(estimator = xgb,
                            param_grid = param_test6,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch6.fit(X_train,y_train)    
    if grid_score6.empty:
        grid_score6 = pd.DataFrame(gsearch6.cv_results_, columns=['params', 'mean_test_score'])
        grid_score6.columns = ['params', 'mean_test_score_0']
    else:
        grid_score6['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch6.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch6.best_params_)
    print('Run {} best score: '.format(i), gsearch6.best_score_)

grid_score6['avg'] = grid_score6.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score6.loc[grid_score6.avg.idxmax(), :])

Run 4 times
Run 0 best param:  {'reg_alpha': 0}
Run 0 best score:  0.5987138638483058
Run 1 best param:  {'reg_alpha': 0}
Run 1 best score:  0.5987527199253964
Run 2 best param:  {'reg_alpha': 0}
Run 2 best score:  0.5963630711843332
Run 3 best param:  {'reg_alpha': 1}
Run 3 best score:  0.5972373329188685
Best params:  params               {'reg_alpha': 0}
mean_test_score_0            0.598714
mean_test_score_1            0.598753
mean_test_score_2            0.596363
mean_test_score_3            0.596732
avg                           0.59764
Name: 0, dtype: object


In [18]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test7 = {
 'reg_alpha':[0, 1e-6, 5e-6, 1e-5, 5e-5, 1e-4, 5e-4]
}
# Grid search 1 cv result
grid_score7 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=410,
        max_depth=7,
        min_child_weight=67,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.7,
        objective= 'binary:logistic',
        n_jobs=-1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch7 = GridSearchCV(estimator = xgb,
                            param_grid = param_test7,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch7.fit(X_train,y_train)    
    if grid_score7.empty:
        grid_score7 = pd.DataFrame(gsearch7.cv_results_, columns=['params', 'mean_test_score'])
        grid_score7.columns = ['params', 'mean_test_score_0']
    else:
        grid_score7['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch7.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch7.best_params_)
    print('Run {} best score: '.format(i), gsearch7.best_score_)

grid_score7['avg'] = grid_score7.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score7.loc[grid_score7.avg.idxmax(), :])

Run 4 times
Run 0 best param:  {'reg_alpha': 0}
Run 0 best score:  0.5987138638483058
Run 1 best param:  {'reg_alpha': 0}
Run 1 best score:  0.5987527199253964
Run 2 best param:  {'reg_alpha': 0}
Run 2 best score:  0.5963630711843332
Run 3 best param:  {'reg_alpha': 0}
Run 3 best score:  0.5967322039166926
Best params:  params               {'reg_alpha': 0}
mean_test_score_0            0.598714
mean_test_score_1            0.598753
mean_test_score_2            0.596363
mean_test_score_3            0.596732
avg                           0.59764
Name: 0, dtype: object


### Reduce the learning rate and tune n_estimators

In [20]:
xgb = XGBClassifier(
    learning_rate =0.01,
    n_estimators=410,
    max_depth=7,
    min_child_weight=67,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.7,
    objective= 'binary:logistic',
    n_jobs=-1,
    seed=0)

xgb_param = xgb.get_xgb_params()
xgtrain = xgboost.DMatrix(X_train, label=y_train)

xgboost.cv(xgb_param, xgtrain, num_boost_round=5000, nfold=5, metrics=['error'],
     early_stopping_rounds=50, stratified=True, seed=0)

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.449118,0.004318,0.454577,0.004564
1,0.442862,0.002303,0.448904,0.004448
2,0.440686,0.001609,0.446767,0.005154
3,0.440201,0.003466,0.448127,0.004002
4,0.439579,0.003657,0.447253,0.005173
5,0.439001,0.002369,0.447233,0.005898
6,0.440409,0.003817,0.448652,0.005159
7,0.438491,0.003978,0.447661,0.005742
8,0.437184,0.003217,0.444669,0.006240
9,0.437767,0.002869,0.444572,0.005936


In [39]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test8 = {
 'n_estimators':[i for i in range(1000, 2000, 100)]+[1038]
}
# Grid search 1 cv result
grid_score8 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.01,
        n_estimators=410,
        max_depth=7,
        min_child_weight=67,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.7,
        objective= 'binary:logistic',
        n_jobs=-1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch8 = GridSearchCV(estimator = xgb,
                            param_grid = param_test8,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch8.fit(X_train,y_train)    
    if grid_score8.empty:
        grid_score8 = pd.DataFrame(gsearch8.cv_results_, columns=['params', 'mean_test_score'])
        grid_score8.columns = ['params', 'mean_test_score_0']
    else:
        grid_score8['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch8.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch8.best_params_)
    print('Run {} best score: '.format(i), gsearch8.best_score_)

grid_score8['avg'] = grid_score8.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score8.loc[grid_score8.avg.idxmax(), :])

Run 4 times
Run 0 best param:  {'n_estimators': 1900}
Run 0 best score:  0.5971984768417781
Run 1 best param:  {'n_estimators': 1800}
Run 1 best score:  0.5957996580665216
Run 2 best param:  {'n_estimators': 1900}
Run 2 best score:  0.5957996580665216
Run 3 best param:  {'n_estimators': 1700}
Run 3 best score:  0.5963630711843332
Best params:  params               {'n_estimators': 1900}
mean_test_score_0                  0.597198
mean_test_score_1                    0.5958
mean_test_score_2                    0.5958
mean_test_score_3                  0.596363
avg                                 0.59629
Name: 9, dtype: object


In [None]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test8 = {
 'n_estimators':[i for i in range(1400, 1500, 20)]+[1423]
}
# Grid search 1 cv result
grid_score8 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.01,
        n_estimators=195,
        max_depth=5,
        min_child_weight=3,
        gamma=0.1,
        subsample=0.95,
        colsample_bytree=0.6,
        objective= 'binary:logistic',
        n_jobs=8,
        scale_pos_weight=1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch8 = GridSearchCV(estimator = xgb,
                            param_grid = param_test8,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch8.fit(X_train,y_train)    
    if grid_score8.empty:
        grid_score8 = pd.DataFrame(gsearch8.cv_results_, columns=['params', 'mean_test_score'])
        grid_score8.columns = ['params', 'mean_test_score_0']
    else:
        grid_score8['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch8.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch8.best_params_)
    print('Run {} best score: '.format(i), gsearch8.best_score_)

grid_score8['avg'] = grid_score8.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score8.loc[grid_score8.avg.idxmax(), :])

## Part 3 Test on test set

In [40]:
accuracy_array = []
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=410,
        max_depth=7,
        min_child_weight=67,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.7,
        objective= 'binary:logistic',
        n_jobs=-1,
        seed=i
    )
    model = xgb.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # evaluate predictions
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_array.append(accuracy)
    print('Accuracy {}: %.2f%%'.format(i) % (accuracy * 100.0))
mean_accuracy_score = sum(accuracy_array) / NUM_TRIALS
print('Average accuracy is: %.2f%%' % (mean_accuracy_score * 100.0))

Accuracy 0: 59.55%
Accuracy 1: 59.47%
Accuracy 2: 59.37%
Accuracy 3: 59.40%
Average accuracy is: 59.45%
