# Adult dataset

## Part 1: Data encoding

### Import library

In [9]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import xgboost
import copy
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

In [10]:
os.chdir('/home/tai/Projects/research-project-Roland/')

### Load train and test data

In [11]:
train = pd.read_csv("data/adult/adult.0.train.csv", encoding='latin1', 
                 names=['age','workclass','fnlwgt','education',
                         'education_num','marital_status','occupation',
                         'relationship','race','sex','capital_gain','capital_loss',
                         'hours_per_week','native_country','income'],
                 na_values='?',
                 low_memory=False )

In [12]:
test = pd.read_csv("data/adult/adult.0.test.csv", encoding='latin1', 
                 names=['age','workclass','fnlwgt','education',
                         'education_num','marital_status','occupation',
                         'relationship','race','sex','capital_gain','capital_loss',
                         'hours_per_week','native_country','income'],
                 na_values='?',
                 low_memory=False)

### Covert the output as binary

In [13]:
train['over_50k'] = np.where(train.income == '>50K', 1, 0)
train=train.drop(['income'], axis=1)

In [14]:
test['over_50k'] = np.where(test.income == '>50K', 1, 0)
test=test.drop(['income'], axis=1)

### Convert the numeric number

In [15]:
train.loc[:,'age'] = pd.to_numeric(train['age'], downcast='integer', errors='coerce')
train.loc[:,'fnlwgt'] = pd.to_numeric(train['fnlwgt'], downcast='float', errors='coerce')
train.loc[:,'age'] = pd.to_numeric(train['age'], downcast='integer', errors='coerce')
train.loc[:,'capital_gain'] = pd.to_numeric(train['capital_gain'], downcast='float', errors='coerce')
train.loc[:,'capital_loss'] = pd.to_numeric(train['capital_loss'], downcast='float', errors='coerce')
train.loc[:,'hours_per_week'] = pd.to_numeric(train['hours_per_week'], downcast='float', errors='coerce')

In [16]:
test.loc[:,'age'] = pd.to_numeric(test['age'], downcast='integer', errors='coerce')
test.loc[:,'fnlwgt'] = pd.to_numeric(test['fnlwgt'], downcast='float', errors='coerce')
test.loc[:,'age'] = pd.to_numeric(test['age'], downcast='integer', errors='coerce')
test.loc[:,'capital_gain'] = pd.to_numeric(test['capital_gain'], downcast='float', errors='coerce')
test.loc[:,'capital_loss'] = pd.to_numeric(test['capital_loss'], downcast='float', errors='coerce')
test.loc[:,'hours_per_week'] = pd.to_numeric(test['hours_per_week'], downcast='float', errors='coerce')

In [17]:
train.isna().sum()

age                  0
workclass         1379
fnlwgt               0
education            0
education_num        0
marital_status       0
occupation        1384
relationship         0
race                 0
sex                  0
capital_gain         0
capital_loss         0
hours_per_week       0
native_country     431
over_50k             0
dtype: int64

In [18]:
test.isna().sum()

age                  0
workclass         1420
fnlwgt               0
education            0
education_num        0
marital_status       0
occupation        1425
relationship         0
race                 0
sex                  0
capital_gain         0
capital_loss         0
hours_per_week       0
native_country     426
over_50k             0
dtype: int64

### One hot encoding

In [19]:
train['education'] = train['education'].astype('category',
                                               categories=['Bachelors', 'Some-college', '11th', 'HS-grad', 
                                                           'Prof-school', 
                                                           'Assoc-acdm', 'Assoc-voc', '9th', '7th-8th',
                                                           '12th', 'Masters', '1st-4th', '10th', 
                                                           'Doctorate', '5th-6th', 'Preschool'])
train['marital_status'] = train['marital_status'].astype('category',
                                                         categories=['Married-civ-spouse', 'Divorced', 
                                                                     'Never-married', 'Separated', 
                                                                     'Widowed', 'Married-spouse-absent', 
                                                                     'Married-AF-spouse'])
train['relationship'] = train['relationship'].astype('category',
                                                     categories=['Wife', 'Own-child', 'Husband', 
                                                                 'Not-in-family', 'Other-relative', 'Unmarried'])
train['race'] = train['race'].astype('category',
                                     categories=['White', 'Asian-Pac-Islander', 
                                                 'Amer-Indian-Eskimo', 'Other', 'Black'])
train['sex'] = train['sex'].astype('category', 
                                   categories=['Female', 'Male'])


train['workclass'] = train['workclass'].astype('category',
                                               categories=['Private', 'Self-emp-not-inc', 
                                                           'Self-emp-inc', 'Federal-gov', 
                                                           'Local-gov', 'State-gov', 
                                                           'Without-pay', 'Never-worked'])
train['occupation'] = train['occupation'].astype('category',
                                                 categories=['Tech-support', 'Craft-repair', 
                                                             'Other-service', 'Sales', 'Exec-managerial',
                                                             'Prof-specialty', 'Handlers-cleaners', 
                                                             'Machine-op-inspct', 'Adm-clerical',
                                                             'Farming-fishing', 'Transport-moving', 
                                                             'Priv-house-serv',
                                                             'Protective-serv', 'Armed-Forces'])
train['native_country'] = train['native_country'].astype('category',
                                                         categories=['United-States',
                                                                                 'Cambodia',
                                                                                 'England',
                                                                                 'Puerto-Rico',
                                                                                 'Canada',
                                                                                 'Germany',
                                                                                 'Outlying-US(Guam-USVI-etc)',
                                                                                 'India',
                                                                                 'Japan',
                                                                                 'Greece',
                                                                                 'South',
                                                                                 'China',
                                                                                 'Cuba',
                                                                                 'Iran',
                                                                                 'Honduras',
                                                                                 'Philippines',
                                                                                 'Italy',
                                                                                 'Poland',
                                                                                 'Jamaica',
                                                                                 'Vietnam',
                                                                                 'Mexico',
                                                                                 'Portugal',
                                                                                 'Ireland',
                                                                                 'France',
                                                                                 'Dominican-Republic',
                                                                                 'Laos',
                                                                                 'Ecuador',
                                                                                 'Taiwan',
                                                                                 'Haiti',
                                                                                 'Columbia',
                                                                                 'Hungary',
                                                                                 'Guatemala',
                                                                                 'Nicaragua',
                                                                                 'Scotland',
                                                                                 'Thailand',
                                                                                 'Yugoslavia',
                                                                                 'El-Salvador',
                                                                                 'Trinadad&Tobago',
                                                                                 'Peru',
                                                                                 'Hong',
                                                                                 'Holand-Netherlands'])

  exec(code_obj, self.user_global_ns, self.user_ns)


In [20]:
train = pd.get_dummies(train, columns=['education','marital_status','relationship','race','sex'])
train = pd.get_dummies(train, columns=['workclass','occupation','native_country'], dummy_na=True)

In [21]:
test['education'] = test['education'].astype('category',
                                               categories=['Bachelors', 'Some-college', '11th', 'HS-grad', 
                                                           'Prof-school', 
                                                           'Assoc-acdm', 'Assoc-voc', '9th', '7th-8th',
                                                           '12th', 'Masters', '1st-4th', '10th', 
                                                           'Doctorate', '5th-6th', 'Preschool'])
test['marital_status'] = test['marital_status'].astype('category',
                                                         categories=['Married-civ-spouse', 'Divorced', 
                                                                     'Never-married', 'Separated', 
                                                                     'Widowed', 'Married-spouse-absent', 
                                                                     'Married-AF-spouse'])
test['relationship'] = test['relationship'].astype('category',
                                                     categories=['Wife', 'Own-child', 'Husband', 
                                                                 'Not-in-family', 'Other-relative', 'Unmarried'])
test['race'] = test['race'].astype('category',
                                    categories=['White', 'Asian-Pac-Islander', 
                                                 'Amer-Indian-Eskimo', 'Other', 'Black'])
test['sex'] = test['sex'].astype('category',
                                   categories=['Female', 'Male'])


test['workclass'] = test['workclass'].astype('category',
                                               categories=['Private', 'Self-emp-not-inc', 
                                                           'Self-emp-inc', 'Federal-gov', 
                                                           'Local-gov', 'State-gov', 
                                                           'Without-pay', 'Never-worked'])
test['occupation'] = test['occupation'].astype('category',
                                                categories=['Tech-support', 'Craft-repair', 
                                                             'Other-service', 'Sales', 'Exec-managerial',
                                                             'Prof-specialty', 'Handlers-cleaners', 
                                                             'Machine-op-inspct', 'Adm-clerical',
                                                             'Farming-fishing', 'Transport-moving', 
                                                             'Priv-house-serv',
                                                             'Protective-serv', 'Armed-Forces'])
test['native_country'] = test['native_country'].astype('category',                                                         
                                                         categories=['United-States',
                                                                                 'Cambodia',
                                                                                 'England',
                                                                                 'Puerto-Rico',
                                                                                 'Canada',
                                                                                 'Germany',
                                                                                 'Outlying-US(Guam-USVI-etc)',
                                                                                 'India',
                                                                                 'Japan',
                                                                                 'Greece',
                                                                                 'South',
                                                                                 'China',
                                                                                 'Cuba',
                                                                                 'Iran',
                                                                                 'Honduras',
                                                                                 'Philippines',
                                                                                 'Italy',
                                                                                 'Poland',
                                                                                 'Jamaica',
                                                                                 'Vietnam',
                                                                                 'Mexico',
                                                                                 'Portugal',
                                                                                 'Ireland',
                                                                                 'France',
                                                                                 'Dominican-Republic',
                                                                                 'Laos',
                                                                                 'Ecuador',
                                                                                 'Taiwan',
                                                                                 'Haiti',
                                                                                 'Columbia',
                                                                                 'Hungary',
                                                                                 'Guatemala',
                                                                                 'Nicaragua',
                                                                                 'Scotland',
                                                                                 'Thailand',
                                                                                 'Yugoslavia',
                                                                                 'El-Salvador',
                                                                                 'Trinadad&Tobago',
                                                                                 'Peru',
                                                                                 'Hong',
                                                                                 'Holand-Netherlands'])

In [22]:
test = pd.get_dummies(test, columns=['education','marital_status','relationship','race','sex'])
test = pd.get_dummies(test, columns=['workclass','occupation','native_country'], dummy_na=True)

In [23]:
X_train = train.drop(['over_50k'], axis=1)
y_train = train.over_50k

X_test = test.drop(['over_50k'], axis=1)
y_test = test.over_50k

In [24]:
# X_train, X_test = X_train.align(X_test, join='outer', fill_value=0, axis=1)

In [25]:
X_train.shape

(24421, 108)

In [26]:
X_test.shape

(24421, 108)

In [27]:
X_train.columns

Index(['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss',
       'hours_per_week', 'education_Bachelors', 'education_Some-college',
       'education_11th', 'education_HS-grad',
       ...
       'native_country_Nicaragua', 'native_country_Scotland',
       'native_country_Thailand', 'native_country_Yugoslavia',
       'native_country_El-Salvador', 'native_country_Trinadad&Tobago',
       'native_country_Peru', 'native_country_Hong',
       'native_country_Holand-Netherlands', 'native_country_nan'],
      dtype='object', length=108)

In [28]:
X_test.columns

Index(['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss',
       'hours_per_week', 'education_Bachelors', 'education_Some-college',
       'education_11th', 'education_HS-grad',
       ...
       'native_country_Nicaragua', 'native_country_Scotland',
       'native_country_Thailand', 'native_country_Yugoslavia',
       'native_country_El-Salvador', 'native_country_Trinadad&Tobago',
       'native_country_Peru', 'native_country_Hong',
       'native_country_Holand-Netherlands', 'native_country_nan'],
      dtype='object', length=108)

## Part 2: Tuning on train data

### Find optimal n_estimators

In [34]:
xgb = XGBClassifier(
    learning_rate =0.1,
    n_estimators=5000,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    n_jobs=-1)

xgb_param = xgb.get_xgb_params()
xgtrain = xgboost.DMatrix(X_train, label=y_train)


xgboost.cv(xgb_param, xgtrain, num_boost_round=5000, nfold=5, metrics=['error'],
     early_stopping_rounds=50, stratified=True, seed=0)

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.167069,0.008100,0.172639,0.012472
1,0.144885,0.002585,0.148725,0.004196
2,0.147342,0.002714,0.151468,0.005173
3,0.145203,0.000719,0.148315,0.005444
4,0.144445,0.001555,0.147742,0.004791
5,0.144251,0.001563,0.147455,0.005365
6,0.143708,0.001650,0.147046,0.005066
7,0.143545,0.001366,0.146800,0.004553
8,0.143033,0.001004,0.146268,0.004691
9,0.143381,0.001173,0.146186,0.003970


### Tuning max_depth and min_child_weight

In [21]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test1 = {
 'max_depth':range(1,10,2),
 'min_child_weight':range(1,300,50)
}
# Grid search 1 cv result
grid_score1 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=159,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=8,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch1 = GridSearchCV(estimator = xgb,
                            param_grid = param_test1,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch1.fit(X_train,y_train)    
    if grid_score1.empty:
        grid_score1 = pd.DataFrame(gsearch1.cv_results_, columns=['params', 'mean_test_score'])
        grid_score1.columns = ['params', 'mean_test_score_0']
    else:
        grid_score1['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch1.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch1.best_params_)
    print('Run {} best score: '.format(i), gsearch1.best_score_)

grid_score1['avg'] = grid_score1.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score1.loc[grid_score1.avg.idxmax(), :])

Run 9 times
Run 0 best param:  {'max_depth': 5, 'min_child_weight': 1}
Run 0 best score:  0.8694566152082225
Run 1 best param:  {'max_depth': 5, 'min_child_weight': 1}
Run 1 best score:  0.8706441177674952
Run 2 best param:  {'max_depth': 5, 'min_child_weight': 1}
Run 2 best score:  0.8706031694033823
Run 3 best param:  {'max_depth': 5, 'min_child_weight': 1}
Run 3 best score:  0.8713811883215266
Run 4 best param:  {'max_depth': 5, 'min_child_weight': 1}
Run 4 best score:  0.8698251504852381
Run 5 best param:  {'max_depth': 5, 'min_child_weight': 1}
Run 5 best score:  0.8696204086646738




Run 6 best param:  {'max_depth': 5, 'min_child_weight': 1}
Run 6 best score:  0.8694975635723353
Run 7 best param:  {'max_depth': 5, 'min_child_weight': 1}
Run 7 best score:  0.8697842021211253
Run 8 best param:  {'max_depth': 5, 'min_child_weight': 1}
Run 8 best score:  0.8706031694033823
Best params:  params               {'max_depth': 5, 'min_child_weight': 1}
mean_test_score_0                                   0.869457
mean_test_score_1                                   0.870644
mean_test_score_2                                   0.870603
mean_test_score_3                                   0.871381
mean_test_score_4                                   0.869825
mean_test_score_5                                    0.86962
mean_test_score_6                                   0.869498
mean_test_score_7                                   0.869784
mean_test_score_8                                   0.870603
avg                                                 0.870157
Name: 12, dtype: object


In [22]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test1b = {
 'max_depth':range(1,10,2),
 'min_child_weight':range(1, 10, 2)
}
# Grid search 1 cv result
grid_score1b = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=159,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=8,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch1b = GridSearchCV(estimator = xgb,
                            param_grid = param_test1b,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch1b.fit(X_train,y_train)    
    if grid_score1b.empty:
        grid_score1b = pd.DataFrame(gsearch1b.cv_results_, columns=['params', 'mean_test_score'])
        grid_score1b.columns = ['params', 'mean_test_score_0']
    else:
        grid_score1b['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch1b.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch1b.best_params_)
    print('Run {} best score: '.format(i), gsearch1b.best_score_)

grid_score1b['avg'] = grid_score1b.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score1b.loc[grid_score1b.avg.idxmax(), :])

Run 9 times
Run 0 best param:  {'max_depth': 5, 'min_child_weight': 3}
Run 0 best score:  0.8699889439416896
Run 1 best param:  {'max_depth': 5, 'min_child_weight': 3}
Run 1 best score:  0.870685066131608
Run 2 best param:  {'max_depth': 5, 'min_child_weight': 5}
Run 2 best score:  0.8714221366856394
Run 3 best param:  {'max_depth': 5, 'min_child_weight': 1}
Run 3 best score:  0.8713811883215266
Run 4 best param:  {'max_depth': 5, 'min_child_weight': 3}
Run 4 best score:  0.8704393759469309
Run 5 best param:  {'max_depth': 5, 'min_child_weight': 1}
Run 5 best score:  0.8696204086646738
Run 6 best param:  {'max_depth': 5, 'min_child_weight': 5}
Run 6 best score:  0.869866098849351
Run 7 best param:  {'max_depth': 5, 'min_child_weight': 5}
Run 7 best score:  0.8698251504852381
Run 8 best param:  {'max_depth': 5, 'min_child_weight': 1}
Run 8 best score:  0.8706031694033823
Best params:  params               {'max_depth': 5, 'min_child_weight': 3}
mean_test_score_0                         

In [23]:
# Look carefully again the neigbor values
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test2 = {
 'max_depth':[4, 5, 6],
 'min_child_weight':[2, 3 ,4]
}
# Grid search 1 cv result
grid_score2 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=159,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=8,
        scale_pos_weight=1,        
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch2 = GridSearchCV(estimator = xgb,
                            param_grid = param_test2,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch2.fit(X_train,y_train)    
    if grid_score2.empty:
        grid_score2 = pd.DataFrame(gsearch2.cv_results_, columns=['params', 'mean_test_score'])
        grid_score2.columns = ['params', 'mean_test_score_0']
    else:
        grid_score2['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch2.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch2.best_params_)
    print('Run {} best score: '.format(i), gsearch2.best_score_)

grid_score2['avg'] = grid_score2.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score2.loc[grid_score2.avg.idxmax(), :])

Run 9 times
Run 0 best param:  {'max_depth': 5, 'min_child_weight': 3}
Run 0 best score:  0.8699889439416896
Run 1 best param:  {'max_depth': 5, 'min_child_weight': 3}
Run 1 best score:  0.870685066131608
Run 2 best param:  {'max_depth': 5, 'min_child_weight': 4}
Run 2 best score:  0.8717087752344294
Run 3 best param:  {'max_depth': 6, 'min_child_weight': 3}
Run 3 best score:  0.8713811883215266
Run 4 best param:  {'max_depth': 5, 'min_child_weight': 3}
Run 4 best score:  0.8704393759469309
Run 5 best param:  {'max_depth': 6, 'min_child_weight': 3}
Run 5 best score:  0.8704393759469309
Run 6 best param:  {'max_depth': 6, 'min_child_weight': 3}
Run 6 best score:  0.8706031694033823
Run 7 best param:  {'max_depth': 6, 'min_child_weight': 2}
Run 7 best score:  0.8708079112239466
Run 8 best param:  {'max_depth': 5, 'min_child_weight': 4}
Run 8 best score:  0.8702346341263667
Best params:  params               {'max_depth': 5, 'min_child_weight': 3}
mean_test_score_0                        

### Tuning gamma

In [30]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
# Grid search 1 cv result
grid_score3 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=159,
        max_depth=5,
        min_child_weight=3,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch3 = GridSearchCV(estimator = xgb,
                            param_grid = param_test3,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch3.fit(X_train,y_train)    
    if grid_score3.empty:
        grid_score3 = pd.DataFrame(gsearch3.cv_results_, columns=['params', 'mean_test_score'])
        grid_score3.columns = ['params', 'mean_test_score_0']
    else:
        grid_score3['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch3.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch3.best_params_)
    print('Run {} best score: '.format(i), gsearch3.best_score_)

grid_score3['avg'] = grid_score3.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score3.loc[grid_score3.avg.idxmax(), :])

Run 9 times
Run 0 best param:  {'gamma': 0.0}
Run 0 best score:  0.8699889439416896
Run 1 best param:  {'gamma': 0.3}
Run 1 best score:  0.8709307563162851
Run 2 best param:  {'gamma': 0.1}
Run 2 best score:  0.8722001556037836
Run 3 best param:  {'gamma': 0.0}
Run 3 best score:  0.8710536014086238
Run 4 best param:  {'gamma': 0.2}
Run 4 best score:  0.8705622210392695
Run 5 best param:  {'gamma': 0.4}
Run 5 best score:  0.8701936857622538
Run 6 best param:  {'gamma': 0.4}
Run 6 best score:  0.8708898079521723
Run 7 best param:  {'gamma': 0.1}
Run 7 best score:  0.8697023053928995
Run 8 best param:  {'gamma': 0.1}
Run 8 best score:  0.8709307563162851
Best params:  params               {'gamma': 0.0}
mean_test_score_0          0.869989
mean_test_score_1          0.870685
mean_test_score_2          0.871299
mean_test_score_3          0.871054
mean_test_score_4          0.870439
mean_test_score_5          0.869375
mean_test_score_6          0.869702
mean_test_score_7           0.86962
me

### Recablirating the n_estimators and 1st tune the n_estimators

In [31]:
xgb = XGBClassifier(
    learning_rate =0.1,
    n_estimators=5000,
    max_depth=5,
    min_child_weight=3,
    gamma=0.0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    n_jobs=-1,
    scale_pos_weight=1
    )

xgb_param = xgb.get_xgb_params()
xgtrain = xgboost.DMatrix(X_train, label=y_train)

xgboost.cv(xgb_param, xgtrain, num_boost_round=5000, nfold=5, metrics=['error'],
     early_stopping_rounds=50, stratified=True, seed=0)

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.167499,0.008291,0.172721,0.012682
1,0.145449,0.002578,0.148929,0.004688
2,0.147639,0.002788,0.151509,0.005178
3,0.145571,0.000577,0.148438,0.005468
4,0.144855,0.001569,0.148069,0.004862
5,0.144793,0.001276,0.147619,0.005214
6,0.144046,0.001369,0.147292,0.004879
7,0.143708,0.001086,0.146800,0.004268
8,0.143278,0.000960,0.146063,0.004509
9,0.143585,0.001202,0.146186,0.004032


In [32]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test8 = {
 'n_estimators':[i for i in range(100, 1000, 100)]+[241]
}
# Grid search 1 cv result
grid_score8 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=241,
        max_depth=5,
        min_child_weight=3,
        gamma=0.0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch8 = GridSearchCV(estimator = xgb,
                            param_grid = param_test8,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch8.fit(X_train,y_train)    
    if grid_score8.empty:
        grid_score8 = pd.DataFrame(gsearch8.cv_results_, columns=['params', 'mean_test_score'])
        grid_score8.columns = ['params', 'mean_test_score_0']
    else:
        grid_score8['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch8.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch8.best_params_)
    print('Run {} best score: '.format(i), gsearch8.best_score_)

grid_score8['avg'] = grid_score8.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score8.loc[grid_score8.avg.idxmax(), :])

Run 9 times
Run 0 best param:  {'n_estimators': 300}
Run 0 best score:  0.8694156668441095
Run 1 best param:  {'n_estimators': 200}
Run 1 best score:  0.8706031694033823
Run 2 best param:  {'n_estimators': 200}
Run 2 best score:  0.8705622210392695
Run 3 best param:  {'n_estimators': 200}
Run 3 best score:  0.8707260144957208
Run 4 best param:  {'n_estimators': 200}
Run 4 best score:  0.8707260144957208
Run 5 best param:  {'n_estimators': 300}
Run 5 best score:  0.869579460300561
Run 6 best param:  {'n_estimators': 241}
Run 6 best score:  0.8701117890340281
Run 7 best param:  {'n_estimators': 200}
Run 7 best score:  0.8704803243110437
Run 8 best param:  {'n_estimators': 100}
Run 8 best score:  0.8699070472134638
Best params:  params               {'n_estimators': 200}
mean_test_score_0                 0.869252
mean_test_score_1                 0.870603
mean_test_score_2                 0.870562
mean_test_score_3                 0.870726
mean_test_score_4                 0.870726
mean_t

In [35]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test8 = {
 'n_estimators':[i for i in range(150, 250, 20)]
}
# Grid search 1 cv result
grid_score8 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=241,
        max_depth=5,
        min_child_weight=3,
        gamma=0.0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch8 = GridSearchCV(estimator = xgb,
                            param_grid = param_test8,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch8.fit(X_train,y_train)    
    if grid_score8.empty:
        grid_score8 = pd.DataFrame(gsearch8.cv_results_, columns=['params', 'mean_test_score'])
        grid_score8.columns = ['params', 'mean_test_score_0']
    else:
        grid_score8['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch8.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch8.best_params_)
    print('Run {} best score: '.format(i), gsearch8.best_score_)

grid_score8['avg'] = grid_score8.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score8.loc[grid_score8.avg.idxmax(), :])

Run 9 times
Run 0 best param:  {'n_estimators': 150}
Run 0 best score:  0.8699889439416896
Run 1 best param:  {'n_estimators': 210}
Run 1 best score:  0.8708898079521723
Run 2 best param:  {'n_estimators': 190}
Run 2 best score:  0.8713811883215266
Run 3 best param:  {'n_estimators': 170}
Run 3 best score:  0.8717087752344294
Run 4 best param:  {'n_estimators': 190}
Run 4 best score:  0.8708488595880595
Run 5 best param:  {'n_estimators': 150}
Run 5 best score:  0.869866098849351
Run 6 best param:  {'n_estimators': 170}
Run 6 best score:  0.8705622210392695
Run 7 best param:  {'n_estimators': 190}
Run 7 best score:  0.8706441177674952
Run 8 best param:  {'n_estimators': 210}
Run 8 best score:  0.8697432537570124
Best params:  params               {'n_estimators': 170}
mean_test_score_0                 0.869989
mean_test_score_1                 0.870276
mean_test_score_2                 0.871217
mean_test_score_3                 0.871709
mean_test_score_4                 0.870562
mean_t

### Tuning the subsample and colsample_bytree

In [36]:
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
# Grid search 1 cv result
grid_score4 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=170,
        max_depth=5,
        min_child_weight=3,
        gamma=0.0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch4 = GridSearchCV(estimator = xgb,
                            param_grid = param_test4,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch4.fit(X_train,y_train)    
    if grid_score4.empty:
        grid_score4 = pd.DataFrame(gsearch4.cv_results_, columns=['params', 'mean_test_score'])
        grid_score4.columns = ['params', 'mean_test_score_0']
    else:
        grid_score4['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch4.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch4.best_params_)
    print('Run {} best score: '.format(i), gsearch4.best_score_)

grid_score4['avg'] = grid_score4.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score4.loc[grid_score4.avg.idxmax(), :])

Run 9 times
Run 0 best param:  {'colsample_bytree': 0.6, 'subsample': 0.8}
Run 0 best score:  0.8710945497727366
Run 1 best param:  {'colsample_bytree': 0.7, 'subsample': 0.6}
Run 1 best score:  0.8713811883215266
Run 2 best param:  {'colsample_bytree': 0.7, 'subsample': 0.9}
Run 2 best score:  0.8718725686908808
Run 3 best param:  {'colsample_bytree': 0.8, 'subsample': 0.8}
Run 3 best score:  0.8717087752344294
Run 4 best param:  {'colsample_bytree': 0.9, 'subsample': 0.8}
Run 4 best score:  0.870971704680398
Run 5 best param:  {'colsample_bytree': 0.7, 'subsample': 0.9}
Run 5 best score:  0.8708079112239466
Run 6 best param:  {'colsample_bytree': 0.7, 'subsample': 0.8}
Run 6 best score:  0.8713811883215266
Run 7 best param:  {'colsample_bytree': 0.7, 'subsample': 0.6}
Run 7 best score:  0.8705212726751567
Run 8 best param:  {'colsample_bytree': 0.8, 'subsample': 0.9}
Run 8 best score:  0.8700708406699152
Best params:  params               {'colsample_bytree': 0.7, 'subsample': 0.9}
m

In [37]:
# Carefully search for each neighboring 0.05
param_test5 = {
 'subsample':[i/100.0 for i in range(85,100,5)],
 'colsample_bytree':[i/100.0 for i in range(65,80,5)]
}
# Grid search 1 cv result
grid_score5 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=170,
        max_depth=5,
        min_child_weight=3,
        gamma=0.0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch5 = GridSearchCV(estimator = xgb,
                            param_grid = param_test5,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch5.fit(X_train,y_train)    
    if grid_score5.empty:
        grid_score5 = pd.DataFrame(gsearch5.cv_results_, columns=['params', 'mean_test_score'])
        grid_score5.columns = ['params', 'mean_test_score_0']
    else:
        grid_score5['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch5.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch5.best_params_)
    print('Run {} best score: '.format(i), gsearch5.best_score_)

grid_score5['avg'] = grid_score5.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score5.loc[grid_score5.avg.idxmax(), :])

Run 9 times
Run 0 best param:  {'colsample_bytree': 0.7, 'subsample': 0.9}
Run 0 best score:  0.8705622210392695
Run 1 best param:  {'colsample_bytree': 0.7, 'subsample': 0.9}
Run 1 best score:  0.8708898079521723
Run 2 best param:  {'colsample_bytree': 0.7, 'subsample': 0.9}
Run 2 best score:  0.8718725686908808
Run 3 best param:  {'colsample_bytree': 0.75, 'subsample': 0.9}
Run 3 best score:  0.8713811883215266
Run 4 best param:  {'colsample_bytree': 0.75, 'subsample': 0.95}
Run 4 best score:  0.8708488595880595
Run 5 best param:  {'colsample_bytree': 0.75, 'subsample': 0.95}
Run 5 best score:  0.8713402399574137
Run 6 best param:  {'colsample_bytree': 0.75, 'subsample': 0.85}
Run 6 best score:  0.8713402399574137
Run 7 best param:  {'colsample_bytree': 0.7, 'subsample': 0.95}
Run 7 best score:  0.8710126530445109
Run 8 best param:  {'colsample_bytree': 0.75, 'subsample': 0.9}
Run 8 best score:  0.8701117890340281
Best params:  params               {'colsample_bytree': 0.7, 'subsampl

### Tuning Regularization Parameters

In [39]:
param_test6 = {
 'reg_alpha':[0, 1e-5, 1e-2, 0.1, 1, 100]
}
# Grid search 1 cv result
grid_score6 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=170,
        max_depth=5,
        min_child_weight=3,
        gamma=0.0,
        subsample=0.9,
        colsample_bytree=0.7,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch6 = GridSearchCV(estimator = xgb,
                            param_grid = param_test6,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch6.fit(X_train,y_train)    
    if grid_score6.empty:
        grid_score6 = pd.DataFrame(gsearch6.cv_results_, columns=['params', 'mean_test_score'])
        grid_score6.columns = ['params', 'mean_test_score_0']
    else:
        grid_score6['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch6.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch6.best_params_)
    print('Run {} best score: '.format(i), gsearch6.best_score_)

grid_score6['avg'] = grid_score6.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score6.loc[grid_score6.avg.idxmax(), :])

Run 9 times
Run 0 best param:  {'reg_alpha': 0}
Run 0 best score:  0.8705622210392695
Run 1 best param:  {'reg_alpha': 0}
Run 1 best score:  0.8708898079521723
Run 2 best param:  {'reg_alpha': 0.01}
Run 2 best score:  0.8722001556037836
Run 3 best param:  {'reg_alpha': 0.1}
Run 3 best score:  0.871258343229188
Run 4 best param:  {'reg_alpha': 1}
Run 4 best score:  0.8706441177674952
Run 5 best param:  {'reg_alpha': 0}
Run 5 best score:  0.8708079112239466
Run 6 best param:  {'reg_alpha': 0.01}
Run 6 best score:  0.8707669628598338
Run 7 best param:  {'reg_alpha': 1}
Run 7 best score:  0.8708079112239466
Run 8 best param:  {'reg_alpha': 0.01}
Run 8 best score:  0.8697432537570124
Best params:  params               {'reg_alpha': 0.01}
mean_test_score_0               0.870521
mean_test_score_1               0.869907
mean_test_score_2                 0.8722
mean_test_score_3               0.871135
mean_test_score_4               0.870521
mean_test_score_5               0.870317
mean_test_s

In [40]:
param_test7 = {
 'reg_alpha':[1e-4, 5e-4, 1e-3, 5e-3, 1e-2]
}
# Grid search 1 cv result
grid_score7 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=170,
        max_depth=5,
        min_child_weight=3,
        gamma=0.0,
        subsample=0.9,
        colsample_bytree=0.7,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch7 = GridSearchCV(estimator = xgb,
                            param_grid = param_test7,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch7.fit(X_train,y_train)    
    if grid_score7.empty:
        grid_score7 = pd.DataFrame(gsearch7.cv_results_, columns=['params', 'mean_test_score'])
        grid_score7.columns = ['params', 'mean_test_score_0']
    else:
        grid_score7['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch7.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch7.best_params_)
    print('Run {} best score: '.format(i), gsearch7.best_score_)

grid_score7['avg'] = grid_score7.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score7.loc[grid_score7.avg.idxmax(), :])

Run 9 times
Run 0 best param:  {'reg_alpha': 0.0005}
Run 0 best score:  0.8707260144957208
Run 1 best param:  {'reg_alpha': 0.0001}
Run 1 best score:  0.8708898079521723
Run 2 best param:  {'reg_alpha': 0.001}
Run 2 best score:  0.8722001556037836
Run 3 best param:  {'reg_alpha': 0.005}
Run 3 best score:  0.8715040334138651
Run 4 best param:  {'reg_alpha': 0.01}
Run 4 best score:  0.8705212726751567
Run 5 best param:  {'reg_alpha': 0.0001}
Run 5 best score:  0.8708079112239466
Run 6 best param:  {'reg_alpha': 0.005}
Run 6 best score:  0.8708488595880595
Run 7 best param:  {'reg_alpha': 0.005}
Run 7 best score:  0.8710945497727366
Run 8 best param:  {'reg_alpha': 0.005}
Run 8 best score:  0.869866098849351
Best params:  params               {'reg_alpha': 0.01}
mean_test_score_0               0.870521
mean_test_score_1               0.869907
mean_test_score_2                 0.8722
mean_test_score_3               0.871135
mean_test_score_4               0.870521
mean_test_score_5        

### Reduce the learning rate and tune n_estimators

In [42]:
xgb = XGBClassifier(
    learning_rate =0.01,
    n_estimators=5000,
    max_depth=5,
    min_child_weight=3,
    gamma=0.0,
    subsample=0.9,
    colsample_bytree=0.7,
    objective= 'binary:logistic',
    n_jobs=-1,
    reg_alpha=0.01,
    scale_pos_weight=1,
    seed=0)

xgb_param = xgb.get_xgb_params()
xgtrain = xgboost.DMatrix(X_train, label=y_train)

xgboost.cv(xgb_param, xgtrain, num_boost_round=5000, nfold=5, metrics=['error'],
     early_stopping_rounds=50, stratified=True, seed=0)

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.167683,0.008088,0.171861,0.012468
1,0.146257,0.002849,0.149503,0.003553
2,0.150465,0.003592,0.153065,0.007127
3,0.146913,0.000984,0.150772,0.004352
4,0.147998,0.003822,0.151468,0.007145
5,0.147957,0.003849,0.151550,0.007860
6,0.147905,0.003402,0.150649,0.005940
7,0.147660,0.003596,0.150281,0.003787
8,0.148090,0.003136,0.150608,0.004905
9,0.147557,0.002717,0.149912,0.003202


In [44]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test8 = {
 'n_estimators':[i for i in range(100, 1501, 100)]+[164]
}
# Grid search 1 cv result
grid_score8 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.01,
        n_estimators=5000,
        max_depth=5,
        min_child_weight=3,
        gamma=0.0,
        subsample=0.9,
        colsample_bytree=0.7,
        objective= 'binary:logistic',
        n_jobs=-1,
        reg_alpha=0.01,
        scale_pos_weight=1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch8 = GridSearchCV(estimator = xgb,
                            param_grid = param_test8,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch8.fit(X_train,y_train)    
    if grid_score8.empty:
        grid_score8 = pd.DataFrame(gsearch8.cv_results_, columns=['params', 'mean_test_score'])
        grid_score8.columns = ['params', 'mean_test_score_0']
    else:
        grid_score8['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch8.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch8.best_params_)
    print('Run {} best score: '.format(i), gsearch8.best_score_)

grid_score8['avg'] = grid_score8.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score8.loc[grid_score8.avg.idxmax(), :])

Run 9 times
Run 0 best param:  {'n_estimators': 1400}
Run 0 best score:  0.8702755824904795
Run 1 best param:  {'n_estimators': 1500}
Run 1 best score:  0.8700708406699152
Run 2 best param:  {'n_estimators': 1500}
Run 2 best score:  0.870398427582818
Run 3 best param:  {'n_estimators': 1500}
Run 3 best score:  0.8717497235985422
Run 4 best param:  {'n_estimators': 1500}
Run 4 best score:  0.869866098849351
Run 5 best param:  {'n_estimators': 1500}
Run 5 best score:  0.8704393759469309
Run 6 best param:  {'n_estimators': 1400}
Run 6 best score:  0.8701527373981409
Run 7 best param:  {'n_estimators': 1500}
Run 7 best score:  0.8708079112239466
Run 8 best param:  {'n_estimators': 1300}
Run 8 best score:  0.8704393759469309
Best params:  params               {'n_estimators': 1500}
mean_test_score_0                  0.870071
mean_test_score_1                  0.870071
mean_test_score_2                  0.870398
mean_test_score_3                   0.87175
mean_test_score_4                  0

In [46]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test8 = {
 'n_estimators':[i for i in range(1500, 2501, 100)]
}
# Grid search 1 cv result
grid_score8 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.01,
        n_estimators=5000,
        max_depth=5,
        min_child_weight=3,
        gamma=0.0,
        subsample=0.9,
        colsample_bytree=0.7,
        objective= 'binary:logistic',
        n_jobs=-1,
        reg_alpha=0.01,
        scale_pos_weight=1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch8 = GridSearchCV(estimator = xgb,
                            param_grid = param_test8,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch8.fit(X_train,y_train)    
    if grid_score8.empty:
        grid_score8 = pd.DataFrame(gsearch8.cv_results_, columns=['params', 'mean_test_score'])
        grid_score8.columns = ['params', 'mean_test_score_0']
    else:
        grid_score8['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch8.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch8.best_params_)
    print('Run {} best score: '.format(i), gsearch8.best_score_)

grid_score8['avg'] = grid_score8.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score8.loc[grid_score8.avg.idxmax(), :])

Run 9 times
Run 0 best param:  {'n_estimators': 2400}
Run 0 best score:  0.8705212726751567
Run 1 best param:  {'n_estimators': 2400}
Run 1 best score:  0.8705622210392695
Run 2 best param:  {'n_estimators': 2200}
Run 2 best score:  0.8713811883215266
Run 3 best param:  {'n_estimators': 1500}
Run 3 best score:  0.8717497235985422
Run 4 best param:  {'n_estimators': 2100}
Run 4 best score:  0.8704803243110437
Run 5 best param:  {'n_estimators': 2300}
Run 5 best score:  0.870971704680398
Run 6 best param:  {'n_estimators': 2000}
Run 6 best score:  0.8715040334138651
Run 7 best param:  {'n_estimators': 2200}
Run 7 best score:  0.8715040334138651
Run 8 best param:  {'n_estimators': 1900}
Run 8 best score:  0.8705622210392695
Best params:  params               {'n_estimators': 2200}
mean_test_score_0                  0.870194
mean_test_score_1                  0.870194
mean_test_score_2                  0.871381
mean_test_score_3                  0.871095
mean_test_score_4                  

## Part 3 Test on test set

In [48]:
accuracy_array = []
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.01,
        n_estimators=2200,
        max_depth=5,
        min_child_weight=3,
        gamma=0.0,
        subsample=0.9,
        colsample_bytree=0.7,
        objective= 'binary:logistic',
        n_jobs=-1,
        reg_alpha=0.01,
        scale_pos_weight=1,
        seed=i
    )
    model = xgb.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # evaluate predictions
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_array.append(accuracy)
    print('Accuracy {}: %.2f%%'.format(i) % (accuracy * 100.0))
mean_accuracy_score = sum(accuracy_array) / NUM_TRIALS
print('Average accuracy is: %.2f%%' % (mean_accuracy_score * 100.0))

Accuracy 0: 87.58%
Accuracy 1: 87.54%
Accuracy 2: 87.56%
Accuracy 3: 87.54%
Accuracy 4: 87.56%
Accuracy 5: 87.61%
Accuracy 6: 87.59%
Accuracy 7: 87.54%
Accuracy 8: 87.55%
Average accuracy is: 87.56%
