# Bank additional dataset

## Part 1: Data encoding

### Import library

In [6]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import xgboost
import copy
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

In [7]:
os.chdir('/home/tai/Projects/research-project-Roland')

### Load train and test data

In [8]:
train = pd.read_csv("data/bank_additional/bank_additional.0.train", encoding='latin1', 
                 names=['age',
                        'job',
                        'marital',
                        'education',
                        'default',
                        'housing',
                        'loan',
                        'contact',
                        'month',
                        'day_of_week',
                        'duration',
                        'campaign',
                        'pdays',
                        'previous',
                        'poutcome',
                        'emp_var_rate',
                        'cons_price_idx',
                        'cons_conf_idx',
                        'euribor3m',
                        'nr_employed',
                        'subscribe'],
                 na_values='?',
                 low_memory=False )

In [9]:
test = pd.read_csv("data/bank_additional/bank_additional.0.test", encoding='latin1', 
                 names=['age',
                        'job',
                        'marital',
                        'education',
                        'default',
                        'housing',
                        'loan',
                        'contact',
                        'month',
                        'day_of_week',
                        'duration',
                        'campaign',
                        'pdays',
                        'previous',
                        'poutcome',
                        'emp_var_rate',
                        'cons_price_idx',
                        'cons_conf_idx',
                        'euribor3m',
                        'nr_employed',
                        'subscribe'],
                 na_values='?',
                 low_memory=False)

In [10]:
pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns', 2000)
pd.set_option('display.width', 1000)

### Covert the output as binary

In [11]:
train['has_subcribe'] = np.where(train.subscribe == 'yes', 1, 0)
train=train.drop(['subscribe'], axis=1)

In [12]:
test['has_subcribe'] = np.where(test.subscribe == 'yes', 1, 0)
test=test.drop(['subscribe'], axis=1)

### Convert the numeric number

In [13]:
train.loc[:,'age'] = pd.to_numeric(train['age'], downcast='integer', errors='coerce')
train.loc[:, 'duration'] = pd.to_numeric(train['duration'], downcast='integer', errors='coerce')
train.loc[:, 'campaign'] = pd.to_numeric(train['campaign'], downcast='integer', errors='coerce')
train.loc[:, 'pdays'] = pd.to_numeric(train['pdays'], downcast='integer', errors='coerce')
train.loc[:, 'previous'] = pd.to_numeric(train['previous'], downcast='integer', errors='coerce')
train.loc[:, 'emp_var_rate'] = pd.to_numeric(train['emp_var_rate'], downcast='float', errors='coerce')
train.loc[:, 'cons_price_idx'] = pd.to_numeric(train['cons_price_idx'], downcast='float', errors='coerce')
train.loc[:, 'cons_conf_idx'] = pd.to_numeric(train['cons_conf_idx'], downcast='float', errors='coerce')
train.loc[:, 'euribor3m'] = pd.to_numeric(train['euribor3m'], downcast='float', errors='coerce')
train.loc[:,'nr_employed'] = pd.to_numeric(train['nr_employed'], downcast='float', errors='coerce')

In [14]:
test.loc[:,'age'] = pd.to_numeric(test['age'], downcast='integer', errors='coerce')
test.loc[:, 'duration'] = pd.to_numeric(test['duration'], downcast='integer', errors='coerce')
test.loc[:, 'campaign'] = pd.to_numeric(test['campaign'], downcast='integer', errors='coerce')
test.loc[:, 'pdays'] = pd.to_numeric(test['pdays'], downcast='integer', errors='coerce')
test.loc[:, 'previous'] = pd.to_numeric(test['previous'], downcast='integer', errors='coerce')
test.loc[:, 'emp_var_rate'] = pd.to_numeric(test['emp_var_rate'], downcast='float', errors='coerce')
test.loc[:, 'cons_price_idx'] = pd.to_numeric(test['cons_price_idx'], downcast='float', errors='coerce')
test.loc[:, 'cons_conf_idx'] = pd.to_numeric(test['cons_conf_idx'], downcast='float', errors='coerce')
test.loc[:, 'euribor3m'] = pd.to_numeric(test['euribor3m'], downcast='float', errors='coerce')
test.loc[:,'nr_employed'] = pd.to_numeric(test['nr_employed'], downcast='float', errors='coerce')

In [15]:
train.isna().sum()

age                  0
job                177
marital             37
education          874
default           4313
housing            498
loan               498
contact              0
month                0
day_of_week          0
duration             0
campaign             0
pdays                0
previous             0
poutcome             0
emp_var_rate         0
cons_price_idx       0
cons_conf_idx        0
euribor3m            0
nr_employed          0
has_subcribe         0
dtype: int64

In [16]:
test.isna().sum()

age                  0
job                153
marital             43
education          857
default           4284
housing            492
loan               492
contact              0
month                0
day_of_week          0
duration             0
campaign             0
pdays                0
previous             0
poutcome             0
emp_var_rate         0
cons_price_idx       0
cons_conf_idx        0
euribor3m            0
nr_employed          0
has_subcribe         0
dtype: int64

In [17]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20594 entries, 0 to 20593
Data columns (total 21 columns):
age               20594 non-null int8
job               20417 non-null object
marital           20557 non-null object
education         19720 non-null object
default           16281 non-null object
housing           20096 non-null object
loan              20096 non-null object
contact           20594 non-null object
month             20594 non-null object
day_of_week       20594 non-null object
duration          20594 non-null int16
campaign          20594 non-null int8
pdays             20594 non-null int16
previous          20594 non-null int8
poutcome          20594 non-null object
emp_var_rate      20594 non-null float32
cons_price_idx    20594 non-null float32
cons_conf_idx     20594 non-null float32
euribor3m         20594 non-null float32
nr_employed       20594 non-null float32
has_subcribe      20594 non-null int64
dtypes: float32(5), int16(2), int64(1), int8(3), object

### One hot encoding

In [18]:
train['job'] = train['job'].astype('category',
                                               categories=[
                                                 'admin',
                                                 'blue-collar',
                                                 'entrepreneur',
                                                 'housemaid',
                                                 'management',
                                                 'retired',
                                                 'self-employed',
                                                 'services',
                                                 'student',
                                                 'technician',
                                                 'unemployed'
                                               ])
train['marital'] = train['marital'].astype('category',
                                               categories=[
                                                 'divorced', 'married', 'single'
                                               ])

train['education'] = train['education'].astype('category',
                                               categories=[
                                                   'basic.4y',
                                                   'basic.6y',
                                                 'basic.9y',
                                                 'high.school',
                                                 'illiterate',
                                                 'professional.course',
                                                 'university.degree'])
train['default'] = train['default'].astype('category',
                                               categories=[
                                                 'no', 'yes'
                                               ])
train['housing'] = train['housing'].astype('category',
                                               categories=[
                                                 'no', 'yes'
                                               ])
train['loan'] = train['loan'].astype('category',
                                               categories=[
                                                 'no', 'yes'
                                               ])
train['contact'] = train['contact'].astype('category',
                                               categories=[
                                                 'cellular','telephone'
                                               ])
train['month'] = train['month'].astype('category',
                                               categories=[
                                                 'jan', 'feb', 'mar', 
                                                   'apr', 'may', 'jun', 
                                                   'jul', 'aug', 'sep',
                                                   'oct', 'nov', 'dec'
                                               ])
train['day_of_week'] = train['day_of_week'].astype('category',
                                               categories=[
                                                 'mon','tue','wed','thu','fri'
                                               ])
train['poutcome'] = train['poutcome'].astype('category',
                                               categories=[
                                                'failure','nonexistent','success'
                                               ])
                                                   

  exec(code_obj, self.user_global_ns, self.user_ns)


In [19]:
train = pd.get_dummies(train, columns=['contact', 'month', 'day_of_week', 'poutcome'])
train = pd.get_dummies(train, columns=['job', 'marital', 'education', 'default', 'housing', 'loan'], dummy_na=True)

In [20]:
test['job'] = test['job'].astype('category',
                                               categories=[
                                                 'admin',
                                                 'blue-collar',
                                                 'entrepreneur',
                                                 'housemaid',
                                                 'management',
                                                 'retired',
                                                 'self-employed',
                                                 'services',
                                                 'student',
                                                 'technician',
                                                 'unemployed'
                                               ])
test['marital'] = test['marital'].astype('category',
                                               categories=[
                                                 'divorced', 'married', 'single'
                                               ])

test['education'] = test['education'].astype('category',
                                               categories=[
                                                   'basic.4y',
                                                   'basic.6y',
                                                 'basic.9y',
                                                 'high.school',
                                                 'illiterate',
                                                 'professional.course',
                                                 'university.degree'])
test['default'] = test['default'].astype('category',
                                               categories=[
                                                 'no', 'yes'
                                               ])
test['housing'] = test['housing'].astype('category',
                                               categories=[
                                                 'no', 'yes'
                                               ])
test['loan'] = test['loan'].astype('category',
                                               categories=[
                                                 'no', 'yes'
                                               ])
test['contact'] = test['contact'].astype('category',
                                               categories=[
                                                 'cellular','telephone'
                                               ])
test['month'] = test['month'].astype('category',
                                               categories=[
                                                 'jan', 'feb', 'mar', 
                                                   'apr', 'may', 'jun', 
                                                   'jul', 'aug', 'sep',
                                                   'oct', 'nov', 'dec'
                                               ])
test['day_of_week'] = test['day_of_week'].astype('category',
                                               categories=[
                                                 'mon','tue','wed','thu','fri'
                                               ])
test['poutcome'] = test['poutcome'].astype('category',
                                               categories=[
                                                'failure','nonexistent','success'
                                               ])
                                                   

In [21]:
test = pd.get_dummies(test, columns=['contact', 'month', 'day_of_week', 'poutcome'])
test = pd.get_dummies(test, columns=['job', 'marital', 'education', 'default', 'housing', 'loan'], dummy_na=True)

In [22]:
X_train = train.drop(['has_subcribe'], axis=1)
y_train = train.has_subcribe

X_test = test.drop(['has_subcribe'], axis=1)
y_test = test.has_subcribe

In [23]:
# X_train, X_test = X_train.align(X_test, join='outer', fill_value=0, axis=1)

In [24]:
X_train.shape

(20594, 65)

In [25]:
X_test.shape

(20594, 65)

In [26]:
X_train.columns

Index(['age', 'duration', 'campaign', 'pdays', 'previous', 'emp_var_rate', 'cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed', 'contact_cellular', 'contact_telephone', 'month_jan', 'month_feb', 'month_mar', 'month_apr', 'month_may', 'month_jun', 'month_jul', 'month_aug', 'month_sep', 'month_oct', 'month_nov', 'month_dec', 'day_of_week_mon', 'day_of_week_tue', 'day_of_week_wed', 'day_of_week_thu', 'day_of_week_fri', 'poutcome_failure', 'poutcome_nonexistent', 'poutcome_success', 'job_admin', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired', 'job_self-employed', 'job_services', 'job_student', 'job_technician', 'job_unemployed', 'job_nan', 'marital_divorced', 'marital_married', 'marital_single', 'marital_nan', 'education_basic.4y', 'education_basic.6y', 'education_basic.9y', 'education_high.school', 'education_illiterate', 'education_professional.course', 'education_university.degree', 'education_nan', 'default_no', 'default_yes',
       '

In [27]:
X_test.columns

Index(['age', 'duration', 'campaign', 'pdays', 'previous', 'emp_var_rate', 'cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed', 'contact_cellular', 'contact_telephone', 'month_jan', 'month_feb', 'month_mar', 'month_apr', 'month_may', 'month_jun', 'month_jul', 'month_aug', 'month_sep', 'month_oct', 'month_nov', 'month_dec', 'day_of_week_mon', 'day_of_week_tue', 'day_of_week_wed', 'day_of_week_thu', 'day_of_week_fri', 'poutcome_failure', 'poutcome_nonexistent', 'poutcome_success', 'job_admin', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired', 'job_self-employed', 'job_services', 'job_student', 'job_technician', 'job_unemployed', 'job_nan', 'marital_divorced', 'marital_married', 'marital_single', 'marital_nan', 'education_basic.4y', 'education_basic.6y', 'education_basic.9y', 'education_high.school', 'education_illiterate', 'education_professional.course', 'education_university.degree', 'education_nan', 'default_no', 'default_yes',
       '

## Part 2: Tuning on train data

### Find optimal n_estimators

In [33]:
xgb = XGBClassifier(
    learning_rate =0.1,
    n_estimators=5000,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    n_jobs=-1)

xgb_param = xgb.get_xgb_params()
xgtrain = xgboost.DMatrix(X_train, label=y_train)


xgboost.cv(xgb_param, xgtrain, num_boost_round=5000, nfold=5, metrics=['error'],
     early_stopping_rounds=50, stratified=True, seed=0)

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.081832,0.000812,0.087793,0.004095
1,0.081213,0.003509,0.089346,0.003929
2,0.080412,0.002331,0.087501,0.003577
3,0.080776,0.002076,0.088472,0.002506
4,0.080108,0.002407,0.08789,0.003545
5,0.079234,0.002656,0.086724,0.002608
6,0.080363,0.003373,0.088327,0.003227
7,0.078785,0.002858,0.088375,0.003643
8,0.078554,0.002753,0.087307,0.003098
9,0.077583,0.002305,0.087113,0.002578


### Tuning max_depth and min_child_weight

In [34]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test1 = {
 'max_depth':range(1,10,2),
 'min_child_weight':range(1,200,40)
}
# Grid search 1 cv result
grid_score1 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=119,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=8,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch1 = GridSearchCV(estimator = xgb,
                            param_grid = param_test1,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch1.fit(X_train,y_train)    
    if grid_score1.empty:
        grid_score1 = pd.DataFrame(gsearch1.cv_results_, columns=['params', 'mean_test_score'])
        grid_score1.columns = ['params', 'mean_test_score_0']
    else:
        grid_score1['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch1.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch1.best_params_)
    print('Run {} best score: '.format(i), gsearch1.best_score_)

grid_score1['avg'] = grid_score1.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score1.loc[grid_score1.avg.idxmax(), :])

Run 10 times
Run 0 best param:  {'max_depth': 3, 'min_child_weight': 1}
Run 0 best score:  0.916577643973973
Run 1 best param:  {'max_depth': 5, 'min_child_weight': 1}
Run 1 best score:  0.9154608138292707
Run 2 best param:  {'max_depth': 5, 'min_child_weight': 1}
Run 2 best score:  0.9164805283092162
Run 3 best param:  {'max_depth': 3, 'min_child_weight': 1}
Run 3 best score:  0.9161406234825678
Run 4 best param:  {'max_depth': 5, 'min_child_weight': 1}
Run 4 best score:  0.9166262018063513
Run 5 best param:  {'max_depth': 5, 'min_child_weight': 81}
Run 5 best score:  0.9155579294940274
Run 6 best param:  {'max_depth': 9, 'min_child_weight': 41}
Run 6 best score:  0.915897834320676
Run 7 best param:  {'max_depth': 5, 'min_child_weight': 1}
Run 7 best score:  0.9157521608235408
Run 8 best param:  {'max_depth': 3, 'min_child_weight': 1}
Run 8 best score:  0.9171603379625134
Run 9 best param:  {'max_depth': 5, 'min_child_weight': 1}
Run 9 best score:  0.9159463921530543
Best params:  par

In [35]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test1b = {
 'max_depth':range(1,10,2),
 'min_child_weight':range(1, 10, 2)
}
# Grid search 1 cv result
grid_score1b = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=119,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=8,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch1b = GridSearchCV(estimator = xgb,
                            param_grid = param_test1b,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch1b.fit(X_train,y_train)    
    if grid_score1b.empty:
        grid_score1b = pd.DataFrame(gsearch1b.cv_results_, columns=['params', 'mean_test_score'])
        grid_score1b.columns = ['params', 'mean_test_score_0']
    else:
        grid_score1b['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch1b.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch1b.best_params_)
    print('Run {} best score: '.format(i), gsearch1b.best_score_)

grid_score1b['avg'] = grid_score1b.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score1b.loc[grid_score1b.avg.idxmax(), :])

Run 10 times
Run 0 best param:  {'max_depth': 5, 'min_child_weight': 3}
Run 0 best score:  0.9175002427891619


Exception in thread QueueManagerThread:
Traceback (most recent call last):
  File "/home/tai/.conda/envs/research/lib/python3.7/threading.py", line 917, in _bootstrap_inner
    self.run()
  File "/home/tai/.conda/envs/research/lib/python3.7/threading.py", line 865, in run
    self._target(*self._args, **self._kwargs)
  File "/home/tai/.conda/envs/research/lib/python3.7/site-packages/sklearn/externals/joblib/externals/loky/process_executor.py", line 674, in _queue_management_worker
    recursive_terminate(p)
  File "/home/tai/.conda/envs/research/lib/python3.7/site-packages/sklearn/externals/joblib/externals/loky/backend/utils.py", line 28, in recursive_terminate
    _recursive_terminate_without_psutil(process)
  File "/home/tai/.conda/envs/research/lib/python3.7/site-packages/sklearn/externals/joblib/externals/loky/backend/utils.py", line 53, in _recursive_terminate_without_psutil
    _recursive_terminate(process.pid)
  File "/home/tai/.conda/envs/research/lib/python3.7/site-packages/s

KeyboardInterrupt: 

In [52]:
# Look carefully again the neigbor values
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test2 = {
 'max_depth':[4, 5, 6],
 'min_child_weight':[2, 3 ,4]
}
# Grid search 1 cv result
grid_score2 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=119,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=8,
        scale_pos_weight=1,        
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch2 = GridSearchCV(estimator = xgb,
                            param_grid = param_test2,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch2.fit(X_train,y_train)    
    if grid_score2.empty:
        grid_score2 = pd.DataFrame(gsearch2.cv_results_, columns=['params', 'mean_test_score'])
        grid_score2.columns = ['params', 'mean_test_score_0']
    else:
        grid_score2['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch2.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch2.best_params_)
    print('Run {} best score: '.format(i), gsearch2.best_score_)

grid_score2['avg'] = grid_score2.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score2.loc[grid_score2.avg.idxmax(), :])

Run 10 times
Run 0 best param:  {'max_depth': 6, 'min_child_weight': 2}
Run 0 best score:  0.9180343789453239
Run 1 best param:  {'max_depth': 4, 'min_child_weight': 3}
Run 1 best score:  0.9174516849567835
Run 2 best param:  {'max_depth': 5, 'min_child_weight': 3}
Run 2 best score:  0.916577643973973
Run 3 best param:  {'max_depth': 5, 'min_child_weight': 2}
Run 3 best score:  0.9173545692920269
Run 4 best param:  {'max_depth': 4, 'min_child_weight': 3}
Run 4 best score:  0.9170146644653783
Run 5 best param:  {'max_depth': 4, 'min_child_weight': 2}
Run 5 best score:  0.9155579294940274
Run 6 best param:  {'max_depth': 4, 'min_child_weight': 3}
Run 6 best score:  0.9174031271244052
Run 7 best param:  {'max_depth': 4, 'min_child_weight': 2}
Run 7 best score:  0.9164319704768379
Run 8 best param:  {'max_depth': 5, 'min_child_weight': 3}
Run 8 best score:  0.9175002427891619
Run 9 best param:  {'max_depth': 4, 'min_child_weight': 4}
Run 9 best score:  0.9166747596387297
Best params:  para

### Tuning gamma

In [59]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
# Grid search 1 cv result
grid_score3 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=119,
        max_depth=4,
        min_child_weight=3,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=8,
        scale_pos_weight=1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch3 = GridSearchCV(estimator = xgb,
                            param_grid = param_test3,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch3.fit(X_train,y_train)    
    if grid_score3.empty:
        grid_score3 = pd.DataFrame(gsearch3.cv_results_, columns=['params', 'mean_test_score'])
        grid_score3.columns = ['params', 'mean_test_score_0']
    else:
        grid_score3['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch3.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch3.best_params_)
    print('Run {} best score: '.format(i), gsearch3.best_score_)

grid_score3['avg'] = grid_score3.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score3.loc[grid_score3.avg.idxmax(), :])

Run 10 times
Run 0 best param:  {'gamma': 0.0}
Run 0 best score:  0.9174516849567835
Run 1 best param:  {'gamma': 0.0}
Run 1 best score:  0.9174516849567835
Run 2 best param:  {'gamma': 0.1}
Run 2 best score:  0.9167233174711081
Run 3 best param:  {'gamma': 0.1}
Run 3 best score:  0.9172574536272701
Run 4 best param:  {'gamma': 0.0}
Run 4 best score:  0.9170146644653783
Run 5 best param:  {'gamma': 0.1}
Run 5 best score:  0.9157036029911625
Run 6 best param:  {'gamma': 0.0}
Run 6 best score:  0.9174031271244052
Run 7 best param:  {'gamma': 0.0}
Run 7 best score:  0.9159949499854326
Run 8 best param:  {'gamma': 0.2}
Run 8 best score:  0.9166747596387297
Run 9 best param:  {'gamma': 0.4}
Run 9 best score:  0.9164805283092162
Best params:  params               {'gamma': 0.0}
mean_test_score_0          0.917452
mean_test_score_1          0.917452
mean_test_score_2          0.916238
mean_test_score_3          0.916626
mean_test_score_4          0.917015
mean_test_score_5          0.915267
m

### Recablirating the n_estimators

In [30]:
xgb = XGBClassifier(
    learning_rate =0.1,
    n_estimators=5000,
    max_depth=4,
    min_child_weight=3,
    gamma=0.0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    n_jobs=-1,
    scale_pos_weight=1)

xgb_param = xgb.get_xgb_params()
xgtrain = xgboost.DMatrix(X_train, label=y_train)

xgboost.cv(xgb_param, xgtrain, num_boost_round=5000, nfold=5, metrics=['error'],
     early_stopping_rounds=50, stratified=True, seed=0)

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.086421,0.001372,0.090949,0.003695
1,0.087623,0.004168,0.092066,0.00446
2,0.085984,0.002774,0.090075,0.003458
3,0.085984,0.002393,0.089103,0.003275
4,0.085207,0.002213,0.089444,0.003296
5,0.085377,0.0036,0.090075,0.002768
6,0.086263,0.002905,0.090318,0.00273
7,0.084794,0.003336,0.090706,0.002615
8,0.084272,0.003305,0.089638,0.002165
9,0.083689,0.002393,0.08988,0.001738


In [29]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test_recalib = {
 'n_estimators':[i for i in range(100, 1000, 100)]+[87]
}
# Grid search 1 cv result
grid_score_recalib = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=87,
        max_depth=4,
        min_child_weight=3,
        gamma=0.0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch_calib = GridSearchCV(estimator = xgb,
                            param_grid = param_test_recalib,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch_calib.fit(X_train,y_train)    
    if grid_score_recalib.empty:
        grid_score_recalib = pd.DataFrame(gsearch_calib.cv_results_, columns=['params', 'mean_test_score'])
        grid_score_recalib.columns = ['params', 'mean_test_score_0']
    else:
        grid_score_recalib['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch_calib.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch_calib.best_params_)
    print('Run {} best score: '.format(i), gsearch_calib.best_score_)

grid_score_recalib['avg'] = grid_score_recalib.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score_recalib.loc[grid_score_recalib.avg.idxmax(), :])

Run 10 times
Run 0 best param:  {'n_estimators': 100}
Run 0 best score:  0.9175973584539187
Run 1 best param:  {'n_estimators': 87}
Run 1 best score:  0.9170632222977566
Run 2 best param:  {'n_estimators': 87}
Run 2 best score:  0.9171603379625134
Run 3 best param:  {'n_estimators': 100}
Run 3 best score:  0.9160920656501894
Run 4 best param:  {'n_estimators': 87}
Run 4 best score:  0.9167233174711081
Run 5 best param:  {'n_estimators': 100}
Run 5 best score:  0.916043507817811
Run 6 best param:  {'n_estimators': 87}
Run 6 best score:  0.9175973584539187
Run 7 best param:  {'n_estimators': 100}
Run 7 best score:  0.9155579294940274
Run 8 best param:  {'n_estimators': 100}
Run 8 best score:  0.9170632222977566
Run 9 best param:  {'n_estimators': 100}
Run 9 best score:  0.9166747596387297
Best params:  params               {'n_estimators': 100}
mean_test_score_0                 0.917597
mean_test_score_1                 0.916723
mean_test_score_2                 0.915898
mean_test_score_

In [37]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test_recalib = {
 'n_estimators':[i for i in range(100, 200, 10)]
}
# Grid search 1 cv result
grid_score_recalib = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=87,
        max_depth=4,
        min_child_weight=3,
        gamma=0.0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch_calib = GridSearchCV(estimator = xgb,
                            param_grid = param_test_recalib,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch_calib.fit(X_train,y_train)    
    if grid_score_recalib.empty:
        grid_score_recalib = pd.DataFrame(gsearch_calib.cv_results_, columns=['params', 'mean_test_score'])
        grid_score_recalib.columns = ['params', 'mean_test_score_0']
    else:
        grid_score_recalib['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch_calib.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch_calib.best_params_)
    print('Run {} best score: '.format(i), gsearch_calib.best_score_)

grid_score_recalib['avg'] = grid_score_recalib.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score_recalib.loc[grid_score_recalib.avg.idxmax(), :])

Run 10 times
Run 0 best param:  {'n_estimators': 110}
Run 0 best score:  0.9178401476158105
Run 1 best param:  {'n_estimators': 120}
Run 1 best score:  0.9173545692920269
Run 2 best param:  {'n_estimators': 110}
Run 2 best score:  0.9163834126444595
Run 3 best param:  {'n_estimators': 150}
Run 3 best score:  0.917111780130135
Run 4 best param:  {'n_estimators': 120}
Run 4 best score:  0.9172088957948917
Run 5 best param:  {'n_estimators': 100}
Run 5 best score:  0.916043507817811
Run 6 best param:  {'n_estimators': 120}
Run 6 best score:  0.9174516849567835
Run 7 best param:  {'n_estimators': 160}
Run 7 best score:  0.916043507817811
Run 8 best param:  {'n_estimators': 100}
Run 8 best score:  0.9170632222977566
Run 9 best param:  {'n_estimators': 100}
Run 9 best score:  0.9166747596387297
Best params:  params               {'n_estimators': 110}
mean_test_score_0                  0.91784
mean_test_score_1                 0.917112
mean_test_score_2                 0.916383
mean_test_scor

In [38]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test_recalib = {
 'n_estimators':[i for i in range(100, 121)]
}
# Grid search 1 cv result
grid_score_recalib = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=87,
        max_depth=4,
        min_child_weight=3,
        gamma=0.0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch_calib = GridSearchCV(estimator = xgb,
                            param_grid = param_test_recalib,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch_calib.fit(X_train,y_train)    
    if grid_score_recalib.empty:
        grid_score_recalib = pd.DataFrame(gsearch_calib.cv_results_, columns=['params', 'mean_test_score'])
        grid_score_recalib.columns = ['params', 'mean_test_score_0']
    else:
        grid_score_recalib['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch_calib.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch_calib.best_params_)
    print('Run {} best score: '.format(i), gsearch_calib.best_score_)

grid_score_recalib['avg'] = grid_score_recalib.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score_recalib.loc[grid_score_recalib.avg.idxmax(), :])

Run 10 times
Run 0 best param:  {'n_estimators': 110}
Run 0 best score:  0.9178401476158105
Run 1 best param:  {'n_estimators': 118}
Run 1 best score:  0.9175973584539187
Run 2 best param:  {'n_estimators': 110}
Run 2 best score:  0.9163834126444595
Run 3 best param:  {'n_estimators': 117}
Run 3 best score:  0.9166262018063513
Run 4 best param:  {'n_estimators': 120}
Run 4 best score:  0.9172088957948917
Run 5 best param:  {'n_estimators': 101}
Run 5 best score:  0.9162377391473244
Run 6 best param:  {'n_estimators': 101}
Run 6 best score:  0.9175973584539187
Run 7 best param:  {'n_estimators': 115}
Run 7 best score:  0.9159949499854326
Run 8 best param:  {'n_estimators': 117}
Run 8 best score:  0.9172088957948917
Run 9 best param:  {'n_estimators': 100}
Run 9 best score:  0.9166747596387297
Best params:  params               {'n_estimators': 117}
mean_test_score_0                  0.91716
mean_test_score_1                 0.917306
mean_test_score_2                 0.916044
mean_test_s

### Tuning the subsample and colsample_bytree

In [39]:
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
# Grid search 1 cv result
grid_score4 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=117,
        max_depth=4,
        min_child_weight=3,
        gamma=0.0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch4 = GridSearchCV(estimator = xgb,
                            param_grid = param_test4,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch4.fit(X_train,y_train)    
    if grid_score4.empty:
        grid_score4 = pd.DataFrame(gsearch4.cv_results_, columns=['params', 'mean_test_score'])
        grid_score4.columns = ['params', 'mean_test_score_0']
    else:
        grid_score4['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch4.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch4.best_params_)
    print('Run {} best score: '.format(i), gsearch4.best_score_)

grid_score4['avg'] = grid_score4.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score4.loc[grid_score4.avg.idxmax(), :])

Run 10 times
Run 0 best param:  {'colsample_bytree': 0.9, 'subsample': 0.7}
Run 0 best score:  0.917645916286297
Run 1 best param:  {'colsample_bytree': 0.8, 'subsample': 0.8}
Run 1 best score:  0.9173060114596484
Run 2 best param:  {'colsample_bytree': 0.8, 'subsample': 0.7}
Run 2 best score:  0.9169175488006216
Run 3 best param:  {'colsample_bytree': 0.7, 'subsample': 0.9}
Run 3 best score:  0.9180829367777023
Run 4 best param:  {'colsample_bytree': 0.7, 'subsample': 0.6}
Run 4 best score:  0.9175973584539187
Run 5 best param:  {'colsample_bytree': 0.6, 'subsample': 0.9}
Run 5 best score:  0.9157521608235408
Run 6 best param:  {'colsample_bytree': 0.7, 'subsample': 0.9}
Run 6 best score:  0.9174516849567835
Run 7 best param:  {'colsample_bytree': 0.7, 'subsample': 0.6}
Run 7 best score:  0.9162862969797029
Run 8 best param:  {'colsample_bytree': 0.7, 'subsample': 0.8}
Run 8 best score:  0.9178401476158105
Run 9 best param:  {'colsample_bytree': 0.7, 'subsample': 0.8}
Run 9 best score

In [40]:
# Carefully search for each neighboring 0.05
param_test5 = {
 'subsample':[i/100.0 for i in range(75,90,5)],
 'colsample_bytree':[i/100.0 for i in range(75,90,5)]
}
# Grid search 1 cv result
grid_score5 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=117,
        max_depth=4,
        min_child_weight=3,
        gamma=0.0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch5 = GridSearchCV(estimator = xgb,
                            param_grid = param_test5,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch5.fit(X_train,y_train)    
    if grid_score5.empty:
        grid_score5 = pd.DataFrame(gsearch5.cv_results_, columns=['params', 'mean_test_score'])
        grid_score5.columns = ['params', 'mean_test_score_0']
    else:
        grid_score5['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch5.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch5.best_params_)
    print('Run {} best score: '.format(i), gsearch5.best_score_)

grid_score5['avg'] = grid_score5.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score5.loc[grid_score5.avg.idxmax(), :])

Run 10 times
Run 0 best param:  {'colsample_bytree': 0.85, 'subsample': 0.85}
Run 0 best score:  0.9177430319510537
Run 1 best param:  {'colsample_bytree': 0.8, 'subsample': 0.8}
Run 1 best score:  0.9173060114596484
Run 2 best param:  {'colsample_bytree': 0.8, 'subsample': 0.75}
Run 2 best score:  0.9170632222977566
Run 3 best param:  {'colsample_bytree': 0.8, 'subsample': 0.75}
Run 3 best score:  0.9173545692920269
Run 4 best param:  {'colsample_bytree': 0.8, 'subsample': 0.8}
Run 4 best score:  0.9170146644653783
Run 5 best param:  {'colsample_bytree': 0.75, 'subsample': 0.75}
Run 5 best score:  0.9162862969797029
Run 6 best param:  {'colsample_bytree': 0.75, 'subsample': 0.85}
Run 6 best score:  0.9174031271244052
Run 7 best param:  {'colsample_bytree': 0.85, 'subsample': 0.8}
Run 7 best score:  0.9164805283092162
Run 8 best param:  {'colsample_bytree': 0.75, 'subsample': 0.8}
Run 8 best score:  0.9180343789453239
Run 9 best param:  {'colsample_bytree': 0.8, 'subsample': 0.85}
Run 

### Tuning Regularization Parameters

In [41]:
param_test6 = {
 'reg_alpha':[0, 1e-5, 1e-2, 0.1, 1, 100]
}
# Grid search 1 cv result
grid_score6 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=117,
        max_depth=4,
        min_child_weight=3,
        gamma=0.0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=8,
        scale_pos_weight=1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch6 = GridSearchCV(estimator = xgb,
                            param_grid = param_test6,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch6.fit(X_train,y_train)    
    if grid_score6.empty:
        grid_score6 = pd.DataFrame(gsearch6.cv_results_, columns=['params', 'mean_test_score'])
        grid_score6.columns = ['params', 'mean_test_score_0']
    else:
        grid_score6['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch6.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch6.best_params_)
    print('Run {} best score: '.format(i), gsearch6.best_score_)

grid_score6['avg'] = grid_score6.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score6.loc[grid_score6.avg.idxmax(), :])

Run 10 times
Run 0 best param:  {'reg_alpha': 1}
Run 0 best score:  0.9172088957948917
Run 1 best param:  {'reg_alpha': 0}
Run 1 best score:  0.9173060114596484
Run 2 best param:  {'reg_alpha': 0.01}
Run 2 best score:  0.9161406234825678
Run 3 best param:  {'reg_alpha': 0.1}
Run 3 best score:  0.9170632222977566
Run 4 best param:  {'reg_alpha': 0.01}
Run 4 best score:  0.917111780130135
Run 5 best param:  {'reg_alpha': 0}
Run 5 best score:  0.9153636981645139
Run 6 best param:  {'reg_alpha': 0}
Run 6 best score:  0.9173545692920269
Run 7 best param:  {'reg_alpha': 0}
Run 7 best score:  0.9158492764882976
Run 8 best param:  {'reg_alpha': 0.1}
Run 8 best score:  0.9178401476158105
Run 9 best param:  {'reg_alpha': 0}
Run 9 best score:  0.9162377391473244
Best params:  params               {'reg_alpha': 0}
mean_test_score_0             0.91716
mean_test_score_1            0.917306
mean_test_score_2            0.916044
mean_test_score_3            0.916626
mean_test_score_4            0.917

In [42]:
param_test7 = {
 'reg_alpha':[0, 1e-4, 1e-3, 1e-2, 5e-2, 8e-2]
}
# Grid search 1 cv result
grid_score7 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=117,
        max_depth=4,
        min_child_weight=3,
        gamma=0.0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=8,
        scale_pos_weight=1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch7 = GridSearchCV(estimator = xgb,
                            param_grid = param_test7,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch7.fit(X_train,y_train)    
    if grid_score7.empty:
        grid_score7 = pd.DataFrame(gsearch7.cv_results_, columns=['params', 'mean_test_score'])
        grid_score7.columns = ['params', 'mean_test_score_0']
    else:
        grid_score7['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch7.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch7.best_params_)
    print('Run {} best score: '.format(i), gsearch7.best_score_)

grid_score7['avg'] = grid_score7.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score7.loc[grid_score7.avg.idxmax(), :])

Run 10 times
Run 0 best param:  {'reg_alpha': 0}
Run 0 best score:  0.9171603379625134
Run 1 best param:  {'reg_alpha': 0.001}
Run 1 best score:  0.9175002427891619
Run 2 best param:  {'reg_alpha': 0.08}
Run 2 best score:  0.9166262018063513
Run 3 best param:  {'reg_alpha': 0.08}
Run 3 best score:  0.9171603379625134
Run 4 best param:  {'reg_alpha': 0.01}
Run 4 best score:  0.917111780130135
Run 5 best param:  {'reg_alpha': 0}
Run 5 best score:  0.9153636981645139
Run 6 best param:  {'reg_alpha': 0}
Run 6 best score:  0.9173545692920269
Run 7 best param:  {'reg_alpha': 0}
Run 7 best score:  0.9158492764882976
Run 8 best param:  {'reg_alpha': 0.01}
Run 8 best score:  0.9173545692920269
Run 9 best param:  {'reg_alpha': 0.05}
Run 9 best score:  0.9169175488006216
Best params:  params               {'reg_alpha': 0}
mean_test_score_0             0.91716
mean_test_score_1            0.917306
mean_test_score_2            0.916044
mean_test_score_3            0.916626
mean_test_score_4        

### Reduce the learning rate and tune n_estimators

In [44]:
xgb = XGBClassifier(
    learning_rate =0.01,
    n_estimators=117,
    max_depth=4,
    min_child_weight=3,
    gamma=0.0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    n_jobs=-1,
    reg_alpha=0,
    scale_pos_weight=1,
    seed=0)

xgb_param = xgb.get_xgb_params()
xgtrain = xgboost.DMatrix(X_train, label=y_train)

xgboost.cv(xgb_param, xgtrain, num_boost_round=5000, nfold=5, metrics=['error'],
     early_stopping_rounds=50, stratified=True, seed=0)

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.086421,0.001372,0.090949,0.003695
1,0.087246,0.004478,0.091434,0.004236
2,0.08568,0.002036,0.089881,0.003358
3,0.086493,0.001606,0.090755,0.002669
4,0.086312,0.001774,0.090949,0.002665
5,0.085911,0.002726,0.091726,0.003091
6,0.087938,0.003024,0.09192,0.002995
7,0.087404,0.002272,0.090997,0.003383
8,0.087088,0.002716,0.09158,0.003372
9,0.086166,0.00233,0.091143,0.003035


In [47]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test8 = {
 'n_estimators':[i for i in range(100, 1100, 100)]+[16]
}
# Grid search 1 cv result
grid_score8 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.01,
        n_estimators=16,
        max_depth=4,
        min_child_weight=3,
        gamma=0.0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch8 = GridSearchCV(estimator = xgb,
                            param_grid = param_test8,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch8.fit(X_train,y_train)    
    if grid_score8.empty:
        grid_score8 = pd.DataFrame(gsearch8.cv_results_, columns=['params', 'mean_test_score'])
        grid_score8.columns = ['params', 'mean_test_score_0']
    else:
        grid_score8['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch8.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch8.best_params_)
    print('Run {} best score: '.format(i), gsearch8.best_score_)

grid_score8['avg'] = grid_score8.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score8.loc[grid_score8.avg.idxmax(), :])

Run 10 times
Run 0 best param:  {'n_estimators': 900}
Run 0 best score:  0.9174516849567835
Run 1 best param:  {'n_estimators': 900}
Run 1 best score:  0.917111780130135
Run 2 best param:  {'n_estimators': 800}
Run 2 best score:  0.917111780130135
Run 3 best param:  {'n_estimators': 800}
Run 3 best score:  0.9173060114596484
Run 4 best param:  {'n_estimators': 1000}
Run 4 best score:  0.916577643973973
Run 5 best param:  {'n_estimators': 500}
Run 5 best score:  0.9156064873264057
Run 6 best param:  {'n_estimators': 700}
Run 6 best score:  0.9180829367777023
Run 7 best param:  {'n_estimators': 800}
Run 7 best score:  0.915897834320676
Run 8 best param:  {'n_estimators': 1000}
Run 8 best score:  0.9174031271244052
Run 9 best param:  {'n_estimators': 600}
Run 9 best score:  0.9164805283092162
Best params:  params               {'n_estimators': 800}
mean_test_score_0                 0.917063
mean_test_score_1                 0.917015
mean_test_score_2                 0.917112
mean_test_sco

In [48]:
NUM_TRIALS = int(np.ceil(200000/train.shape[0]))
param_test9 = {
 'n_estimators':[i for i in range(750, 860, 10)]
}
# Grid search 1 cv result
grid_score9 = pd.DataFrame()

# Loop for each trial
print('Run {} times'.format(NUM_TRIALS))
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.01,
        n_estimators=800,
        max_depth=4,
        min_child_weight=3,
        gamma=0.0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,
        seed=0)
    five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    gsearch9 = GridSearchCV(estimator = xgb,
                            param_grid = param_test9,
                            scoring='accuracy',n_jobs=-1,
                            cv=five_folds,
                            return_train_score=False)
    gsearch9.fit(X_train,y_train)    
    if grid_score9.empty:
        grid_score9 = pd.DataFrame(gsearch9.cv_results_, columns=['params', 'mean_test_score'])
        grid_score9.columns = ['params', 'mean_test_score_0']
    else:
        grid_score9['mean_test_score_{}'.format(i)] = pd.DataFrame(gsearch9.cv_results_).mean_test_score
    print('Run {} best param: '.format(i), gsearch9.best_params_)
    print('Run {} best score: '.format(i), gsearch9.best_score_)

grid_score9['avg'] = grid_score9.sum(axis=1)/NUM_TRIALS
print('Best params: ', grid_score9.loc[grid_score9.avg.idxmax(), :])

Run 10 times
Run 0 best param:  {'n_estimators': 820}
Run 0 best score:  0.9173545692920269
Run 1 best param:  {'n_estimators': 810}
Run 1 best score:  0.917111780130135
Run 2 best param:  {'n_estimators': 810}
Run 2 best score:  0.9173545692920269
Run 3 best param:  {'n_estimators': 780}
Run 3 best score:  0.9173545692920269
Run 4 best param:  {'n_estimators': 750}
Run 4 best score:  0.916577643973973
Run 5 best param:  {'n_estimators': 750}
Run 5 best score:  0.915509371661649
Run 6 best param:  {'n_estimators': 760}
Run 6 best score:  0.9181314946100806
Run 7 best param:  {'n_estimators': 810}
Run 7 best score:  0.9159463921530543
Run 8 best param:  {'n_estimators': 850}
Run 8 best score:  0.9174031271244052
Run 9 best param:  {'n_estimators': 750}
Run 9 best score:  0.9159463921530543
Best params:  params               {'n_estimators': 810}
mean_test_score_0                 0.917209
mean_test_score_1                 0.917112
mean_test_score_2                 0.917355
mean_test_scor

## Part 3 Test on test set

In [49]:
accuracy_array = []
for i in range(NUM_TRIALS):
    xgb = XGBClassifier(
        learning_rate =0.01,
        n_estimators=810,
        max_depth=4,
        min_child_weight=3,
        gamma=0.0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=-1,
        scale_pos_weight=1,
        seed=i
    )
    model = xgb.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # evaluate predictions
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_array.append(accuracy)
    print('Accuracy {}: %.2f%%'.format(i) % (accuracy * 100.0))
mean_accuracy_score = sum(accuracy_array) / NUM_TRIALS
print('Average accuracy is: %.2f%%' % (mean_accuracy_score * 100.0))

Accuracy 0: 91.66%
Accuracy 1: 91.67%
Accuracy 2: 91.69%
Accuracy 3: 91.62%
Accuracy 4: 91.71%
Accuracy 5: 91.68%
Accuracy 6: 91.72%
Accuracy 7: 91.71%
Accuracy 8: 91.65%
Accuracy 9: 91.69%
Average accuracy is: 91.68%
