# Fine tuning with GridSearchCV

## XGBoost Tuning on train set

### Import the dependencies

In [20]:
import os
os.chdir('/home/tai/Projects/kickstarter_prediction/')
import pandas as pd
from os.path import join

from pandas import DataFrame
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from xgboost import plot_importance
import xgboost
from settings import *

### Load the train and test sets

In [3]:
train_set = join(DATA_SPLIT_ROOT, 'train.csv')
test_set = join(DATA_SPLIT_ROOT, 'test.csv')

train = pd.read_csv(train_set, encoding='latin1', low_memory=True)
test = pd.read_csv(test_set, encoding='latin1', low_memory=True)

train_features = train.drop(['success'], axis=1)
train_targets = train['success']

test_features = test.drop(['success'], axis=1)
test_targets = test['success']

### Define the best n_estimator for the XGBoost

In [11]:
xgb = XGBClassifier(
    learning_rate =0.1,
    n_estimators=5000,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    scale_pos_weight=1,
    n_jobs=-1,
    seed=0
)

xgb_param = xgb.get_xgb_params()
xgtrain = xgboost.DMatrix(train_features, label=train_targets)


xgboost.cv(xgb_param, xgtrain, num_boost_round=5000, nfold=5, metrics=['error'],
     early_stopping_rounds=10, stratified=True, seed=0)

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.235662,0.006116,0.236628,0.005942
1,0.229322,0.005648,0.230758,0.005505
2,0.226116,0.003752,0.228014,0.003794
3,0.223574,0.001151,0.225322,0.001181
4,0.223233,0.000892,0.224749,0.001409
5,0.222071,0.001192,0.223875,0.001291
6,0.221235,0.000390,0.223013,0.001188
7,0.220824,0.000828,0.222751,0.001205
8,0.220218,0.000905,0.221988,0.001676
9,0.220205,0.000520,0.221786,0.001556


### Fine tune best min_child_weight and max_depth

In [18]:
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=380,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        scale_pos_weight=1,
        seed=0)
five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
gsearch1 = GridSearchCV(estimator = xgb,
                        param_grid = param_test1,
                        scoring='accuracy',n_jobs=-1,
                        cv=five_folds)
gsearch1.fit(train_features,train_targets)
print('Best param: ', gsearch1.best_params_)
print('Best score: ', gsearch1.best_score_)

Best param:  {'max_depth': 9, 'min_child_weight': 5}
Best score:  0.812417625016152


In [20]:
# Carefully take a look to neighbor area
param_test2 = {
 'max_depth':[8,9,10],
 'min_child_weight':[4,5,6]
}
xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=380,
        max_depth=9,
        min_child_weight=5,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        scale_pos_weight=1,
        seed=0)
five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
gsearch2 = GridSearchCV(estimator = xgb,
                        param_grid = param_test2,
                        scoring='accuracy',n_jobs=-1,
                        cv=five_folds)
gsearch2.fit(train_features,train_targets)
print('Run {} best param: ', gsearch2.best_params_)
print('Run {} best score: ', gsearch2.best_score_)

Run {} best param:  {'max_depth': 10, 'min_child_weight': 5}
Run {} best score:  0.8124262393935479


In [21]:
# Try with the weight greater than 10
param_test2b = {
 'min_child_weight':[5, 6, 8, 10, 12]
}
xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=380,
        max_depth=10,
        min_child_weight=5,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=8,
        scale_pos_weight=1,
        seed=0)
five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
gsearch2b = GridSearchCV(estimator = xgb,
                        param_grid = param_test2b,
                        scoring='accuracy',n_jobs=-1,
                        cv=five_folds)
gsearch2b.fit(train_features,train_targets)
print('Run {} best param: ', gsearch2b.best_params_)
print('Run {} best score: ', gsearch2b.best_score_)

Run {} best param:  {'min_child_weight': 5}
Run {} best score:  0.8124262393935479


### Fine tune the gamma

In [5]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=380,
        max_depth=10,
        min_child_weight=5,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=8,
        scale_pos_weight=1,
        seed=0)
five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
gsearch3 = GridSearchCV(estimator = xgb,
                        param_grid = param_test3,
                        scoring='accuracy',n_jobs=-1,
                        cv=five_folds)
gsearch3.fit(train_features,train_targets)
print('Run {} best param: ', gsearch3.best_params_)
print('Run {} best score: ', gsearch3.best_score_)

Run {} best param:  {'gamma': 0.0}
Run {} best score:  0.8124262393935479


### Recalibrate the n_estimators

In [10]:
xgb = XGBClassifier(
    learning_rate =0.1,
    n_estimators=5000,
    max_depth=10,
    min_child_weight=5,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    n_jobs=8,
    scale_pos_weight=1,
    seed=0)

xgb_param = xgb.get_xgb_params()
xgtrain = xgboost.DMatrix(train_features, label=train_targets)
xgboost.cv(xgb_param, xgtrain, num_boost_round=5000, nfold=5, metrics=['error'],
     early_stopping_rounds=50, stratified=True, seed=0)

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.213419,0.003642,0.220864,0.002292
1,0.206908,0.003902,0.214756,0.002054
2,0.204342,0.002058,0.213219,0.000707
3,0.202497,0.001670,0.211242,0.001318
4,0.201245,0.001235,0.210544,0.001329
5,0.200197,0.001212,0.209876,0.001262
6,0.199526,0.001348,0.208912,0.000577
7,0.198771,0.001173,0.208550,0.000778
8,0.197932,0.001041,0.207882,0.000807
9,0.197547,0.000607,0.207710,0.000922


### Fine tune the colsample_bytree and subsample

In [11]:
param_test4 = {
 'colsample_bytree':[i/10.0 for i in range(6,10)],
 'subsample':[i/10.0 for i in range(6,10)] 
}

xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=375,
        max_depth=10,
        min_child_weight=5,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=8,
        scale_pos_weight=1,
        seed=0)
five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
gsearch4 = GridSearchCV(estimator = xgb,
                        param_grid = param_test4,
                        scoring='accuracy',n_jobs=16,
                        cv=five_folds)
gsearch4.fit(train_features,train_targets)
print('Run {} best param: ', gsearch4.best_params_)
print('Run {} best score: ', gsearch4.best_score_)

Run {} best param:  {'colsample_bytree': 0.9, 'subsample': 0.9}
Run {} best score:  0.8125942197527674


In [12]:
param_test5 = {
 'colsample_bytree':[0.85, 0.9, 0.95],
 'subsample':[0.85, 0.9, 0.95] 
}

xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=375,
        max_depth=10,
        min_child_weight=5,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=8,
        scale_pos_weight=1,
        seed=0)
five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
gsearch5 = GridSearchCV(estimator = xgb,
                        param_grid = param_test5,
                        scoring='accuracy',n_jobs=16,
                        cv=five_folds)
gsearch5.fit(train_features,train_targets)
print('Run {} best param: ', gsearch5.best_params_)
print('Run {} best score: ', gsearch5.best_score_)

Run {} best param:  {'colsample_bytree': 0.95, 'subsample': 0.85}
Run {} best score:  0.8129732523581858


In [13]:
param_test6 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}

xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=375,
        max_depth=10,
        min_child_weight=5,
        gamma=0,
        subsample=0.85,
        colsample_bytree=0.95,
        objective= 'binary:logistic',
        n_jobs=8,
        scale_pos_weight=1,
        seed=0)
five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
gsearch6 = GridSearchCV(estimator = xgb,
                        param_grid = param_test6,
                        scoring='accuracy',n_jobs=16,
                        cv=five_folds)
gsearch6.fit(train_features,train_targets)
print('Run {} best param: ', gsearch6.best_params_)
print('Run {} best score: ', gsearch6.best_score_)

Run {} best param:  {'reg_alpha': 1e-05}
Run {} best score:  0.8130206314338632


In [14]:
param_test7 = {
 'reg_alpha':[0, 1e-6, 5e-6, 1e-5, 5e-5, 1e-4, 5e-4]
}

xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=375,
        max_depth=10,
        min_child_weight=5,
        gamma=0,
        subsample=0.85,
        colsample_bytree=0.95,
        objective= 'binary:logistic',
        n_jobs=8,
        scale_pos_weight=1,
        seed=0)
five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
gsearch7 = GridSearchCV(estimator = xgb,
                        param_grid = param_test7,
                        scoring='accuracy',n_jobs=16,
                        cv=five_folds)
gsearch7.fit(train_features,train_targets)
print('Run {} best param: ', gsearch7.best_params_)
print('Run {} best score: ', gsearch7.best_score_)

Run {} best param:  {'reg_alpha': 1e-05}
Run {} best score:  0.8130206314338632


In [21]:
xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=375,
        max_depth=10,
        min_child_weight=5,
        gamma=0,
        subsample=0.85,
        colsample_bytree=0.95,
        objective= 'binary:logistic',
        n_jobs=8,
        reg_alpha=1e-5,
        scale_pos_weight=1,
        seed=0
    )
model = xgb.fit(train_features, train_targets)
y_pred = model.predict(test_features)
# evaluate predictions
accuracy = accuracy_score(test_targets, y_pred)
print('Accuracy %.4f%%' % (accuracy * 100.0))

Accuracy 81.2788%
