# Fine tuning with GridSearchCV

## XGBoost Tuning on train set (remove the post features)

### Import the dependencies

In [22]:
import os
os.chdir('/home/tai/Projects/kickstarter_prediction/')
import pandas as pd
from os.path import join

from pandas import DataFrame
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from xgboost import plot_importance
import xgboost
from settings import *

### Load the train and test sets

In [34]:
train_set = join(DATA_SPLIT_ROOT, 'train.csv')
test_set = join(DATA_SPLIT_ROOT, 'test.csv')

train = pd.read_csv(train_set, encoding='latin1', low_memory=True)
test = pd.read_csv(test_set, encoding='latin1', low_memory=True)

In [35]:
post_features = ['pledged_per_backer', 'required_backers', 'required_backers_per_day']
train = train.drop(columns=['pledged_per_backer','required_backers','required_backers_per_day'])
test = test.drop(post_features, axis=1)

In [36]:
train_features = train.drop(['success'], axis=1)
train_targets = train['success']

test_features = test.drop(['success'], axis=1)
test_targets = test['success']

### Define the best n_estimator for the XGBoost

In [45]:
xgb = XGBClassifier(
    learning_rate =0.1,
    n_estimators=5000,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    scale_pos_weight=1,
    n_jobs=-1,
    seed=0
)

xgb_param = xgb.get_xgb_params()
xgtrain = xgboost.DMatrix(train_features, label=train_targets)


xgboost.cv(xgb_param, xgtrain, num_boost_round=5000, nfold=5, metrics=['error'],
     early_stopping_rounds=50, stratified=True, seed=0)

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.366554,0.005534,0.368489,0.004747
1,0.354418,0.001786,0.356450,0.001496
2,0.350135,0.002585,0.350782,0.003484
3,0.347629,0.003125,0.348004,0.003272
4,0.346954,0.001595,0.347788,0.003182
5,0.345765,0.002496,0.346668,0.003628
6,0.342791,0.001176,0.343951,0.001896
7,0.341723,0.001115,0.342051,0.001751
8,0.339844,0.000706,0.341284,0.001932
9,0.339009,0.000994,0.340070,0.000903


### Fine tune best min_child_weight and max_depth

In [46]:
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=411,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        scale_pos_weight=1,
        n_jobs=8,
        seed=0)
five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
gsearch1 = GridSearchCV(estimator = xgb,
                        param_grid = param_test1,
                        scoring='accuracy', n_jobs=-1,
                        cv=five_folds)
gsearch1.fit(train_features,train_targets)
print('Best param: ', gsearch1.best_params_)
print('Best score: ', gsearch1.best_score_)

Best param:  {'max_depth': 5, 'min_child_weight': 1}
Best score:  0.6962312098893052


In [47]:
# Carefully take a look to neighbor area
param_test2 = {
 'max_depth':[4, 5, 6],
 'min_child_weight':[1, 2, 3]
}
xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=411,
        max_depth=9,
        min_child_weight=5,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        scale_pos_weight=1,
        seed=0)
five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
gsearch2 = GridSearchCV(estimator = xgb,
                        param_grid = param_test2,
                        scoring='accuracy',n_jobs=-1,
                        cv=five_folds)
gsearch2.fit(train_features,train_targets)
print('Best param: ', gsearch2.best_params_)
print('Best score: ', gsearch2.best_score_)

Best param:  {'max_depth': 6, 'min_child_weight': 2}
Best score:  0.6965499418529526


In [48]:
# Try with the weight greater than 10
param_test2b = {
 'min_child_weight':[10, 11, 12]
}
xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=411,
        max_depth=6,
        min_child_weight=2,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=8,
        scale_pos_weight=1,
        seed=0)
five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
gsearch2b = GridSearchCV(estimator = xgb,
                        param_grid = param_test2b,
                        scoring='accuracy',n_jobs=-1,
                        cv=five_folds)
gsearch2b.fit(train_features,train_targets)
print('Best param: ', gsearch2b.best_params_)
print('Best score: ', gsearch2b.best_score_)

Best param:  {'min_child_weight': 10}
Best score:  0.6966016281173278


In [42]:
# param_test2c = {
#  'min_child_weight':[9, 10, 11]
# }
# xgb = XGBClassifier(
#         learning_rate =0.1,
#         n_estimators=411,
#         max_depth=6,
#         min_child_weight=5,
#         gamma=0,
#         subsample=0.8,
#         colsample_bytree=0.8,
#         objective= 'binary:logistic',
#         n_jobs=8,
#         scale_pos_weight=1,
#         seed=0)
# five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# gsearch2c = GridSearchCV(estimator = xgb,
#                         param_grid = param_test2c,
#                         scoring='accuracy',n_jobs=-1,
#                         cv=five_folds)
# gsearch2c.fit(train_features,train_targets)
# print('Run {} best param: ', gsearch2c.best_params_)
# print('Run {} best score: ', gsearch2c.best_score_)

Run {} best param:  {'min_child_weight': 9}
Run {} best score:  0.6968169875522247


### Fine tune the gamma

In [49]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=411,
        max_depth=6,
        min_child_weight=10,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=8,
        scale_pos_weight=1,
        seed=0)
five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
gsearch3 = GridSearchCV(estimator = xgb,
                        param_grid = param_test3,
                        scoring='accuracy',n_jobs=-1,
                        cv=five_folds)
gsearch3.fit(train_features,train_targets)
print('Run {} best param: ', gsearch3.best_params_)
print('Run {} best score: ', gsearch3.best_score_)

Run {} best param:  {'gamma': 0.0}
Run {} best score:  0.6966016281173278


### Recalibrate the n_estimators

In [50]:
xgb = XGBClassifier(
    learning_rate =0.1,
    n_estimators=5000,
    max_depth=6,
    min_child_weight=10,
    gamma=0.0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    n_jobs=8,
    scale_pos_weight=1,
    seed=0)

xgb_param = xgb.get_xgb_params()
xgtrain = xgboost.DMatrix(train_features, label=train_targets)
xgboost.cv(xgb_param, xgtrain, num_boost_round=5000, nfold=5, metrics=['error'],
     early_stopping_rounds=50, stratified=True, seed=0)

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.361585,0.004911,0.363906,0.005019
1,0.349196,0.003559,0.351678,0.004441
2,0.343806,0.002168,0.345001,0.002332
3,0.340888,0.001989,0.342210,0.001419
4,0.340704,0.002249,0.342167,0.002021
5,0.339145,0.002321,0.341185,0.001816
6,0.336930,0.001328,0.338816,0.001931
7,0.336177,0.001100,0.337649,0.002068
8,0.335360,0.000757,0.336930,0.001584
9,0.333099,0.000802,0.334716,0.001268


### Fine tune the colsample_bytree and subsample

In [51]:
param_test4 = {
 'colsample_bytree':[i/10.0 for i in range(6,10)],
 'subsample':[i/10.0 for i in range(6,10)] 
}

xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=491,
        max_depth=6,
        min_child_weight=10,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=8,
        scale_pos_weight=1,
        seed=0)
five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
gsearch4 = GridSearchCV(estimator = xgb,
                        param_grid = param_test4,
                        scoring='accuracy',n_jobs=16,
                        cv=five_folds)
gsearch4.fit(train_features,train_targets)
print('Run {} best param: ', gsearch4.best_params_)
print('Run {} best score: ', gsearch4.best_score_)

Run {} best param:  {'colsample_bytree': 0.7, 'subsample': 0.7}
Run {} best score:  0.6974243011586337


In [52]:
param_test5 = {
 'colsample_bytree':[0.65, 0.7, 0.75],
 'subsample':[0.65, 0.7, 0.75] 
}

xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=491,
        max_depth=6,
        min_child_weight=10,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        n_jobs=8,
        scale_pos_weight=1,
        seed=0)
five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
gsearch5 = GridSearchCV(estimator = xgb,
                        param_grid = param_test5,
                        scoring='accuracy',n_jobs=16,
                        cv=five_folds)
gsearch5.fit(train_features,train_targets)
print('Run {} best param: ', gsearch5.best_params_)
print('Run {} best score: ', gsearch5.best_score_)

Run {} best param:  {'colsample_bytree': 0.7, 'subsample': 0.7}
Run {} best score:  0.6974243011586337


In [54]:
param_test6 = {
 'reg_alpha':[0, 1e-5, 1e-2, 0.1, 1, 100]
}

xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=491,
        max_depth=6,
        min_child_weight=10,
        gamma=0.0,
        subsample=0.7,
        colsample_bytree=0.7,
        objective= 'binary:logistic',
        n_jobs=8,
        scale_pos_weight=1,
        seed=0)
five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
gsearch6 = GridSearchCV(estimator = xgb,
                        param_grid = param_test6,
                        scoring='accuracy',n_jobs=-1,
                        cv=five_folds)
gsearch6.fit(train_features,train_targets)
print('Run {} best param: ', gsearch6.best_params_)
print('Run {} best score: ', gsearch6.best_score_)

Run {} best param:  {'reg_alpha': 0}
Run {} best score:  0.6974243011586337


In [56]:
param_test7 = {
 'reg_alpha':[0, 1e-6, 5e-6, 1e-5, 5e-5]
}

xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=491,
        max_depth=6,
        min_child_weight=10,
        gamma=0,
        subsample=0.7,
        colsample_bytree=0.7,
        objective= 'binary:logistic',
        n_jobs=8,
        scale_pos_weight=1,
        seed=0)
five_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
gsearch7 = GridSearchCV(estimator = xgb,
                        param_grid = param_test7,
                        scoring='accuracy',n_jobs=16,
                        cv=five_folds)
gsearch7.fit(train_features,train_targets)
print('Run {} best param: ', gsearch7.best_params_)
print('Run {} best score: ', gsearch7.best_score_)

Run {} best param:  {'reg_alpha': 0}
Run {} best score:  0.6974243011586337


In [57]:
xgb = XGBClassifier(
        learning_rate =0.1,
        n_estimators=491,
        max_depth=6,
        min_child_weight=10,
        gamma=0,
        subsample=0.7,
        colsample_bytree=0.7,
        objective= 'binary:logistic',
        n_jobs=8,
        reg_alpha=1e-5,
        scale_pos_weight=1,
        seed=0
    )
model = xgb.fit(train_features, train_targets)
y_pred = model.predict(test_features)
# evaluate predictions
accuracy = accuracy_score(test_targets, y_pred)
print('Accuracy %.4f%%' % (accuracy * 100.0))

Accuracy 69.9353%
