# Overview

In this notebook, a Boosted Tree methods based classifier called XGBoost will be used.

In [1]:
import numpy as np
import pandas as pd

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# Sklearn imports
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from sklearn.utils import resample

In [4]:
import helperFunctions

In [5]:
import xgboost

In [6]:
# Setting the random state for later use
random_state = 565

## Load datasets

In [7]:
X_train, y_train = helperFunctions.load_clean_encode('training.csv', delimiter=';')

In [8]:
X_valid, y_valid = helperFunctions.load_clean_encode('validation.csv', delimiter=';')


Make sure that the train and validation sets have the same columns

In [9]:
X_train, X_valid = helperFunctions.equalizeColumns(X_train, X_valid)

## XGBoost Classification

__Initial Performance__

In [10]:
model = xgboost.XGBClassifier()
eval_set = [(X_valid, y_valid)]
model.fit(X_train, y_train, eval_metric="error", eval_set=eval_set)

[0]	validation_0-error:0.251282
[1]	validation_0-error:0.251282
[2]	validation_0-error:0.25641
[3]	validation_0-error:0.25641
[4]	validation_0-error:0.25641
[5]	validation_0-error:0.25641
[6]	validation_0-error:0.25641
[7]	validation_0-error:0.276923
[8]	validation_0-error:0.271795
[9]	validation_0-error:0.271795
[10]	validation_0-error:0.276923
[11]	validation_0-error:0.282051
[12]	validation_0-error:0.276923
[13]	validation_0-error:0.276923
[14]	validation_0-error:0.271795
[15]	validation_0-error:0.276923
[16]	validation_0-error:0.271795
[17]	validation_0-error:0.271795
[18]	validation_0-error:0.271795
[19]	validation_0-error:0.271795
[20]	validation_0-error:0.271795
[21]	validation_0-error:0.266667
[22]	validation_0-error:0.266667
[23]	validation_0-error:0.266667
[24]	validation_0-error:0.266667
[25]	validation_0-error:0.266667
[26]	validation_0-error:0.266667
[27]	validation_0-error:0.266667
[28]	validation_0-error:0.266667
[29]	validation_0-error:0.266667
[30]	validation_0-error:0

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [11]:
# make predictions for test data
y_pred = model.predict(X_valid)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_valid, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 80.51%


__Hyperparameter Tuning__

In [12]:
xgPipe0 = Pipeline(steps = [
    ('imputer', Imputer(strategy='mean', axis=0)),
    ('scaler', StandardScaler()),
    ('clf', xgboost.XGBClassifier(random_state=random_state)),
])

In [21]:
param_grid = [{ 
               'clf__learning_rate': [0.3,0.5,0.7,0.9], 
                'clf__objective': ['reg:linear', 'reg:logistic'], 
               'clf__subsample': [0.3, 0.5, 1],
               'clf__reg_lambda': [1, 5, 10, 50]
              }]

# Using a predefined function for gridSearch in helperFunctions
helperFunctions.gridSearch(xgPipe0, param_grid, X_train, y_train, scoring='neg_log_loss', cv=5)

Best score: -0.056
Best parameters set:
	clf__learning_rate: 0.5
	clf__objective: 'reg:logistic'
	clf__reg_lambda: 1
	clf__subsample: 1


Grid scores:
-0.151 (+/-0.098) for {'clf__learning_rate': 0.3, 'clf__objective': 'reg:linear', 'clf__reg_lambda': 1, 'clf__subsample': 0.3}
-0.142 (+/-0.098) for {'clf__learning_rate': 0.3, 'clf__objective': 'reg:linear', 'clf__reg_lambda': 1, 'clf__subsample': 0.5}
-0.108 (+/-0.083) for {'clf__learning_rate': 0.3, 'clf__objective': 'reg:linear', 'clf__reg_lambda': 1, 'clf__subsample': 1}
-0.137 (+/-0.103) for {'clf__learning_rate': 0.3, 'clf__objective': 'reg:linear', 'clf__reg_lambda': 5, 'clf__subsample': 0.3}
-0.098 (+/-0.075) for {'clf__learning_rate': 0.3, 'clf__objective': 'reg:linear', 'clf__reg_lambda': 5, 'clf__subsample': 0.5}
-0.110 (+/-0.068) for {'clf__learning_rate': 0.3, 'clf__objective': 'reg:linear', 'clf__reg_lambda': 5, 'clf__subsample': 1}
-0.108 (+/-0.068) for {'clf__learning_rate': 0.3, 'clf__objective': 'reg:linear', 'clf__reg

__Final Validation__

In [29]:
xgPipe0.set_params(**{ 
               'clf__learning_rate': 0.5, 
               'clf__objective': 'reg:logistic', 
               'clf__subsample': 1,
               'clf__reg_lambda': 1
              })
xgPipe0 = xgPipe0.fit(X=X_train, y=y_train)
accuracy_score(y_pred=xgPipe0.predict(X=X_valid), y_true=y_valid)

0.83589743589743593

# Summary

The best performance with the XGBoost classifier was about 83.5% accuracy on the validation dataset.
