## Import Statements


---



In [1]:
# Visualization Statements
import matplotlib.pyplot as plt

# Normal Imports
import os
import numpy as np
import pickle
import random
from datetime import datetime

# XGBoost Imports
from xgboost import XGBClassifier

# Data Manipulator/Scaler Imports
from keras.utils import to_categorical
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# Upload from Local Drive
from google.colab import files

%matplotlib inline

Using TensorFlow backend.


## Upload from Local Drive



---



In [3]:
uploaded = files.upload()

Saving Xfileupdated to Xfileupdated
Saving Yfileupdated to Yfileupdated


In [0]:
# Contains a 3D Matrix of nsamples*nx*ny (where ny=12 features, nx=232 rows)
X_all = pickle.load(open('Xfileupdated', 'rb'))

# Contains a 2D Matrix of nsamples*2 (where each list contains ['fun',1], ['mother',2] kind of data)
Y_all = pickle.load(open('Yfileupdated', 'rb'))

## Modifying & Scaling the Matrices


---



In [0]:
nsamples, nx, ny = X_all.shape
X_updated_all = X_all.reshape((nsamples,nx*ny))
Y_updated_all = Y_all

In [0]:
scaler = MinMaxScaler()
scaled_X_all = scaler.fit_transform(X_updated_all)

## Creating Train, and Test Data


---



In [0]:
# For scaled
X_train, X_test, y_train, y_test = train_test_split(scaled_X_all, [i[1] for i in Y_updated_all], test_size=0.3)

# For unscaled
#X_train, X_test, y_train, y_test = train_test_split(X_updated_all, [i[1] for i in Y_updated_all], test_size=0.3)

In [13]:
print(X_train.shape, X_test.shape)

(290, 2784) (125, 2784)


## Fitting the model on train data


---



In [14]:
# fit model no training data
model = XGBClassifier(objective='multi:softmax', tree_method='gpu_hist', 
                      n_estimators=600, learning_rate=0.02, gamma=1.5, max_depth=5,
                      subsample=0.6, n_jobs=-1, colsample_bytree=0.8, nthread=4)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=1.5,
              learning_rate=0.02, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=600, n_jobs=-1,
              nthread=4, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.6, tree_method='gpu_hist', verbosity=1)

## Model Evaluation


---



In [0]:
predictions = model.predict(X_test)

In [16]:
# 71.2% accuracy - unscaled - normal
# 73.6% accuracy - scaled - normal
# 74.4% accuracy - scaled - learning_rate=0.2, n_estimators=600
# 76.8/79.2% cv accuracy - scaled

accuracy = accuracy_score(y_test, predictions)
print("Accuracy: {0:.2f}".format(accuracy * 100.0))

Accuracy: 83.20%


## Save the model

---



In [0]:
pickle.dump(model, open('xgboost_model','wb'))
files.download('xgboost_model')

## Experiments Done


---

We ran a cross validation with hyper parameterization to find out the best possible parameters of the different decision trees, and boosting parameters. The one above is the best one that the following cells produced.

**Cross Validation** - Using Stratified K-Fold

**Hyper Parameterization** - Using RandomSearchCV


In [0]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [0]:
# A parameter grid for XGBoost
params = {
        'n_estimators':[600, 1000, 1500, 5000],
        'learning_rate': [0.01, 0.02, 0.001, 0.2,0.1]0
        }

xgb = XGBClassifier( min_child_weight=1, objective='multi:softmax', tree_method='gpu_hist',silent=True, 
                    n_jobs=-1, gamma=1.5, colsample_bytree=0.8, max_depth=5, 
                    subsample=0.6)

folds = 5
param_comb = 2

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='accuracy', n_jobs=-1, cv=skf.split(X_train,y_train), verbose=3, random_state=1001 )

In [56]:
# Here we go
start_time = timer(None) # timing starts from this point for "start_time" variable
random_search.fit(X_train, y_train)
timer(start_time) # timing ends here for "start_time" variable

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.



 Time taken: 0 hours 23 minutes and 15.73 seconds.


In [57]:
random_search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=1.5,
              learning_rate=0.01, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=1000, n_jobs=-1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=True, subsample=0.6, tree_method='gpu_hist', verbosity=1)

In [58]:
predictions = random_search.predict(X_test)

accuracy_score(y_test, predictions)*100

0.768