# 3. Modelling and Hyperparameter Tuning

## 3.1 Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold

import pickle

from Classification import Classification
from Ensemble import Ensemble



I have created .py files; Classifiction.py and Ensemble.py with classes, that contain functions to simplify the modelling process, and to neaten up the modelling notebook.

In [2]:
sns.set_context('poster')

In [3]:
x_train = pd.read_csv('Data/3.x_train_data.csv')
y_train = pd.read_csv('Data/3.y_train_data.csv')

In [4]:
print(x_train.shape)
print(y_train.shape)

(7524, 138)
(7524, 1)


## 3.2 Train and Validation Split

I did another data split into Train and Validation data in preparation for using GridSearch Cross Validation. I also chose Stratified 5-fold has a my choice for cross validating.

In [5]:
x_train, x_val, y_train, y_val = train_test_split(x_train,y_train['score'],test_size=.25,random_state=42)

In [6]:
skf = StratifiedKFold(n_splits=5,random_state=42,shuffle=True)

## 3.3 Decision Tree (Baseline)

### 3.3.1 1st Attempt

In [7]:
params = {'min_samples_leaf':[3,5,10,15,30,50,100],
          'max_depth':[3,4,5,6,7,8,9]}

In [8]:
dec_tree_1 = Classification('Decision Tree',x_train,x_val,y_train,y_val)

In [9]:
dec_tree_1.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Decision Tree,0.432926,0.353535,0.07939


The best hyperparameters are:  {'max_depth': 9, 'min_samples_leaf': 5} 



Unnamed: 0,1,2,3,4,5
precision,0.531381,0.256169,0.290323,0.349057,0.495702
recall,0.333333,0.564767,0.095745,0.310056,0.455263
f1-score,0.409677,0.352466,0.144,0.328402,0.474623


### 3.3.2 2nd Attempt

In [10]:
params = {'min_samples_leaf':[3,4,5],
          'max_depth':[7,8,9,10]}

In [11]:
dec_tree_2 = Classification('Decision Tree',x_train,x_val,y_train,y_val)

In [12]:
dec_tree_2.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Decision Tree,0.432926,0.353535,0.07939


The best hyperparameters are:  {'max_depth': 9, 'min_samples_leaf': 5} 



Unnamed: 0,1,2,3,4,5
precision,0.531381,0.256169,0.290323,0.349057,0.495702
recall,0.333333,0.564767,0.095745,0.310056,0.455263
f1-score,0.409677,0.352466,0.144,0.328402,0.474623


## 3.4 Random Forest

### 3.4.1 1st Attempt

In [13]:
params = {'min_samples_leaf':[3,5,10,15,30,50,100],
          'max_depth':[3,5,7,9,11,13,15]}

In [14]:
ran_for_1 = Classification('Random Forest',x_train,x_val,y_train,y_val)

In [15]:
ran_for_1.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Random Forest,0.575403,0.454014,0.121389


The best hyperparameters are:  {'max_depth': 15, 'min_samples_leaf': 10} 



Unnamed: 0,1,2,3,4,5
precision,0.485769,0.408451,0.412,0.381963,0.530474
recall,0.671916,0.300518,0.273936,0.402235,0.618421
f1-score,0.563877,0.346269,0.329073,0.391837,0.571081


### 3.4.2 2nd Attempt

In [16]:
params = {'min_samples_leaf':[7,8,9,10,11,12,13,14],
          'max_depth':[13,14,15,16,17,18]}

In [17]:
ran_for_2 = Classification('Random Forest',x_train,x_val,y_train,y_val)

In [18]:
ran_for_2.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Random Forest,0.586745,0.45933,0.127414


The best hyperparameters are:  {'max_depth': 18, 'min_samples_leaf': 11} 



Unnamed: 0,1,2,3,4,5
precision,0.504854,0.404,0.434028,0.393701,0.510067
recall,0.682415,0.261658,0.332447,0.418994,0.6
f1-score,0.580357,0.31761,0.376506,0.405954,0.551391


### 3.4.3 3rd Attempt

In [19]:
params = {'min_samples_leaf':[7,8,9,10,11,12,13,14],
          'max_depth':[17,18,19,20,21,22]}

In [20]:
ran_for_3 = Classification('Random Forest',x_train,x_val,y_train,y_val)

In [21]:
ran_for_3.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Random Forest,0.647351,0.457735,0.189615


The best hyperparameters are:  {'max_depth': 22, 'min_samples_leaf': 7} 



Unnamed: 0,1,2,3,4,5
precision,0.493952,0.390728,0.423358,0.383632,0.555024
recall,0.643045,0.305699,0.308511,0.418994,0.610526
f1-score,0.558723,0.343023,0.356923,0.400534,0.581454


## 3.5 Logistic Regression

### 3.5.1 1st Attempt

In [22]:
params = {'penalty':['l1','l2'],
          'C':[0.01,0.05,0.1,0.5,1,5,10]}

In [23]:
log_reg_1 = Classification('Logistic Regression',x_train,x_val,y_train,y_val)

In [24]:
log_reg_1.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Logistic Regression,0.554847,0.501329,0.053518


The best hyperparameters are:  {'C': 0.1, 'penalty': 'l2'} 



Unnamed: 0,1,2,3,4,5
precision,0.547619,0.427762,0.508532,0.402439,0.612346
recall,0.603675,0.391192,0.396277,0.460894,0.652632
f1-score,0.574282,0.40866,0.445441,0.429688,0.631847


### 3.5.2 2nd Attempt

In [25]:
params = {'penalty':['l1','l2'],
          'C':[0.3,0.4,0.5,0.6,0,7]}

In [26]:
log_reg_2 = Classification('Logistic Regression',x_train,x_val,y_train,y_val)

In [27]:
log_reg_2.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Logistic Regression,0.562112,0.509304,0.052809


The best hyperparameters are:  {'C': 0.3, 'penalty': 'l2'} 



Unnamed: 0,1,2,3,4,5
precision,0.570025,0.410468,0.5,0.431525,0.617073
recall,0.608924,0.38601,0.417553,0.46648,0.665789
f1-score,0.588832,0.397864,0.455072,0.448322,0.640506


### 3.5.3 3rd Attempt

In [28]:
params = {'penalty':['l2'],
          'C':[0.25,0.26,0.27,0.28,0.29,0.30,0.31,0.32]}

In [29]:
log_reg_3 = Classification('Logistic Regression',x_train,x_val,y_train,y_val)

In [30]:
log_reg_3.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Logistic Regression,0.56229,0.50824,0.054049


The best hyperparameters are:  {'C': 0.29, 'penalty': 'l2'} 



Unnamed: 0,1,2,3,4,5
precision,0.566176,0.410468,0.501597,0.430412,0.616137
recall,0.606299,0.38601,0.417553,0.46648,0.663158
f1-score,0.585551,0.397864,0.455733,0.447721,0.638783


## 3.6 Support Vector Machines

### 3.6.1 1st Attempt

In [31]:
params = {'kernel':['poly'],
          'degree':[2,3]}

In [32]:
svm_1 = Classification('SVM',x_train,x_val,y_train,y_val)

In [33]:
svm_1.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,SVM,0.810385,0.498139,0.312245


The best hyperparameters are:  {'degree': 2, 'kernel': 'poly'} 



Unnamed: 0,1,2,3,4,5
precision,0.531513,0.411765,0.457227,0.430303,0.643646
recall,0.664042,0.398964,0.412234,0.396648,0.613158
f1-score,0.590432,0.405263,0.433566,0.412791,0.628032


### 3.6.2 2nd Attempt

In [34]:
params = {'C':[0.2,0.3,0.4],
          'kernel':['linear'],
          'gamma':['scale','auto']}

In [35]:
svm_2 = Classification('SVM',x_train,x_val,y_train,y_val)

In [36]:
svm_2.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,SVM,0.559631,0.515152,0.04448


The best hyperparameters are:  {'C': 0.2, 'gamma': 'scale', 'kernel': 'linear'} 



Unnamed: 0,1,2,3,4,5
precision,0.561033,0.426702,0.482456,0.448468,0.647849
recall,0.627297,0.42228,0.43883,0.449721,0.634211
f1-score,0.592317,0.424479,0.45961,0.449093,0.640957


### 3.6.3 3rd Attempt

In [37]:
params = {'C':[0.18,0.19,0.2,0.21],
          'kernel':['linear'],
          'gamma':['scale']}

In [38]:
svm_3 = Classification('SVM',x_train,x_val,y_train,y_val)

In [39]:
svm_3.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,SVM,0.559986,0.51143,0.048556


The best hyperparameters are:  {'C': 0.19, 'gamma': 'scale', 'kernel': 'linear'} 



Unnamed: 0,1,2,3,4,5
precision,0.558411,0.422164,0.482558,0.438202,0.644385
recall,0.627297,0.414508,0.441489,0.435754,0.634211
f1-score,0.590853,0.418301,0.461111,0.436975,0.639257


## 3.7 Guassian Naive Bayes

### 3.7.1 1st Attempt

In [40]:
params = {'var_smoothing':[1e-09,1e-06,1e-03,1e-01,1e2]}

In [41]:
gnb_1 = Classification('Naive Bayes',x_train,x_val,y_train,y_val)

In [42]:
gnb_1.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Naive Bayes,0.498671,0.484848,0.013822


The best hyperparameters are:  {'var_smoothing': 0.1} 



Unnamed: 0,1,2,3,4,5
precision,0.508681,0.517857,0.550296,0.389333,0.494098
recall,0.769029,0.225389,0.24734,0.407821,0.771053
f1-score,0.61233,0.314079,0.341284,0.398363,0.602261


### 3.7.2 2nd Attempt

In [43]:
params = {'var_smoothing':[1e-02,1e-01,1]}

In [44]:
gnb_2 = Classification('Naive Bayes',x_train,x_val,y_train,y_val)

In [45]:
gnb_2.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Naive Bayes,0.498671,0.484848,0.013822


The best hyperparameters are:  {'var_smoothing': 0.1} 



Unnamed: 0,1,2,3,4,5
precision,0.508681,0.517857,0.550296,0.389333,0.494098
recall,0.769029,0.225389,0.24734,0.407821,0.771053
f1-score,0.61233,0.314079,0.341284,0.398363,0.602261


## 3.8 KNN

### 3.8.1 1st Attempt

In [46]:
params = {'n_neighbors':[5,10,50,100,200,300]}

In [47]:
knn_1 = Classification('KNN',x_train,x_val,y_train,y_val)

In [48]:
knn_1.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,KNN,0.50824,0.49176,0.016481


The best hyperparameters are:  {'n_neighbors': 300} 



Unnamed: 0,1,2,3,4,5
precision,0.503953,0.435976,0.52381,0.410053,0.56875
recall,0.669291,0.370466,0.263298,0.432961,0.718421
f1-score,0.574972,0.40056,0.350442,0.421196,0.634884


### 3.8.2 2nd Attempt

In [49]:
params = {'n_neighbors':[250,300,350]}

In [50]:
knn_2 = Classification('KNN',x_train,x_val,y_train,y_val)

In [51]:
knn_2.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,KNN,0.507,0.501329,0.005671


The best hyperparameters are:  {'n_neighbors': 350} 



Unnamed: 0,1,2,3,4,5
precision,0.511765,0.446108,0.546875,0.408108,0.583158
recall,0.685039,0.38601,0.279255,0.421788,0.728947
f1-score,0.585859,0.413889,0.369718,0.414835,0.647953


### 3.8.3 3rd Attempt

In [52]:
params = {'n_neighbors':[340,345,350,355,360]}

In [53]:
knn_3 = Classification('KNN',x_train,x_val,y_train,y_val)

In [54]:
knn_3.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,KNN,0.507,0.501329,0.005671


The best hyperparameters are:  {'n_neighbors': 350} 



Unnamed: 0,1,2,3,4,5
precision,0.511765,0.446108,0.546875,0.408108,0.583158
recall,0.685039,0.38601,0.279255,0.421788,0.728947
f1-score,0.585859,0.413889,0.369718,0.414835,0.647953


## 3.9 Adaboost (log_reg_2)

### 3.9.1 1st Attempt

In [55]:
params = {'learning_rate':[0.1,1,10]}

In [56]:
adaboost_1 = Ensemble('AdaBoost',log_reg_2.best_model,x_train,x_val,y_train,y_val)

In [57]:
adaboost_1.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,AdaBoost,0.513734,0.503987,0.009747


The best hyperparameters are:  {'learning_rate': 0.1} 



Unnamed: 0,1,2,3,4,5
precision,0.655914,0.411633,0.440501,0.423398,0.687697
recall,0.480315,0.476684,0.56117,0.424581,0.573684
f1-score,0.554545,0.441777,0.493567,0.423989,0.625538


### 3.9.2 2nd Attempt

In [58]:
params = {'learning_rate':[0.01,0.05,0.1]}

In [59]:
adaboost_2 = Ensemble('AdaBoost',log_reg_2.best_model,x_train,x_val,y_train,y_val)

In [60]:
adaboost_2.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,AdaBoost,0.521531,0.499203,0.022329


The best hyperparameters are:  {'learning_rate': 0.01} 



Unnamed: 0,1,2,3,4,5
precision,0.626298,0.440443,0.486726,0.391221,0.622283
recall,0.475066,0.411917,0.43883,0.572626,0.602632
f1-score,0.540299,0.425703,0.461538,0.464853,0.612299


## 3.10 XGBoost (log_reg_2)

### 3.10.1 1st Attempt

In [61]:
params = {'eta':[0.001,0.005,0.1,0.5],
          'min_child_weight':[1,5,10]}

In [62]:
xgboost_1 = Ensemble('XGBoost',log_reg_2.best_model,x_train,x_val,y_train,y_val)

In [63]:
xgboost_1.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,XGBoost,0.853624,0.484848,0.368775


The best hyperparameters are:  {'eta': 0.001, 'min_child_weight': 5} 



Unnamed: 0,1,2,3,4,5
precision,0.541353,0.387097,0.44152,0.440341,0.607792
recall,0.566929,0.404145,0.401596,0.432961,0.615789
f1-score,0.553846,0.395437,0.420613,0.43662,0.611765


### 3.10.2 2nd Attempt

In [64]:
params = {'eta':[0.0001,0.0005,0.001],
          'min_child_weight':[5]}

In [65]:
xgboost_2 = Ensemble('XGBoost',log_reg_1.best_model,x_train,x_val,y_train,y_val)

In [66]:
xgboost_2.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,XGBoost,0.853624,0.484848,0.368775


The best hyperparameters are:  {'eta': 0.0001, 'min_child_weight': 5} 



Unnamed: 0,1,2,3,4,5
precision,0.541353,0.387097,0.44152,0.440341,0.607792
recall,0.566929,0.404145,0.401596,0.432961,0.615789
f1-score,0.553846,0.395437,0.420613,0.43662,0.611765


## 3.11 Voting (Adaboost(log_reg_2)/Logistic Regression)

In [67]:
params = {'voting':['hard','soft']}

In [68]:
adaboost_best = ('ada', adaboost_1.best_model)
log_reg_best = ('lr2', log_reg_2.best_model)

estimators = [adaboost_best,log_reg_best]

In [69]:
voting = Ensemble('Voting',estimators,x_train,x_val,y_train,y_val)

In [70]:
voting.get_scores(params,skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Voting,0.557859,0.515683,0.042176


The best hyperparameters are:  {'voting': 'soft'} 



Unnamed: 0,1,2,3,4,5
precision,0.592493,0.427441,0.491379,0.42,0.650919
recall,0.580052,0.419689,0.454787,0.469274,0.652632
f1-score,0.586207,0.423529,0.472376,0.443272,0.651774


## 3.11 Stacking (Adaboost(log_reg_2)/Logistic Regression)

In [71]:
adaboost_best = ('ada', adaboost_2.best_model)
log_reg_best = ('lr2', log_reg_2.best_model)

estimators = [adaboost_best,log_reg_best]

In [72]:
stacking = Ensemble('Stacking',estimators,x_train,x_val,y_train,y_val)

In [73]:
stacking.get_scores({},skf)

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Stacking,0.55467,0.519405,0.035265


Unnamed: 0,1,2,3,4,5
precision,0.594937,0.422652,0.481771,0.45283,0.639566
recall,0.616798,0.396373,0.492021,0.469274,0.621053
f1-score,0.60567,0.409091,0.486842,0.460905,0.630174


## 3.12 All Models Compared

For the majority of models I created, I applied hyperparameter tuning, where I started with a broad range of hyperparameters, and tuned for optimal train accuracy and validation accuracy. 

In [74]:
all_models = pd.concat([dec_tree_1.scores_table,
                        ran_for_3.scores_table,
                        log_reg_2.scores_table,
                        svm_3.scores_table,
                        gnb_1.scores_table,
                        knn_3.scores_table,
                        adaboost_1.scores_table,
                        xgboost_1.scores_table,
                        voting.scores_table,
                        stacking.scores_table],
                        axis=0)

In [75]:
all_models

Unnamed: 0,Model Name,Train Accuracy,Validation Accuracy,Accuracy Difference
0,Decision Tree,0.432926,0.353535,0.07939
0,Random Forest,0.647351,0.457735,0.189615
0,Logistic Regression,0.562112,0.509304,0.052809
0,SVM,0.559986,0.51143,0.048556
0,Naive Bayes,0.498671,0.484848,0.013822
0,KNN,0.507,0.501329,0.005671
0,AdaBoost,0.513734,0.503987,0.009747
0,XGBoost,0.853624,0.484848,0.368775
0,Voting,0.557859,0.515683,0.042176
0,Stacking,0.55467,0.519405,0.035265


In [76]:
all_models.to_csv('Data/4.all_models.csv',index=False)

Initially, I thought the validation accuracy was low for most of the models I created, but when considering these models were attempting to classify for 5 different classes, 0.45 and greater seems very reasonable (where 0.2 = randomly guessing correctly).

## 3.13 Saving Models

I have saved all the models using the pickle library's dump function.

In [77]:
for model in [dec_tree_1,ran_for_3,log_reg_2,svm_3,gnb_1,knn_3,adaboost_1,xgboost_1,voting,stacking]:
    pickle.dump(model, open(f'Models/{model.model_type}.pkl', 'wb'))