In [1]:
! pip install xgboost
! pip install -U imbalanced-learn
! pip install hyperopt

Requirement already up-to-date: imbalanced-learn in /opt/conda/lib/python3.7/site-packages (0.9.0)


# Environment Setup

In [31]:
# Essential modules for data manipulation
import pandas as pd
import numpy as np

# Custom modules to assist the commom data exploration and preparation tasks
import src.data.sets as datasets

# Custom module to produce Baseline assement
import src.models.null as basemodel

# Custom module to produced Performance metrics
import src.models.performance as perf

# Classifiers
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

# Metrics
from sklearn.metrics import accuracy_score

# Pytorch Modules
import torch
import torch.nn as nn

# Neural Network Modules
from src.models.pytorch import PytorchDataset
from src.models.pytorch import PytorchMultiClass
from src.models.pytorch import get_device
from src.models.pytorch import train_classification, test_classification

# Classifier Tuning
from hyperopt import Trials, STATUS_OK, tpe, hp, fmin
from sklearn.utils import class_weight
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV,KFold

# Modules to persist classes
import joblib

# Time related modules
from datetime import datetime
import pytz

# Declare variables to store name of timezone
tz_SYD = pytz.timezone('Australia/Sydney')

## Set module auto reload options
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 3. Modelling

## 3.1 Collect processed data

In [3]:
# Load data set(s) into dataframe(s)
X_train, y_train, X_val, y_val, X_test, y_test = datasets.load_sets()

In [4]:
# Print the shape of the loaded datasets to verify that correct data has been loaded
print("Train Dataframe (rows, columns): ", X_train.shape)
print("Validation Dataframe (rows, columns): ", X_val.shape)
print("Test Dataframe (rows, columns): ", X_test.shape)

Train Dataframe (rows, columns):  (467746, 6)
Validation Dataframe (rows, columns):  (155916, 6)
Test Dataframe (rows, columns):  (155916, 6)


## 3.2 Assess Baseline of Training Data Set

In [5]:
# Instantiate a Base Classification Model
base = basemodel.NullModel(target_type="classification")
# Make Predictions on the Model
y_base_preds = base.fit_predict(y_train)
# Score the Base Model
perf.score_null_model(y_train, y_base_preds, "Base", "multiclass")

Unnamed: 0,Set Name,ACC,PREC,RECALL,F1
0,Base,0.060471,0.009615,0.990966,0.001097


>**Observations:**
>* Accuracy score of base model is very low at 6%

## 3.3 Train various models with default parameters

In [9]:
# Build a dictionary of default models to fit and score
models_to_fit = {"XGBoost": xgb.XGBClassifier(random_state=8, use_label_encoder=False, eval_metric='mlogloss'),
                 "KNN": KNeighborsClassifier(),
                 "HistGradientBoosting": HistGradientBoostingClassifier(random_state=8),
                 "BalancedBagging": BalancedBaggingClassifier(base_estimator=HistGradientBoostingClassifier(random_state=8), random_state=8, n_estimators=10, n_jobs=2)}
# Fit and score the models
perf.fit_score_models (models_to_fit, X_train, y_train, X_val, y_val, "multiclass")

*******************************
2022-03-20 19:32:11.995593+11:00 - Start fit and score for model:  XGBoost
2022-03-20 19:45:26.422980+11:00 - End fit for model:  XGBoost
2022-03-20 19:45:26.423247+11:00 - Make train preds for model:  XGBoost
2022-03-20 19:45:45.220909+11:00 - Make val preds for model:  XGBoost
2022-03-20 19:45:52.344902+11:00 - End fit and score for model:  XGBoost
*******************************
                               
*******************************
2022-03-20 19:45:52.346165+11:00 - Start fit and score for model:  KNN
2022-03-20 19:45:53.038193+11:00 - End fit for model:  KNN
2022-03-20 19:45:53.038661+11:00 - Make train preds for model:  KNN
2022-03-20 19:46:17.968723+11:00 - Make val preds for model:  KNN
2022-03-20 19:46:27.710537+11:00 - End fit and score for model:  KNN
*******************************
                               
*******************************
2022-03-20 19:46:27.711307+11:00 - Start fit and score for model:  HistGradientBoosting
20

Unnamed: 0,model,t_ACC,v_ACC,t_PREC,v_PREC,t_RECALL,v_RECALL,t_F1,v_F1
0,XGBoost,0.69136,0.652601,0.744925,0.687157,0.700858,0.640138,0.719348,0.659655
1,KNN,0.680104,0.521698,0.714844,0.54767,0.640714,0.489971,0.668392,0.510908
2,HistGradientBoosting,0.45358,0.443489,0.475392,0.459867,0.426151,0.412508,0.435544,0.420908
3,BalancedBagging,0.281375,0.270761,0.276561,0.265689,0.465464,0.441564,0.285961,0.274208


>**Observations:**
>* Default XGBoost model accuracy is much higher than Base model but is overfitting
>* KNN model accuracy is also mulch higher than the Base model but display a higher degree of overfitting than XGBoost

## 3.4 XGBoost Model tuning

### 3.3.1 Compute balanced class weights for the dataset and train XGBoost with calculated weights

In [10]:
print(datetime.now(tz_SYD))
# Calculate balanced class weights to be used in training XGB
class_weights = class_weight.compute_sample_weight(class_weight='balanced', y=y_train)
# Instantiate XGB Classifier
clf_xgb1 = xgb.XGBClassifier(random_state=8, use_label_encoder=False, eval_metric='mlogloss')
# Fit XGB Classifier with calculated class weights
clf_xgb1.fit(X_train, y_train, sample_weight=class_weights)
print(clf_xgb1)
print(datetime.now(tz_SYD))

2022-03-20 19:56:15.603323+11:00
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              eval_metric='mlogloss', gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=8, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, ...)
2022-03-20 20:08:09.580150+11:00


In [13]:
# Score XGB Classifier
perf.score_models(X_train, y_train, X_val, y_val, pd.DataFrame(), pd.DataFrame(), None, False, "multiclass", clf_xgb1)

Unnamed: 0,Set Name,ACC,PREC,RECALL,F1
0,Train,0.093777,0.64367,0.159429,0.055717
1,Validate,0.091267,0.652071,0.149104,0.053539


>**Observations:**
>* Accuracy is much lower than default XGBoost model but higher than Base model.
>* There is a slight degree of overfitting

### 3.3.2 Tune XGBoost with Hyperopt to find best hyperparameters

In [12]:
# Define search space for XGB Hyperopt tuning
xgbspace = {
    'max_depth' : hp.choice('max_depth', range(5, 10, 1)),
    'learning_rate' : hp.quniform('learning_rate', 0.01, 0.5, 0.05),
    'min_child_weight' : hp.quniform('min_child_weight', 1, 10, 1),
    'subsample' : hp.quniform('subsample', 0.1, 1, 0.05),
    'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1.0, 0.05)
}

In [13]:
# Define objective for XGB Hyperopt tuning
def xgbobjective(space):   
    xgboost = xgb.XGBClassifier(
        max_depth = int(space['max_depth']),
        learning_rate = space['learning_rate'],
        min_child_weight = space['min_child_weight'],
        subsample = space['subsample'],
        colsample_bytree = space['colsample_bytree'],
        use_label_encoder=False,
        eval_metric='mlogloss'
    )    
    xgboost.fit(X_train, y_train)
    y_preds = xgboost.predict(X_train)
    acc = accuracy_score(y_train, y_preds)
    return{'loss': 1-acc, 'status': STATUS_OK }

In [14]:
print(datetime.now(tz_SYD))
# Perform Hyperopt search and return best parameters after 5 iterations
xgbbest = fmin(
    fn=xgbobjective,   
    space=xgbspace,       
    algo=tpe.suggest,       
    max_evals=5
)
print(datetime.now(tz_SYD))

2022-03-19 15:37:14.731559+11:00
100%|██████████| 5/5 [1:08:56<00:00, 827.38s/trial, best loss: 0.3398917361131897]
2022-03-19 16:46:11.624516+11:00


In [15]:
print("Best XGB: ", xgbbest)

Best:  {'colsample_bytree': 0.75, 'learning_rate': 0.2, 'max_depth': 4, 'min_child_weight': 4.0, 'subsample': 0.4}


In [18]:
# Instantiate XGB Classifier with best Hyperopt Params
clf_xgb2 = xgb.XGBClassifier(
    max_depth = best['max_depth'],
    learning_rate = best['learning_rate'],
    min_child_weight = best['min_child_weight'],
    subsample = best['subsample'],
    colsample_bytree = best['colsample_bytree'],
    use_label_encoder=False,
    eval_metric='mlogloss'
)

In [19]:
print(datetime.now(tz_SYD))
# Fit XGB Classifier with best Hyperopt Params
clf_xgb2.fit(X_train, y_train)
print(datetime.now(tz_SYD))

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.75,
              enable_categorical=False, eval_metric='mlogloss', gamma=0,
              gpu_id=-1, importance_type=None, interaction_constraints='',
              learning_rate=0.2, max_delta_step=0, max_depth=4,
              min_child_weight=4.0, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1,
              objective='multi:softprob', predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=0.4,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, ...)

In [20]:
# Score XGB Classifier
perf.score_models(X_train, y_train, X_val, y_val, X_test, y_test, None, False, "multiclass", clf_xgb2)

Unnamed: 0,Set Name,ACC,MSE,MAE,PREC,RECALL,F1,AUC
0,Train,0.44767,34.295791,19.715239,0.44767,0.44767,0.44767,
1,Validate,0.438121,34.497455,19.988141,0.438121,0.438121,0.438121,
2,Test,0.433515,34.807561,20.250314,0.433515,0.433515,0.433515,


>**Observations:**
>* Accuracy is much lower than default XGBoost model but higher than Base model.
>* This model shows a very slight degree of overfitting

## 3.3.3 Train and assess default XGB Model and save for deployment to production

In [14]:
print(datetime.now(tz_SYD))
# Instantiate XGB Classifier with default Hyperparams
clf_xgb3=xgb.XGBClassifier(random_state=8, use_label_encoder=False, eval_metric='mlogloss')
# Fit XGB Classifier
clf_xgb3.fit(X_train,y_train)
print(clf_xgb3)
print(datetime.now(tz_SYD))

2022-03-20 20:12:33.781737+11:00
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              eval_metric='mlogloss', gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=8, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, ...)
2022-03-20 20:27:08.990302+11:00


In [15]:
# Score XGB Classifier
perf.score_models(X_train, y_train, X_val, y_val, X_test, y_test, None, False, "multiclass", clf_xgb3)

Unnamed: 0,Set Name,ACC,PREC,RECALL,F1
0,Train,0.69136,0.744925,0.700858,0.719348
1,Validate,0.652601,0.687157,0.640138,0.659655
2,Test,0.65155,0.686672,0.638717,0.658627


In [16]:
# Dump the XGB Classifier for use in the API
joblib.dump(clf_xgb3, '../models/xgb_beer_type_prediction.joblib', compress=9)

['../models/xgb_beer_type_prediction.joblib']

## 3.4 KNN Model tuning

### 3.4.1 Tune KNN with Hyperopt to find best hyperparameters

In [17]:
# Define search space for XGB Hyperopt tuning
knnspace = {
    'leaf_size' : hp.choice('leaf_size', range(1, 50, 1)),
    'n_neighbors' : hp.choice('n_neighbors', range(1, 30, 1)),
    'p' : hp.uniform('p', 1, 2)
}

In [18]:
# Define objective for XGB Hyperopt tuning
def knnobjective(space):  
    clf_knn = KNeighborsClassifier (
        leaf_size = int(space['leaf_size']),
        n_neighbors = int(space['n_neighbors']),
        p = space['p']
    )
    clf_knn.fit(X_train, y_train)   
    y_preds = clf_knn.predict(X_train)
    acc = accuracy_score(y_train, y_preds)
    return{'loss': 1-acc, 'status': STATUS_OK }

In [21]:
print(datetime.now(tz_SYD))
# Perform Hyperopt search and return best parameters after 5 iterations
knnbest = fmin(
    fn=knnobjective,   
    space=knnspace,       
    algo=tpe.suggest,       
    max_evals=5
)
print(datetime.now(tz_SYD))

2022-03-20 20:33:58.651071+11:00
100%|██████████| 5/5 [09:48<00:00, 117.75s/trial, best loss: 0.3751202575756928]
2022-03-20 20:43:47.415334+11:00


In [22]:
print("Best KNN: ", knnbest)

Best KNN:  {'leaf_size': 4, 'n_neighbors': 10, 'p': 1.5885049034437508}


In [24]:
# Instantiate KNN Classifier with best Hyperopt Params
clf_knn1 = KNeighborsClassifier(
    leaf_size = knnbest['leaf_size'],
    n_neighbors = knnbest['n_neighbors'],
    p = knnbest['p']
)

In [25]:
print(datetime.now(tz_SYD))
# Fit KNN  Classifier with best Hyperopt Params
clf_knn1.fit(X_train, y_train)
print(clf_knn1)
print(datetime.now(tz_SYD))

2022-03-20 20:47:34.922070+11:00
KNeighborsClassifier(leaf_size=4, n_neighbors=10, p=1.5885049034437508)
2022-03-20 20:47:36.087834+11:00


In [26]:
# Score KNN Classifier
perf.score_models(X_train, y_train, X_val, y_val, X_test, y_test, None, False, "multiclass", clf_knn1)

Unnamed: 0,Set Name,ACC,PREC,RECALL,F1
0,Train,0.631514,0.657662,0.5876,0.614906
1,Validate,0.525283,0.545551,0.486109,0.50875
2,Test,0.523372,0.54763,0.484041,0.508357


>**Observations:**
>* Accuracy is a little lower than default KNN model but higher than Base model.
>* This model is still overfitting

## 3.5 Train and assess a Neural Network

In [41]:
# Make Pytorch compatible datasets from numpy arrays
train_dataset = PytorchDataset(X=X_train, y=y_train)
val_dataset = PytorchDataset(X=X_val, y=y_val)
test_dataset = PytorchDataset(X=X_test, y=y_test)

In [42]:
# Instantiate a the Pytorch Classifier
model = PytorchMultiClass(X_train.shape[1])

In [43]:
# Get the available devices and assign model to the device
device = get_device()
model.to(device)

PytorchMultiClass(
  (layer_1): Linear(in_features=6, out_features=32, bias=True)
  (layer_out): Linear(in_features=32, out_features=105, bias=True)
  (softmax): Softmax(dim=1)
)

In [44]:
# Define the Model Criteria
criterion = nn.CrossEntropyLoss()

In [45]:
# Define the Model Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

In [46]:
# Define the number of epochs and batch size
N_EPOCHS = 1000
BATCH_SIZE = 32

In [None]:
# Train and test the model
print(datetime.now(tz_SYD))
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset, model=model, criterion=criterion, optimizer=optimizer, batch_size=BATCH_SIZE, device=device)
    valid_loss, valid_acc = test_classification(val_dataset, model=model, criterion=criterion, batch_size=BATCH_SIZE, device=device)

    print(f'Epoch: {epoch}')
    print(f'\t(train)\t|\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.1f}%')
    print(f'\t(valid)\t|\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.1f}%')
print(datetime.now(tz_SYD))

2022-03-20 21:07:26.257740+11:00
Epoch: 0
	(train)	|	Loss: 0.1455	|	Acc: 1.5%
	(valid)	|	Loss: 0.1457	|	Acc: 0.9%
Epoch: 1
	(train)	|	Loss: 0.1455	|	Acc: 1.3%
	(valid)	|	Loss: 0.1457	|	Acc: 0.9%
Epoch: 2
	(train)	|	Loss: 0.1455	|	Acc: 1.3%
	(valid)	|	Loss: 0.1457	|	Acc: 0.9%
Epoch: 3
	(train)	|	Loss: 0.1453	|	Acc: 2.1%
	(valid)	|	Loss: 0.1452	|	Acc: 2.3%
Epoch: 4
	(train)	|	Loss: 0.1452	|	Acc: 2.3%
	(valid)	|	Loss: 0.1452	|	Acc: 2.3%
Epoch: 5
	(train)	|	Loss: 0.1452	|	Acc: 2.3%
	(valid)	|	Loss: 0.1452	|	Acc: 2.3%
Epoch: 6
	(train)	|	Loss: 0.1452	|	Acc: 2.3%
	(valid)	|	Loss: 0.1452	|	Acc: 2.3%
Epoch: 7
	(train)	|	Loss: 0.1452	|	Acc: 2.3%
	(valid)	|	Loss: 0.1452	|	Acc: 2.3%
Epoch: 8
	(train)	|	Loss: 0.1452	|	Acc: 2.3%
	(valid)	|	Loss: 0.1452	|	Acc: 2.3%
Epoch: 9
	(train)	|	Loss: 0.1453	|	Acc: 2.2%
	(valid)	|	Loss: 0.1452	|	Acc: 2.3%
Epoch: 10
	(train)	|	Loss: 0.1452	|	Acc: 2.3%
	(valid)	|	Loss: 0.1452	|	Acc: 2.3%
Epoch: 11
	(train)	|	Loss: 0.1452	|	Acc: 2.3%
	(valid)	|	Loss: 0.1452	|	Acc

In [54]:
# Test the model agains the test data set
test_loss, test_acc = test_classification(test_dataset, model=model, criterion=criterion, batch_size=BATCH_SIZE, device=device)
print(f'\t(test)\t|\tLoss: {test_loss:.4f}\t|\tAcc: {test_acc * 100:.1f}%')

	(test)	|	Loss: 0.0103	|	Acc: 0.9%
