Hyper-parameter Optimization
--------------------------------------------

Install Python library Hyperopt using either of the two commands:
- conda install -c conda-forge hyperopt
- pip install hyperopt

### Load required libraries

In [1]:
RAND_SEED = 12345
import numpy as np
np.random.seed(RAND_SEED)
import random
random.seed(RAND_SEED)

import pandas as pd
from hyperopt import fmin, tpe, hp, Trials

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

### Read and prepare data

In [2]:
# import the .csv file as a dataframe - Raw Data File
df = pd.read_csv('./data/Lithotype_Data.csv')
X = df.iloc[:, :-1]
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
y = df.iloc[:, -1]

TEST_FRAC = 0.25
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_FRAC, random_state=RAND_SEED)

### Create a baseline SVC model

In [3]:
def clf_metrics(test, pred):
    # This function returns basic metrics for classification models.
    print('Classification Accuracy Score:', accuracy_score(test, pred))
    print('Confusion Matrix: \n', confusion_matrix(test, pred))
    print('Classification Report: \n', classification_report(test, pred))

svc = SVC()
svc.fit(X_train, y_train)
# Prediction on test data
y_pred = svc.predict(X_test)
# Accuracy Metrics
clf_metrics(y_test, y_pred)

Classification Accuracy Score: 0.8621700879765396
Confusion Matrix: 
 [[ 59  17   3]
 [ 14 110   3]
 [  2   8 125]]
Classification Report: 
                  precision    recall  f1-score   support

     F-MOUTHBAR       0.79      0.75      0.77        79
    F-TIDAL BAR       0.81      0.87      0.84       127
F-TIDAL CHANNEL       0.95      0.93      0.94       135

       accuracy                           0.86       341
      macro avg       0.85      0.85      0.85       341
   weighted avg       0.86      0.86      0.86       341



### Search for hyper-parameters using Hyperopt

In [4]:
def objective(params):
    svc = SVC(**params)
    return -1. * cross_val_score(svc, X_train, y_train).mean()

kernels = ['rbf','poly','rbf','sigmoid']
space = {'C':hp.lognormal('C', 0, 1),
         'kernel':hp.choice('kernel', kernels),
         'degree':hp.choice('degree', range(1, 15)),
         'gamma':hp.uniform('gamma', 1e-2, 1e2)
        }
 
trials = Trials()
best_svc = fmin(objective, space, algo=tpe.suggest, max_evals=50, trials=trials)
print(best_svc)

100%|██████████| 50/50 [00:11<00:00,  4.39it/s, best loss: -0.9490769966523194]
{'C': 1751.5444736790396, 'degree': 11, 'gamma': 5.899864955354083, 'kernel': 0}


### Test accuracy of the best SVC model

In [5]:
svc = SVC(C=best_svc['C'], 
         kernel=kernels[best_svc['kernel']], 
         degree=best_svc['degree'], 
         gamma=best_svc['gamma'])
svc.fit(X_train, y_train)
# Prediction on test data
y_pred = svc.predict(X_test)
# Accuracy Metrics
clf_metrics(y_test, y_pred)

Classification Accuracy Score: 0.9618768328445748
Confusion Matrix: 
 [[ 78   1   0]
 [  8 117   2]
 [  2   0 133]]
Classification Report: 
                  precision    recall  f1-score   support

     F-MOUTHBAR       0.89      0.99      0.93        79
    F-TIDAL BAR       0.99      0.92      0.96       127
F-TIDAL CHANNEL       0.99      0.99      0.99       135

       accuracy                           0.96       341
      macro avg       0.95      0.96      0.96       341
   weighted avg       0.96      0.96      0.96       341



### Observation
- Hyper-parameter optimization leads to increase in accuracy from 0.86 to 0.96.