# Traditional Machine Learning Models

# Logistic Regression & Support Vector Machines

In [41]:
#import lib
import pandas as pd
import numpy as np
from scipy.signal import welch
from tabulate import tabulate
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.neighbors.classification import KNeighborsClassifier
from sklearn.manifold.t_sne import TSNE

np.random.seed(41)

## Import PCA features
As performed in the previous step, we shall be using our PCA features.

In [42]:
# Import PCA features and labels
X = np.loadtxt('./data/PCA_features.csv',delimiter=',')
print(X.shape[0])
print(tabulate(X[0:5],floatfmt = ".2f",headers = [(i) for i in range(15)],tablefmt='psql',numalign='right'))

3810
+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+
|    0 |    1 |    2 |    3 |    4 |    5 |    6 |    7 |    8 |    9 |   10 |   11 |   12 |   13 |   14 |
|------+------+------+------+------+------+------+------+------+------+------+------+------+------+------|
| 2.80 | 0.14 | 0.53 | 0.55 | 0.22 | 0.96 | 0.11 | 0.81 | 0.31 | 0.07 | 1.08 | 0.53 | 2.22 | 1.06 | 0.22 |
| 0.95 | 0.35 | 0.67 | 0.31 | 0.30 | 0.04 | 0.27 | 0.23 | 0.44 | 0.08 | 0.64 | 0.79 | 0.40 | 0.52 | 0.11 |
| 2.66 | 0.14 | 0.74 | 0.44 | 0.54 | 0.18 | 0.63 | 0.12 | 0.35 | 0.47 | 0.30 | 0.26 | 0.43 | 0.98 | 0.07 |
| 1.32 | 0.50 | 1.40 | 1.13 | 4.44 | 0.52 | 1.16 | 0.88 | 0.77 | 1.51 | 0.48 | 0.57 | 0.34 | 0.43 | 0.12 |
| 3.56 | 0.02 | 1.25 | 0.59 | 0.11 | 1.53 | 0.11 | 0.29 | 0.27 | 0.27 | 0.40 | 0.19 | 1.02 | 0.34 | 0.23 |
+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+


## Encode y-labels
We do not have to convert into one-hot encoding as the tools we use directly operates on discrete class integers. However, we still need to encode from strings into class integers.

In [43]:
y_train = pd.read_csv('./data/y_train.csv')
y_encoded_train = np.zeros(y_train.shape[0])
y_labels = list(y_train['surface'].value_counts().index)
i = 0
for surface in y_labels:
    y_encoded_train[y_train['surface']==surface] = i
    i += 1
print(y_labels)
print(y_encoded_train)

['concrete', 'soft_pvc', 'wood', 'tiled', 'fine_concrete', 'hard_tiles_large_space', 'soft_tiles', 'carpet', 'hard_tiles']
[4. 0. 0. ... 4. 3. 1.]


## Normalise features
We use z-scoring to normalise our features distributions.

In [44]:
print('Before scaling:')
print(X[0:5,0:5])
zscore = StandardScaler()
X = zscore.fit_transform(X) #scales each feature column
print('After scaling:')
print(X[0:5,0:5])

Before scaling:
[[2.79885583 0.14018706 0.53344835 0.54947368 0.22398822]
 [0.95024313 0.3537402  0.66782446 0.31184005 0.30197932]
 [2.66298508 0.14100292 0.7353114  0.44199593 0.54238961]
 [1.31882634 0.50290392 1.40031595 1.13372449 4.43853231]
 [3.5550391  0.02075805 1.24775028 0.58754519 0.11419715]]
After scaling:
[[-0.18836596 -0.10746136 -0.40703808 -0.34302846 -0.47355155]
 [-0.56514982 -0.02605698 -0.34516169 -0.4999286  -0.42202239]
 [-0.2160591  -0.10715036 -0.31408587 -0.41399179 -0.26318194]
 [-0.49002527  0.03080278 -0.00787015  0.04272936  2.31102166]
 [-0.03424083 -0.15298654 -0.07812231 -0.31789134 -0.54609113]]


## Split train-test sets
We will split our PCA features/labels into train and test sets, where the test set will be used as a final evaluation later. Train will be further split into validation and train sets during K-Fold Validation.

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded_train, test_size=0.2)

## Get score of model used to classify data

In [46]:
def get_score(model, X_train, X_val, Y_train,Y_val):
    model.fit(X_train,Y_train)
    return model.score(X_val,Y_val)

## Determine each set of hyperparameters for K-fold evaluation
In our models, we shall attempt to find the best of hyperparameters. The tunable hyperparameters are:

Log Reg
- Regularisation parameter C

SVM
- Regularisation parameter C
- Kernel type: 'rbf', 'poly' and 'sigmoid'

In [47]:
regs = [0.01,0.03,0.1,0.3,1,3,10]
kernels = ['rbf','poly']

## 1) Log Reg: Perform Stratified K-Fold Split (k= 5) for each model permutation
Stratified K-Fold is used to ensure that there is a uniform distribution of surfaces in all train and test sets.

In [48]:
n_splits = 5
val_loss_all = {}

for reg in regs:
    print('Model Regularisation = {}'.format(reg))
    score_logreg = []

    kf = StratifiedKFold(n_splits)

    for i,(train_index, val_index) in enumerate(kf.split(X_train, y_train)):
        print("Training on fold " + str(i+1) + "/{}...".format(n_splits))
        score_logreg.append(get_score(LogisticRegression(C=reg,multi_class='ovr',solver='lbfgs'), X_train[train_index,:],X_train[val_index,:], y_encoded_train[train_index], y_encoded_train[val_index]))
    val_loss_all[reg] = np.mean(score_logreg)
    print('Average Validation Loss:{}'.format(np.mean(score_logreg)))
    

Model Regularisation = 0.01
Training on fold 1/5...
Training on fold 2/5...
Training on fold 3/5...
Training on fold 4/5...
Training on fold 5/5...
Average Validation Loss:0.18175045796928485
Model Regularisation = 0.03
Training on fold 1/5...
Training on fold 2/5...
Training on fold 3/5...
Training on fold 4/5...
Training on fold 5/5...
Average Validation Loss:0.18535650337367882
Model Regularisation = 0.1
Training on fold 1/5...
Training on fold 2/5...
Training on fold 3/5...
Training on fold 4/5...
Training on fold 5/5...
Average Validation Loss:0.18108718816265565
Model Regularisation = 0.3
Training on fold 1/5...
Training on fold 2/5...
Training on fold 3/5...
Training on fold 4/5...
Training on fold 5/5...
Average Validation Loss:0.18108450151761957
Model Regularisation = 1
Training on fold 1/5...
Training on fold 2/5...
Training on fold 3/5...
Training on fold 4/5...
Training on fold 5/5...
Average Validation Loss:0.1810812765000835
Model Regularisation = 3
Training on fold 1/5.

In [49]:
# Get best hyperparameter set based on val loss
val_loss_best,best_hyper = min(val_loss_all.values()),min(val_loss_all,key=val_loss_all.get)
print('The best combinaton belongs to C={}, with a lowest val loss of {}.'.format(best_hyper,val_loss_best))

The best combinaton belongs to C=1, with a lowest val loss of 0.1810812765000835.


## Retrain model with best set of hyperparameter & evaluate final accuracy

In [50]:
model = LogisticRegression(C=best_hyper,multi_class='ovr',solver='lbfgs')
model.fit(X_train,y_train)
predictions = model.predict(X_test)

# evaluate the model
score = accuracy_score(predictions,y_test)
print('Final accuracy score:',score)

Final accuracy score: 0.45013123359580054


Since there are 9 classes to predict on, 42-44% is already better than baseline accuracy obtained via random guessing, i.e. 1/9 chance.

## 2) SVM: Perform Stratified K-Fold Split (k= 5) for each model permutation
Similar to above Log Reg model, we train on different combinations of hyperparameters.

In [51]:
n_splits = 5
val_loss_all = {}

for kernel in kernels:
    for reg in regs:
        print('Model Kernel = {}, Regularisation = {}'.format(kernel,reg))
        score_svc = []

        kf = StratifiedKFold(n_splits)

        for i,(train_index, val_index) in enumerate(kf.split(X_train, y_train)):
            print("Training on fold " + str(i+1) + "/{}...".format(n_splits))
            score_svc.append(get_score(SVC(C=reg,kernel=kernel,gamma='auto',degree=6), X_train[train_index,:],X_train[val_index,:], y_encoded_train[train_index],y_encoded_train[val_index]))
        val_loss_all[(kernel,reg)] = np.mean(score_svc)
        print('Average Validation Loss:{}'.format(np.mean(score_svc)))


Model Kernel = rbf, Regularisation = 0.01
Training on fold 1/5...
Training on fold 2/5...
Training on fold 3/5...
Training on fold 4/5...
Training on fold 5/5...
Average Validation Loss:0.16929454502908162
Model Kernel = rbf, Regularisation = 0.03
Training on fold 1/5...
Training on fold 2/5...
Training on fold 3/5...
Training on fold 4/5...
Training on fold 5/5...
Average Validation Loss:0.16929454502908162
Model Kernel = rbf, Regularisation = 0.1
Training on fold 1/5...
Training on fold 2/5...
Training on fold 3/5...
Training on fold 4/5...
Training on fold 5/5...
Average Validation Loss:0.16929454502908162
Model Kernel = rbf, Regularisation = 0.3
Training on fold 1/5...
Training on fold 2/5...
Training on fold 3/5...
Training on fold 4/5...
Training on fold 5/5...
Average Validation Loss:0.1761530250706267
Model Kernel = rbf, Regularisation = 1
Training on fold 1/5...
Training on fold 2/5...
Training on fold 3/5...
Training on fold 4/5...
Training on fold 5/5...
Average Validation L

In [52]:
# Get best hyperparameter set based on val loss
val_loss_best,best_hyper = min(val_loss_all.values()),min(val_loss_all,key=val_loss_all.get)
print('The best combinaton belongs to {}, with a lowest val loss of {}.'.format(best_hyper,val_loss_best))

The best combinaton belongs to ('poly', 1), with a lowest val loss of 0.16338210640916107.


## Retrain model with best set of hyperparameter & evaluate final accuracy

In [53]:
model = SVC(C=best_hyper[1],kernel=best_hyper[0],gamma='auto')
model.fit(X_train,y_train)
predictions = model.predict(X_test)

# evaluate the model
score = accuracy_score(predictions,y_test)
print('Final accuracy score:',score)

Final accuracy score: 0.3766404199475066
