In [80]:
#import lib
import pandas as pd
import numpy as np
from scipy.signal import welch
from tabulate import tabulate
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import SVC


X_pca_train = pd.read_csv('./data/PCA_features.csv',header = None)
print(X_pca_train.shape[0])
y_train = pd.read_csv('./data/y_train.csv')
y_encoded_train = np.zeros(y_train.shape[0])
y_labels = list(y_train['surface'].value_counts().index)
i = 0
for surface in y_labels:
    y_encoded_train[y_train['surface']==surface] = i
    i += 1
print(y_labels)
print(y_encoded_train)
# print(y_train)
print(tabulate(X_pca_train[0:5],floatfmt = ".2f",headers = [(i) for i in range(15)],tablefmt='psql',numalign='right'))

3810
['concrete', 'soft_pvc', 'wood', 'tiled', 'fine_concrete', 'hard_tiles_large_space', 'soft_tiles', 'carpet', 'hard_tiles']
[4. 0. 0. ... 4. 3. 1.]
+----+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+
|    |    0 |    1 |    2 |    3 |    4 |    5 |    6 |    7 |    8 |    9 |   10 |   11 |   12 |   13 |   14 |
|----+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------|
|  0 | 2.80 | 0.14 | 0.53 | 0.55 | 0.22 | 0.96 | 0.11 | 0.81 | 0.31 | 0.07 | 1.08 | 0.53 | 2.22 | 1.06 | 0.22 |
|  1 | 0.95 | 0.35 | 0.67 | 0.31 | 0.30 | 0.04 | 0.27 | 0.23 | 0.44 | 0.08 | 0.64 | 0.79 | 0.40 | 0.52 | 0.11 |
|  2 | 2.66 | 0.14 | 0.74 | 0.44 | 0.54 | 0.18 | 0.63 | 0.12 | 0.35 | 0.47 | 0.30 | 0.26 | 0.43 | 0.98 | 0.07 |
|  3 | 1.32 | 0.50 | 1.40 | 1.13 | 4.44 | 0.52 | 1.16 | 0.88 | 0.77 | 1.51 | 0.48 | 0.57 | 0.34 | 0.43 | 0.12 |
|  4 | 3.56 | 0.02 | 1.25 | 0.59 | 0.11 | 1.53 | 0.11 | 0.29 | 0

## Perform Stratified K-Fold Split (k= 5) in Dataset for Cross Validation
Stratified K-Fold is used to ensure that there is a uniform distribution of surfaces in all train and test sets.

In [81]:
kf = StratifiedKFold(n_splits = 5)
print(kf.split(X_pca_train,y_encoded_train))
for train_index, val_index in kf.split(X_pca_train, y_encoded_train):
    print("Train:", train_index, "Val:", val_index)
    print(train_index.shape[0])
    print(val_index.shape[0])
#     X_ktrain, X_ktest = X_pca_train[train_index,:], X_pca_train[test_index,:]
#     y_ktrain, y_ktest = y_train[train_index], y_train[test_index]

<generator object _BaseKFold.split at 0x00000245DE78B5C8>
Train: [ 348  351  357 ... 3807 3808 3809] Val: [   0    1    2    3    4    5    6    7    8    9   10   11   12   13
   14   15   16   17   18   19   20   21   22   23   24   25   26   27
   28   29   30   31   32   33   34   35   36   37   38   39   40   41
   42   43   44   45   46   47   48   49   50   51   52   53   54   55
   56   57   58   59   60   61   62   63   64   65   66   67   68   69
   70   71   72   73   74   75   76   77   78   79   80   81   82   83
   84   85   86   87   88   89   90   91   92   93   94   95   96   97
   98   99  100  101  102  103  104  105  106  107  108  109  110  111
  112  113  114  115  116  117  118  119  120  121  122  123  124  125
  126  127  128  129  130  131  132  133  134  135  136  137  138  139
  140  141  142  143  144  145  146  147  148  149  150  151  152  153
  154  155  156  157  158  159  160  161  162  163  164  165  166  167
  168  169  170  171  172  173  174  175  

## Get score of model used to classify data

In [82]:
def get_score(model, X_train, X_val, Y_train,Y_val):
    model.fit(X_train,Y_train)
    return model.score(X_val,Y_val)

## Models tested: Logistic Regression(multiclass), SVC (add to this to test more models)

In [85]:
score_logreg = []
score_svc = []

for train_index, val_index in kf.split(X_pca_train, y_encoded_train):
    score_logreg.append(get_score(LogisticRegressionCV(), X_pca_train.loc[train_index,:],X_pca_train.loc[val_index,:], y_encoded_train[train_index], y_encoded_train[val_index]))
    score_svc.append(get_score(SVC(), X_pca_train.loc[train_index,:],X_pca_train.loc[val_index,:], y_encoded_train[train_index],y_encoded_train[val_index]))



In [86]:
print(score_logreg)
print(score_svc)

[0.3224543080939948, 0.4091503267973856, 0.4435695538057743, 0.3618421052631579, 0.41743725231175693]
[0.39556135770234985, 0.5045751633986928, 0.5616797900262467, 0.5815789473684211, 0.5759577278731837]
