# 1. Import libraries

In [1]:
import pandas as pd
import numpy as np 
import pickle
import os

from itertools import cycle
from collections import defaultdict
from matplotlib import pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler, MinMaxScaler, label_binarize
from sklearn.metrics import roc_curve, auc
from sklearn.multiclass import OneVsRestClassifier

from helper import plot_classifier


# 2. Load data, extract data and label

In [2]:
rootdir = os.getcwd() + "/data/training/"

data             = []
data_label       = []


for subdir, dirs, files in os.walk(rootdir):
    for fl in files:
        if fl.endswith('.pkl'):
            # string split for event labelling
            str_arr = fl.split('_')
            label   = str_arr[1]

            # read data and append it to variable
            infile = open((rootdir + fl),'rb')
            new_dict = pickle.load(infile, encoding='latin1')

            data.append(new_dict)
            data_label.append(new_dict['label'])

# 3. Feature Generation - Mean, Variance, Maximum and Minimum

In [3]:
keys = [
    'Magnetometer', 
    'Accelerometer', 
    'LinearAccelerometer', 
    'JinsGyroscope', 
    'Gravity', 
    'MSAccelerometer',
    'JinsAccelerometer',
    'MSGyroscope',
    'Gyroscope',
    'JinsEyeMovement',
    # 'JinsBlinkStrength', Excluded because some data has no JinsBlinkStrength data
    'rightHand',
    'label'
]

ary = []
for dt in data:
    internal = []
    for key in keys[:-2]:
        feature = dt[key]
        # print(f"Shape von Feature {key}: {feature.shape}")
        for column in feature.T:
            internal.append(np.mean(column))
            internal.append(np.var(column))
            internal.append(np.max(column))
            internal.append(np.min(column))
            
    # Right hand - If true 1 else 0
    right_hand = 1 if dt["rightHand"] else 0
    internal.append(right_hand)

    internal.append(dt["label"])

    ary.append(internal)

# 4. Added columns name to the data frame

In [4]:
df = pd.DataFrame(ary)

# Name df columns, so it's more clear
df_columns = []

# Iterate through except label, rightHand, JinsEyeMovement, JinsBlinkStrength
for key in keys[:-3]:
    df_columns.append('{} X (mean)'.format(key))
    df_columns.append('{} X (var)'.format(key))
    df_columns.append('{} X (max)'.format(key))
    df_columns.append('{} X (min)'.format(key))

    df_columns.append('{} Y (mean)'.format(key))
    df_columns.append('{} Y (var)'.format(key))
    df_columns.append('{} Y (max)'.format(key))
    df_columns.append('{} Y (min)'.format(key))

    df_columns.append('{} Z (mean)'.format(key))
    df_columns.append('{} Z (var)'.format(key))
    df_columns.append('{} Z (max)'.format(key))
    df_columns.append('{} Z (min)'.format(key))

# JinsEyeMovement (4 dimensions)
for i in range(4):
    df_columns.append('JinsEyeMovement Dim-{} (mean)'.format(i+1))
    df_columns.append('JinsEyeMovement Dim-{} (var)'.format(i+1))
    df_columns.append('JinsEyeMovement Dim-{} (max)'.format(i+1))
    df_columns.append('JinsEyeMovement Dim-{} (min)'.format(i+1))

df_columns.append('rightHand')

df_columns.append('label')

df.columns = df_columns

df

Unnamed: 0,Magnetometer X (mean),Magnetometer X (var),Magnetometer X (max),Magnetometer X (min),Magnetometer Y (mean),Magnetometer Y (var),Magnetometer Y (max),Magnetometer Y (min),Magnetometer Z (mean),Magnetometer Z (var),...,JinsEyeMovement Dim-3 (mean),JinsEyeMovement Dim-3 (var),JinsEyeMovement Dim-3 (max),JinsEyeMovement Dim-3 (min),JinsEyeMovement Dim-4 (mean),JinsEyeMovement Dim-4 (var),JinsEyeMovement Dim-4 (max),JinsEyeMovement Dim-4 (min),rightHand,label
0,4.991582,3.368823,9.3125,1.9375,-32.884439,0.634716,-31.1875,-35.2500,26.872449,0.663769,...,0.0,0.0,0.0,0.0,0.009901,0.009803,1.0,0.0,0,0
1,4.999238,7.892133,7.7500,-2.5625,-40.103659,1.256309,-37.8750,-43.4375,-1.536331,3.767433,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0,0
2,8.244621,33.212013,14.1250,-1.7500,-37.640113,1.081339,-35.2500,-40.8125,9.581711,26.765016,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0,0
3,8.289634,0.321695,9.6875,6.6875,-39.676829,0.765797,-37.8750,-41.9375,3.122967,1.000790,...,0.0,0.0,0.0,0.0,0.010000,0.009900,1.0,0.0,0,0
4,18.582317,0.217309,20.0000,17.3750,-39.456047,0.764281,-37.5000,-41.5625,-7.030488,0.467312,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
254,57.716972,143.881277,70.9375,23.0625,-8.623476,6.323994,-1.8125,-14.3125,54.860010,121.039457,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0,5
255,60.309959,32.958549,72.0000,45.8750,-9.146341,3.963309,-3.6875,-13.9375,52.580285,101.322569,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0,5
256,62.244157,22.920205,70.1875,49.9375,-10.478659,6.692094,-4.0625,-15.4375,52.757876,97.716036,...,0.0,0.0,0.0,0.0,0.010000,0.009900,1.0,0.0,0,5
257,62.617378,22.964754,72.7500,52.1250,-9.737551,7.747225,-3.3125,-16.5625,51.687246,73.808673,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0,5


# 5. Performing a principal Component analysis

In [None]:
X = df.iloc[:, :-1].to_numpy() # All features except label
y = df['label'].to_numpy()

pca = PCA(n_components=10) # CAVE: maybe change the amount of principal components to use

# Standardizing the features
X = StandardScaler().fit_transform(X)

principal_components = pca.fit_transform(X)

principal_df = pd.DataFrame(data=principal_components)

print(pca.explained_variance_ratio_)
print(pca.get_params)
principal_df
# Plot PCA?

# Feature selections with RandomForestClassifier and SelectFromModel

In [None]:
X = df.iloc[:, :-1]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

sel = SelectFromModel(RandomForestClassifier(n_estimators = 100, random_state=42))
sel.fit(X_train, y_train)

sel.get_support()

# Array of columns, that regarded as best features by random forest classifier above
selected_feats = X_train.columns[(sel.get_support())]

selected_feats

# RandomForestClassifier without pre-processing

In [None]:
# X = principal_components # All features except label
X = df.iloc[:, :-1].to_numpy() # All features except label
y = df['label'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)

predicted = clf.predict(X_test)

# Cross fold validation - 10 folds
scores = cross_val_score(clf, X, y, cv=10)

# Scores value
print (f"Score: {clf.score(X_test, y_test)}")
print(f"cross fold validation scores: {scores}")
print(f"cross fold validation score average: {scores.mean()}")
print(f"cross fold validation score standard deviation: {scores.std()}")

# RandomForestClassifier with selected features

In [None]:
X = df[selected_feats].to_numpy()
y = df['label'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)

# Cross fold validation - 10 folds
scores = cross_val_score(clf, X, y, cv=10)

# Scores value
print (f"Score: {clf.score(X_test, y_test)}")
print(f"cross fold validation scores: {scores}")
print(f"cross fold validation score average: {scores.mean()}")
print(f"cross fold validation score standard deviation: {scores.std()}")

# Logistic Regression with StandardScaler and selected features

In [None]:
X = df[selected_feats].to_numpy()
y = df['label'].to_numpy()

# Standardizing the features
X = StandardScaler().fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

clf = LogisticRegression(random_state=0, solver="saga", max_iter=1000).fit(X_train, y_train)

scores = cross_val_score(clf, X, y, cv=10)

# Scores value
print(f"Score: {clf.score(X_test, y_test)}")
print(f"cross fold validation scores: {scores}")
print(f"cross fold validation score average: {scores.mean()}")
print(f"cross fold validation score standard deviation: {scores.std()}")

# SVM with StandardScaler and selected features

In [None]:
X = df[selected_feats].to_numpy()
y = df['label'].to_numpy()

# Standardizing the features
X = StandardScaler().fit_transform(X)

clf = SVC(gamma='auto', random_state=0).fit(X_train, y_train)

scores = cross_val_score(clf, X, y, cv=10)

# Scores value
print(f"Score: {clf.score(X_test, y_test)}")
print(f"cross fold validation scores: {scores}")
print(f"cross fold validation score average: {scores.mean()}")
print(f"cross fold validation score standard deviation: {scores.std()}")

# KNN with StandardScaler and selected features

In [None]:
X = df[selected_feats].to_numpy()
y = df['label'].to_numpy()

clf = KNeighborsClassifier(n_neighbors=2)
clf.fit(X_train, y_train)

scores = cross_val_score(clf, X, y, cv=10)

# Scores value
print(f"Score: {clf.score(X_test, y_test)}")
print(f"cross fold validation scores: {scores}")
print(f"cross fold validation score average: {scores.mean()}")
print(f"cross fold validation score standard deviation: {scores.std()}")

# Naive Bayes without standardization necessary with selected features

In [None]:
X = df.iloc[:, :-1].to_numpy() # All features except label
# X = df.iloc[selected_feats].to_numpy()
y = df['label'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

clf = MultinomialNB()
clf.fit(X_train, y_train)

print(f"Score: {clf.score(X_test, y_test)}")
print(f"cross fold validation scores: {scores}")
print(f"cross fold validation score average: {scores.mean()}")
print(f"cross fold validation score standard deviation: {scores.std()}")

proba = clf.predict_proba(X_train)

np.set_printoptions(suppress=True)
print(f"Probabilites per dataset: {proba}")

df_proba = pd.DataFrame(proba, columns=["Class 0", "Class 1", "Class 2", "Class 3", "Class 4", "Class 5"])
pd.set_option("display.float_format", lambda x: '%.5f' % x)
df_proba

# AUC-ROC Curve (OneVsRest SVM)

In [None]:
X = df[selected_feats].to_numpy()
y = df['label'].to_numpy()

# Standardizing the features
X = StandardScaler().fit_transform(X)

# Binarize the output
y = label_binarize(y, classes=[0, 1, 2, 3, 4, 5])
n_classes = y.shape[1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = OneVsRestClassifier(SVC(random_state=42)).fit(X_train, y_train)

y_score = clf.fit(X_train, y_train).decision_function(X_test)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

lw = 2

# Plot all ROC curves
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=4)

colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'blueviolet', 'saddlebrown', 'crimson'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=lw,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('AUC-ROC Curve (SVM)')
plt.legend(loc="lower right")
plt.show()

# AUC-ROC Curve (OneVsRest Logistic Regression)

In [None]:
X = df[selected_feats].to_numpy()
y = df['label'].to_numpy()

# Standardizing the features
X = StandardScaler().fit_transform(X)

# Binarize the output
y = label_binarize(y, classes=[0, 1, 2, 3, 4, 5])
n_classes = y.shape[1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = OneVsRestClassifier(LogisticRegression(random_state=0,solver='lbfgs', max_iter=1000)).fit(X_train, y_train)

y_score = clf.fit(X_train, y_train).decision_function(X_test)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

lw = 2

# Plot all ROC curves
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=4)

colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'blueviolet', 'saddlebrown', 'crimson'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=lw,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('AUC-ROC Curve (Logistic Regression)')
plt.legend(loc="lower right")
plt.show()

# AUC-ROC Curve (OneVsRest RandomForest)

In [None]:
X = df[selected_feats].to_numpy()
y = df['label'].to_numpy()

# Standardizing the features
X = StandardScaler().fit_transform(X)

# Binarize the output
y = label_binarize(y, classes=[0, 1, 2, 3, 4, 5])
n_classes = y.shape[1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = OneVsRestClassifier(RandomForestClassifier(random_state=0)).fit(X_train, y_train)

y_score = clf.fit(X_train, y_train).predict_proba(X_test)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

lw = 2

# Plot all ROC curves
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=4)

colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'blueviolet', 'saddlebrown', 'crimson'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=lw,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('AUC-ROC Curve (Random Forest)')
plt.legend(loc="lower right")
plt.show()

In [6]:
from helper import compare_classifier_score

In [8]:
print('Without standardization and with no feature selection')

compare_classifier_score(df, standardization=False, featureSelection='none')

print('Without standardization and with random forest feature selection')

compare_classifier_score(df, standardization=False, featureSelection='randomForest')

print('Without standardization and with pca feature selection')

compare_classifier_score(df, standardization=False, featureSelection='pca')

print('With standardization and with no feature selection')

compare_classifier_score(df, standardization=True, featureSelection='none')

print('With standardization and with random forest feature selection')

compare_classifier_score(df, standardization=True, featureSelection='randomForest')

print('With standardization and with pca feature selection')

compare_classifier_score(df, standardization=True, featureSelection='pca')

Without standardization and with no feature selection
cross fold validation score average for Random Forest: 0.9692307692307693
cross fold validation score average for Logistic Regression: 0.7841538461538462
cross fold validation score average for SVM: 0.22430769230769232
cross fold validation score average for Nearest Neighbors: 0.6066153846153847
cross fold validation score average for Naive Bayes: 0.7766153846153846
Without standardization and with random forest feature selection
cross fold validation score average for Random Forest: 0.9692307692307693
cross fold validation score average for Logistic Regression: 0.9343076923076923
cross fold validation score average for SVM: 0.45199999999999996
cross fold validation score average for Nearest Neighbors: 0.8807692307692306
cross fold validation score average for Naive Bayes: 0.8266153846153846
Without standardization and with pca feature selection
cross fold validation score average for Random Forest: 0.7415384615384615
cross fold val