In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import statistics
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
dtype = {'file': str,
         'block': str,
         'tokens': str,
         'subject': str,
         'duration': int,
         'nSFD': float,
         'nFFD': float,
         'nGD': float,
         'nTT': float,
         'PrF': float,
         'Pr1': float,
         'Pr2': float,
         'PrS': float,
         'length': int,
         'wpm': float}

df = pd.read_csv("eight_metrics_block_level.csv", dtype=dtype)

In [9]:
df['subject'] = df['subject'].apply(lambda row: int(row == '001'))

In [11]:
df.head()

Unnamed: 0,file,block,tokens,subject,duration,nSFD,nFFD,nGD,nTT,PrF,Pr1,Pr2,PrS,length,wpm
0,11,4,,1,10447,,57.366304,58.540217,115.437319,0.821429,0.25,0.571429,0.178571,28,160.811716
1,11,2,,1,11356,,60.747985,61.331319,126.949679,0.8125,0.15625,0.65625,0.1875,32,169.073617
2,11,3,,1,17383,,80.019118,80.577941,161.241503,0.944444,0.166667,0.777778,0.055556,36,124.259334
3,11,5,,1,7128,,43.751948,48.321212,76.08566,0.733333,0.3,0.433333,0.266667,30,252.525253
4,11,6,,1,6742,,53.124901,53.124901,71.359127,0.685714,0.4,0.285714,0.314286,35,311.480273


In [12]:
data = df[['nFFD', 'nGD', 'nTT', 'PrF', 'Pr1', 'Pr2', 'PrS', 'wpm']]
target = df['subject']

### Drop High Correlated Features

In [13]:
# Create correlation matrix
corr_matrix = data.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

In [14]:
data = data.drop(data[to_drop], axis=1)

### Scale Data Using StandardScaler

In [15]:
data = pd.DataFrame(data=StandardScaler().fit_transform(data), columns=data.columns, index=data.index)

In [16]:
data.head()

Unnamed: 0,nFFD,nTT,PrF,Pr1,Pr2,wpm
0,0.069595,0.012498,0.750449,-0.125077,0.731115,-0.635189
1,0.266575,0.251724,0.683876,-0.891346,1.26595,-0.536466
2,1.389101,0.964305,1.667681,-0.806205,2.032234,-1.071961
3,-0.723428,-0.805226,0.093592,0.2836,-0.139634,0.460714
4,-0.177462,-0.903443,-0.261465,1.100953,-1.070434,1.16518


### Train Test Split

In [17]:
train_data, test_data, train_target, test_target = train_test_split(data, target, test_size=0.2)

In [18]:
print("train data:", train_data.shape)
print("train data:", test_data.shape)
print('train target:', train_target.shape)
print('test target:', test_target.shape)

train data: (8724, 6)
train data: (2181, 6)
train target: (8724,)
test target: (2181,)


### All Classifiers

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.metrics import plot_confusion_matrix

from tqdm import tqdm

In [20]:
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes"]

classifiers = [
    KNeighborsClassifier(5),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB()]

In [22]:
accuracy, precision, recall, f1, confusion = [], [], [], [], []

for name, clf in tqdm(zip(names, classifiers)):
    train_data, test_data, train_target, test_target = train_test_split(data, target, test_size=0.25)
    clf.fit(train_data, train_target)
    
    accuracy.append(statistics.mean(cross_val_score(clf, train_data, train_target, cv=10, scoring="accuracy")))
    
    test_predict = cross_val_predict(clf, test_data, test_target, cv=10)
    precision.append(precision_score(test_target, test_predict, pos_label=1))
    recall.append(recall_score(test_target, test_predict, pos_label=1))
    f1.append(f1_score(test_target, test_predict, pos_label=1))
    
    confusion.append(confusion_matrix(test_target, test_predict))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
3it [23:14, 464.93s/it]


KeyboardInterrupt: 

In [None]:
fig, ax = plt.subplots(figsize=(18,8))

ax.plot(names, accuracy, label='Accuracy score')
ax.plot(names, precision, label='Precision score')
ax.plot(names, recall, label='Recall score')
ax.plot(names, f1, label='F1 score')

ax.scatter(names, accuracy)
ax.scatter(names, precision)
ax.legend(loc="lower right", frameon=True)

ax.set_title("Results in Subject Classification with Different Classifiers");