In [1]:
seed = 42
import pandas as pd
import numpy as np
import random
np.random.seed(seed)
random.seed(seed)

Load the data

In [2]:
data = pd.read_csv('data/EEG_data.csv')
data.columns

"""
load subtitle vectors  
"""
sub_vec_path = 'data/subtitles/subtitle_vecs.npy'
sub_vecs = np.load(sub_vec_path)
sub_vec_dim = sub_vecs.shape[1]

"""
Make a dataset of original data combined with sub vecs 
"""
dataset = np.hstack((data.values.astype('float32'), sub_vecs))


PCA to reduce dimension of the word average vectors (might give better results) to reduce training speed

In [3]:
from sklearn.decomposition import PCA

sub_vec_dim = 12
pca = PCA(n_components=sub_vec_dim)
pcad_sub_vecs = pca.fit_transform(sub_vecs)
dataset = np.hstack((data.values.astype('float32'), pcad_sub_vecs))

Define target variable and training variables

In [4]:
# The student's confusion column index is 14, predefined label index is 13
y_col = 14
orig_train_data_cols = list(range(2,y_col))
vector_cols = list(np.arange(sub_vec_dim) + 15) 

train_cols = orig_train_data_cols + vector_cols
n_dims = len(train_cols)

Functions to make interval counts even for all data points

In [5]:
def min_max_rows_per_subject_vid(X):
    VideoID = list(set(X[:,1]))
    SubjectID = list(set(X[:,0]))

    max_intervals = 0 # length of signal
    min_intervals = len(X)

    for subId in SubjectID:
        for vidId in VideoID:
            X_tmp=X[(X[:, 0] == subId) & (X[:, 1] == vidId)]
            max_intervals = max(len(X_tmp), max_intervals)
            min_intervals = min(len(X_tmp), min_intervals)
    print(max_intervals)
    print(min_intervals)
    assert max_intervals == 144
    return min_intervals, max_intervals

min_intervals, max_intervals = min_max_rows_per_subject_vid(dataset)



def zero_pad_data(data, max_intervals, train_cols, y_col):
    X_pad = None
    y = []
    VideoID = list(set(data[:,1]))
    SubjectID = list(set(data[:,0])) 
    for subId in SubjectID:
        for vidId in VideoID:
            data_sv = data[(data[:,0]==subId) & (data[:,1]==vidId)]

            y.append(data_sv[:, y_col].mean())

            X_sv = data_sv[:, train_cols]
            
            pad_len = max_intervals - X_sv.shape[0]
    
            z = np.zeros((pad_len, X_sv.shape[1]), dtype=X_sv.dtype)
            z[:,0] = X_sv[:,0][pad_len]
            z[:,1] = X_sv[:,1][pad_len]
            
            X_sv_pad = np.concatenate((X_sv, z), axis=0)
            X_sv_pad = X_sv_pad.reshape(1, -1)

            X_pad = X_sv_pad if X_pad is None else np.vstack((X_pad, X_sv_pad))
            
    return X_pad, np.array(y)

def truncate_data(data, min_intervals, train_cols, y_col):
    X_trunc = None
    y = []
    VideoID = list(set(data[:,1]))
    SubjectID = list(set(data[:,0]))
    for vidId in VideoID:
        for subId in SubjectID:
            data_sv = data[(data[:,0]==subId) & (data[:,1]==vidId)]
 
            y.append(data_sv[:, y_col].mean())

            X_sv = data_sv[:, train_cols]
            trunc_len = min_intervals
            X_sv_trunc = X_sv[0:trunc_len].reshape(1, -1)
            X_trunc = X_sv_trunc if X_trunc is None else np.vstack((X_trunc, X_sv_trunc))
    return X_trunc, np.array(y)

144
112


In [6]:
"""
Cross-validate
"""
from time import time
from sklearn.metrics import precision_recall_fscore_support as pr_f1, accuracy_score as accur, matthews_corrcoef, \
roc_auc_score
from sklearn.base import clone, BaseEstimator
from time import time
import numpy as np

def get_scores(modelname, y, y_hat):
    acc = accur(y, y_hat)
    matt = matthews_corrcoef(y, y_hat)
    p, r, f1, sup = pr_f1(y, y_hat)
    roc_auc = roc_auc_score(y, y_hat)
    return acc, f1[1], f1[0], roc_auc

def sklearn_cross_validate(modelname, model, data, intervals, even_data, n_test=2):
    """
    even_data: either truncate_data or zero_pad_data to make number of intervals
             even for each data point
    """
    results = []
    start = time()
    for i in range(0, 10, n_test):
        cv_model = clone(model)
        
        
        X_train, y_train = even_data(data[np.in1d(data[:,0], (i, i+1), invert=True)],
                                     intervals, train_cols, y_col)
        X_test, y_test = even_data(data[np.in1d(data[:,0], (i, i+1))],
                                   intervals, train_cols, y_col)
        
        print(X_train.shape)
        #print(y_test)
        cv_model.fit(X_train, y_train)
        
        print('{}-fold cross validation for model {}, iteration {}'
              .format(int(10/n_test), modelname, len(results) +1))
        acc, f1, f1_flip, roc_auc = get_scores('svm', y_test, np.round(cv_model.predict(X_test)))

        results.append({'acc': acc, 'F1': f1, 'F1-flipped': f1_flip, 'roc-auc': roc_auc})
        result_means = {key:np.mean([r[key] for r in results]) for key in results[0].keys()}
        print('current cross-validation mean accuracy: {:.3f}, f1: {:.3f}, f1 flipped: {:.3f} and roc-auc: {:.3f}'.format(
              *result_means.values()))
    print('cross-validation total time: {:.3f} seconds'.format(time() - start))
    return result_means, results


Test cross-validation with Support Vector Classifier

In [7]:
print(dataset.shape)
from sklearn.svm import SVC
model = SVC(C=1, kernel='linear')
result_means, results = sklearn_cross_validate('svc', model, dataset, max_intervals, zero_pad_data)

(12811, 27)
(80, 3456)
5-fold cross validation for model svc, iteration 1
current cross-validation mean accuracy: 0.550, f1: 0.571, f1 flipped: 0.526 and roc-auc: 0.561
(80, 3456)
5-fold cross validation for model svc, iteration 2
current cross-validation mean accuracy: 0.575, f1: 0.586, f1 flipped: 0.563 and roc-auc: 0.580
(80, 3456)
5-fold cross validation for model svc, iteration 3
current cross-validation mean accuracy: 0.567, f1: 0.566, f1 flipped: 0.566 and roc-auc: 0.581
(80, 3456)
5-fold cross validation for model svc, iteration 4
current cross-validation mean accuracy: 0.562, f1: 0.542, f1 flipped: 0.577 and roc-auc: 0.579
(80, 3456)
5-fold cross validation for model svc, iteration 5
current cross-validation mean accuracy: 0.540, f1: 0.552, f1 flipped: 0.492 and roc-auc: 0.561
cross-validation total time: 0.195 seconds




Helper functions to save models in a format that can be used to create a pandas DataFrame

In [8]:
def init_scores():
    return {'Model':[], 'Accuracy':[], 'F1':[], 'F1 flipped':[], 'ROC-AUC':[]}

def save_model_score(scores, name, results):
    scores['Model'].append(name)
    scores['Accuracy'].append(results['acc'])
    scores['F1'].append(results['F1'])
    scores['F1 flipped'].append(results['F1-flipped'])
    scores['ROC-AUC'].append(results['roc-auc'])

In [9]:
class Naive(BaseEstimator):
    def __init__(self):
        self.X = None
        self.y = None
    
    def fit(self, X, y):
        self.X = X
        self.y = y
    
    def predict(self, X):
        return np.zeros(len(X))
    
    def predict_proba(self, X):
        return np.zeros(len(X))
    
    def decision_function(self, X):
        return np.zeros(len(X))

Perform cross-validation on multiple models 

In [10]:
import warnings

def warn(*args, **kwargs):
    pass

old_warn = warnings.warn
warnings.warn = warn

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import linear_model as lm
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier as MLP
from sklearn.model_selection import cross_val_score

scores = init_scores()
sk_scores = init_scores()

seed = np.random.randint(0, 2000)
models = {'Naive': Naive(),
          'Logreg': lm.LogisticRegression(random_state=seed),
          'Ptron': lm.Perceptron(random_state=seed),
          'KNN': KNeighborsClassifier(n_neighbors=5),
          'GNB': GaussianNB(),
          'BNB': BernoulliNB(),
          'RF': RandomForestClassifier(random_state=seed),
          'GBT': GradientBoostingClassifier(n_estimators=777,
                                            random_state=seed),
          'MLP2': MLP(hidden_layer_sizes=((32,)*2), random_state=seed,
                       solver='adam', activation='logistic'),
          'MLP3': MLP(hidden_layer_sizes=((16, 8, 16)), random_state=seed,
                       solver='adam', activation='logistic'),
          'SVC Linear': SVC(C=1, kernel='linear', random_state=seed)}

for name, model in models.items():
    result_means, results = sklearn_cross_validate(name, model, dataset, min_intervals, truncate_data)
    save_model_score(scores, name, result_means)
    X, y = truncate_data(dataset, min_intervals, train_cols, y_col)
    result_means={key:np.mean(cross_val_score(model, X,y, cv=5, n_jobs=-1, scoring=metric))
                  for key, metric in {'F1':'f1', 'acc':'accuracy', 'roc-auc':'roc_auc'}.items()}
    result_means['F1-flipped'] = 0.
    save_model_score(sk_scores, name, result_means)

(80, 2688)
5-fold cross validation for model Naive, iteration 1
current cross-validation mean accuracy: 0.550, f1: 0.000, f1 flipped: 0.710 and roc-auc: 0.500
(80, 2688)
5-fold cross validation for model Naive, iteration 2
current cross-validation mean accuracy: 0.525, f1: 0.000, f1 flipped: 0.688 and roc-auc: 0.500
(80, 2688)
5-fold cross validation for model Naive, iteration 3
current cross-validation mean accuracy: 0.483, f1: 0.000, f1 flipped: 0.649 and roc-auc: 0.500
(80, 2688)
5-fold cross validation for model Naive, iteration 4
current cross-validation mean accuracy: 0.475, f1: 0.000, f1 flipped: 0.642 and roc-auc: 0.500
(80, 2688)
5-fold cross validation for model Naive, iteration 5
current cross-validation mean accuracy: 0.490, f1: 0.000, f1 flipped: 0.656 and roc-auc: 0.500
cross-validation total time: 0.119 seconds


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


(80, 2688)
5-fold cross validation for model Logreg, iteration 1
current cross-validation mean accuracy: 0.700, f1: 0.667, f1 flipped: 0.727 and roc-auc: 0.697
(80, 2688)
5-fold cross validation for model Logreg, iteration 2
current cross-validation mean accuracy: 0.700, f1: 0.697, f1 flipped: 0.697 and roc-auc: 0.698
(80, 2688)
5-fold cross validation for model Logreg, iteration 3
current cross-validation mean accuracy: 0.583, f1: 0.610, f1 flipped: 0.543 and roc-auc: 0.577
(80, 2688)
5-fold cross validation for model Logreg, iteration 4
current cross-validation mean accuracy: 0.637, f1: 0.652, f1 flipped: 0.612 and roc-auc: 0.637
(80, 2688)
5-fold cross validation for model Logreg, iteration 5
current cross-validation mean accuracy: 0.630, f1: 0.649, f1 flipped: 0.601 and roc-auc: 0.633
cross-validation total time: 0.481 seconds
(80, 2688)
5-fold cross validation for model Ptron, iteration 1
current cross-validation mean accuracy: 0.650, f1: 0.533, f1 flipped: 0.720 and roc-auc: 0.63

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


5-fold cross validation for model MLP2, iteration 2
current cross-validation mean accuracy: 0.475, f1: 0.583, f1 flipped: 0.222 and roc-auc: 0.500
(80, 2688)
5-fold cross validation for model MLP2, iteration 3
current cross-validation mean accuracy: 0.433, f1: 0.389, f1 flipped: 0.321 and roc-auc: 0.479
(80, 2688)
5-fold cross validation for model MLP2, iteration 4
current cross-validation mean accuracy: 0.462, f1: 0.469, f1 flipped: 0.241 and roc-auc: 0.484
(80, 2688)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


5-fold cross validation for model MLP2, iteration 5
current cross-validation mean accuracy: 0.430, f1: 0.458, f1 flipped: 0.218 and roc-auc: 0.452
cross-validation total time: 1.150 seconds
(80, 2688)
5-fold cross validation for model MLP3, iteration 1
current cross-validation mean accuracy: 0.450, f1: 0.621, f1 flipped: 0.000 and roc-auc: 0.500
(80, 2688)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


5-fold cross validation for model MLP3, iteration 2
current cross-validation mean accuracy: 0.475, f1: 0.644, f1 flipped: 0.000 and roc-auc: 0.500
(80, 2688)
5-fold cross validation for model MLP3, iteration 3
current cross-validation mean accuracy: 0.450, f1: 0.429, f1 flipped: 0.190 and roc-auc: 0.500
(80, 2688)


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


5-fold cross validation for model MLP3, iteration 4
current cross-validation mean accuracy: 0.488, f1: 0.481, f1 flipped: 0.282 and roc-auc: 0.524
(80, 2688)
5-fold cross validation for model MLP3, iteration 5
current cross-validation mean accuracy: 0.480, f1: 0.509, f1 flipped: 0.225 and roc-auc: 0.519
cross-validation total time: 0.663 seconds


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


(80, 2688)
5-fold cross validation for model SVC Linear, iteration 1
current cross-validation mean accuracy: 0.650, f1: 0.632, f1 flipped: 0.667 and roc-auc: 0.652
(80, 2688)
5-fold cross validation for model SVC Linear, iteration 2
current cross-validation mean accuracy: 0.625, f1: 0.530, f1 flipped: 0.679 and roc-auc: 0.626
(80, 2688)
5-fold cross validation for model SVC Linear, iteration 3
current cross-validation mean accuracy: 0.617, f1: 0.576, f1 flipped: 0.620 and roc-auc: 0.612
(80, 2688)
5-fold cross validation for model SVC Linear, iteration 4
current cross-validation mean accuracy: 0.675, f1: 0.649, f1 flipped: 0.671 and roc-auc: 0.670
(80, 2688)
5-fold cross validation for model SVC Linear, iteration 5
current cross-validation mean accuracy: 0.690, f1: 0.672, f1 flipped: 0.684 and roc-auc: 0.688
cross-validation total time: 0.206 seconds


Plot the results

In [11]:
%matplotlib notebook
from matplotlib import pyplot as plt

scores_df = pd.DataFrame(data=scores)
print(scores_df)
print(scores_df.Model.values)


ax = scores_df.plot(kind='line', x='Model', y=['Accuracy', 'ROC-AUC', 'F1'], xticks=range(len(models) + 1),
                    yticks=np.arange(10) / 10, color=['black', 'orange', 'green'])
ax.set_xticklabels(list(scores_df.Model))
for p in ax.patches:
     ax.annotate('{:.2f}'.format(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.02))
ax.axhline(y=0.7, xmin=0.0, xmax=1.0, color='black', linewidth=.1)
ax.axhline(y=0.6, xmin=0.0, xmax=1.0, color='black', linewidth=.1)
plt.tight_layout()
plt.show()

         Model  Accuracy        F1  F1 flipped   ROC-AUC
0        Naive      0.49  0.000000    0.655628  0.500000
1       Logreg      0.63  0.648573    0.600594  0.632929
2        Ptron      0.58  0.512821    0.605639  0.588308
3          KNN      0.68  0.612605    0.721140  0.696263
4          GNB      0.67  0.684314    0.596215  0.670404
5          BNB      0.57  0.560606    0.570760  0.579621
6           RF      0.56  0.508319    0.595080  0.570985
7          GBT      0.70  0.701445    0.682305  0.707904
8         MLP2      0.43  0.458498    0.217593  0.452146
9         MLP3      0.48  0.508882    0.225397  0.519192
10  SVC Linear      0.69  0.671657    0.683869  0.688182
['Naive' 'Logreg' 'Ptron' 'KNN' 'GNB' 'BNB' 'RF' 'GBT' 'MLP2' 'MLP3'
 'SVC Linear']


<IPython.core.display.Javascript object>

In [12]:
scores_df = pd.DataFrame(data=sk_scores)
print(scores_df)
print(scores_df.Model.values)


ax = scores_df.plot(kind='line', x='Model', y=['Accuracy', 'ROC-AUC', 'F1'], xticks=range(len(models) + 1),
                    yticks=np.arange(10) / 10, color=['black', 'orange', 'green'])
ax.set_xticklabels(list(scores_df.Model))
for p in ax.patches:
     ax.annotate('{:.2f}'.format(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.02))
ax.axhline(y=0.7, xmin=0.0, xmax=1.0, color='black', linewidth=.1)
ax.axhline(y=0.6, xmin=0.0, xmax=1.0, color='black', linewidth=.1)
plt.tight_layout()
plt.show()

         Model  Accuracy        F1  F1 flipped   ROC-AUC
0        Naive  0.490000  0.000000         0.0  0.500000
1       Logreg  0.570501  0.566370         0.0  0.581253
2        Ptron  0.599599  0.610606         0.0  0.668242
3          KNN  0.681654  0.595207         0.0  0.691525
4          GNB  0.600602  0.628480         0.0  0.645697
5          BNB  0.488070  0.366609         0.0  0.415202
6           RF  0.588170  0.548792         0.0  0.608141
7          GBT  0.612130  0.554094         0.0  0.682778
8         MLP2  0.450025  0.571593         0.0  0.435838
9         MLP3  0.510025  0.675431         0.0  0.539020
10  SVC Linear  0.610652  0.593323         0.0  0.628929
['Naive' 'Logreg' 'Ptron' 'KNN' 'GNB' 'BNB' 'RF' 'GBT' 'MLP2' 'MLP3'
 'SVC Linear']


<IPython.core.display.Javascript object>

In [13]:
print(dataset.shape)
print(len(train_cols))

(12811, 27)
24
