In [1]:
import numpy as np
import scipy.io
import scipy.stats as stats
import matplotlib.pyplot as plt
import pandas as pd
import random
import time 

import os
from pathlib import Path

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, LeaveOneOut, GroupKFold
from sklearn.svm import LinearSVC, SVR
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.metrics import balanced_accuracy_score, f1_score

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler

from sklearn.utils import shuffle

from ..make_classification import *

# Import and prepare dataset

In [2]:
labels = pd.read_csv('../Dataset/labels.csv', sep=",", header=0, index_col=0).dropna()
labels.head()

Unnamed: 0_level_0,binary-stress,affect3-class
subject/task,Unnamed: 1_level_1,Unnamed: 2_level_1
2ea4_Breathing,0,0
2ea4_Counting1,1,2
2ea4_Counting2,1,2
2ea4_Counting3,1,2
2ea4_Math,1,2


In [26]:
x = pd.read_csv('Features/HCfeatures.csv', sep=",", header=None, index_col=0)
x.set_axis([i.split('.')[0] for i in list(X.index)], axis='index', inplace=True)

#### For W2V features

In [20]:
from ast import literal_eval

new_x = []
new_idx = [] # this is actually useless but whatever
process_row = lambda row: literal_eval(row[1]) + literal_eval(row[2])
process_row_quick = lambda row: literal_eval(row[1])

for idx, row in x.iterrows():
    new_row = process_row_quick(row)
    new_x.append(new_row)
    new_idx.append(idx)
    
new_x = pd.DataFrame(data=new_x, index=new_idx, columns=range(1,513))
x = new_x

# Classification

Feature selection using L1 penalty or Recursive Feature Elimination (RFE). The optimal number of features is determined using RFECV. Several models are tested: Logistic Regression, Decision Trees, Random Forests, K neares neighbors, Adaboost, GradientBoosting, and Multi Layer Perceptron. 

The models are fitted 8 times, and the average scores over 8 repetitions are reported.

In [27]:
idx = list(x.merge(labels, left_index= True, right_index=True).index)
labels = labels.loc[idx]
x = x.loc[idx]

### binary stress

In [28]:
y = labels['binary-stress']
y.value_counts()

1    263
0    115
Name: binary-stress, dtype: int64

In [29]:
feature_selector= "L1"
list_classif = [
    LogisticRegression(max_iter=5000, random_state=0),
                #DecisionTreeClassifier(random_state=0), 
                RandomForestClassifier(max_depth=5, random_state=0), 
                #KNeighborsClassifier(n_neighbors=3), 
                #AdaBoostClassifier(n_estimators=100, random_state=0), 
                #GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=2, random_state=0),
                MLPClassifier(max_iter=5000, random_state=0, hidden_layer_sizes=[])]
n_splits=8

In [30]:
res, conf = make_nclassif(x, y, n_splits=n_splits, 
                    feature_selector=feature_selector, 
                    list_classifiers = list_classif)
avg_res(res)

Split  1/ 8
Split  2/ 8
Split  3/ 8
Split  4/ 8
Split  5/ 8
Split  6/ 8
Split  7/ 8
Split  8/ 8


Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogisticRegression,0.58559,0.531664,3.558039
MLPClassifier,0.603736,0.548665,4.495873
RandomForestClassifier,0.57271,0.499439,4.416487


In [25]:
res.groupby(['classifier']).std()[['f1-score', 'accuracy', 'time']]

Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogisticRegression,0.109353,0.08857,8.32358
MLPClassifier,0.10616,0.075709,10.770314
RandomForestClassifier,0.165949,0.065991,5.950833


### binary relax

In [None]:
labels = pd.read_csv('../Dataset/labels_supplementary.csv', sep=",", header=0, index_col=0).dropna()
labels = labels.loc[idx]
labels.head()

In [9]:
y = labels['binary-relax']
y.value_counts()

0    256
1    115
Name: binary-relax, dtype: int64

In [10]:
feature_selector= "L1"
list_classif = [
    #LogisticRegression(max_iter=5000, random_state=0),
                #DecisionTreeClassifier(random_state=0), 
                RandomForestClassifier(max_depth=5, random_state=0), 
                #KNeighborsClassifier(n_neighbors=3), 
                #AdaBoostClassifier(n_estimators=100, random_state=0), 
                #GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=2, random_state=0),
                MLPClassifier(max_iter=5000, random_state=0)]
n_splits=8

In [11]:
res, conf = make_nclassif(x, y, n_splits=n_splits, 
                    feature_selector=feature_selector, 
                    list_classifiers = list_classif)
avg_res(res)

Split  1/ 8
Split  2/ 8
Split  3/ 8
Split  4/ 8
Split  5/ 8
Split  6/ 8
Split  7/ 8
Split  8/ 8


Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogisticRegression,0.593999,0.522903,4.239737
MLPClassifier,0.629913,0.561345,4.831971
RandomForestClassifier,0.599594,0.523884,5.810274


In [12]:
res.groupby(['classifier']).std()[['f1-score', 'accuracy', 'time']]

Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogisticRegression,0.119103,0.076579,1.54954
MLPClassifier,0.101724,0.072423,0.623297
RandomForestClassifier,0.133941,0.062763,2.236864


### binary arousal

In [14]:
y = labels['binary-arousal']
y.value_counts()

1.0    314
0.0     57
Name: binary-arousal, dtype: int64

In [15]:
feature_selector= "L1"
list_classif = [
    #LogisticRegression(max_iter=5000, random_state=0),
                #DecisionTreeClassifier(random_state=0), 
                RandomForestClassifier(max_depth=5, random_state=0), 
                #KNeighborsClassifier(n_neighbors=3), 
                #AdaBoostClassifier(n_estimators=100, random_state=0), 
                #GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=2, random_state=0),
                MLPClassifier(max_iter=5000, random_state=0)]
n_splits=8

In [16]:
res, conf = make_nclassif(x, y, n_splits=n_splits, 
                    feature_selector=feature_selector, 
                    list_classifiers = list_classif)
avg_res(res)

Split  1/ 8
Split  2/ 8
Split  3/ 8
Split  4/ 8
Split  5/ 8
Split  6/ 8
Split  7/ 8
Split  8/ 8


Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogisticRegression,0.797395,0.53071,3.367436
MLPClassifier,0.788262,0.514279,4.245142
RandomForestClassifier,0.789656,0.504464,3.706059


In [17]:
res.groupby(['classifier']).std()[['f1-score', 'accuracy', 'time']]

Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogisticRegression,0.093647,0.132637,0.397595
MLPClassifier,0.103546,0.097728,1.359546
RandomForestClassifier,0.138373,0.012627,0.571468


### binary valence

In [23]:
y = labels['binary-valence']
y.value_counts()

0.0    214
1.0    157
Name: binary-valence, dtype: int64

In [24]:
feature_selector= "L1"
list_classif = [
       #LogisticRegression(max_iter=5000, random_state=0),
                #DecisionTreeClassifier(random_state=0), 
                RandomForestClassifier(max_depth=5, random_state=0), 
                #KNeighborsClassifier(n_neighbors=3), 
                #AdaBoostClassifier(n_estimators=100, random_state=0), 
                #GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=2, random_state=0),
                MLPClassifier(max_iter=5000, random_state=0)]
n_splits=8

In [25]:
res, conf = make_nclassif(x, y, n_splits=n_splits, 
                    feature_selector=feature_selector, 
                    list_classifiers = list_classif)
avg_res(res)

Split  1/ 8
Split  2/ 8
Split  3/ 8
Split  4/ 8
Split  5/ 8
Split  6/ 8
Split  7/ 8
Split  8/ 8


Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogisticRegression,0.55419,0.561776,5.200261
MLPClassifier,0.535565,0.542059,6.271982
RandomForestClassifier,0.463954,0.491355,5.412028


In [26]:
res.groupby(['classifier']).std()[['f1-score', 'accuracy', 'time']]

Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogisticRegression,0.093945,0.078137,2.272301
MLPClassifier,0.091954,0.075929,1.689049
RandomForestClassifier,0.100991,0.046157,2.318038
