In [1]:
import numpy as np
import scipy.io
import scipy.stats as stats
import matplotlib.pyplot as plt
import pandas as pd
import random
import time 

import os
from pathlib import Path

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, LeaveOneOut, GroupKFold
from sklearn.svm import LinearSVC, SVR
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.metrics import balanced_accuracy_score, f1_score

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler

from sklearn.utils import shuffle

from make_classification import *

# Import and prepare dataset

In [2]:
labels = pd.read_csv('../../Dataset/labels.csv', sep=",", header=0, index_col=0).dropna()
labels.head()

Unnamed: 0_level_0,binary-stress,affect3-class
subject/task,Unnamed: 1_level_1,Unnamed: 2_level_1
2ea4_Breathing,0,0
2ea4_Counting1,1,2
2ea4_Counting2,1,2
2ea4_Counting3,1,2
2ea4_Math,1,2


In [4]:
X = pd.read_csv('../Feature Extraction/Features/all_physiological_features.csv', sep=",", header=0, index_col=0)

# Classification

Feature selection using L1 penalty or Recursive Feature Elimination (RFE). The optimal number of features is determined using RFECV. Several models are tested: Logistic Regression, Decision Trees, Random Forests, K neares neighbors, Adaboost, GradientBoosting, and Multi Layer Perceptron. 

The models are fitted 8 times, and the average scores over 8 repetitions are reported.

In [5]:
idx = list(X.merge(labels, left_index= True, right_index=True).index)
labels = labels.loc[idx]
x = X.loc[idx]

### binary stress

In [6]:
y = labels['binary-stress']
y.value_counts()

1    367
0    332
Name: binary-stress, dtype: int64

In [10]:
feature_selector= "L1"
list_classif = [
    LogisticRegression(max_iter=5000, random_state=0),
                #DecisionTreeClassifier(random_state=0), 
                RandomForestClassifier(max_depth=5, random_state=0), 
                #KNeighborsClassifier(n_neighbors=3), 
                #AdaBoostClassifier(n_estimators=100, random_state=0), 
                #GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=2, random_state=0),
                MLPClassifier(max_iter=5000, random_state=0)]
n_splits=8

In [12]:
res, conf = make_nclassif(x, y, n_splits=n_splits, 
                    feature_selector=feature_selector, 
                    list_classifiers = list_classif)
avg_res(res)

Split  1/ 8
Split  2/ 8
Split  3/ 8
Split  4/ 8
Split  5/ 8
Split  6/ 8
Split  7/ 8
Split  8/ 8


Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogisticRegression,0.661655,0.666257,4.31155
MLPClassifier,0.642117,0.640512,6.945
RandomForestClassifier,0.688788,0.693673,5.08013


In [13]:
res.groupby(['classifier']).std()[['f1-score', 'accuracy', 'time']]

Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogisticRegression,0.036086,0.03174,0.556832
MLPClassifier,0.051204,0.048536,0.743022
RandomForestClassifier,0.078199,0.065909,0.98978


In [14]:
res.to_csv('Results/phys_stress_classif.csv', sep=",", index=True)

#### Random splits instead of grouped by subject

In [15]:
res = make_nclassif_random_splits(x, y, n_splits=n_splits, 
                    feature_selector=feature_selector, 
                    list_classifiers = list_classif)
avg_res(res)

Split  1/ 8
Split  2/ 8
Split  3/ 8
Split  4/ 8
Split  5/ 8
Split  6/ 8
Split  7/ 8
Split  8/ 8


Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogisticRegression,0.681774,0.683807,4.559778
MLPClassifier,0.659564,0.661865,6.067718
RandomForestClassifier,0.721195,0.72333,5.101117


### binary relax

In [12]:
labels = pd.read_csv('../Dataset/labels_supplementary.csv', sep=",", header=0, index_col=0).dropna()
labels = labels.loc[idx]
labels.head()

Unnamed: 0_level_0,binary-relax,binary-valence,binary-arousal
subject/task,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2ea4_Breathing,1,0.0,0.0
2ea4_Counting1,0,0.0,1.0
2ea4_Counting2,0,0.0,1.0
2ea4_Counting3,0,0.0,1.0
2ea4_Math,1,0.0,1.0


In [13]:
y = labels['binary-relax']
y.value_counts()

0    380
1    319
Name: binary-relax, dtype: int64

In [14]:
feature_selector= "L1"
list_classif = [
    LogisticRegression(max_iter=5000, random_state=12),
                #DecisionTreeClassifier(random_state=0), 
                RandomForestClassifier(max_depth=5, random_state=0), 
                #KNeighborsClassifier(n_neighbors=3), 
                #AdaBoostClassifier(n_estimators=100, random_state=0), 
                #GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=2, random_state=0),
                MLPClassifier(max_iter=5000, random_state=0)]
n_splits=8

In [15]:
res, conf = make_nclassif(x, y, n_splits=n_splits, 
                    feature_selector=feature_selector, 
                    list_classifiers = list_classif)
avg_res(res)

Split  1/ 8
Split  2/ 8
Split  3/ 8
Split  4/ 8
Split  5/ 8
Split  6/ 8
Split  7/ 8
Split  8/ 8


Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogisticRegression,0.628639,0.627793,7.16606
MLPClassifier,0.595118,0.591157,9.739649
RandomForestClassifier,0.678512,0.678521,6.81266


In [16]:
res.groupby(['classifier']).std()[['f1-score', 'accuracy', 'time']]

Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogisticRegression,0.062545,0.069754,2.754281
MLPClassifier,0.071565,0.072989,3.126904
RandomForestClassifier,0.069942,0.056677,1.741623


In [None]:
#res.to_csv('Results/phys_relax_classif.csv', sep=",", index=True)

### binary arousal

In [18]:
y = labels['binary-arousal']
y.value_counts()

1.0    484
0.0    194
Name: binary-arousal, dtype: int64

In [19]:
feature_selector= "L1"
list_classif = [
    LogisticRegression(max_iter=5000, random_state=12),
                #DecisionTreeClassifier(random_state=0), 
                RandomForestClassifier(max_depth=5, random_state=0), 
                #KNeighborsClassifier(n_neighbors=3), 
                #AdaBoostClassifier(n_estimators=100, random_state=0), 
                #GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=2, random_state=0),
                MLPClassifier(max_iter=5000, random_state=0)]
n_splits=8

In [20]:
res, conf = make_nclassif(x, y, n_splits=n_splits, 
                    feature_selector=feature_selector, 
                    list_classifiers = list_classif)
avg_res(res)

Split  1/ 8
Split  2/ 8
Split  3/ 8
Split  4/ 8
Split  5/ 8
Split  6/ 8
Split  7/ 8
Split  8/ 8


Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogisticRegression,0.658597,0.565929,10.785414
MLPClassifier,0.619701,0.523413,12.877882
RandomForestClassifier,0.653728,0.550282,9.115002


In [21]:
res.groupby(['classifier']).std()[['f1-score', 'accuracy', 'time']]

Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogisticRegression,0.075452,0.060482,3.499068
MLPClassifier,0.062078,0.035147,3.325026
RandomForestClassifier,0.073278,0.02826,1.418386


In [None]:
#res.to_csv('Results/phys_arousal_classif.csv', sep=",", index=True)

### binary valence

In [22]:
y = labels['binary-valence']
y.value_counts()

0.0    340
1.0    338
Name: binary-valence, dtype: int64

In [23]:
feature_selector= "L1"
list_classif = [
    LogisticRegression(max_iter=5000),
                #DecisionTreeClassifier(random_state=0), 
                RandomForestClassifier(max_depth=5, random_state=1), 
                KNeighborsClassifier(n_neighbors=3), 
                #AdaBoostClassifier(n_estimators=100, random_state=0), 
                #GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=2, random_state=0),
                MLPClassifier(max_iter=5000)]
n_splits=8

In [24]:
res, conf = make_nclassif(x, y, n_splits=n_splits, 
                    feature_selector=feature_selector, 
                    list_classifiers = list_classif)
avg_res(res)

Split  1/ 8
Split  2/ 8
Split  3/ 8
Split  4/ 8
Split  5/ 8
Split  6/ 8
Split  7/ 8
Split  8/ 8


Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
KNeighborsClassifier,0.536511,0.536262,8.844893
LogisticRegression,0.604341,0.603002,9.746196
MLPClassifier,0.544609,0.548216,10.592978
RandomForestClassifier,0.641748,0.651092,10.582728


In [25]:
res.groupby(['classifier']).std()[['f1-score', 'accuracy', 'time']]

Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
KNeighborsClassifier,0.025363,0.02292,2.634604
LogisticRegression,0.045139,0.049059,4.397
MLPClassifier,0.059994,0.053753,1.600675
RandomForestClassifier,0.079481,0.072109,5.79774


In [None]:
#res.to_csv('Results/phys_valence_classif.csv', sep=",", index=True)