In [1]:
import numpy as np
import scipy.io
import scipy.stats as stats
import matplotlib.pyplot as plt
import pandas as pd
import random
import time 

import os
from pathlib import Path

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, LeaveOneOut, GroupKFold
from sklearn.svm import LinearSVC, SVR
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.metrics import balanced_accuracy_score, f1_score

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler

from sklearn.utils import shuffle

from ..make_classification import *

# Import and prepare dataset

In [2]:
labels = pd.read_csv('../Dataset/labels.csv', sep=",", header=0, index_col=0).dropna()
labels.head()

Unnamed: 0_level_0,binary-stress,affect3-class
subject/task,Unnamed: 1_level_1,Unnamed: 2_level_1
2ea4_Breathing,0,0
2ea4_Counting1,1,2
2ea4_Counting2,1,2
2ea4_Counting3,1,2
2ea4_Math,1,2


In [3]:
X = pd.read_csv('Features/video11tasks_aus_gaze_mean_std.csv', sep=",", header=0, index_col=0)

# Classification of Stress

Feature selection using L1 penalty or Recursive Feature Elimination (RFE). The optimal number of features is determined using RFECV. Several models are tested: Logistic Regression, Decision Trees, Random Forests, K neares neighbors, Adaboost, GradientBoosting, and Multi Layer Perceptron. 

The models are fitted 10 times, and the average scores over 10 repetitions are reported.

In [4]:
idx = list(X.merge(labels, left_index= True, right_index=True).index)
labels = labels.loc[idx]
x = X.loc[idx]

### binary stress

In [5]:
y = labels['binary-stress']
y.value_counts()

1    317
0    283
Name: binary-stress, dtype: int64

In [6]:
feature_selector= "L1"
list_classif = [
    LogisticRegression(max_iter=5000, random_state=0),
                #DecisionTreeClassifier(random_state=0), 
                RandomForestClassifier(max_depth=5, random_state=0), 
                #KNeighborsClassifier(n_neighbors=3), 
                #AdaBoostClassifier(n_estimators=100, random_state=0), 
                #GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=2, random_state=0),
                MLPClassifier(max_iter=5000, random_state=0, hidden_layer_sizes=[])]
n_splits=8

In [7]:
res, conf = make_nclassif(x, y, n_splits=n_splits, 
                    feature_selector=feature_selector, 
                    list_classifiers = list_classif)
avg_res(res)

Split  1/ 8
Split  2/ 8
Split  3/ 8
Split  4/ 8
Split  5/ 8
Split  6/ 8
Split  7/ 8
Split  8/ 8


Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogisticRegression,0.624522,0.622246,1.35913
MLPClassifier,0.611502,0.607282,2.154424
RandomForestClassifier,0.616471,0.625818,1.869584


In [8]:
res.groupby(['classifier']).std()[['f1-score', 'accuracy', 'time']]

Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogisticRegression,0.048669,0.063066,0.530415
MLPClassifier,0.059227,0.066945,0.616212
RandomForestClassifier,0.069767,0.054361,0.561236


### binary relax

In [9]:
labels = pd.read_csv('../Dataset/labels_supplementary.csv', sep=",", header=0, index_col=0).dropna()
labels = labels.loc[idx]
labels.head()

Unnamed: 0_level_0,binary-relax,binary-valence,binary-arousal
subject/task,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2ea4_Breathing,1,0.0,0.0
2ea4_Counting1,0,0.0,1.0
2ea4_Counting2,0,0.0,1.0
2ea4_Counting3,0,0.0,1.0
2ea4_Math,1,0.0,1.0


In [10]:
y = labels['binary-relax']
y.value_counts()

0    331
1    269
Name: binary-relax, dtype: int64

In [11]:
feature_selector= "L1"
list_classif = [
    LogisticRegression(max_iter=5000, random_state=0),
                #DecisionTreeClassifier(random_state=0), 
                RandomForestClassifier(max_depth=5, random_state=0), 
                #KNeighborsClassifier(n_neighbors=3), 
                #AdaBoostClassifier(n_estimators=100, random_state=0), 
                #GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=2, random_state=0),
                MLPClassifier(max_iter=5000, random_state=0, hidden_layer_sizes=[])]
n_splits=8

In [12]:
res, conf = make_nclassif(x, y, n_splits=n_splits, 
                    feature_selector=feature_selector, 
                    list_classifiers = list_classif)
avg_res(res)

Split  1/ 8
Split  2/ 8
Split  3/ 8
Split  4/ 8
Split  5/ 8
Split  6/ 8
Split  7/ 8
Split  8/ 8


Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogisticRegression,0.599269,0.604499,2.343957
MLPClassifier,0.594136,0.602245,3.316446
RandomForestClassifier,0.626486,0.631662,2.865319


In [13]:
res.groupby(['classifier']).std()[['f1-score', 'accuracy', 'time']]

Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogisticRegression,0.067087,0.081106,1.074962
MLPClassifier,0.070685,0.076854,0.931438
RandomForestClassifier,0.106207,0.088714,1.233626


### binary arousal

In [14]:
y = labels['binary-arousal']
y.value_counts()

1.0    444
0.0    156
Name: binary-arousal, dtype: int64

In [15]:
feature_selector= "L1"
list_classif = [
    LogisticRegression(max_iter=5000, random_state=0),
                #DecisionTreeClassifier(random_state=0), 
                RandomForestClassifier(max_depth=5, random_state=0), 
                #KNeighborsClassifier(n_neighbors=3), 
                #AdaBoostClassifier(n_estimators=100, random_state=0), 
                #GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=2, random_state=0),
                MLPClassifier(max_iter=5000, random_state=0, hidden_layer_sizes=[])]
n_splits=8

In [16]:
res, conf = make_nclassif(x, y, n_splits=n_splits, 
                    feature_selector=feature_selector, 
                    list_classifiers = list_classif)
avg_res(res)

Split  1/ 8
Split  2/ 8
Split  3/ 8
Split  4/ 8
Split  5/ 8
Split  6/ 8
Split  7/ 8
Split  8/ 8


Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogisticRegression,0.661152,0.532121,0.889271
MLPClassifier,0.670096,0.542621,1.559993
RandomForestClassifier,0.635788,0.506415,1.194343


In [17]:
res.groupby(['classifier']).std()[['f1-score', 'accuracy', 'time']]

Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogisticRegression,0.103286,0.054883,0.114253
MLPClassifier,0.106482,0.05333,0.215503
RandomForestClassifier,0.134222,0.048105,0.077324


### binary valence

In [18]:
y = labels['binary-valence']
y.value_counts()

0.0    305
1.0    295
Name: binary-valence, dtype: int64

In [19]:
feature_selector= "L1"
list_classif = [
    LogisticRegression(max_iter=5000, random_state=0),
                #DecisionTreeClassifier(random_state=0), 
                RandomForestClassifier(max_depth=5, random_state=0), 
                #KNeighborsClassifier(n_neighbors=3), 
                #AdaBoostClassifier(n_estimators=100, random_state=0), 
                #GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=2, random_state=0),
                MLPClassifier(max_iter=5000, random_state=0, hidden_layer_sizes=[])]
n_splits=8

In [20]:
res, conf = make_nclassif(x, y, n_splits=n_splits, 
                    feature_selector=feature_selector, 
                    list_classifiers = list_classif)
avg_res(res)

Split  1/ 8
Split  2/ 8
Split  3/ 8
Split  4/ 8
Split  5/ 8
Split  6/ 8
Split  7/ 8
Split  8/ 8


Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogisticRegression,0.4951,0.510027,0.97007
MLPClassifier,0.540867,0.555786,1.5496
RandomForestClassifier,0.501605,0.524724,1.416051


In [21]:
res.groupby(['classifier']).std()[['f1-score', 'accuracy', 'time']]

Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LogisticRegression,0.081312,0.08131,0.071236
MLPClassifier,0.075645,0.067325,0.413847
RandomForestClassifier,0.063344,0.044015,0.179383
