# Classification of Stress from Audio data

In [1]:
import numpy as np
import scipy.io
import scipy.stats as stats
import matplotlib.pyplot as plt
import pandas as pd
import random
import time 

import os
from pathlib import Path

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, LeaveOneOut, GroupKFold
from sklearn.svm import LinearSVC, SVR, SVC
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.metrics import balanced_accuracy_score, f1_score

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler

from sklearn.utils import shuffle

from make_classification import *

#### Import labels

In [2]:
labels = pd.read_csv('../../Dataset/labels.csv', sep=",", header=0, index_col=0).dropna()
labels.head()

Unnamed: 0_level_0,binary-stress,affect3-class
subject/task,Unnamed: 1_level_1,Unnamed: 2_level_1
2ea4_Breathing,0,0
2ea4_Counting1,1,2
2ea4_Counting2,1,2
2ea4_Counting3,1,2
2ea4_Math,1,2


# Classical approaches 

Several models are tested: Random Forests, K nearets neighbors, SVM, and Multi Layer Perceptron. All models are fitted 10 times on random splits, and the average scores over 10 repetitions are reported.

To dimensionnality of the features matrix can be reduced using PCA or Feature selection. Feature selection is performed using L1 penalty or Recursive Feature Elimination (RFE). The optimal number of features is determined using RFECV.

#### Import and prepare dataset

We use handcrafted features for the classical approaches. The two available feature sets are : classical acoustic features, and the mean and standard deviation of Wave2Vec features computed for each task.

In [3]:
x = pd.read_csv('../Feature Extraction/Features/HCfeatures.csv', sep=",", header=None, index_col=0)
x.set_axis([i.split('.')[0] for i in list(x.index)], axis='index', inplace=True)
x

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,131,132,133,134,135,136,137,138,139,140
chdf_Counting2,-280.338013,124.138573,3.840094,15.627791,-4.523294,-15.956475,-15.634309,-10.193167,-9.976574,-19.588152,...,0.040699,0.046211,0.057499,0.028933,0.057475,0.070104,0.053073,0.041000,0.072966,0.068477
u3v9_Stroop,-332.180664,82.551025,36.539909,13.298847,9.375514,6.349435,0.126900,4.142829,-0.517474,1.666138,...,0.056451,0.054041,0.059267,0.035263,0.040047,0.047146,0.040196,0.061043,0.058365,0.062228
x1q3_Stroop,-362.426636,132.324814,7.044482,4.615066,-15.601804,-8.803574,-15.455957,-13.802106,-14.303585,-23.968596,...,0.060399,0.073943,0.086178,0.039574,0.056021,0.053250,0.062675,0.063904,0.061947,0.041878
cxj0_Counting2,-212.131577,80.600708,-21.619318,-13.967280,-25.350821,-22.545033,-17.179089,-13.269796,-11.334437,-11.906796,...,0.079905,0.060541,0.070319,0.057833,0.062822,0.081051,0.076904,0.059277,0.080670,0.076766
d4n6_Reading,-368.372650,107.842270,28.294743,1.705698,-12.907864,-5.071440,-39.436275,-29.513298,-7.421915,-13.819040,...,0.061729,0.063967,0.068888,0.063583,0.069102,0.071830,0.051946,0.074013,0.066823,0.069891
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
j9h8_Math,-318.902069,59.248405,29.819403,16.495792,14.034431,-1.386578,1.671305,2.522140,0.464210,1.041862,...,0.121528,0.090555,0.092629,0.062740,0.078983,0.100332,0.071423,0.080527,0.079014,0.077805
kycf_Counting2,-274.979279,94.682816,8.230233,5.364861,-7.384567,-17.215910,-21.922163,-11.229294,-15.574428,-14.748496,...,0.059797,0.057618,0.047983,0.038825,0.047335,0.047993,0.063589,0.057100,0.050366,0.054505
k67g_Counting1,-353.467926,138.669968,21.895538,14.345483,-5.692627,-3.310135,-11.725992,-25.843159,-15.291170,-17.531143,...,0.061554,0.061031,0.059175,0.056096,0.102160,0.063463,0.058458,0.042741,0.092567,0.105108
cxj0_Math,-253.067215,85.056641,-11.448489,-15.564773,-25.164740,-12.583304,-10.089787,-8.028225,-10.927115,-7.694977,...,0.058189,0.053052,0.048247,0.052107,0.047545,0.063636,0.064138,0.062151,0.069617,0.074496


In [4]:
idx = list(x.merge(labels, left_index= True, right_index=True).index)
labels = labels.loc[idx]
x = x.loc[idx]

### Classification of binary stress

In [5]:
y = labels['binary-stress']
y.value_counts()

1    263
0    115
Name: binary-stress, dtype: int64

In [6]:
feature_selector = None ###  'PCA', 'RFE', 'L1' or None
list_classif = [RandomForestClassifier(max_depth=5, random_state=0), 
                KNeighborsClassifier(n_neighbors=3), 
                SVC(gamma='auto', kernel='rbf', random_state=0), 
                MLPClassifier(max_iter=5000, random_state=0, hidden_layer_sizes=[])
]
n_splits=10

In [7]:
res, conf = make_nclassif_random_splits_resample(x, y, n_splits=n_splits, 
                    feature_selector=feature_selector, 
                    list_classifiers = list_classif)
avg_res(res)

Split  1/10
Split  2/10
Split  3/10
Split  4/10
Split  5/10
Split  6/10
Split  7/10
Split  8/10
Split  9/10
Split 10/10


Unnamed: 0_level_0,f1-score,accuracy,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
KNeighborsClassifier,0.597635,0.624336,7.037865
MLPClassifier,0.644288,0.610884,8.607246
RandomForestClassifier,0.655604,0.58922,6.661447
SVC,0.679749,0.619802,6.03907


In [None]:
print('Standard Deviations over 10 splits:')
res.groupby(['classifier']).std()[['f1-score', 'accuracy', 'time']]

In [None]:
#res.to_csv('Results/audio_stress_classif.csv', sep=",", index=True)

### Classification of 3-class stress

In [None]:
y = labels['affect3-class']
y.value_counts()

In [None]:
res, conf = make_nclassif_random_splits(x, y, n_splits=n_splits, 
                    feature_selector=feature_selector, 
                    list_classifiers = list_classif)
avg_res(res)

In [None]:
print('Standard Deviations over 10 splits:')
res.groupby(['classifier']).std()[['f1-score', 'accuracy', 'time']]

In [None]:
#res.to_csv('Results/audio_3stress_classif.csv', sep=",", index=True)

# Deep Learning approch
Wave2Vec (W2V) features are used directly (without aggregation) in the deep learning approaches. The W2V matrices are used as input for a Transformer network. The model is fitted 10 times on random splits, and the average scores over 10 repetitions are reported.

In [None]:
x = pd.read_csv('../Feature Extraction/Features/W2Vfeatures.csv', sep=",", header=None, index_col=0)
x.set_axis([i.split('.')[0] for i in list(x.index)], axis='index', inplace=True)

In [None]:
#### For W2V features
from ast import literal_eval

new_x = []
new_idx = [] # this is actually useless but whatever
process_row = lambda row: literal_eval(row[1]) + literal_eval(row[2])
process_row_quick = lambda row: literal_eval(row[1])

for idx, row in x.iterrows():
    new_row = process_row_quick(row)
    new_x.append(new_row)
    new_idx.append(idx)
    
new_x = pd.DataFrame(data=new_x, index=new_idx, columns=range(1,513))
x = new_x
x

In [None]:
idx = list(x.merge(labels, left_index= True, right_index=True).index)
labels = labels.loc[idx]
x = x.loc[idx]

### Classification of binary stress

In [None]:
y = labels['binary-stress']
y.value_counts()

In [None]:
list_classif = [MLPClassifier(max_iter=5000, random_state=0, hidden_layer_sizes=[])]
n_splits=10

In [None]:
res, conf = make_nclassif_random_splits_resample(x, y, n_splits=n_splits,  
                    list_classifiers = list_classif)
avg_res(res)

In [None]:
print('Standard Deviations over 10 splits:')
res.groupby(['classifier']).std()[['f1-score', 'accuracy', 'time']]

In [None]:
#res.to_csv('Results/audio_stress_classif.csv', sep=",", index=True)

### Classification of 3-class stress

In [None]:
y = labels['affect3-class']
y.value_counts()

In [None]:
res, conf = make_nclassif_random_splits(x, y, n_splits=n_splits, 
                    list_classifiers = list_classif)
avg_res(res)

In [None]:
print('Standard Deviations over 10 splits:')
res.groupby(['classifier']).std()[['f1-score', 'accuracy', 'time']]