In [2]:
#import libraries
import pandas as pd
import numpy as np

from imblearn.under_sampling import RandomUnderSampler
import json,codecs
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,multilabel_confusion_matrix, confusion_matrix,classification_report
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier

import xgboost
from xgboost import XGBClassifier
import xlsxwriter
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D
import warnings
warnings.filterwarnings('ignore')

In [14]:
#import data
df = pd.read_csv('em_feat_1020.csv').dropna()


In [15]:
#separate biosignals from emotion features
bio_df = df[['eda_mean', 'eda_min', 'eda_max', 'eda_std', 'eda_kurtosis', 'eda_skew',
       'eda_num_peaks', 'eda_amphitude', 'eda_duration', 'hr_mean', 'hr_min',
       'hr_max', 'hr_std', 'hr_rms', 'hr_num_peaks', 'hr_amphitude',
       'hr_duration', 'temp_mean', 'temp_min', 'temp_max', 'temp_mtd',
       'stress', 'user']]
emo_df = df[['Angry', 'Disgust', 'Scared', 'Happy', 'Sad',
             'Surprised', 'Neutral','stress', 'user']]
user_list=df.user.unique()
print(emo_df.shape)

(18756, 9)


In [16]:
#save the results into a single excel file
filename = 'results_1020_over.xlsx'
writer = pd.ExcelWriter(filename,engine='xlsxwriter')

pd.DataFrame(user_list,columns=['user_list']).to_excel(writer,sheet_name = "user_list", index =False)  
writer.save()
a = {'metric':[],'user':[],'score':[]}
with open("confusion.json", "w") as outfile:
        json.dump(a, outfile)
predfilename = 'pred'+filename[8:12]+'.xlsx'
user_list = emo_df.user.unique()
with pd.ExcelWriter(predfilename, engine='openpyxl', mode='w') as writer:
        pd.DataFrame(user_list).to_excel(writer, sheet_name = 'user_list',index=False)


In [17]:
def results2excel(metric,rf_score,dt_score,et_score,xgb_score,user_list,sheet_name):
    

    rf= pd.DataFrame(rf_score)
    dt= pd.DataFrame(dt_score)
    et= pd.DataFrame(et_score)
    xg= pd.DataFrame(xgb_score)
    user_list = pd.DataFrame(user_list)

    results = pd.DataFrame()
    results = pd.concat([user_list,rf,dt,et,xg],axis=1)
    results.columns = ['user','random forest','decision tree','extra trees','xgboost']
    #with pd.ExcelWriter('results.xlsx',mode='a') as writer:  
    #    results.to_excel(writer, sheet_name='sheet_name')  
    with pd.ExcelWriter(filename, engine='openpyxl', mode='a') as writer:
        results.to_excel(writer, sheet_name = sheet_name+'_'+metric,index=False)

        writer.save()


In [18]:
def conf_mtrx2json(metric,score,user,sheetname):
    a = {'metric':[],'user':[],'score':[]}
    a['metric'].append(metric)
    a['user'].append(user)
    a['score'].append(score.tolist())
    print(a)
    #if (user == 'ST') and (sheetname == 'Pearsons1') and (metric == 'xgb'):
    #with open("confusion.json", "a") as outfile:
    #    json.dump(a, outfile)

    #with open("confusion.json", mode='a', encoding='utf-8') as feedsjson:
    
    json.dump(a, codecs.open('confusion.json', 'a', encoding='utf-8'), 
          separators=(',', ':'), 
          sort_keys=True, 
          indent=1)

In [19]:
#machine learning models
def machine_learning(dataframe, sheet_name,strategy):

    xgb_accuracy, et_accuracy, dt_accuracy, rf_accuracy = [], [], [], []
    xgb_precision, et_precision, dt_precision, rf_precision = [], [], [], []
    xgb_recall, et_recall, dt_recall, rf_recall = [], [], [], []
    xgb_f1score, et_f1score, dt_f1score, rf_f1score = [], [], [], []
    user_list = []
    rf_conf, et_conf, dt_conf, xgb_conf = [],[],[],[]
    for user in dataframe.user.unique():

        user_list.append(user)

        
        train_set = dataframe[dataframe['user'] != user]
        over = RandomOverSampler(sampling_strategy=strategy,random_state=42)
        #su = SMOTE(random_state=42,sampling_strategy=strategy)
        #X_train, y_train = over.fit_resample(train_set.drop(columns= ['user','stress']), train_set['stress'])
        #under = RandomUnderSampler(sampling_strategy=strategy)
        X_train, y_train = over.fit_resample(train_set.drop(columns= ['user','stress']),train_set['stress'])
        
        test_set = dataframe[dataframe['user'] == user]
        #print(test_set.stress.value_counts())
        #print(np.unique(y_train,return_counts = True))
        rf = RandomForestClassifier(n_estimators = 100, max_depth=7, min_samples_leaf=5,random_state = 123)
        rf.fit(X_train, y_train)
        predictions = rf.predict(test_set.drop(columns=['user','stress']))
        #print(multilabel_confusion_matrix(predictions,test_set['stress']))
        #print(accuracy_score(predictions,test_set['stress']))
        #print(np.unique(predictions,return_counts=True))
        rf_accuracy.append(accuracy_score(predictions,test_set['stress']))
        
        rf_precision.append(precision_score(predictions,test_set['stress'],average = 'macro'))
        rf_recall.append(recall_score(predictions,test_set['stress'],average = 'macro'))
        rf_f1score.append(f1_score(predictions,test_set['stress'],average = 'macro'))
        rf_conf = multilabel_confusion_matrix(predictions,test_set['stress'])
        print(classification_report(predictions,test_set['stress']))
        conf_mtrx2json('rf_conf',rf_conf,user,sheet_name)

        et = ExtraTreesClassifier(random_state=123, max_depth=7,min_samples_leaf=5)
        et.fit(X_train, y_train)
        predictions = et.predict(test_set.drop(columns=['user','stress']))
        et_accuracy.append(accuracy_score(predictions,test_set['stress']))
        et_precision.append(precision_score(predictions,test_set['stress'],average = 'weighted'))
        et_recall.append(recall_score(predictions,test_set['stress'],average = 'weighted'))
        et_f1score.append(f1_score(predictions,test_set['stress'],average = 'weighted'))
        et_conf = multilabel_confusion_matrix(predictions,test_set['stress'])
        #print(classification_report(predictions,test_set['stress']))
        conf_mtrx2json('et_conf',et_conf,user,sheet_name)

        dt =DecisionTreeClassifier(random_state=123, max_depth=7,min_samples_leaf=5 )
        dt.fit(X_train, y_train)
        predictions = dt.predict(test_set.drop(columns=['user','stress']))
        dt_accuracy.append(accuracy_score(predictions,test_set['stress']))
        dt_precision.append(precision_score(predictions,test_set['stress'],average = 'weighted'))
        dt_recall.append(recall_score(predictions,test_set['stress'],average = 'weighted'))
        dt_f1score.append(f1_score(predictions,test_set['stress'],average = 'weighted'))
        dt_conf = multilabel_confusion_matrix(predictions,test_set['stress'])
        #print(classification_report(predictions,test_set['stress']))
        conf_mtrx2json('dt_conf',dt_conf,user,sheet_name)

        xgb = XGBClassifier()
        xgb.fit(X_train, y_train)
        predictions = xgb.predict(test_set.drop(columns=['user','stress']))
        xgb_accuracy.append(accuracy_score(predictions,test_set['stress']))
        xgb_precision.append(precision_score(predictions,test_set['stress'],average = 'weighted'))
        xgb_recall.append(recall_score(predictions,test_set['stress'],average = 'weighted'))
        xgb_f1score.append(f1_score(predictions,test_set['stress'],average = 'weighted'))
        xgb_conf = multilabel_confusion_matrix(predictions,test_set['stress'])
        #print(classification_report(predictions,test_set['stress']))
        conf_mtrx2json('xgb_conf',xgb_conf,user,sheet_name)

    results2excel('acc',rf_accuracy,dt_accuracy,et_accuracy,xgb_accuracy,user_list,sheet_name) 
    results2excel('pre',rf_precision,dt_precision,et_precision,xgb_precision,user_list,sheet_name) 
    results2excel('rec',rf_recall,dt_recall,et_recall,xgb_recall,user_list,sheet_name)
    results2excel('f1score',rf_f1score,dt_f1score,et_f1score,xgb_f1score,user_list,sheet_name)

In [20]:
#PCA function
def Pca(dataset,target):
    pcas = PCA(n_components=3)
    principalComponents = pcas.fit_transform(dataset)
    ppal_df = pd.DataFrame(data = principalComponents, columns = ['PC1', 'PC2','PC3'])
    final_df = pd.concat([ppal_df, target], axis = 1)
    return final_df

In [21]:
#3D plot function
def plot3d(dataset, image_size,title):
    fig = plt.figure(figsize=image_size)
    ax = Axes3D(fig)
    ax.scatter(dataset['PC1'], dataset['PC2'], dataset['PC3'], c=dataset['stress'])
        
    # make simple, bare axis lines through space:
    
    # label the axes
    ax.axes.set_xlim3d(left   = dataset['PC1'].min(), right = dataset['PC1'].max())
    ax.axes.set_ylim3d(bottom = dataset['PC2'].min(), top   = dataset['PC2'].max())
    ax.axes.set_ylim3d(bottom = dataset['PC3'].min(), top   = dataset['PC3'].max())
    ax.set_xlabel("PC1")
    ax.set_ylabel("PC2")
    ax.set_zlabel("PC3")
    ax.set_title(f"PCA on the {title} data set")
    plt.show()

In [22]:
strategy = {0:14000, 1:9000, 2:9000}

machine_learning(emo_df,'Emotions',strategy)

              precision    recall  f1-score   support

           0       0.87      0.86      0.86      1502
           1       0.00      0.00      0.00        86
           2       0.00      0.01      0.01       110

    accuracy                           0.76      1698
   macro avg       0.29      0.29      0.29      1698
weighted avg       0.77      0.76      0.76      1698

{'metric': ['rf_conf'], 'user': ['HT'], 'score': [[[[1, 195], [211, 1291]], [[1612, 0], [86, 0]], [[1377, 211], [109, 1]]]]}
              precision    recall  f1-score   support

           0       0.99      0.88      0.93      1683
           2       0.01      0.13      0.02        15

    accuracy                           0.87      1698
   macro avg       0.50      0.50      0.47      1698
weighted avg       0.98      0.87      0.92      1698

{'metric': ['et_conf'], 'user': ['HT'], 'score': [[[[2, 13], [210, 1473]], [[1473, 210], [13, 2]]]]}
              precision    recall  f1-score   support

           

In [23]:
machine_learning(bio_df,'biometrics',strategy)

              precision    recall  f1-score   support

           0       1.00      0.88      0.93      1697
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         0

    accuracy                           0.87      1698
   macro avg       0.33      0.29      0.31      1698
weighted avg       1.00      0.87      0.93      1698

{'metric': ['rf_conf'], 'user': ['HT'], 'score': [[[[0, 1], [212, 1485]], [[1697, 0], [1, 0]], [[1486, 212], [0, 0]]]]}
              precision    recall  f1-score   support

           0       1.00      0.88      0.93      1698
           2       0.00      0.00      0.00         0

    accuracy                           0.88      1698
   macro avg       0.50      0.44      0.47      1698
weighted avg       1.00      0.88      0.93      1698

{'metric': ['et_conf'], 'user': ['HT'], 'score': [[[[0, 0], [212, 1486]], [[1486, 212], [0, 0]]]]}
              precision    recall  f1-score   support

           0      

In [None]:
machine_learning(df,'full-set',strategy)


In [None]:
from scipy.stats import pearsonr,spearmanr

ndf = df.drop(columns='user')
pvals = pd.DataFrame([pearsonr(ndf[c], ndf['stress'])[1] for c in ndf.columns],
                     index=ndf.columns)
pvals

In [None]:
new_df = df[[ 'Disgust','Scared','Surprised', 
       'eda_kurtosis','eda_skew', 'hr_std', 'hr_rms', 'hr_num_peaks', 'hr_amphitude',
       'hr_duration', 'stress', 'user']] 

In [None]:
machine_learning(new_df,'Pearsons1',strategy)

In [None]:
pca_bio = Pca(bio_df.drop(columns=['user','stress']),bio_df[['user','stress']])
plot3d(pca_bio,(5,5),'bio')
pca_emo = Pca(emo_df.drop(columns=['user','stress']),emo_df[['user','stress']])
plot3d(pca_emo,(5,5),'emotion')
full_pca = Pca(df.drop(columns=['user','stress']),df[['user','stress']])
plot3d(full_pca,(5,5),'whole')

In [None]:
strategy = {0:14029, 1:7000, 2:7000}
machine_learning(pca_bio.fillna(0),'PCA_bio',strategy)
machine_learning(pca_emo.fillna(0),'PCA_emo',strategy)
machine_learning(full_pca.fillna(0),'PCA_full',strategy)

In [None]:
strategy = {0:36000, 1:36000, 2:36000}
print(pd.concat([pca_bio.drop(columns=['user','stress']),pca_emo]).stress.value_counts())
machine_learning(pd.concat([pca_bio.drop(columns=['user','stress']),pca_emo]).fillna(0),'PCA_bio&PCA_emo',strategy)
strategy = {0:36000, 1:7000, 2:7000}
machine_learning(pd.concat([pca_bio.drop(columns=['user','stress']),emo_df]).fillna(0),'PCA_bio&emo',strategy)


In [None]:
strategy = {0:35034, 1:7000, 2:7000}
machine_learning(pd.concat([pca_emo.drop(columns=['user','stress']),bio_df]).fillna(0),'bio&PCA_emo',strategy)

In [None]:
strategy = {0:12891, 1:12891, 2:12891}
user = 'JT'
dataframe = emo_df
train_set = dataframe[dataframe['user'] != user]
over = RandomOverSampler(sampling_strategy=strategy,random_state=42)
su = SMOTE(random_state=42,sampling_strategy=strategy)
#X_train, y_train = over.fit_resample(train_set.drop(columns= ['user','stress']), train_set['stress'])
#under = RandomUnderSampler(sampling_strategy=strategy)
X_train, y_train = su.fit_resample(train_set.drop(columns= ['user','stress']),train_set['stress'])

test_set = dataframe[dataframe['user'] == user]
print(test_set.stress.value_counts())
print(np.unique(y_train,return_counts = True))
rf = RandomForestClassifier(n_estimators = 100, max_depth=5, min_samples_leaf=5,random_state = 123)
rf.fit(X_train, y_train)
predictions = rf.predict(test_set.drop(columns=['user','stress']))
print(multilabel_confusion_matrix(predictions,test_set['stress']))
print('accuracy_score',accuracy_score(predictions,test_set['stress']))
print('predictions',np.unique(predictions,return_counts=True))
print(confusion_matrix(predictions,test_set['stress']))
print('recall_score',recall_score(predictions,test_set['stress']),average = 'macro')
print('f1_score',f1_score(predictions,test_set['stress']),average = 'macro')
print('precision_score',precision_score(predictions,test_set['stress']),average = 'macro')