In [1]:
import copy
import numpy as np
import os
import pandas as pd
import pickle

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

In [3]:
study_keys = pickle.load(open('../data/study_keys.pkl', 'rb'))

In [4]:
df = pd.read_csv('../data/data_to_model.csv', index_col='Unnamed: 0')
df

Unnamed: 0,Participant_ID,Group,Age,Gender,Nationality,Native_Language,Education,Writing_Proficiency,Daily_Email_Frequency,BFI_Agreeableness,...,NASA_Frustration,Treatment,Task,PP_QC,EDA_QC,BR_QC,Chest_HR_QC,Wrist_HR_QC,RR_QC,Is_Stressed
0,T003,CH,23.0,2,1.0,1.0,2.0,7.0,6.0,40.0,...,5.0,RB,Not Applicable,0.003544,0.311345,12.2,83.0,85.78,705.0,1
1,T003,CH,23.0,2,1.0,1.0,2.0,7.0,6.0,40.0,...,5.0,RB,Not Applicable,0.003543,0.310063,12.8,82.0,85.80,731.0,1
2,T003,CH,23.0,2,1.0,1.0,2.0,7.0,6.0,40.0,...,5.0,RB,Not Applicable,0.003541,0.309743,12.8,81.0,85.83,704.0,1
3,T003,CH,23.0,2,1.0,1.0,2.0,7.0,6.0,40.0,...,5.0,RB,Not Applicable,0.003539,0.309743,13.3,79.0,85.83,719.0,1
4,T003,CH,23.0,2,1.0,1.0,2.0,7.0,6.0,40.0,...,5.0,RB,Not Applicable,0.003537,0.309743,13.3,82.0,85.83,733.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149090,T176,BL,18.0,2,1.0,1.0,2.0,7.0,7.0,39.0,...,2.0,PR,Not Applicable,0.007014,0.145388,17.0,91.0,111.60,653.0,0
149091,T176,BL,18.0,2,1.0,1.0,2.0,7.0,7.0,39.0,...,2.0,PR,Not Applicable,0.007089,0.147629,17.0,91.0,112.38,637.0,0
149092,T176,BL,18.0,2,1.0,1.0,2.0,7.0,7.0,39.0,...,2.0,PR,Not Applicable,0.007165,0.147309,17.0,90.0,113.22,639.0,0
149093,T176,BL,18.0,2,1.0,1.0,2.0,7.0,7.0,39.0,...,2.0,PR,Not Applicable,0.007251,0.147629,17.0,86.0,114.07,1233.0,0


In [8]:
df.isnull().sum()

Participant_ID                0
Group                         0
Age                           0
Gender                        0
Nationality                   0
Native_Language               0
Education                     0
Writing_Proficiency           0
Daily_Email_Frequency         0
BFI_Agreeableness             0
BFI_Conscientiousness         0
BFI_Extraversion              0
BFI_Neuroticism               0
BFI_Openness                  0
ERQ_Cognitive_Reappraisal     0
ERQ_Expressive_Suppression    0
Perceived_Stress_Scale        0
NASA_Mental_Demand            0
NASA_Physical_Demand          0
NASA_Temporal_Demand          0
NASA_Performance              0
NASA_Effort                   0
NASA_Frustration              0
Treatment                     0
Task                          0
PP_QC                         0
EDA_QC                        0
BR_QC                         0
Chest_HR_QC                   0
Wrist_HR_QC                   0
RR_QC                         0
Is_Stres

In [21]:
df_features = df[df.columns[1:]].join(pd.get_dummies(df[['Treatment', 'Task', 'Group']])).drop(columns=['Treatment', 'Task', 'Group'])
features = [ft for ft in df_features.columns if ft != 'Is_Stressed']

In [22]:
df_features.shape

(37412, 40)

In [23]:
df_features

Unnamed: 0,Age,Gender,Nationality,Native_Language,Education,Writing_Proficiency,Daily_Email_Frequency,BFI_Agreeableness,BFI_Conscientiousness,BFI_Extraversion,...,Treatment_PR,Treatment_RB,Treatment_ST,Task_Email,Task_Not Applicable,Task_Report,Group_BH,Group_BL,Group_CH,Group_CL
0,23.0,2,1.0,1.0,2.0,7.0,6.0,40.0,38.0,37.0,...,0,1,0,0,1,0,0,0,1,0
1,23.0,2,1.0,1.0,2.0,7.0,6.0,40.0,38.0,37.0,...,0,1,0,0,1,0,0,0,1,0
2,23.0,2,1.0,1.0,2.0,7.0,6.0,40.0,38.0,37.0,...,0,1,0,0,1,0,0,0,1,0
3,23.0,2,1.0,1.0,2.0,7.0,6.0,40.0,38.0,37.0,...,0,1,0,0,1,0,0,0,1,0
4,23.0,2,1.0,1.0,2.0,7.0,6.0,40.0,38.0,37.0,...,0,1,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149090,18.0,2,1.0,1.0,2.0,7.0,7.0,39.0,35.0,37.0,...,1,0,0,0,1,0,0,1,0,0
149091,18.0,2,1.0,1.0,2.0,7.0,7.0,39.0,35.0,37.0,...,1,0,0,0,1,0,0,1,0,0
149092,18.0,2,1.0,1.0,2.0,7.0,7.0,39.0,35.0,37.0,...,1,0,0,0,1,0,0,1,0,0
149093,18.0,2,1.0,1.0,2.0,7.0,7.0,39.0,35.0,37.0,...,1,0,0,0,1,0,0,1,0,0


In [24]:
weights = dict(df_features.Is_Stressed.value_counts())
weights

{0: 20311, 1: 17101}

In [25]:
df_features.shape

(37412, 40)

In [56]:
X = df_features[features]
y = df_features['Is_Stressed']


X_train, X_test, label_train, label_test = train_test_split(X, y, test_size=0.2, random_state=1)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.values)
X_test_scaled = scaler.transform(X_test.values)

In [60]:
logmodel = LogisticRegression(penalty='none')
logmodel.fit(X_train_scaled, label_train)
print(np.mean(cross_val_score(logmodel, X_test_scaled, label_test)))

1.0


In [61]:
for i in list(zip(X.columns, logmodel.coef_[0])):
    print(i)

('Age', 1.5034019506264817)
('Gender', -0.9933421184156981)
('Nationality', 0.8035514119354253)
('Native_Language', 0.22719623912753611)
('Education', 0.8375592913922028)
('Writing_Proficiency', 0.41440078528864344)
('Daily_Email_Frequency', 1.7723088346635394)
('BFI_Agreeableness', 1.1571504049282915)
('BFI_Conscientiousness', 0.8259524339255906)
('BFI_Extraversion', 1.2765577006348834)
('BFI_Neuroticism', 0.8995824188968236)
('BFI_Openness', -1.3907531797904167)
('ERQ_Cognitive_Reappraisal', 0.14402053490125646)
('ERQ_Expressive_Suppression', 1.3267343166640104)
('Perceived_Stress_Scale', 0.25111861711002914)
('NASA_Mental_Demand', -0.9779315845450728)
('NASA_Physical_Demand', 1.0801449169066983)
('NASA_Temporal_Demand', 0.9936570894992378)
('NASA_Performance', -0.32604756896561476)
('NASA_Effort', 1.63526905411064)
('NASA_Frustration', 0.9832298903096226)
('PP_QC', -0.7048973534915369)
('EDA_QC', 0.14049911563104905)
('BR_QC', 0.0711306556812988)
('Chest_HR_QC', 0.6319933803156792)


In [62]:
logmodel = LogisticRegression(penalty='none', class_weight=weights)
logmodel.fit(X_train_scaled, label_train)
print(np.mean(cross_val_score(logmodel, X_test_scaled, label_test)))

1.0


In [63]:
def remove_big_coef_features(dict_, threshold=4):
    big_k = []

    for k in dict_:
        if abs(dict_[k]) > threshold:
            big_k.append(k)

    for k in big_k:
        dict_.pop(k)
    
    return dict_

In [64]:
baseline = dict(zip(X.columns, logmodel.coef_[0]))
iter_1 = copy.deepcopy(baseline)

iter_1 = remove_big_coef_features(iter_1, 5)
ft = list(iter_1.keys())
ft

['Age',
 'Gender',
 'Nationality',
 'Native_Language',
 'Education',
 'Writing_Proficiency',
 'Daily_Email_Frequency',
 'BFI_Agreeableness',
 'BFI_Conscientiousness',
 'BFI_Extraversion',
 'BFI_Neuroticism',
 'BFI_Openness',
 'ERQ_Cognitive_Reappraisal',
 'ERQ_Expressive_Suppression',
 'Perceived_Stress_Scale',
 'NASA_Mental_Demand',
 'NASA_Physical_Demand',
 'NASA_Temporal_Demand',
 'NASA_Performance',
 'NASA_Effort',
 'NASA_Frustration',
 'PP_QC',
 'EDA_QC',
 'BR_QC',
 'Chest_HR_QC',
 'Wrist_HR_QC',
 'RR_QC',
 'Treatment_DT',
 'Treatment_PM',
 'Treatment_PR',
 'Treatment_RB',
 'Treatment_ST',
 'Task_Email',
 'Task_Not Applicable',
 'Task_Report']

In [65]:
# separate data into label/features
X = df_features[ft]
#y = df_features['Is_Stressed']


# drop features we won't use
#X.drop(columns=drop_fts, inplace=True)


# split data in train/test
X_train, X_test, label_train, label_test = train_test_split(X, y, test_size=0.2, random_state=1)



# standardize data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.values)
X_test_scaled = scaler.transform(X_test.values)


# build, fit and calculate accuracy of logistic model
logmodel = LogisticRegression(penalty='none', class_weight=weights)
logmodel.fit(X_train_scaled, label_train)
print(np.mean(cross_val_score(logmodel, X_test_scaled, label_test)))

1.0


In [66]:
iter_2 = dict(zip(X.columns, logmodel.coef_[0]))
iter_2 = remove_big_coef_features(iter_2, 4)
ft = list(iter_2.keys())

In [67]:
# separate data into label/features
X = df_features[ft]
y = df_features['Is_Stressed']



# split data in train/test
X_train, X_test, label_train, label_test = train_test_split(X, y, test_size=0.2, random_state=1)
train_subset = {'X': X_train, 'y': label_train}
test_subset = {'X': X_test, 'y': label_test}



# standardize data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.values)
X_test_scaled = scaler.transform(X_test.values)



# build, fit and calculate accuracy of logistic model
logmodel = LogisticRegression(penalty='none', class_weight=weights)
logmodel.fit(X_train_scaled, label_train)
print(np.mean(cross_val_score(logmodel, X_test_scaled, label_test)))

0.7134832945748896


In [69]:
random = RandomForestClassifier()
random.fit(**train_subset)
print(np.mean(cross_val_score(random, **test_subset)))

1.0


In [70]:
estimators = list(range(1, 100))

In [71]:
for i in estimators:
    print(i, "estimators")
    random = RandomForestClassifier(n_estimators=i)
    random.fit(**train_subset)
    accuracy = np.mean(cross_val_score(random, **test_subset))
    print(accuracy)
    if accuracy == 1.0:
        break

1 estimators
0.9879722903918353
2 estimators
0.9886424363879274
3 estimators
0.9975946545497413
4 estimators
0.9987974165800406
5 estimators
0.9985299475957262
6 estimators
0.9989310171144428
7 estimators
1.0


In [72]:
random = RandomForestClassifier(n_estimators=1)
random.fit(**train_subset)
np.mean(cross_val_score(random, **test_subset))

0.9915815588396043

In [75]:
random_fts = list(zip(X.columns, random.feature_importances_))
sorted(random_fts, key=lambda ft: ft[1], reverse=True)

[('Perceived_Stress_Scale', 0.49725794812150226),
 ('RR_QC', 0.1326854776909048),
 ('EDA_QC', 0.10551757051629308),
 ('ERQ_Cognitive_Reappraisal', 0.10042202116393882),
 ('NASA_Performance', 0.08726840840126608),
 ('BR_QC', 0.021236954191630197),
 ('Task_Not Applicable', 0.014317241437538163),
 ('Task_Email', 0.010369251895022445),
 ('Wrist_HR_QC', 0.008675375584998837),
 ('Treatment_ST', 0.008654096957896576),
 ('Treatment_PR', 0.0059814649475019984),
 ('Treatment_PM', 0.0035085849449023538),
 ('Treatment_RB', 0.002725297532397673),
 ('Task_Report', 0.0012935071694915757),
 ('Treatment_DT', 8.679944471504455e-05)]

In [76]:
X.columns

Index(['ERQ_Cognitive_Reappraisal', 'Perceived_Stress_Scale',
       'NASA_Performance', 'EDA_QC', 'BR_QC', 'Wrist_HR_QC', 'RR_QC',
       'Treatment_DT', 'Treatment_PM', 'Treatment_PR', 'Treatment_RB',
       'Treatment_ST', 'Task_Email', 'Task_Not Applicable', 'Task_Report'],
      dtype='object')

In [77]:
ft

['ERQ_Cognitive_Reappraisal',
 'Perceived_Stress_Scale',
 'NASA_Performance',
 'EDA_QC',
 'BR_QC',
 'Wrist_HR_QC',
 'RR_QC',
 'Treatment_DT',
 'Treatment_PM',
 'Treatment_PR',
 'Treatment_RB',
 'Treatment_ST',
 'Task_Email',
 'Task_Not Applicable',
 'Task_Report']

In [78]:
pickle.dump(ft, open('../data/model_features.pkl', 'wb'))