In [None]:
! pip install kaggle --upgrade



In [3]:
! mkdir ~/.kaggle/

In [5]:
! cp kaggle.json ~/.kaggle/

In [6]:
! kaggle datasets download -d yamqwe/depression-anxiety-stress-scales

Downloading depression-anxiety-stress-scales.zip to /content
 63% 5.00M/7.93M [00:01<00:00, 4.35MB/s]
100% 7.93M/7.93M [00:01<00:00, 6.56MB/s]


In [7]:
! mkdir dataset

In [8]:
! unzip depression-anxiety-stress-scales.zip -d dataset

Archive:  depression-anxiety-stress-scales.zip
  inflating: dataset/DASS_data_21.02.19/codebook.txt  
  inflating: dataset/DASS_data_21.02.19/data.csv  
  inflating: dataset/DASS_data_21.02.19/demo1.png  


In [9]:
! ls dataset/

DASS_data_21.02.19


In [10]:
! ls dataset/DASS_data_21.02.19

codebook.txt  data.csv	demo1.png


In [11]:
import matplotlib.pyplot as plt
import numpy as np
import csv
import pandas as pd

In [12]:
np.random.seed(42)

df = pd.read_csv('dataset/DASS_data_21.02.19/data.csv', error_bad_lines=False, warn_bad_lines=False, sep=r'\t')
df = df.sample(frac = 1) # shuffle df rows

  This is separate from the ipykernel package so we can avoid doing imports until


In [13]:
df.head()

Unnamed: 0,Q1A,Q1I,Q1E,Q2A,Q2I,Q2E,Q3A,Q3I,Q3E,Q4A,Q4I,Q4E,Q5A,Q5I,Q5E,Q6A,Q6I,Q6E,Q7A,Q7I,Q7E,Q8A,Q8I,Q8E,Q9A,Q9I,Q9E,Q10A,Q10I,Q10E,Q11A,Q11I,Q11E,Q12A,Q12I,Q12E,Q13A,Q13I,Q13E,Q14A,...,TIPI2,TIPI3,TIPI4,TIPI5,TIPI6,TIPI7,TIPI8,TIPI9,TIPI10,VCL1,VCL2,VCL3,VCL4,VCL5,VCL6,VCL7,VCL8,VCL9,VCL10,VCL11,VCL12,VCL13,VCL14,VCL15,VCL16,education,urban,gender,engnat,age,screensize,uniquenetworklocation,hand,religion,orientation,race,voted,married,familysize,major
21513,3,16,4447,2,40,4928,1,35,3848,1,9,3962,2,13,3691,2,19,5715,2,6,7017,2,21,7363,2,5,17032,2,27,3570,3,24,3362,2,7,4191,2,18,2450,3,...,6,6,7,7,5,7,3,2,5,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,3,2,2,2,19,1,2,1,10,1,10,2,1,3,political science
1796,2,25,40037,3,35,5262,2,29,10343,1,22,4624,3,32,6223,2,42,8265,3,3,9968,3,15,1515,4,11,11875,4,10,13266,2,7,14688,2,17,10797,3,24,1199,2,...,6,5,6,2,6,6,5,3,2,1,0,1,1,0,0,0,0,0,1,0,0,0,1,1,1,4,2,1,2,24,2,1,2,1,1,60,2,1,2,
21861,2,9,5901,4,20,14174,2,22,10254,1,12,4115,1,42,8455,1,32,6866,2,25,4533,2,30,3793,2,15,19898,1,40,4467,2,28,2179,3,2,21060,2,37,3279,2,...,6,4,5,5,7,6,5,6,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3,2,2,19,1,1,1,10,1,10,2,1,2,
26571,3,26,5568,3,40,11086,3,6,6199,4,7,5730,4,28,23855,4,29,4265,4,10,4620,4,15,1907,4,35,6818,4,41,3198,4,21,2900,4,25,1293,4,23,1465,3,...,6,2,7,5,7,7,7,1,7,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,4,3,2,1,23,1,1,2,10,4,70,2,1,11,Accounting
28720,1,31,3259,1,24,2679,1,42,2641,1,37,2258,2,34,2652,2,7,2918,1,20,2757,1,35,1799,2,26,4225,1,2,6396,1,27,2527,2,15,3294,2,32,3046,2,...,5,3,6,5,6,5,6,6,6,1,1,0,1,1,0,0,0,0,1,0,0,0,1,1,1,3,1,2,2,25,1,1,1,10,1,10,2,1,3,Psychology


Guidelines as per [DASS](https://www.psytoolkit.org/survey-library/depression-anxiety-stress-dass.html):

Meaning  | Depression | Anxiety | Stress 
-------------------|------------------|-------------------|------------------
Normal       | 0-9     | 0-7     | 0-14     |
Mild       | 10-13     | 8-9     | 15-18     |
Moderate       | 14-20     | 10-14     | 19-25     |
Severe       | 21-27     | 15-19     | 26-33     |
Extremely Severe       | 28+     | 20+     | 34+     |

In [14]:
dep_cols = ['Q3A', 'Q5A', 'Q10A', 'Q13A', 'Q16A', 'Q17A', 'Q21A', 'Q24A',\
            'Q26A', 'Q31A', 'Q34A', 'Q37A', 'Q38A', 'Q42A']
anx_cols = ['Q2A', 'Q4A', 'Q7A', 'Q9A', 'Q15A', 'Q19A', 'Q20A', 'Q23A',\
            'Q25A', 'Q28A', 'Q30A', 'Q36A', 'Q40A', 'Q41A']
str_cols = ['Q1A', 'Q6A', 'Q8A', 'Q11A', 'Q12A', 'Q14A', 'Q18A', 'Q22A',\
            'Q27A', 'Q29A', 'Q32A', 'Q33A', 'Q35A', 'Q39A']

# source code for columns: https://www.kaggle.com/solvalou/dass-eda
df['Depression Sum'] = df[dep_cols].sum(axis=1)
df['Anxiety Sum'] = df[anx_cols].sum(axis=1)
df['Stress Sum'] = df[str_cols].sum(axis=1)

df.head()

Unnamed: 0,Q1A,Q1I,Q1E,Q2A,Q2I,Q2E,Q3A,Q3I,Q3E,Q4A,Q4I,Q4E,Q5A,Q5I,Q5E,Q6A,Q6I,Q6E,Q7A,Q7I,Q7E,Q8A,Q8I,Q8E,Q9A,Q9I,Q9E,Q10A,Q10I,Q10E,Q11A,Q11I,Q11E,Q12A,Q12I,Q12E,Q13A,Q13I,Q13E,Q14A,...,TIPI5,TIPI6,TIPI7,TIPI8,TIPI9,TIPI10,VCL1,VCL2,VCL3,VCL4,VCL5,VCL6,VCL7,VCL8,VCL9,VCL10,VCL11,VCL12,VCL13,VCL14,VCL15,VCL16,education,urban,gender,engnat,age,screensize,uniquenetworklocation,hand,religion,orientation,race,voted,married,familysize,major,Depression Sum,Anxiety Sum,Stress Sum
21513,3,16,4447,2,40,4928,1,35,3848,1,9,3962,2,13,3691,2,19,5715,2,6,7017,2,21,7363,2,5,17032,2,27,3570,3,24,3362,2,7,4191,2,18,2450,3,...,7,5,7,3,2,5,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,3,2,2,2,19,1,2,1,10,1,10,2,1,3,political science,31,33,32
1796,2,25,40037,3,35,5262,2,29,10343,1,22,4624,3,32,6223,2,42,8265,3,3,9968,3,15,1515,4,11,11875,4,10,13266,2,7,14688,2,17,10797,3,24,1199,2,...,2,6,6,5,3,2,1,0,1,1,0,0,0,0,0,1,0,0,0,1,1,1,4,2,1,2,24,2,1,2,1,1,60,2,1,2,,46,33,35
21861,2,9,5901,4,20,14174,2,22,10254,1,12,4115,1,42,8455,1,32,6866,2,25,4533,2,30,3793,2,15,19898,1,40,4467,2,28,2179,3,2,21060,2,37,3279,2,...,5,7,6,5,6,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3,2,2,19,1,1,1,10,1,10,2,1,2,,22,26,29
26571,3,26,5568,3,40,11086,3,6,6199,4,7,5730,4,28,23855,4,29,4265,4,10,4620,4,15,1907,4,35,6818,4,41,3198,4,21,2900,4,25,1293,4,23,1465,3,...,5,7,7,7,1,7,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,4,3,2,1,23,1,1,2,10,4,70,2,1,11,Accounting,54,53,53
28720,1,31,3259,1,24,2679,1,42,2641,1,37,2258,2,34,2652,2,7,2918,1,20,2757,1,35,1799,2,26,4225,1,2,6396,1,27,2527,2,15,3294,2,32,3046,2,...,5,6,5,6,6,6,1,1,0,1,1,0,0,0,0,1,0,0,0,1,1,1,3,1,2,2,25,1,1,1,10,1,10,2,1,3,Psychology,19,21,20


Add on the 5 categories to the dataframe. Make 3 separate dataframes for depression, anxiety, and stress one-hot encoding process.

In [15]:
df['Normal'] = 0
df['Mild'] = 0
df['Moderate'] = 0
df['Severe'] = 0
df['Extremely Severe'] = 0

depression_df = df.copy()
anxiety_df = df.copy()
stress_df = df.copy()

In [16]:
depression_df.head()

Unnamed: 0,Q1A,Q1I,Q1E,Q2A,Q2I,Q2E,Q3A,Q3I,Q3E,Q4A,Q4I,Q4E,Q5A,Q5I,Q5E,Q6A,Q6I,Q6E,Q7A,Q7I,Q7E,Q8A,Q8I,Q8E,Q9A,Q9I,Q9E,Q10A,Q10I,Q10E,Q11A,Q11I,Q11E,Q12A,Q12I,Q12E,Q13A,Q13I,Q13E,Q14A,...,TIPI10,VCL1,VCL2,VCL3,VCL4,VCL5,VCL6,VCL7,VCL8,VCL9,VCL10,VCL11,VCL12,VCL13,VCL14,VCL15,VCL16,education,urban,gender,engnat,age,screensize,uniquenetworklocation,hand,religion,orientation,race,voted,married,familysize,major,Depression Sum,Anxiety Sum,Stress Sum,Normal,Mild,Moderate,Severe,Extremely Severe
21513,3,16,4447,2,40,4928,1,35,3848,1,9,3962,2,13,3691,2,19,5715,2,6,7017,2,21,7363,2,5,17032,2,27,3570,3,24,3362,2,7,4191,2,18,2450,3,...,5,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,3,2,2,2,19,1,2,1,10,1,10,2,1,3,political science,31,33,32,0,0,0,0,0
1796,2,25,40037,3,35,5262,2,29,10343,1,22,4624,3,32,6223,2,42,8265,3,3,9968,3,15,1515,4,11,11875,4,10,13266,2,7,14688,2,17,10797,3,24,1199,2,...,2,1,0,1,1,0,0,0,0,0,1,0,0,0,1,1,1,4,2,1,2,24,2,1,2,1,1,60,2,1,2,,46,33,35,0,0,0,0,0
21861,2,9,5901,4,20,14174,2,22,10254,1,12,4115,1,42,8455,1,32,6866,2,25,4533,2,30,3793,2,15,19898,1,40,4467,2,28,2179,3,2,21060,2,37,3279,2,...,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3,2,2,19,1,1,1,10,1,10,2,1,2,,22,26,29,0,0,0,0,0
26571,3,26,5568,3,40,11086,3,6,6199,4,7,5730,4,28,23855,4,29,4265,4,10,4620,4,15,1907,4,35,6818,4,41,3198,4,21,2900,4,25,1293,4,23,1465,3,...,7,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,4,3,2,1,23,1,1,2,10,4,70,2,1,11,Accounting,54,53,53,0,0,0,0,0
28720,1,31,3259,1,24,2679,1,42,2641,1,37,2258,2,34,2652,2,7,2918,1,20,2757,1,35,1799,2,26,4225,1,2,6396,1,27,2527,2,15,3294,2,32,3046,2,...,6,1,1,0,1,1,0,0,0,0,1,0,0,0,1,1,1,3,1,2,2,25,1,1,1,10,1,10,2,1,3,Psychology,19,21,20,0,0,0,0,0


One-hot-encode depression data.

In [17]:
for index, row in depression_df.iterrows():
    if row['Depression Sum'] <= 9: 
      depression_df.at[index,'Normal'] = 1
    elif row['Depression Sum'] > 9 and row['Depression Sum'] <= 13:
      depression_df.at[index,'Mild'] = 1
    elif row['Depression Sum'] >= 14 and row['Depression Sum'] <= 20:
      depression_df.at[index,'Moderate'] = 1
    elif row['Depression Sum'] >= 21 and row['Depression Sum'] <= 27:
      depression_df.at[index,'Severe'] = 1
    else:
      depression_df.at[index,'Extremely Severe'] = 1

In [18]:
depression_df.head()

Unnamed: 0,Q1A,Q1I,Q1E,Q2A,Q2I,Q2E,Q3A,Q3I,Q3E,Q4A,Q4I,Q4E,Q5A,Q5I,Q5E,Q6A,Q6I,Q6E,Q7A,Q7I,Q7E,Q8A,Q8I,Q8E,Q9A,Q9I,Q9E,Q10A,Q10I,Q10E,Q11A,Q11I,Q11E,Q12A,Q12I,Q12E,Q13A,Q13I,Q13E,Q14A,...,TIPI10,VCL1,VCL2,VCL3,VCL4,VCL5,VCL6,VCL7,VCL8,VCL9,VCL10,VCL11,VCL12,VCL13,VCL14,VCL15,VCL16,education,urban,gender,engnat,age,screensize,uniquenetworklocation,hand,religion,orientation,race,voted,married,familysize,major,Depression Sum,Anxiety Sum,Stress Sum,Normal,Mild,Moderate,Severe,Extremely Severe
21513,3,16,4447,2,40,4928,1,35,3848,1,9,3962,2,13,3691,2,19,5715,2,6,7017,2,21,7363,2,5,17032,2,27,3570,3,24,3362,2,7,4191,2,18,2450,3,...,5,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,3,2,2,2,19,1,2,1,10,1,10,2,1,3,political science,31,33,32,0,0,0,0,1
1796,2,25,40037,3,35,5262,2,29,10343,1,22,4624,3,32,6223,2,42,8265,3,3,9968,3,15,1515,4,11,11875,4,10,13266,2,7,14688,2,17,10797,3,24,1199,2,...,2,1,0,1,1,0,0,0,0,0,1,0,0,0,1,1,1,4,2,1,2,24,2,1,2,1,1,60,2,1,2,,46,33,35,0,0,0,0,1
21861,2,9,5901,4,20,14174,2,22,10254,1,12,4115,1,42,8455,1,32,6866,2,25,4533,2,30,3793,2,15,19898,1,40,4467,2,28,2179,3,2,21060,2,37,3279,2,...,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3,2,2,19,1,1,1,10,1,10,2,1,2,,22,26,29,0,0,0,1,0
26571,3,26,5568,3,40,11086,3,6,6199,4,7,5730,4,28,23855,4,29,4265,4,10,4620,4,15,1907,4,35,6818,4,41,3198,4,21,2900,4,25,1293,4,23,1465,3,...,7,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,4,3,2,1,23,1,1,2,10,4,70,2,1,11,Accounting,54,53,53,0,0,0,0,1
28720,1,31,3259,1,24,2679,1,42,2641,1,37,2258,2,34,2652,2,7,2918,1,20,2757,1,35,1799,2,26,4225,1,2,6396,1,27,2527,2,15,3294,2,32,3046,2,...,6,1,1,0,1,1,0,0,0,0,1,0,0,0,1,1,1,3,1,2,2,25,1,1,1,10,1,10,2,1,3,Psychology,19,21,20,0,0,1,0,0


Do the same for stress and anxiety levels.

In [19]:
for index, row in anxiety_df.iterrows(): # anxiety
    if row['Anxiety Sum'] <= 7: 
      anxiety_df.at[index,'Normal'] = 1
    elif row['Anxiety Sum'] > 7 and row['Anxiety Sum'] <= 9:
      anxiety_df.at[index,'Mild'] = 1
    elif row['Anxiety Sum'] >= 10 and row['Anxiety Sum'] <= 14:
      anxiety_df.at[index,'Moderate'] = 1
    elif row['Anxiety Sum'] >= 15 and row['Anxiety Sum'] <= 19:
      anxiety_df.at[index,'Severe'] = 1
    else:
      anxiety_df.at[index,'Extremely Severe'] = 1

for index, row in stress_df.iterrows(): # stress
    if row['Stress Sum'] <= 14: 
      stress_df.at[index,'Normal'] = 1
    elif row['Stress Sum'] > 14 and row['Stress Sum'] <= 18:
      stress_df.at[index,'Mild'] = 1
    elif row['Stress Sum'] >= 19 and row['Stress Sum'] <= 25:
      stress_df.at[index,'Moderate'] = 1
    elif row['Stress Sum'] >= 26 and row['Stress Sum'] <= 33:
      stress_df.at[index,'Severe'] = 1
    else:
      stress_df.at[index,'Extremely Severe'] = 1

Check to make sure these worked properly.

In [20]:
anxiety_df.head()

Unnamed: 0,Q1A,Q1I,Q1E,Q2A,Q2I,Q2E,Q3A,Q3I,Q3E,Q4A,Q4I,Q4E,Q5A,Q5I,Q5E,Q6A,Q6I,Q6E,Q7A,Q7I,Q7E,Q8A,Q8I,Q8E,Q9A,Q9I,Q9E,Q10A,Q10I,Q10E,Q11A,Q11I,Q11E,Q12A,Q12I,Q12E,Q13A,Q13I,Q13E,Q14A,...,TIPI10,VCL1,VCL2,VCL3,VCL4,VCL5,VCL6,VCL7,VCL8,VCL9,VCL10,VCL11,VCL12,VCL13,VCL14,VCL15,VCL16,education,urban,gender,engnat,age,screensize,uniquenetworklocation,hand,religion,orientation,race,voted,married,familysize,major,Depression Sum,Anxiety Sum,Stress Sum,Normal,Mild,Moderate,Severe,Extremely Severe
21513,3,16,4447,2,40,4928,1,35,3848,1,9,3962,2,13,3691,2,19,5715,2,6,7017,2,21,7363,2,5,17032,2,27,3570,3,24,3362,2,7,4191,2,18,2450,3,...,5,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,3,2,2,2,19,1,2,1,10,1,10,2,1,3,political science,31,33,32,0,0,0,0,1
1796,2,25,40037,3,35,5262,2,29,10343,1,22,4624,3,32,6223,2,42,8265,3,3,9968,3,15,1515,4,11,11875,4,10,13266,2,7,14688,2,17,10797,3,24,1199,2,...,2,1,0,1,1,0,0,0,0,0,1,0,0,0,1,1,1,4,2,1,2,24,2,1,2,1,1,60,2,1,2,,46,33,35,0,0,0,0,1
21861,2,9,5901,4,20,14174,2,22,10254,1,12,4115,1,42,8455,1,32,6866,2,25,4533,2,30,3793,2,15,19898,1,40,4467,2,28,2179,3,2,21060,2,37,3279,2,...,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3,2,2,19,1,1,1,10,1,10,2,1,2,,22,26,29,0,0,0,0,1
26571,3,26,5568,3,40,11086,3,6,6199,4,7,5730,4,28,23855,4,29,4265,4,10,4620,4,15,1907,4,35,6818,4,41,3198,4,21,2900,4,25,1293,4,23,1465,3,...,7,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,4,3,2,1,23,1,1,2,10,4,70,2,1,11,Accounting,54,53,53,0,0,0,0,1
28720,1,31,3259,1,24,2679,1,42,2641,1,37,2258,2,34,2652,2,7,2918,1,20,2757,1,35,1799,2,26,4225,1,2,6396,1,27,2527,2,15,3294,2,32,3046,2,...,6,1,1,0,1,1,0,0,0,0,1,0,0,0,1,1,1,3,1,2,2,25,1,1,1,10,1,10,2,1,3,Psychology,19,21,20,0,0,0,0,1


In [21]:
stress_df.head()

Unnamed: 0,Q1A,Q1I,Q1E,Q2A,Q2I,Q2E,Q3A,Q3I,Q3E,Q4A,Q4I,Q4E,Q5A,Q5I,Q5E,Q6A,Q6I,Q6E,Q7A,Q7I,Q7E,Q8A,Q8I,Q8E,Q9A,Q9I,Q9E,Q10A,Q10I,Q10E,Q11A,Q11I,Q11E,Q12A,Q12I,Q12E,Q13A,Q13I,Q13E,Q14A,...,TIPI10,VCL1,VCL2,VCL3,VCL4,VCL5,VCL6,VCL7,VCL8,VCL9,VCL10,VCL11,VCL12,VCL13,VCL14,VCL15,VCL16,education,urban,gender,engnat,age,screensize,uniquenetworklocation,hand,religion,orientation,race,voted,married,familysize,major,Depression Sum,Anxiety Sum,Stress Sum,Normal,Mild,Moderate,Severe,Extremely Severe
21513,3,16,4447,2,40,4928,1,35,3848,1,9,3962,2,13,3691,2,19,5715,2,6,7017,2,21,7363,2,5,17032,2,27,3570,3,24,3362,2,7,4191,2,18,2450,3,...,5,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,3,2,2,2,19,1,2,1,10,1,10,2,1,3,political science,31,33,32,0,0,0,1,0
1796,2,25,40037,3,35,5262,2,29,10343,1,22,4624,3,32,6223,2,42,8265,3,3,9968,3,15,1515,4,11,11875,4,10,13266,2,7,14688,2,17,10797,3,24,1199,2,...,2,1,0,1,1,0,0,0,0,0,1,0,0,0,1,1,1,4,2,1,2,24,2,1,2,1,1,60,2,1,2,,46,33,35,0,0,0,0,1
21861,2,9,5901,4,20,14174,2,22,10254,1,12,4115,1,42,8455,1,32,6866,2,25,4533,2,30,3793,2,15,19898,1,40,4467,2,28,2179,3,2,21060,2,37,3279,2,...,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3,2,2,19,1,1,1,10,1,10,2,1,2,,22,26,29,0,0,0,1,0
26571,3,26,5568,3,40,11086,3,6,6199,4,7,5730,4,28,23855,4,29,4265,4,10,4620,4,15,1907,4,35,6818,4,41,3198,4,21,2900,4,25,1293,4,23,1465,3,...,7,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,4,3,2,1,23,1,1,2,10,4,70,2,1,11,Accounting,54,53,53,0,0,0,0,1
28720,1,31,3259,1,24,2679,1,42,2641,1,37,2258,2,34,2652,2,7,2918,1,20,2757,1,35,1799,2,26,4225,1,2,6396,1,27,2527,2,15,3294,2,32,3046,2,...,6,1,1,0,1,1,0,0,0,0,1,0,0,0,1,1,1,3,1,2,2,25,1,1,1,10,1,10,2,1,3,Psychology,19,21,20,0,0,1,0,0


Split the data into training and testing data. Predict depression, anxiety, and stress scores based on response times.



In [30]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# for depression scores

dep_r_cols = ['Q3E', 'Q5E', 'Q10E', 'Q13E', 'Q16E', 'Q17E', 'Q21E', 'Q24E',\
            'Q26E', 'Q31E', 'Q34E', 'Q37E', 'Q38E', 'Q42E']

d_X_train, d_X_test, d_y_train, d_y_test = train_test_split(
        depression_df[dep_r_cols], depression_df['Depression Sum'], 
        test_size=0.30, random_state=42)


Perform 5-fold cross validation for depression data.

In [31]:
s = d_X_train.shape[0]

set1 = d_X_train[:int(0.2*s)]
set1_y = d_y_train[:int(0.2*s)]

set2 = d_X_train[int(0.2*s):int(0.4*s)]
set2_y = d_y_train[int(0.2*s):int(0.4*s)]

set3 = d_X_train[int(0.4*s):int(0.6*s)]
set3_y = d_y_train[int(0.4*s):int(0.6*s)]

set4 = d_X_train[int(0.6*s):int(0.8*s)]
set4_y = d_y_train[int(0.6*s):int(0.8*s)]

set5 = d_X_train[int(0.8*s):]
set5_y = d_y_train[int(0.8*s):]

set_list_x = [set1, set2, set3, set4, set5]
set_list_y = [set1_y, set2_y, set3_y, set4_y, set5_y]

reg_acc = 0

for i in range(5):
    new_set_list_x = [set1, set2, set3, set4, set5]
    new_set_list_y = [set1_y, set2_y, set3_y, set4_y, set5_y]
        
    X_valid = new_set_list_x.pop(i)
    y_valid = new_set_list_y.pop(i)
        
    X_training = np.concatenate(new_set_list_x)
    y_training = np.concatenate(new_set_list_y)        
      
    reg = LinearRegression()
    reg.fit(X_training, y_training)
    reg_acc += reg.score(X_valid, y_valid)
    
print('R-squared:', reg_acc/5)

R-squared: -0.012196561718376376


  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"


Repeat the process for anxiety and stress.

In [33]:
anx_r_cols = ['Q2E', 'Q4E', 'Q7E', 'Q9E', 'Q15E', 'Q19E', 'Q20E', 'Q23E',\
            'Q25E', 'Q28E', 'Q30E', 'Q36E', 'Q40E', 'Q41E']

a_X_train, a_X_test, a_y_train, a_y_test = train_test_split(
        anxiety_df[anx_r_cols], anxiety_df['Anxiety Sum'], 
        test_size=0.30, random_state=42)

s = a_X_train.shape[0]

set1 = a_X_train[:int(0.2*s)]
set1_y = a_y_train[:int(0.2*s)]

set2 = a_X_train[int(0.2*s):int(0.4*s)]
set2_y = a_y_train[int(0.2*s):int(0.4*s)]

set3 = a_X_train[int(0.4*s):int(0.6*s)]
set3_y = a_y_train[int(0.4*s):int(0.6*s)]

set4 = a_X_train[int(0.6*s):int(0.8*s)]
set4_y = a_y_train[int(0.6*s):int(0.8*s)]

set5 = a_X_train[int(0.8*s):]
set5_y = a_y_train[int(0.8*s):]

set_list_x = [set1, set2, set3, set4, set5]
set_list_y = [set1_y, set2_y, set3_y, set4_y, set5_y]

reg_acc = 0

for i in range(5):
    new_set_list_x = [set1, set2, set3, set4, set5]
    new_set_list_y = [set1_y, set2_y, set3_y, set4_y, set5_y]
        
    X_valid = new_set_list_x.pop(i)
    y_valid = new_set_list_y.pop(i)
        
    X_training = np.concatenate(new_set_list_x)
    y_training = np.concatenate(new_set_list_y)        
      
    reg = LinearRegression()
    reg.fit(X_training, y_training)
    reg_acc += reg.score(X_valid, y_valid)
    
print('R-squared:', reg_acc/5)

R-squared: -0.26102081474706496


  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"


In [34]:
str_r_cols = ['Q1E', 'Q6E', 'Q8E', 'Q11E', 'Q12E', 'Q14E', 'Q18E', 'Q22E',\
            'Q27E', 'Q29E', 'Q32E', 'Q33E', 'Q35E', 'Q39E']

s_X_train, s_X_test, s_y_train, s_y_test = train_test_split(
        stress_df[str_r_cols], stress_df['Stress Sum'], 
        test_size=0.30, random_state=42)

s = s_X_train.shape[0]

set1 = s_X_train[:int(0.2*s)]
set1_y = s_y_train[:int(0.2*s)]

set2 = s_X_train[int(0.2*s):int(0.4*s)]
set2_y = s_y_train[int(0.2*s):int(0.4*s)]

set3 = s_X_train[int(0.4*s):int(0.6*s)]
set3_y = s_y_train[int(0.4*s):int(0.6*s)]

set4 = s_X_train[int(0.6*s):int(0.8*s)]
set4_y = s_y_train[int(0.6*s):int(0.8*s)]

set5 = s_X_train[int(0.8*s):]
set5_y = s_y_train[int(0.8*s):]

set_list_x = [set1, set2, set3, set4, set5]
set_list_y = [set1_y, set2_y, set3_y, set4_y, set5_y]

reg_acc = 0

for i in range(5):
    new_set_list_x = [set1, set2, set3, set4, set5]
    new_set_list_y = [set1_y, set2_y, set3_y, set4_y, set5_y]
        
    X_valid = new_set_list_x.pop(i)
    y_valid = new_set_list_y.pop(i)
        
    X_training = np.concatenate(new_set_list_x)
    y_training = np.concatenate(new_set_list_y)        
      
    reg = LinearRegression()
    reg.fit(X_training, y_training)
    reg_acc += reg.score(X_valid, y_valid)
    
print('R-squared:', reg_acc/5)

R-squared: -0.0020502059874742073


  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"


Our calculated R^2 values suggest that there is almost no correlation between response times and depression, anxiety, and stress scores. Next, we will check to see if the following are good predictors of 'Extremely Severe' depression, anxiety, and stress levels:      

*   Education
*   Gender
*   Orientation
*   Religion

We will try a range of classifiers and see which work the best.






In [81]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression

# for depression

dep_r_cols = ['education', 'gender', 'orientation', 'religion']

d_X_train, d_X_test, d_y_train, d_y_test = train_test_split(
        depression_df[dep_r_cols], depression_df['Extremely Severe'], 
        test_size=0.30, random_state=42)

s = d_X_train.shape[0]

set1 = d_X_train[:int(0.2*s)]
set1_y = d_y_train[:int(0.2*s)]

set2 = d_X_train[int(0.2*s):int(0.4*s)]
set2_y = d_y_train[int(0.2*s):int(0.4*s)]

set3 = d_X_train[int(0.4*s):int(0.6*s)]
set3_y = d_y_train[int(0.4*s):int(0.6*s)]

set4 = d_X_train[int(0.6*s):int(0.8*s)]
set4_y = d_y_train[int(0.6*s):int(0.8*s)]

set5 = d_X_train[int(0.8*s):]
set5_y = d_y_train[int(0.8*s):]

set_list_x = [set1, set2, set3, set4, set5]
set_list_y = [set1_y, set2_y, set3_y, set4_y, set5_y]

clf_list = [('Decision Tree', DecisionTreeClassifier()), \
            ('kNN', KNeighborsClassifier()), \
            ('Perceptron', Perceptron(max_iter = 20)), \
            ('Logistic Regression', LogisticRegression())]

for clf_type in clf_list:
    print('Classifier name:', clf_type[0])
    
    clf = clf_type[1] # instantiate class
    clf_acc = 0 # keep scores stored to average later
    
    for i in range(len(set_list_x)):
        new_set_list_x = [set1, set2, set3, set4, set5]
        new_set_list_y = [set1_y, set2_y, set3_y, set4_y, set5_y]
        
        X_valid = new_set_list_x.pop(i)
        y_valid = new_set_list_y.pop(i)
        
        X_training = np.concatenate(new_set_list_x)
        y_training = np.concatenate(new_set_list_y)        
        
        clf.fit(X_training.reshape(-1, 4), y_training)
        clf_acc += clf.score(np.array(X_valid).reshape(-1, 4), y_valid)
    
    print('Accuracy:', clf_acc/5)

Classifier name: Decision Tree
Accuracy: 0.6707849332202278
Classifier name: kNN
Accuracy: 0.6185259495297243
Classifier name: Perceptron
Accuracy: 0.5005460234508352
Classifier name: Logistic Regression
Accuracy: 0.6801593720781913


The logistic regression works the best here, yielding an accuracy of 0.68 on predicting whether or not someone is 'Extremely Severely' depressed based on their gender, age, orientation, and religion. We will do the same for 'Extremely Severe' anxiety and stress.


In [80]:
anx_r_cols = ['education', 'gender', 'orientation', 'religion']

a_X_train, a_X_test, a_y_train, a_y_test = train_test_split(
        anxiety_df[anx_r_cols], anxiety_df['Extremely Severe'], 
        test_size=0.30, random_state=42)

s = a_X_train.shape[0]

set1 = a_X_train[:int(0.2*s)]
set1_y = a_y_train[:int(0.2*s)]

set2 = a_X_train[int(0.2*s):int(0.4*s)]
set2_y = a_y_train[int(0.2*s):int(0.4*s)]

set3 = a_X_train[int(0.4*s):int(0.6*s)]
set3_y = a_y_train[int(0.4*s):int(0.6*s)]

set4 = a_X_train[int(0.6*s):int(0.8*s)]
set4_y = a_y_train[int(0.6*s):int(0.8*s)]

set5 = a_X_train[int(0.8*s):]
set5_y = a_y_train[int(0.8*s):]

set_list_x = [set1, set2, set3, set4, set5]
set_list_y = [set1_y, set2_y, set3_y, set4_y, set5_y]

clf_list = [('Decision Tree', DecisionTreeClassifier()), \
            ('kNN', KNeighborsClassifier()), \
            ('Perceptron', Perceptron(max_iter = 20)), \
            ('Logistic Regression', LogisticRegression())]

for clf_type in clf_list:
    print('Classifier name:', clf_type[0])
    
    clf = clf_type[1] # instantiate class
    clf_acc = 0 # keep scores stored to average later
    
    for i in range(len(set_list_x)):
        new_set_list_x = [set1, set2, set3, set4, set5]
        new_set_list_y = [set1_y, set2_y, set3_y, set4_y, set5_y]
        
        X_valid = new_set_list_x.pop(i)
        y_valid = new_set_list_y.pop(i)
        
        X_training = np.concatenate(new_set_list_x)
        y_training = np.concatenate(new_set_list_y)        
        
        clf.fit(X_training.reshape(-1, 4), y_training)
        clf_acc += clf.score(np.array(X_valid).reshape(-1, 4), y_valid)
    
    print('Accuracy:', clf_acc/5)

Classifier name: Decision Tree
Accuracy: 0.8205226734922177
Classifier name: kNN
Accuracy: 0.8071255363743879
Classifier name: Perceptron
Accuracy: 0.6690901939719671
Classifier name: Logistic Regression
Accuracy: 0.8254791765995255


The logistic regression works the best here, yielding an accuracy of 0.83 on predicting whether or not someone is 'Extremely Severely' anxious based on their gender, age, orientation, and religion. Finally, we will do this for "Extremely Severe" stress.

In [85]:
str_r_cols = ['education', 'gender', 'orientation', 'religion']

s_X_train, s_X_test, s_y_train, s_y_test = train_test_split(
        stress_df[str_r_cols], stress_df['Extremely Severe'], 
        test_size=0.30, random_state=42)

s = s_X_train.shape[0]

set1 = s_X_train[:int(0.2*s)]
set1_y = s_y_train[:int(0.2*s)]

set2 = s_X_train[int(0.2*s):int(0.4*s)]
set2_y = s_y_train[int(0.2*s):int(0.4*s)]

set3 = s_X_train[int(0.4*s):int(0.6*s)]
set3_y = s_y_train[int(0.4*s):int(0.6*s)]

set4 = s_X_train[int(0.6*s):int(0.8*s)]
set4_y = s_y_train[int(0.6*s):int(0.8*s)]

set5 = s_X_train[int(0.8*s):]
set5_y = s_y_train[int(0.8*s):]

set_list_x = [set1, set2, set3, set4, set5]
set_list_y = [set1_y, set2_y, set3_y, set4_y, set5_y]

clf_list = [('Decision Tree', DecisionTreeClassifier()), \
            ('kNN', KNeighborsClassifier()), \
            ('Perceptron', Perceptron(max_iter = 20)), \
            ('Logistic Regression', LogisticRegression())]

for clf_type in clf_list:
    print('Classifier name:', clf_type[0])
    
    clf = clf_type[1] # instantiate class
    clf_acc = 0 # keep scores stored to average later
    
    for i in range(len(set_list_x)):
        new_set_list_x = [set1, set2, set3, set4, set5]
        new_set_list_y = [set1_y, set2_y, set3_y, set4_y, set5_y]
        
        X_valid = new_set_list_x.pop(i)
        y_valid = new_set_list_y.pop(i)
        
        X_training = np.concatenate(new_set_list_x)
        y_training = np.concatenate(new_set_list_y)        
        
        clf.fit(X_training.reshape(-1, 4), y_training)
        clf_acc += clf.score(np.array(X_valid).reshape(-1, 4), y_valid)
    
    print('Accuracy:', clf_acc/5)

Classifier name: Decision Tree
Accuracy: 0.5799150430957084
Classifier name: kNN
Accuracy: 0.5441782094228518
Classifier name: Perceptron
Accuracy: 0.5297052985224033
Classifier name: Logistic Regression
Accuracy: 0.5791607134011555


The decision tree classifier works the best here, yielding an accuracy of 0.58 on predicting whether or not someone is 'Extremely Severely' stressed based on their gender, age, orientation, and religion. Similar results can be found on the other one-hot encoded values using such classifiers.

Finally, we will calculate recall, precision, and the f1 score on the logistic regression classifier for depression, anxiety, and stress.

In [86]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

# depression
d_X_train, d_X_test, d_y_train, d_y_test = train_test_split(
        depression_df[dep_r_cols], depression_df['Extremely Severe'], 
        test_size=0.30, random_state=42)

d_reg = LogisticRegression().fit(d_X_train, d_y_train)
predictions = d_reg.predict(d_X_test)

print('Recall score:', recall_score(d_y_test, predictions))
print('Precision score:', precision_score(d_y_test, predictions))
print('F1 score:', f1_score(d_y_test, predictions))

Recall score: 0.9993894993894994
Precision score: 0.686545881563496
F1 score: 0.8139419252187748


In [87]:
# anxiety
a_X_train, a_X_test, a_y_train, a_y_test = train_test_split(
        anxiety_df[anx_r_cols], anxiety_df['Extremely Severe'], 
        test_size=0.30, random_state=42)

a_reg = LogisticRegression().fit(a_X_train, a_y_train)
predictions = a_reg.predict(a_X_test)

print('Recall score:', recall_score(a_y_test, predictions))
print('Precision score:', precision_score(a_y_test, predictions))
print('F1 score:', f1_score(a_y_test, predictions))

Recall score: 0.9998990103009493
Precision score: 0.8298549995809237
F1 score: 0.9069756790180002


In [88]:
# stress
s_X_train, s_X_test, s_y_train, s_y_test = train_test_split(
        stress_df[str_r_cols], stress_df['Extremely Severe'], 
        test_size=0.30, random_state=42)

s_reg = LogisticRegression().fit(s_X_train, s_y_train)
predictions = a_reg.predict(s_X_test)

print('Recall score:', recall_score(s_y_test, predictions))
print('Precision score:', precision_score(s_y_test, predictions))
print('F1 score:', f1_score(s_y_test, predictions))

Recall score: 1.0
Precision score: 0.5530969742687117
F1 score: 0.7122504047490555
