In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

import sys
sys.path.append("./")

In [23]:
# load INS-W_4 dataset for evaluation
import pandas as pd
import os

directory = "data_raw/INS-W_4/"

In [26]:
# survey data
survey_data_dir = os.path.join(directory, "SurveyData")

# load dep_weekly_new
dep_weekly_new = pd.read_csv(os.path.join(survey_data_dir, "dep_weekly_new.csv"))

print(dep_weekly_new.head())

          pid        date  phq4    dep  BDI2 dep_weekly_subscale  \
0  INS-W_1000  2021-03-28   2.0  False   NaN               False   
1  INS-W_1000  2021-04-04   3.0   True   NaN               False   
2  INS-W_1000  2021-04-11   6.0   True   NaN                True   
3  INS-W_1000  2021-04-18   1.0  False   NaN               False   
4  INS-W_1000  2021-04-25   5.0   True   NaN                True   

  anx_weekly_subscale  dep_weeklysubscale_endterm_merged  
0               False                              False  
1               False                              False  
2               False                               True  
3               False                              False  
4               False                               True  


In [49]:
# calculate the mean of each pid
dep_weekly_new_mean = dep_weekly_new.groupby(
    "pid")['phq4'].mean().reset_index()

# calculate the standard deviation of each pid
dep_weekly_new_std = dep_weekly_new.groupby("pid")['phq4'].std().reset_index()

# rename columns before joining to avoid suffix issues
dep_weekly_new_mean = dep_weekly_new_mean.rename(columns={'phq4': 'phq4_mean'})
dep_weekly_new_std = dep_weekly_new_std.rename(columns={'phq4': 'phq4_std'})

# merge the two dataframes on 'pid'
dep_weekly_mean_std = pd.merge(
    dep_weekly_new_mean, dep_weekly_new_std, on='pid', how='inner')

print(dep_weekly_mean_std.head())

          pid  phq4_mean  phq4_std
0  INS-W_1000   3.400000  2.011080
1  INS-W_1002   1.000000  0.632456
2  INS-W_1003   4.100000  1.728840
3  INS-W_1004   0.636364  0.809040
4  INS-W_1005   7.666667  1.505545


In [50]:
# normalize the phq4_mean and phq4_std columns
dep_weekly_mean_std['phq4_mean'] = (
    dep_weekly_mean_std['phq4_mean'] - dep_weekly_mean_std['phq4_mean'].min()) / (
        dep_weekly_mean_std['phq4_mean'].max() - dep_weekly_mean_std['phq4_mean'].min())
dep_weekly_mean_std['phq4_std'] = (
    dep_weekly_mean_std['phq4_std'] - dep_weekly_mean_std['phq4_std'].min()) / (
        dep_weekly_mean_std['phq4_std'].max() - dep_weekly_mean_std['phq4_std'].min())
print(dep_weekly_mean_std.head())

          pid  phq4_mean  phq4_std
0  INS-W_1000   0.294488  0.431773
1  INS-W_1002   0.086614  0.135786
2  INS-W_1003   0.355118  0.371177
3  INS-W_1004   0.055118  0.173698
4  INS-W_1005   0.664042  0.323236


In [52]:
# add another column, the sum of phq4_mean and phq4_std
dep_weekly_mean_std['phq4_sum'] = dep_weekly_mean_std['phq4_mean'] + \
    dep_weekly_mean_std['phq4_std']
# sort by phq4_sum column
dep_weekly_mean_std = dep_weekly_mean_std.sort_values(by="phq4_sum", ascending=False)
dep_weekly_mean_std.head(n = 10)

Unnamed: 0,pid,phq4_mean,phq4_std,phq4_sum
181,INS-W_984,0.365704,1.0,1.365704
99,INS-W_1222,1.0,0.323668,1.323668
51,INS-W_1055,0.669291,0.651851,1.321142
71,INS-W_1077,0.629921,0.637551,1.267473
94,INS-W_1217,0.637795,0.61684,1.254635
55,INS-W_1059,0.381102,0.853403,1.234506
5,INS-W_1006,0.528346,0.689783,1.218129
83,INS-W_1202,0.866142,0.332607,1.198749
6,INS-W_1007,0.503937,0.677696,1.181633
46,INS-W_1049,0.320472,0.828741,1.149214


In [None]:
# check if they have consistent bi-weekly data

In [33]:
# sort by phq4 column
dep_weekly_new_mean = dep_weekly_new_mean.sort_values(by="phq4", ascending=False)
dep_weekly_new_mean.head()

Unnamed: 0_level_0,phq4,dep,BDI2,dep_weeklysubscale_endterm_merged
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
INS-W_1222,11.545455,1.0,42.0,1.0
INS-W_1202,10.0,1.0,46.0,1.0
INS-W_1025,9.909091,1.0,25.0,1.0
INS-W_928,9.6,1.0,18.0,1.0
INS-W_1209,8.909091,1.0,31.0,0.916667


In [34]:
# sort by phq4 column
dep_weekly_new_std = dep_weekly_new_std.sort_values(by="phq4", ascending=False)
dep_weekly_new_std.head()

Unnamed: 0_level_0,phq4,dep,BDI2,dep_weeklysubscale_endterm_merged
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
INS-W_984,4.65773,0.514929,,0.452267
INS-W_1059,3.974921,0.547723,,0.447214
INS-W_1049,3.860052,0.522233,,0.40452
INS-W_1219,3.308239,0.527046,,0.421637
INS-W_1006,3.212822,0.301511,,0.522233


In [15]:
from data_loader import data_loader_ml

ds_keys = ["INS-W_4"]
dataset_dict_pred = data_loader_ml.data_loader(ds_keys_dict={
    "dep_weekly": ds_keys}, flag_more_feat_types=True, verbose=False)

In [16]:
data_depression_patients = dataset_dict_pred['dep_weekly']['INS-W_4'].datapoints

In [18]:
# show the statistics of depression patients
data_depression_patients.head()

Unnamed: 0,pid,date,X_raw,y_raw,y_allraw,device_type
0,INS-W_1000#INS-W_4,2021-03-28,pid date \ 0 INS-W...,False,pid INS-W_1000#INS-W_4 date 2021...,ios
1,INS-W_1000#INS-W_4,2021-04-04,pid date \ 0 INS-W...,True,pid INS-W_1000#INS-W_4 date 2021...,ios
2,INS-W_1000#INS-W_4,2021-04-11,pid date \ 0 INS-W...,True,pid INS-W_1000#INS-W_4 date 2021...,ios
3,INS-W_1000#INS-W_4,2021-04-18,pid date \ 0 INS-W...,False,pid INS-W_1000#INS-W_4 date 2021...,ios
4,INS-W_1000#INS-W_4,2021-04-25,pid date \ 0 INS-W...,True,pid INS-W_1000#INS-W_4 date 2021...,ios


In [20]:
data_depression_patients.iloc[0]

pid                                           INS-W_1000#INS-W_4
date                                         2021-03-28 00:00:00
X_raw                             pid       date  \
0   INS-W...
y_raw                                                      False
y_allraw       pid         INS-W_1000#INS-W_4
date       2021...
device_type                                                  ios
Name: 0, dtype: object