In [1]:
from src import utils
import pandas as pd

# Data processing: Join PHQ9, GAD7 and ACE datasets together by record_id and redcap_event_name

In [12]:
data = utils.data_load(data_keys={'phq9', 'generalized_anxiety_disorder_scale_gad7', 'ace', 'surveys'})

In [13]:
outcomes = pd.merge(data['phq9'], data['generalized_anxiety_disorder_scale_gad7'],  how='outer', left_on=['record_id','redcap_event_name'], right_on = ['record_id','redcap_event_name'])

In [15]:
overall_df = pd.merge(data['ace'].drop(columns=['redcap_event_name']).dropna(), outcomes, how='left', on='record_id')

In [16]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
overall_df.head()

Unnamed: 0,record_id,ace_1,ace_2,ace_3,ace_4,ace_5,ace_6,ace_7,ace_8,ace_9,ace_10,ace_complete,redcap_event_name,phq9_1,phq9_2,phq9_3,phq9_4,phq9_5,phq9_6,phq9_7,phq9_8,phq9_9,phq9_10,phq9_complete,gad_1,gad_2,gad_3,gad_4,gad_5,gad_6,gad_7,gad_8,generalized_anxiety_disorder_scale_gad7_complete
0,4,1,1,0,1.0,1,0,1,1,0,0,2,ci_1_arm_1,0.0,0.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,2,1.0,1.0,0.0,2.0,1.0,3.0,0.0,1.0,2.0
1,4,1,1,0,1.0,1,0,1,1,0,0,2,ci_2_arm_1,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,2,2.0,1.0,1.0,1.0,1.0,3.0,0.0,1.0,2.0
2,4,1,1,0,1.0,1,0,1,1,0,0,2,ci_3_arm_1,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,2,1.0,1.0,0.0,2.0,2.0,3.0,1.0,1.0,2.0
3,4,1,1,0,1.0,1,0,1,1,0,0,2,ci_4_arm_1,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,2,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,2.0
4,4,1,1,0,1.0,1,0,1,1,0,0,2,ci_5_arm_1,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,2,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,2.0


# Process PHQ9 and GAD data by taking the average over time for each individual

In [38]:
phq9_mean_lst = [f'phq9_{x}_mean' for x in range(1,11)]
gad_mean_lst = [f'gad_{x}_mean' for x in range(1,9)]
processed_overall_df = pd.DataFrame(columns=['record_id', 'ace_sum'] + phq9_mean_lst + gad_mean_lst)
for rid in overall_df['record_id'].unique():
    each_df = overall_df.loc[overall_df['record_id']==rid]
    ace_lst = [f'ace_{x}' for x in range(1,11)]
    phq9_lst = [f'phq9_{x}' for x in range(1,11)]
    gad_lst = [f'gad_{x}' for x in range(1,9)]
    ace_sum = each_df[ace_lst].mean(axis=0).sum()
    phq9_mean = each_df[phq9_lst].mean(axis=0).rename(lambda x: x + '_mean', axis='columns').to_dict()
    gad_mean = each_df[gad_lst].mean(axis=0).rename(lambda x: x + '_mean', axis='columns').to_dict()
    processed_overall_df = processed_overall_df.append({'record_id': rid, 'ace_sum': ace_sum, **phq9_mean, **gad_mean}, ignore_index=True)


In [40]:
processed_overall_df.head()

Unnamed: 0,record_id,ace_sum,phq9_1_mean,phq9_2_mean,phq9_3_mean,phq9_4_mean,phq9_5_mean,phq9_6_mean,phq9_7_mean,phq9_8_mean,phq9_9_mean,phq9_10_mean,gad_1_mean,gad_2_mean,gad_3_mean,gad_4_mean,gad_5_mean,gad_6_mean,gad_7_mean,gad_8_mean
0,4.0,6.0,0.071429,0.0,2.357143,2.785714,0.214286,0.142857,0.5,0.5,0.0,1.0,1.428571,0.785714,0.428571,1.214286,1.0,2.571429,0.142857,1.0
1,5.0,4.0,0.923077,0.692308,0.923077,1.538462,1.307692,0.538462,0.769231,0.5,0.076923,1.384615,1.615385,1.076923,1.153846,1.615385,1.461538,1.538462,0.461538,1.384615
2,6.0,0.0,0.083333,0.166667,0.5,1.583333,0.0,0.083333,0.083333,0.0,0.0,0.454545,0.916667,0.25,0.5,0.083333,0.083333,0.916667,0.5,0.454545
3,7.0,4.0,0.333333,0.666667,2.666667,1.666667,1.0,1.0,0.666667,0.0,0.0,0.333333,2.0,2.333333,2.333333,2.333333,1.0,2.333333,1.333333,0.333333
4,11.0,0.0,0.0,0.0,0.153846,0.384615,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0


# Next: Global survey data - PROMIS quality of life, I'm unable to join this with the overall_df dataframe

In [8]:
promis_survey = data['surveys']
promis_survey = promis_survey.loc[promis_survey['question_text'] == 'In general, would you say your quality of life is:']

In [29]:
user_ids = utils.data_load(data_keys={'study_ids'})['study_ids']['evidation_id', 'record_id']

In [40]:
promis_survey.loc[promis_survey['user_id'] == 37]

Unnamed: 0,id,user_id,survey_id,title,event_date,from,to,created_at,updated_at,question_id,question_text,answer_text,date
214098,154994,37,11,Global survey,2021-07-13 00:00:00,2021-07-13 04:00:00,2021-07-14 04:00:00,2021-07-13 04:08:01.137807,2021-07-13 17:58:54.62095,115,"In general, would you say your quality of life...",Excellent,2021-07-13
214099,127778,81,11,Global survey,2021-07-01 00:00:00,2021-07-01 04:00:00,2021-07-02 04:00:00,2021-07-01 04:07:32.412659,2021-07-01 11:54:47.462582,115,"In general, would you say your quality of life...",Excellent,2021-07-01
214100,145441,110,11,Global survey,2021-07-09 00:00:00,2021-07-09 04:00:00,2021-07-10 04:00:00,2021-07-09 04:08:00.721924,2021-07-09 12:01:56.842804,115,"In general, would you say your quality of life...",Excellent,2021-07-09
214101,132847,94,11,Global survey,2021-07-03 00:00:00,2021-07-03 05:00:00,2021-07-04 05:00:00,2021-07-03 05:07:42.444653,2021-07-03 20:32:52.358196,115,"In general, would you say your quality of life...",Excellent,2021-07-03
214102,127773,55,11,Global survey,2021-07-01 00:00:00,2021-07-01 04:00:00,2021-07-02 04:00:00,2021-07-01 04:07:32.076611,2021-07-01 15:03:31.756573,115,"In general, would you say your quality of life...",Excellent,2021-07-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...
215354,469941,186,11,Global survey,2021-11-03 00:00:00,2021-11-02 04:00:00,2021-11-03 04:00:00,2021-11-02 04:10:30.989147,2021-11-03 00:48:27.847439,115,"In general, would you say your quality of life...",Fair,2021-11-03
215355,1070054,1745,11,Global survey,2022-03-11 00:00:00,2022-03-11 07:00:00,2022-03-12 06:00:00,2022-03-11 06:41:28.065601,2022-03-11 22:02:47.311421,115,"In general, would you say your quality of life...",Fair,2022-03-11
215356,993378,1699,11,Global survey,2022-02-25 00:00:00,2022-02-24 06:00:00,2022-02-25 05:00:00,2022-02-24 05:31:38.265878,2022-02-25 01:05:00.187846,115,"In general, would you say your quality of life...",Fair,2022-02-25
215357,963407,1021,11,Global survey,2022-02-19 00:00:00,2022-02-18 06:00:00,2022-02-19 05:00:00,2022-02-18 05:31:15.206333,2022-02-19 00:01:35.303646,115,"In general, would you say your quality of life...",Fair,2022-02-19


In [39]:
utils.data_load(data_keys={'study_ids'})['study_ids']

Unnamed: 0,record_id,redcap_event_name,participant_id,sema4_id,evidation_id,evidation_id_prior1,evidation_id_prior2,evidation_id_c,study_ids_complete
0,1,enrollment__partic_arm_1,1,EVI-HQYBGEMV,25.0,,,,2
1,2,enrollment__partic_arm_1,2,EVI-UYERWPZG,,,,,2
2,3,enrollment__partic_arm_1,3,EVI-IHVYSQTW,27.0,,,,2
3,4,enrollment__partic_arm_1,4,EVI-AWRFLXIP,28.0,,,,2
4,5,enrollment__partic_arm_1,5,EVI-SHDBCNYF,29.0,,,,2
...,...,...,...,...,...,...,...,...,...
687,893,enrollment__partic_arm_1,893,BUMP-ZCVLKXEF,2171.0,,,,2
688,894,enrollment__partic_arm_1,894,BUMP-CDYVZWGN,2180.0,,,,2
689,895,enrollment__partic_arm_1,895,BUMP-ESHQKOMW,,,,2185.0,2
690,896,enrollment__partic_arm_1,896,BUMP-APIGVLJQ,2209.0,,,,2


In [37]:
data['phq9']

Unnamed: 0,record_id,redcap_event_name,phq9_1,phq9_2,phq9_3,phq9_4,phq9_5,phq9_6,phq9_7,phq9_8,phq9_9,phq9_10,phq9_complete
0,4,ci_1_arm_1,0.0,0.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,2
1,4,ci_2_arm_1,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,2
2,4,ci_3_arm_1,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,2
3,4,ci_4_arm_1,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,2
4,4,ci_5_arm_1,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3840,842,ci_3_arm_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
3841,842,ci_4_arm_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
3842,854,ci_1_arm_1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2
3843,862,ci_1_arm_1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2
