In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)

#Folder name with original input data.
dir_input = 'Raw data/'

#Folder name for output data.
dir_output = '12w/'


################################################################################
#Data Extraction STOP-PD
################################################################################
# Sergio Mena Ortega, 2025

#------------- Covariates ----------------------------------------
#OUTPUT dataframe: df_covars

df_covars = pd.read_excel(dir_input + 'pqdem01.xlsx', skiprows=[1])

# Filter the visit, just baseline, and drop the visit column.  
df_covars = df_covars[df_covars['week'] == 12].drop(columns = 'week')

selected_variables = ["src_subject_id", "site"]

df_covars = df_covars[selected_variables].copy()

#-----------------------------------------------------------------


#------------- Clinical Global impression (CGI) ------------------
# OUTPUT dataframe: df_cgi
df_cgi = pd.read_excel(dir_input + 'cgi01.xlsx', skiprows=[1]).filter(regex = "^(src_subject_id$|cgi_si$|week$)")

# Filter the visit, just baseline, and drop the visit column.  
df_cgi = df_cgi[df_cgi['week'] == 12].drop(columns = 'week')
#-----------------------------------------------------------------

#------------- Movement side eff. (AIMS) -------------------------------
# OUTPUT dataframe: df_move
df_move = pd.read_excel(dir_input + 'aims01.xlsx', skiprows=[1])

# Filter the visit, just baseline, and drop the visit column.  
df_move = df_move[df_move['week'] == 12].drop(columns = 'week')
selected_variables = ["src_subject_id", "aims_facial1_date1", "aims_facial2_date1", "aims_facial3_date1", "aims_facial4_date1", "aims_extrem5_date1", "aims_extrem6_date1", "aims_trunk_score_date1", "aims_global8_date1", "aims_global9_date1", "aims_global10_date1", "aims_dental11_date1", "aims_dental12_date1", "aimsmed_ttl"]
df_move = df_move[selected_variables].copy()
#-----------------------------------------------------------------

#------------- Barnes Akathisia Rating Scale  --------------------
# OUTPUT dataframe: df_aka
df_aka = pd.read_excel(dir_input + 'bns01.xlsx', skiprows=[1])

# Filter the visit, just baseline, and drop the visit column.  
df_aka = df_aka[df_aka['week'] == 12].drop(columns = 'week')

selected_variables = ["src_subject_id", "bnsa3", "bnsa4", "bnsa1a", "bnsa2a"]

df_aka = df_aka[selected_variables].copy()

#-----------------------------------------------------------------

#------------- Brief Psychiatric Rating Scale (BPRS)  ------------
# OUTPUT dataframe: df_bprs
df_bprs = pd.read_excel(dir_input + 'bprs01.xlsx', skiprows=[1])

# Filter the visit, just baseline, and drop the visit column.  
df_bprs = df_bprs[df_bprs['week'] == 12].drop(columns = 'week')

selected_variables = ["src_subject_id", "bprs_somc", "bprs_anxi", "bprs_depr", "bprs_guil", "bprs_host", "bprs_susp", 
                    "bprs_unus", "bprs_gran", "bprs_hall", "bprs_diso", "bprs_conc", "bprs_exci", 
                    "bprs_motr", "bprs_blun", "bprs_tens", "bprs_mann", "bprs_unco", "bprs_emot", 
                    "bprs_total"]

df_bprs = df_bprs[selected_variables].copy()

#Remove 0 -> not assessed for individual terms
for var in selected_variables[1:-1]:
    df_bprs[var] = df_bprs[var].replace(0, np.nan)

#-----------------------------------------------------------------

#------------- Delusion Assessment Scale (DAS)  ------------
# OUTPUT dataframe: df_das
df_das = pd.read_excel(dir_input + 'del01.xlsx', skiprows=[1])

# Filter the visit, just baseline, and drop the visit column.  
df_das = df_das[df_das['week'] == 12].drop(columns = 'week')

selected_variables = [
    "src_subject_id", "das_1", "das_2", "das_3", "das_4", "das_5", "das_6", "das_7", "das_8", "das_9", 
    "das_10", "das_11", "das_12", "das_13", "das_14", "das_16", "das_impact", "das_disorg", 
    "das_convict", "das_bizzare", "das_extension"
]
df_das = df_das[selected_variables].copy()
#-----------------------------------------------------------------

#------------- Mattis Dementia Rating Scale (MDRS)  ------------
# OUTPUT dataframe: df_mdrs
df_mdrs = pd.read_excel(dir_input + 'mdrs01.xlsx', skiprows=[1])

# Filter the visit, just baseline, and drop the visit column.  
df_mdrs = df_mdrs[df_mdrs['week'] == 12].drop(columns = 'week')

selected_variables = ["src_subject_id", "drse", "drsf", "drsg", "drsh", "drsi", "drsj", "drsk", "drsl", "drsm", "drsn", "drso", "drs2"]

df_mdrs = df_mdrs[selected_variables].copy()
#-----------------------------------------------------------------


#------------- Hamilton Depression Rating Scale (HDRS)  ------------
# OUTPUT dataframe: df_hrsd
df_hrsd = pd.read_excel(dir_input + 'hrsd01.xlsx', skiprows=[1])

# Filter the visit, just baseline, and drop the visit column.  
df_hrsd = df_hrsd[df_hrsd['week'] == 12].drop(columns = 'week')

selected_variables = ["src_subject_id", "hsoin", "hmnin", "hemin", "hmdsd", "hpanx", "hinsg", "happt", "hwl", 
    "hsanx", "hhypc", "hvwsf", "hsuic", "hintr", "hengy", "hslow", "hagit", 
    "hsex", "hamd_02", "hamd_04", "hamd_22", "hamd_31", "hamd_32", 
    "hamd_33", "hamd_34", "hamd_35", "hamd_36", "hamd_score_24"]


df_hrsd = df_hrsd[selected_variables].copy()

#Remove -9 -> not assessed for individual terms
for var in selected_variables[1:]:
    df_hrsd[var] = df_hrsd[var].replace(-9, np.nan)

#-----------------------------------------------------------------

#------------- Mini-Mental State Examination (MMSE)  ------------
# OUTPUT dataframe: df_mmse
df_mmse = pd.read_excel(dir_input + 'mmse01.xlsx', skiprows=[1])

# Filter the visit, just baseline, and drop the visit column.  
df_mmse = df_mmse[df_mmse['week'] == 12].drop(columns = 'week')

selected_variables = ["src_subject_id", "mmse01", "mmse02", "mmse03", "mmse04", "mmse05", "mmse06", "mmse07", 
                        "mmse08", "mmse22", "mmse23", "mmse24", "mmse28", "mmse29", "mmse30", 
                        "mmse_ts", "mmse20_1", "mmse11_1", "mmse12_1", "mmse13_1", "mmse6_1", "mmse7_1"]


df_mmse = df_mmse[selected_variables].copy()

#Remove 8 -> refused
df_mmse['mmse12_1'] = df_mmse['mmse12_1'].replace(8, np.nan)
#-----------------------------------------------------------------


#------------- Cardiovascular Risk Form (MMSE)  ------------
# OUTPUT dataframe: df_cardio
df_cardio = pd.read_excel(dir_input + 'screen01.xlsx', skiprows=[1])

# Filter the visit, just baseline, and drop the visit column.  
df_cardio = df_cardio[df_cardio['week'] == 12].drop(columns = 'week')

selected_variables = ["src_subject_id", "cr1", "cr2", "cr2a", "estrogen", "cr3", "cr3a", "cr3b", "cr3c", "cr4",
                      "cr5a", "cr5b", "cr5c", "cr6", "cr7", "cr8a", "cr8b", "cr8c", "cr9"]

df_cardio = df_cardio[selected_variables].copy()

#Dummy code variables. 
for var in ["cr1", "cr2a"]:
    dummies = pd.get_dummies(df_cardio[var], prefix=var)
    df_cardio = pd.concat([df_cardio, dummies], axis=1)
    df_cardio.drop(var, axis=1, inplace=True)

# Replace -7 values in the 'estrogen' column with NaN
df_cardio['estrogen'] = df_cardio['estrogen'].replace(-7, np.nan)

# Replace 9 values in the cr3 column with NaN or 0 depending on the value. 
df_cardio['cr3'] = df_cardio['cr3'].replace(9, np.nan)
for var in ["cr4", "cr9"]:
    df_cardio[var] = df_cardio[var].replace(9, 0)

#-----------------------------------------------------------------

#------------- Cumulative Illness Rating Scale-Geriatric (CIRS)  ------------
# OUTPUT dataframe: df_cirs
df_cirs = pd.read_excel(dir_input + 'crs01.xlsx', skiprows=[1])

# Filter the visit, just baseline, and drop the visit column.  
df_cirs = df_cirs[df_cirs['week'] == 12].drop(columns = 'week')


selected_variables = ["src_subject_id", "heart", "vsclr", "hema", "eyes", "ugi", "lgi", "renal", 
                  "genur", "mskl", "neuro", "psych", "respiratory", "liverd", "endod", "cirscnt", "cirstot"]

df_cirs = df_cirs[selected_variables].copy()



#Remove -9 -> not known
for var in ["eyes", "ugi", "lgi", "renal", "genur", "mskl", "neuro", "liverd"]:
    df_cirs[var] = df_cirs[var].replace(-9, np.nan)
    

#-----------------------------------------------------------------


#------------- Electrocardiogram (ECG)  --------------------------
# OUTPUT dataframe: df_ecg
df_ecg = pd.read_excel(dir_input + 'ecg01.xlsx', skiprows=[1])

# Filter the visit, just baseline, and drop the visit column.  
df_ecg = df_ecg[df_ecg['week'] == 12].drop(columns = 'week')


selected_variables = ["src_subject_id", "ecghr", "ecgpr", "ecgqrs", "ecgqt", "ecgqtc", "crfint"]

df_ecg = df_ecg[selected_variables].copy()
df_ecg["crfint"] = df_ecg["crfint"].replace(-9, np.nan)
df_ecg
#-----------------------------------------------------------------


#------------- MacArthur Competence Assessment Tool - Clinical Research (MACCAT)  --------------------------
# OUTPUT dataframe: df_maccat
df_maccat = pd.read_excel(dir_input + 'maccomp01.xlsx', skiprows=[1])

# Filter the visit, just baseline, and drop the visit column.  
df_maccat = df_maccat[df_maccat['week'] == 12].drop(columns = 'week')

selected_variables = ["src_subject_id",
    "matu1a", "matu1b", "matu1c", "matu1d", "matu2", "matu3a", "matu3b", "matu3c", 
    "matu4a", "matu4b", "matu4c", "matu4d", "matu5", "matu6", "mata1", "mata2", 
    "mata3", "mata4", "matr1", "matr2", "matr3", "matr4", "matr5", "matc1", "maccattot"
]
df_maccat = df_maccat[selected_variables].copy()

#-----------------------------------------------------------------

#------------- Physical Examination (PE) -------------------------
# OUTPUT dataframe: df_pe
df_pe = pd.read_excel(dir_input + 'pe01.xlsx', skiprows=[1])

# Filter the visit, just baseline, and drop the visit column.  
df_pe = df_pe[df_pe['week'] == 12].drop(columns = 'week')


selected_variables = ["src_subject_id", "pex003a", "pex004a", "pex005a", "pex007a", "pex010a", "pex012a", "hdnkpe", "neurope", "pe_4", "phys9"]
df_pe = df_pe[selected_variables].copy()


for var in ["pex003a", "pex004a", "pex005a", "pex007a", "pex010a", "pex012a", "hdnkpe", "neurope"]:
    #Remove 7 and -9
    df_pe[var] = df_pe[var].replace(-9, np.nan)
    df_pe[var] = df_pe[var].replace(7, np.nan)
    
    #Change all into just abnormal. 
    df_pe[var] = df_pe[var].replace(3, 2)
    df_pe[var] = df_pe[var].replace(4, 2)
    df_pe[var] = df_pe[var].replace(5, 2)
    df_pe[var] = df_pe[var].replace(6, 2)

#Convert to not known. 
df_pe["pe_4"] = df_pe["pe_4"].replace(-99, np.nan)
df_pe["pe_4"] = df_pe["pe_4"].replace(0, np.nan)
#----------------------------------------------------------------

#------------- Schedule for Affective Disorders and Schizophrenia - Delusional Scale -------------------------
# OUTPUT dataframe: df_ksads
df_ksads = pd.read_excel(dir_input + 'ksads_diagnoses01.xlsx', skiprows=[1])

# Filter the visit, just baseline, and drop the visit column.  
df_ksads = df_ksads[df_ksads['week'] == 12].drop(columns = 'week')

selected_variables = ["src_subject_id", "sadsd1", "sadsd2"]

df_ksads = df_ksads[selected_variables].copy()

#Remove no sufficient info 0 -> np.nan.
df_ksads["sadsd2"] = df_ksads["sadsd2"].replace(0, np.nan)

#-----------------------------------------------------------------

#------------- Simpson Angus Scale -------------------------------
# OUTPUT dataframe: df_sas

df_sas = pd.read_excel(dir_input + 'sas01.xlsx', skiprows=[1])

# Filter the visit, just baseline, and drop the visit column.  
df_sas = df_sas[df_sas['week'] == 12].drop(columns = 'week')

selected_variables = ["src_subject_id", "sas01", "sas02", "sas03", "sas04", "sas05", "sas06", "sas08", "sas09", "sas10", "sas_total"]
df_sas = df_sas[selected_variables].copy()

#Remove 9 -> not ratable
for var in selected_variables[1:-1]:
    df_sas[var] = df_sas[var].replace(9, np.nan)
    
#-----------------------------------------------------------------


#------------- Scale for the Assessment of Positive Symptoms -------------------------------
# OUTPUT dataframe: df_saps

df_saps = pd.read_excel(dir_input + 'saps_sans01.xlsx', skiprows=[1])

# Filter the visit, just baseline, and drop the visit column.  
df_saps = df_saps[df_saps['week'] == 12].drop(columns = 'week')

selected_variables = ["src_subject_id",  "saps_h1", "saps_h2", "saps_h3", "saps_h4", "saps_h5", "saps_h6",
    "saps_d1", "saps_d2", "saps_d3", "saps_d4", "saps_d5", "saps_d6", "saps_d7", "saps_d8", "saps_d9", "saps_d10",
    "saps_d11", "saps_d12", "saps19", "saps20", "saps21", "saps22", "saps23", "saps_total"] 

df_saps = df_saps[selected_variables].copy()

#Change -1 to 1 -> present symptom for easiness of interpretation.
for var in ["saps_h1", "saps_h2", "saps_h3", "saps_h4", "saps_h5", "saps_h6", "saps_d1", "saps_d2", "saps_d3", 
            "saps_d4", "saps_d5", "saps_d6", "saps_d7", "saps_d8", "saps_d9", "saps_d10","saps_d11", "saps_d12"]:
    df_saps[var] = df_saps[var].replace(-1, 1)

#-----------------------------------------------------------------

#------------- Structured Clinical Interview for DSM-IV -------------------------------
# OUTPUT dataframe: df_dsm

df_dsm = pd.read_excel(dir_input + 'scid01.xlsx', skiprows=[1])

# Filter the visit, just baseline, and drop the visit column.  
df_dsm = df_dsm[df_dsm['week'] == 12].drop(columns = 'week')

selected_variables = ["src_subject_id",
    "q001_bpi_life", "q006_bpi_mon", "q010_bpii_life", "q014_bpii_mon", "q018_obp_life", "q019_obp_current",
    "q021_mdd_life", "q023_mdd_season", "q025_mdd_mon", "q026_mdd_current", "q027_mdd_sev", "q028_dysd_current",
    "q031_depnos_life", "q032_depnos_mon", "q034_mdgmc_life", "q036_mdgmc_mon", "q038_simd_life", "q040_simd_mon",
    "q042_sz_life", "q043_sz_mon", "q045_szp_life", "q046_szp_mon", "q048_sza_life", "q049_sza_mon",
    "q051_dd_life", "q052_dd_mon", "q053_bfpd_life", "q054_bfpd_mon", "q055_pdgmc_life", "q057_pdgmc_mon",
    "q060_sipd_life", "q062_sipd_mon", "q065_pdnos_life", "q066_pdnos_mon", "q067_al_life", "q068_al_mon",
    "q069_sha_life", "q070_sha_mon", "q071_can_life", "q072_can_mon", "q073_stim_life", "q074_stim_mon",
    "q075_op_life", "q076_op_mon", "q077_coc_life", "q078_coc_mon", "q079_hal_life", "q080_hal_mon",
    "q081_poly_life", "q082_poly_mon", "q083_othsub_life", "q084_othsub_mon", "q085_panic_life", "q086_panic_agor",
    "q087_panic_mon", "q088_agor_life", "q089_agor_mon", "q090_social_life", "q091_social_mon", "q092_phobia_life",
    "q093_phobia_mon", "q094_ocd_life", "q095_ocd_mon", "q096_ptsd_life", "q097_ptsd_mon", "q098_gad_life",
    "q099_adgmc_life", "q101_adgmc_month", "q103_siad_life", "q105_siad_mon", "q107_adnos_life", "q108_adnos_mon",
    "q109_somat_life", "q110_pain_life", "q111_undiffsom_life", "q112_hypochon_life", "q113_bdd_life", 
    "q114_an_life", "q115_an_mon", "q116_bn_life", "q117_bn_mon", "q118_bed_life", "q119_bed_mon", "q120_adjd_life",
    "q145_gaf", "cfmh_oth_suicideatt", "scids45", "scids35al", "scids35ap", "p2i4", "scids4d1", "scids4d3", 
    "scids4e", "dage", "scids4f"
]

df_dsm = df_dsm[selected_variables].copy()

#Correction of life, monthly and current scores 
temp_vars = ["q001_bpi_life", "q006_bpi_mon", "q010_bpii_life", "q014_bpii_mon", "q018_obp_life", "q019_obp_current",
    "q021_mdd_life", "q025_mdd_mon", "q027_mdd_sev", "q028_dysd_current",
    "q031_depnos_life", "q032_depnos_mon", "q034_mdgmc_life", "q036_mdgmc_mon", "q038_simd_life", "q040_simd_mon",
    "q042_sz_life", "q043_sz_mon", "q045_szp_life", "q046_szp_mon", "q048_sza_life", "q049_sza_mon",
    "q051_dd_life", "q052_dd_mon", "q053_bfpd_life", "q054_bfpd_mon", "q055_pdgmc_life", "q057_pdgmc_mon",
    "q060_sipd_life", "q062_sipd_mon", "q065_pdnos_life", "q066_pdnos_mon", "q067_al_life", "q068_al_mon",
    "q069_sha_life", "q070_sha_mon", "q071_can_life", "q072_can_mon", "q073_stim_life", "q074_stim_mon",
    "q075_op_life", "q076_op_mon", "q077_coc_life", "q078_coc_mon", "q079_hal_life", "q080_hal_mon",
    "q081_poly_life", "q082_poly_mon", "q083_othsub_life", "q084_othsub_mon", "q085_panic_life", "q086_panic_agor",
    "q087_panic_mon", "q088_agor_life", "q089_agor_mon", "q090_social_life", "q091_social_mon", "q092_phobia_life",
    "q093_phobia_mon", "q094_ocd_life", "q095_ocd_mon", "q096_ptsd_life", "q097_ptsd_mon", "q098_gad_life",
    "q099_adgmc_life", "q101_adgmc_month", "q103_siad_life", "q105_siad_mon", "q107_adnos_life", "q108_adnos_mon",
    "q109_somat_life", "q110_pain_life", "q111_undiffsom_life", "q112_hypochon_life", "q113_bdd_life", 
    "q114_an_life", "q115_an_mon", "q116_bn_life", "q117_bn_mon", "q118_bed_life", "q119_bed_mon", "q120_adjd_life",
    "cfmh_oth_suicideatt", "scids45", "scids35al", "scids35ap", "p2i4", "scids4d1", "scids4d3", 
    "scids4e", "dage"
]
# 0 = Inadequate Information; 1 = Absent; 2 = Sub-Threshold; 3 = Threshold/Present; -8 = N/A; 999 = Missing -9 = Missing.
for var in temp_vars:
    df_dsm[var] = df_dsm[var].replace({0: np.nan, -8: np.nan, 999: np.nan, -9: np.nan, -7:np.nan})

#Correct MDD categorical vars: q023_mdd_season #0 = Without Seasonal Pattern;1 = With Seasonal Pattern; 2=not recurrent
#q026_mdd_current: 0 = Neither Melancholic, Atypical, nor Catatonic;1 = Melanchonic;2 = Atypical;3 = Catatonic;4=Anxious Distress; 5=Mixed Features
for var in ["q023_mdd_season", "q026_mdd_current"]:
    dummy_vars = pd.get_dummies(df_dsm[var], prefix=var)
    df_dsm = pd.concat([df_dsm, dummy_vars], axis=1)
    df_dsm.drop(columns=[var], inplace=True)

#Correct number of MDD episodes 1=1; 2=2-3; 3=>3; 4=>1,unknown if >3. We make it 1=1, 2=more than 1
df_dsm["scids4f"] = df_dsm["scids4f"].replace({3:2, 4:2})

#Correct GAF: 0-100, 999 -> missing.
df_dsm["q145_gaf"] = df_dsm["q145_gaf"].replace({900:np.nan})

#-----------------------------------------------------------------

#------------- Short Form (36) Health Survey -------------------------------
# OUTPUT dataframe: df_sf36

df_sf36 = pd.read_excel(dir_input + 'sf36v201.xlsx', skiprows=[1])

# Filter the visit, just baseline, and drop the visit column.  
df_sf36 = df_sf36[df_sf36['week'] == 12].drop(columns = 'week')

selected_variables = ["src_subject_id",
    "sf3601", "sf3602", "sf3603a", "sf3603b", "sf3603c", "sf3603d", "sf3603e", "sf3603f", "sf3603g", "sf3603h", 
    "sf3603i", "sf3603j", "sf3604a", "sf3604b", "sf3604c", "sf3604d", "sf3605a", "sf3605b", "sf3605c",
    "sf3609a", "sf3609b", "sf3609c", "sf3609d", "sf3609e", "sf3609f", "sf3609g", "sf3609h", "sf3609i", "sf3610", 
    "sf3611a", "sf3611b", "sf3611c", "sf3611d", "sf36_pf", "sf36_rp", "sf36_re", "sf36_p", "sf36_sf", "sf36_mh", 
    "sf36_ghp", "sf3607b", "sf3608b", "socialact", "mosvti", "mospcs", "mosmcs"
]

df_sf36 = df_sf36[selected_variables].copy()

#Correct  1 = Excellent; 2 = Very good; 3 = Good; 4 = Fair; 5 = Poor; -9 = Missing value; -97=Missing Online; -98=Purposely Skipped Online
temp_vars = ["sf3601", "sf3602", "sf3603a", "sf3603b", "sf3603c", "sf3603d", "sf3603e", "sf3603f", "sf3603g", "sf3603h", 
    "sf3603i", "sf3603j", "sf3604a", "sf3604b", "sf3604c", "sf3604d", "sf3605a", "sf3605b", "sf3605c",
    "sf3609a", "sf3609b", "sf3609c", "sf3609d", "sf3609e", "sf3609f", "sf3609g", "sf3609h", "sf3609i", "sf3610", 
    "sf3611a", "sf3611b", "sf3611c", "sf3611d", "sf3607b", "sf3608b"]
for var in temp_vars:
    df_sf36[var] = df_sf36[var].replace({-9:np.nan, -97: np.nan, -98:np.nan})

#-----------------------------------------------------------------

#------------- Scale for Suicide Ideation – Worst ----------------
# OUTPUT dataframe: df_ssi

df_ssi = pd.read_excel(dir_input + 'ssi01.xlsx', skiprows=[1])

# Filter the visit, just baseline, and drop the visit column.  
df_ssi = df_ssi[df_ssi['week'] == 12].drop(columns = 'week')

selected_variables  = ["src_subject_id",
    "ssi1", "ssi2", "ssi3", "ssi4", "ssi5", "ssi6", "ssi7", "ssi8", "ssi9", 
    "ssi10", "ssi11", "ssi12", "ssi13", "ssi14", "ssi15", "ssi16", "ssi17", 
    "ssi18", "ssi19", "ssitot"
]

#Generate total score
df_ssi["ssitot"] = df_ssi[selected_variables[1:]].sum(axis=1)


df_ssi = df_ssi[selected_variables].copy()

#Correct -9 -> np.nan (incomplete)
for var in selected_variables[1:]:
    df_ssi[var] = df_ssi[var].replace(-9, np.nan)
    



#-----------------------------------------------------------------

#------------- Stroop Test ----------------
# OUTPUT dataframe: df_stroop

df_stroop = pd.read_excel(dir_input + 'stroop01.xlsx', skiprows=[1])

# Filter the visit, just baseline, and drop the visit column.  
df_stroop = df_stroop[df_stroop['week'] == 12].drop(columns = 'week')

selected_variables  = ["src_subject_id", "stroop1", "stroop2", "stroop3"]

df_stroop = df_stroop[selected_variables].copy()

#Calculation of derived scores
#SCORING THE STROOP TEST, ARTHUR R. JENSEN, University of California, Berkeley, Calif., USA
#Color-naming factor (D)
df_stroop['stroop_color_naming_score'] = df_stroop['stroop2']/(df_stroop['stroop2'] + df_stroop['stroop1'])

# Interference Score (E)
df_stroop['stroop_interference_score'] = df_stroop['stroop3'] - df_stroop['stroop2']

# Cognitive Control Index
df_stroop['stroop_cognitive_control_index'] = (df_stroop['stroop3'] - df_stroop['stroop1'])

# Facilitation Score (H)
df_stroop['stroop_facilitation_score'] = df_stroop['stroop2'] - df_stroop['stroop1']

# Relative Interference Score (K)
df_stroop['stroop_relative_interference_score'] = (df_stroop['stroop3']-df_stroop['stroop2'])/(df_stroop['stroop1'])

# Reaction Time Variability
stroop_tasks = ['stroop1', 'stroop2', 'stroop3']
df_stroop['stroop_reaction_time_variability'] = df_stroop[stroop_tasks].std(axis=1) / df_stroop[stroop_tasks].mean(axis=1)

# (F)
#df_stroop['stroop_F'] = df_stroop['stroop2']/df_stroop['stroop1']
# (G)
#df_stroop['stroop_G'] = df_stroop['stroop2']/df_stroop['stroop3']
# (I)
#df_stroop['stroop_I'] = (df_stroop['stroop2'] - df_stroop['stroop1'])/(df_stroop['stroop2'] + df_stroop['stroop1'])
# (J)
#df_stroop['stroop_J'] = df_stroop['stroop1']/df_stroop['stroop2']

#-----------------------------------------------------------------


#------------- UKU Side Effect Rating Scale  ----------------
# OUTPUT dataframe: df_uku

df_uku = pd.read_excel(dir_input + 'uku01.xlsx', skiprows=[1])

# Filter the visit, just baseline, and drop the visit column.  
df_uku = df_uku[df_uku['week'] == 12].drop(columns = 'week')

selected_variables = ["src_subject_id",
    "uku1", "uku2", "uku3", "uku4", "uku5", "uku6", "uku7", "uku8", "uku9", "uku10", 
    "uku11", "uku12", "uku13", "uku14", "uku15", "uku16", "uku17", "uku18", "uku19", "uku20", 
    "uku21", "uku22", "uku23", "uku24", "uku25", "uku26", "uku27", "uku28", "uku29", "uku30", 
    "uku31", "uku32", "uku33", "uku34", "uku35", "uku36", "uku37", "uku38", "uku39", "uku40", 
    "uku41", "uku42", "uku43", "uku44", "uku45", "uku46", "uku47", "uku48a", "uku48b"
]

df_uku = df_uku[selected_variables].copy()

#Correct 9 -> np.nan (NA)
for var in selected_variables[1:]:
    df_uku[var] = df_uku[var].replace(9, np.nan)

#-----------------------------------------------------------------

#------------- Vitals  ----------------
# OUTPUT dataframe: df_vitals

df_vitals = pd.read_excel(dir_input + 'vitals01.xlsx', skiprows=[1])

# Filter the visit, just baseline, and drop the visit column.  
df_vitals = df_vitals[df_vitals['week'] == 12].drop(columns = 'week')

#Calculate BMI
df_vitals["vt_bmi"] = (df_vitals["blwt"])/(df_vitals["vtl007"]*df_vitals["vtl007"]) * 703

selected_variables = ["src_subject_id", "vital_sysbp", "vital_diabp", "vital_pulse", "vtl004a", "vtl004b", "vtl004c", "blwt", "vtl007", "premorbidweight", "vt_bmi"]



df_vitals = df_vitals[selected_variables].copy()

#Correct -> np.nan (NA)
for var in selected_variables[1:]:
    df_vitals[var] = df_vitals[var].replace({999:np.nan, -9:np.nan, -5:np.nan, -2:np.nan})

    
#-----------------------------------------------------------------


#------------- Blood tests  ----------------
# OUTPUT dataframe: df_blood

df_blood = pd.read_excel(dir_input + 'clinlabtests01.xlsx', skiprows=[1])

# Filter the visit, just baseline, and drop the visit column.  
df_blood = df_blood[df_blood['week'] == 12].drop(columns = 'week')


selected_variables = ["src_subject_id", "rsptc_no", "rsphdl_no", "rspldl_no", "rsptrig_no", "glorres"]

df_blood = df_blood[selected_variables].copy()

#Calculate HDL/trig ratio. 
df_blood['hdl_trg_ratio'] = df_blood["rsphdl_no"]/df_blood["rsptrig_no"] 

#-----------------------------------------------------------------

#------------- Antidepressant Treatment History Form  -----------
# OUTPUT dataframe: df_athq

df_athq = pd.read_excel(dir_input + 'athq01.xlsx', skiprows=[1])

# Drop duplicates, there is no "week column" 
df_athq = df_athq.drop_duplicates(subset=['src_subject_id'])

#"b" and "c"'s are confidence ratings of the evidence of history of medications, no need for prediction.
selected_variables = ["src_subject_id", "athf1a", "athf2a", "athf3a", 
                      "athf4a",  "athf5a",  "athf6a", "athf7a",  "athf8a", "athf9a", "athf10a",  "onsetepi"]

df_athq = df_athq[selected_variables].copy()
df_athq
#-----------------------------------------------------------------

#------------- Cornell Services Index  -----------
# OUTPUT dataframe: df_suq

df_suq = pd.read_excel(dir_input + 'suq01.xlsx', skiprows=[1])

# Filter the visit, just baseline, and drop the visit column.  
df_suq = df_suq[df_suq['week'] == 12].drop(columns = 'week')

selected_variables  = ["src_subject_id", "sumhc38", "avghc39", "servicetype", "providertype", "servicesite", "servicereason"]

df_suq = df_suq[selected_variables].copy()

# Compute sum of days and minutes spent for all medical visits. 
aggregated = df_suq.groupby("src_subject_id").agg(
    overall_mean_visits=("sumhc38", "sum"),
    overall_mean_time=("avghc39", "sum")
).reset_index()

# Create dummy variables for service type, provider type, service site, and service reason
dummies = pd.get_dummies(
    df_suq[["src_subject_id", "servicetype", "providertype", "servicesite", "servicereason"]],
    columns=["servicetype", "providertype", "servicesite", "servicereason"],
    prefix=["servicetype", "providertype", "servicesite", "servicereason"]
)

# Aggregate the dummy variables by patient since there are multiple entries per patient, so we get all the
#cummulated services, sites and reasons in dummy coded variables.
dummies_aggregated = dummies.groupby("src_subject_id").max().reset_index()

df_suq = pd.merge(aggregated, dummies_aggregated, on="src_subject_id")

#-----------------------------------------------------------------

#------------- Demographics  -----------
# OUTPUT dataframe: df_demo

df_demo = pd.read_excel(dir_input + 'pqdem01.xlsx', skiprows=[1])

# Filter the visit, just baseline, and drop the visit column.  
df_demo = df_demo[df_demo['week'] == 12].drop(columns = 'week')


selected_variables = ["src_subject_id", "sex", "ethnicity", "race", "demo_resp_status", "interview_age", "educat", "tx_code", 
            "eo4", "live", "entry_status"]

df_demo = df_demo[selected_variables].copy()

#9 -> np.nan (missing, not applicable)
for var in ["demo_resp_status", "eo4"]:
    df_demo[var] = df_demo[var].replace(9, np.nan)


# Dummy-code the specified categorical variables and add them to df_demo
for var in ["ethnicity", "race", "demo_resp_status", "live", "entry_status"]:
    dummies = pd.get_dummies(df_demo[var], prefix=var)
    # Concatenate the dummy-coded variables with the original DataFrame
    df_demo = pd.concat([df_demo.drop(columns=var), dummies], axis=1)

#Convert sex to number
df_demo["sex"] = df_demo["sex"].replace({"F":1, "M":2})

#Treatment arms
# Define binary treatment variables based on tx_code
df_demo['treatment_olanzapine'] = 1  # All tx_code values include Olanzapine
df_demo['treatment_sertraline'] = df_demo['tx_code'].apply(lambda x: 1 if x in [1, 3] else 0)
df_demo['treatment_placebo'] = df_demo['tx_code'].apply(lambda x: 1 if x in [0, 2] else 0)
df_demo['treatment_lithium'] = df_demo['tx_code'].apply(lambda x: 1 if x in [2, 3] else 0)

#Drop tx_code
df_demo.drop(columns=["tx_code"], inplace = True)
#-----------------------------------------------------------------

#------------- Protocol violators --------------------------------
#OUTPUT dataframe: df_viol
# Not for predictor battery. 
df_viol = pd.read_excel(dir_input + 'viol01.xlsx', skiprows=[1])
selected_variables = ["src_subject_id", "violate", "violationtype", "violationdays"]

df_viol = df_viol[selected_variables].copy()
#-----------------------------------------------------------------



  df_demo["sex"] = df_demo["sex"].replace({"F":1, "M":2})


In [3]:
################################################################################
#Storing Data and Merging
################################################################################
# Sergio Mena Ortega, 2025

#Writing individual datasets into excel.
## IMPORTANT: violators not removed, bear in mind they will have to be removed if using these.
df_covars.to_excel(dir_output+'Individual Datasets/covariates.xlsx', index = None)
df_viol.to_excel(dir_output+'Individual Datasets/violators.xlsx', index = None)
df_cgi.to_excel(dir_output+'Individual Datasets/cgi.xlsx', index = None)
df_move.to_excel(dir_output+'Individual Datasets/move.xlsx', index = None)
df_aka.to_excel(dir_output+'Individual Datasets/aka.xlsx', index = None)
df_bprs.to_excel(dir_output+'Individual Datasets/bprs.xlsx', index = None)
df_das.to_excel(dir_output+'Individual Datasets/das.xlsx', index = None)
df_mdrs.to_excel(dir_output+'Individual Datasets/mdrs.xlsx', index = None)
df_hrsd.to_excel(dir_output+'Individual Datasets/hrsd.xlsx', index = None)
df_mmse.to_excel(dir_output+'Individual Datasets/mmse.xlsx', index = None)
df_cardio.to_excel(dir_output+'Individual Datasets/cardio.xlsx', index = None)
df_cirs.to_excel(dir_output+'Individual Datasets/cirs.xlsx', index = None)
df_ecg.to_excel(dir_output+'Individual Datasets/ecg.xlsx', index = None)
df_maccat.to_excel(dir_output+'Individual Datasets/maccat.xlsx', index = None)
df_pe.to_excel(dir_output+'Individual Datasets/pe.xlsx', index = None)
df_ksads.to_excel(dir_output+'Individual Datasets/ksads.xlsx', index = None)
df_sas.to_excel(dir_output+'Individual Datasets/sas.xlsx', index = None)
df_saps.to_excel(dir_output+'Individual Datasets/saps.xlsx', index = None)
df_dsm.to_excel(dir_output+'Individual Datasets/dsm.xlsx', index = None)
df_sf36.to_excel(dir_output+'Individual Datasets/sf36.xlsx', index = None)
df_ssi.to_excel(dir_output+'Individual Datasets/ssi.xlsx', index = None)
df_stroop.to_excel(dir_output+'Individual Datasets/stroop.xlsx', index = None)
df_uku.to_excel(dir_output+'Individual Datasets/uku.xlsx', index = None)
df_vitals.to_excel(dir_output+'Individual Datasets/vitals.xlsx', index = None)
df_blood.to_excel(dir_output+'Individual Datasets/blood.xlsx', index = None)
df_athq.to_excel(dir_output+'Individual Datasets/athq.xlsx', index = None)
df_suq.to_excel(dir_output+'Individual Datasets/suq.xlsx', index = None)
df_demo.to_excel(dir_output+'Individual Datasets/demo.xlsx', index = None)



In [4]:
##### VARIABLE PRUNING to AVOID CURSE OF DIMENSIONALITY ###############
#df_move = df_move[["src_subject_id" ,"aimsmed_ttl"]].copy()
#df_aka = df_aka[["src_subject_id" ,"bnsa4"]].copy()
#df_dsm = df_dsm[["src_subject_id"]].copy()
#df_bprs = df_bprs.copy()
#df_das = df_das[["src_subject_id", "das_impact", "das_disorg", "das_convict", "das_bizzare", "das_extension"]].copy()
#df_mdrs = df_mdrs[["src_subject_id", "drs2"]].copy()
#df_hrsd = df_hrsd.copy()
#df_mmse = df_mmse[["src_subject_id", "mmse_ts"]].copy()
#df_ksads = df_ksads.copy()
#df_sas = df_sas[["src_subject_id", "sas_total"]].copy()
#df_saps = df_saps.copy()
#df_ssi = df_ssi[["src_subject_id", "ssitot"]].copy()
#df_uku = df_uku.copy()
#df_athq = df_athq.copy()
#df_suq = df_suq[["src_subject_id", "overall_mean_visits", "overall_mean_time"]].copy()
#df_demo = df_demo[["src_subject_id", "sex",	"interview_age", "educat", "eo4", "treatment_sertraline", "treatment_placebo"]].copy()
#df_sui = df_sui.copy()
#df_adverse = df_adverse[["src_subject_id", "AE_BEFORE"]].copy()
#df_concommed = df_concommed[["src_subject_id", "CONCOMMED_ANTIDEPRESSANTS", "CONCOMMED_ANTIPSYCHOTICS"]].copy()
#df_dosage = df_dosage.copy()

#df_vitals = df_vitals[["src_subject_id", "vt_bmi"]].copy()
#df_blood = df_blood.copy()
#df_ecg = df_ecg[["src_subject_id"]].copy()

#df_sf36 = df_sf36.copy()
#df_cardio = df_cardio.copy()
#df_cirs = df_cirs.copy()
#df_pe = df_pe.copy()

#Merging data into 4 category: clinical_and_socio, biological, neurocog, quality of life/general health data used for model development. 
clin_dataframes = [df_cgi, df_move, df_aka, df_dsm, df_bprs, df_das, df_mdrs, df_hrsd, df_mmse, df_ksads, df_sas, df_saps, df_ssi, df_uku, 
                      df_athq, df_suq, df_demo]
bio_dataframes = [df_vitals, df_blood, df_ecg]
qol_dataframes = [df_sf36, df_cardio, df_cirs, df_pe]
neurocog_dataframes = [df_stroop]


######################################################################

df_clin = df_demo[["src_subject_id"]].copy()
for data in clin_dataframes:
    df_clin = pd.merge(df_clin, data, on='src_subject_id', how='left')
df_clin.to_excel(dir_output+'Merged Datasets/clin.xlsx', index = False)

df_bio = df_demo[["src_subject_id"]].copy()
for data in bio_dataframes:
    df_bio = pd.merge(df_bio, data, on='src_subject_id', how='left')
df_bio.to_excel(dir_output+'Merged Datasets/bio.xlsx', index = False)

df_qol = df_demo[["src_subject_id"]].copy()
for data in qol_dataframes:
    df_qol = pd.merge(df_qol, data, on='src_subject_id', how='left')
df_qol.to_excel(dir_output+'Merged Datasets/qol.xlsx', index = False)

df_neurocog = df_demo[["src_subject_id"]].copy()
for data in neurocog_dataframes:
    df_neurocog = pd.merge(df_neurocog, data, on='src_subject_id', how='left')
df_neurocog.to_excel(dir_output+'Merged Datasets/neurocog.xlsx', index = False)


# Merge all dataframes on 'src_subject_id'
df_all = df_demo[["src_subject_id"]].copy()
df_all = df_all.merge(df_clin, on="src_subject_id", how="left")
df_all = df_all.merge(df_bio, on="src_subject_id", how="left")
df_all = df_all.merge(df_qol, on="src_subject_id", how="left")
df_all = df_all.merge(df_neurocog, on="src_subject_id", how="left")

# Save to Excel
df_all.to_excel(dir_output + 'Merged Datasets/all_data.xlsx', index=False)

#Save to excel.    
df_covars.to_excel(dir_output+'Merged Datasets/covariates.xlsx', index = None)