In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', None)

#Folder name with original input data.
dir_input = 'Raw data/'

#Folder name for output data.
dir_output = 'Prediction set/'

In [2]:
################################################################################
#Data Extraction
################################################################################
# Sergio Mena Ortega, 2025


visit_columns = [0, 4, 12] #4 and 12 weeks.

#------------- Hamilton Depression Rating Scale (HDRS)  ------------
# OUTPUT dataframe: df_hrsd

df_hrsd = pd.read_excel(dir_input + 'hrsd01.xlsx', skiprows=[1]).filter(regex = "^(src_subject_id$|week$)")
# Filter the visit, just baseline, and drop the visit column.  
df_hrsd = df_hrsd[df_hrsd['week'] == 0].drop(columns = 'week')


for visit in visit_columns:
    df_hrsd_v = pd.read_excel(dir_input + 'hrsd01.xlsx', skiprows=[1])
    df_hrsd_v = df_hrsd_v[df_hrsd_v['week'] == visit].drop(columns = 'week')
    
    #Remission of depression as defined in STOP-PD. Here we only consider 
    #depressive symptoms and not the positive symptoms of in psychotic depression
    df_hrsd_v["DEP_STOPPD_REMISSION_"+str(visit)+"W"] = (df_hrsd_v["hamd_36"]<=10).astype(int)
    df_hrsd_v["DEP_HAMD_REMISSION_"+str(visit)+"W"] = (df_hrsd_v["hamd_36"]<=7).astype(int)
    
    # Absolute HDRS score.
    df_hrsd_v["DEP_TOTAL_"+str(visit)+"W"] = df_hrsd_v["hamd_36"]

    selected_variables = ["src_subject_id", "DEP_STOPPD_REMISSION_"+str(visit)+"W", "DEP_HAMD_REMISSION_"+str(visit)+"W", "DEP_TOTAL_"+str(visit)+"W"]
    
    df_hrsd = df_hrsd.merge(df_hrsd_v[selected_variables], on = "src_subject_id", how = "left")
    
#Calculate derivated variables. 
for visit in visit_columns[1:]:
  
    #Difference in HDRS score from baseline.
    df_hrsd["DEP_DIFF_"+str(visit)+"W"] = df_hrsd["DEP_TOTAL_"+str(visit)+"W"] - df_hrsd["DEP_TOTAL_0W"]

    # Improvers vs. deteriorators label. 
    df_hrsd.loc[df_hrsd["DEP_DIFF_"+str(visit)+"W"]<0, "DEP_DIFF_CLASS_"+str(visit)+"W"] = 1
    df_hrsd.loc[df_hrsd["DEP_DIFF_"+str(visit)+"W"] >= 0, "DEP_DIFF_CLASS_"+str(visit)+"W"] = 0
    df_hrsd.loc[df_hrsd["DEP_DIFF_"+str(visit)+"W"] == np.nan, "DEP_DIFF_CLASS_"+str(visit)+"W"] = np.nan

    # Add to dataframe.
    selected_variables = ["src_subject_id", "DEP_STOPPD_REMISSION_"+str(visit)+"W", "DEP_HAMD_REMISSION_"+str(visit)+"W", "DEP_TOTAL_"+str(visit)+"W", 
                         "DEP_DIFF_"+str(visit)+"W", "DEP_DIFF_CLASS_"+str(visit)+"W"]
    
    #df_hrsd = df_hrsd.merge(df_hrsd_v[selected_variables], on = "src_subject_id", how = "left")

#-----------------------------------------------------------------


In [3]:
################################################################################
#Storing Data and Merging
################################################################################
# Sergio Mena Ortega, 2024

#Writing individual datasets into excel.
df_hrsd.to_excel(dir_output+'Individual Datasets/hdrs_pred.xlsx', index = None)

list_of_dataframes = [df_hrsd] 
#Merge all labels into a dataframe.
df_labels = pd.read_excel(dir_input + 'pqdem01.xlsx', skiprows=[1]).filter(regex = "^(src_subject_id$|week$)")
df_labels = df_labels[df_labels['week'] == 0].drop(columns = 'week')

for data in list_of_dataframes:
    df_labels = pd.merge(df_labels, data, on='src_subject_id', how='left')

#Save to excel.    
df_labels.to_excel(dir_output+'Merged Datasets/labels.xlsx', index = False)