In [4]:
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

#Folder name with data.
dir_input_pred = 'Prediction set/Merged Datasets/'
dir_input_baseline = 'Baseline/Merged Datasets/'

#Folder name for output data.
dir_output = 'ML Analysis/'

In [5]:
### Load baseline data and store columns. 

# Load data
df_clin = pd.read_excel(dir_input_baseline + 'clin.xlsx')
df_bio = pd.read_excel(dir_input_baseline + 'bio.xlsx')
df_qol = pd.read_excel(dir_input_baseline + 'qol.xlsx')
df_neurocog = pd.read_excel(dir_input_baseline + 'neurocog.xlsx')

# Load covariates data
df_covariates = pd.read_excel(dir_input_baseline + 'covariates.xlsx')


analysis_labels = ["src_subject_id", "DEP_STOPPD_REMISSION_0W", "DEP_HAMD_REMISSION_0W", "DEP_TOTAL_0W", "DEP_STOPPD_REMISSION_4W", "DEP_HAMD_REMISSION_4W", 
           "DEP_TOTAL_4W", "DEP_STOPPD_REMISSION_12W", "DEP_HAMD_REMISSION_12W", "DEP_TOTAL_12W", "DEP_DIFF_4W", 
           "DEP_DIFF_CLASS_4W", "DEP_DIFF_12W", "DEP_DIFF_CLASS_12W"]


label_to_use = "DEP_DIFF_12W"

#Load predictor labels. 
df_labels = pd.read_excel(dir_input_pred + 'labels.xlsx', usecols = analysis_labels)

# Left merge on 'ID' for all baseline dataframes
df_merged = df_clin.merge(df_bio, on ='src_subject_id', how='left')
df_merged = df_merged.merge(df_qol, on ='src_subject_id', how='left')
df_merged = df_merged.merge(df_neurocog, on ='src_subject_id', how='left')
df_merged = df_merged.merge(df_covariates, on ='src_subject_id', how='left')
df_merged = df_merged.merge(df_labels, on='src_subject_id', how='left')

# Remove rows where the label is NaN. 
df_merged = df_merged.dropna(subset=analysis_labels)

threshold_percentage = 20
# Remove columns with more than 20% missing values. 
df_merged = df_merged.drop(columns = df_merged.columns[df_merged.isnull().mean() > threshold_percentage / 100])

# Remove rows with more than 20% missing values
df_merged = df_merged.drop(index = df_merged[df_merged.isnull().mean(axis=1) > threshold_percentage / 100].index)

# Split df_merged into the original separated dataframes and the labels. 
df_clin = df_merged[df_merged.columns[df_merged.columns.isin(df_clin.columns) | df_merged.columns.isin([label_to_use])]]
df_qol = df_merged[df_merged.columns[df_merged.columns.isin(df_qol.columns) | df_merged.columns.isin([label_to_use])]]
df_neurocog = df_merged[df_merged.columns[df_merged.columns.isin(df_neurocog.columns) | df_merged.columns.isin([label_to_use])]]
df_bio = df_merged[df_merged.columns[df_merged.columns.isin(df_bio.columns) | df_merged.columns.isin([label_to_use])]]
df_all_data = df_merged[df_merged.columns[
    df_merged.columns.isin(df_clin.columns) | 
    df_merged.columns.isin(df_qol.columns) | 
    df_merged.columns.isin(df_neurocog.columns) | 
    df_merged.columns.isin(df_bio.columns) | 
    df_merged.columns.isin([label_to_use])
]]

df_covariates = df_merged[df_merged.columns[df_merged.columns.isin(df_covariates.columns)]]

# Define label mapping for NM (non-missing labels)
if "REMISSION" in label_to_use:
    label_mapping = {0: 'NoRemission', 1: 'Remission'}
elif "DIFF_CLASS" in label_to_use:
    label_mapping = {0: 'Deteriorator', 1: 'Improver'}
else:
    label_mapping = None 

# Apply mapping to all relevant dataframes if a mapping exists
if label_mapping:
    df_clin.loc[:, label_to_use] = df_clin[label_to_use].map(label_mapping)
    df_qol.loc[:, label_to_use] = df_qol[label_to_use].map(label_mapping)
    df_neurocog.loc[:, label_to_use] = df_neurocog[label_to_use].map(label_mapping)
    df_bio.loc[:, label_to_use] = df_bio[label_to_use].map(label_mapping)
    df_all_data.loc[:, label_to_use] = df_all_data[label_to_use].map(label_mapping)

# Save all dataframes to Excel
df_clin.to_excel(dir_output + 'data_clin.xlsx', index=False)
df_qol.to_excel(dir_output + 'data_qol.xlsx', index=False)
df_neurocog.to_excel(dir_output + 'data_neurocog.xlsx', index=False)
df_bio.to_excel(dir_output + 'data_bio.xlsx', index=False)
df_all_data.to_excel(dir_output + 'all_data.xlsx', index=False)
df_covariates.to_excel(dir_output + 'covariates.xlsx', index=False)
################ GENERATE THE OTHER LABELS ########################
#Neurominer assigns 1 to the first label in the dataset, so to generate the numeric labels we can do the same.
df_labels = df_merged[df_merged.columns[df_merged.columns.isin(analysis_labels)]]

#Save the analysis labels. 
df_labels.to_excel(dir_output+'labels.xlsx', index = False)