In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 5)
pd.set_option('display.max_columns', None)

#Folder name with data.
dir_input_pred = 'Prediction set/Merged Datasets/'
dir_input_baseline = 'Baseline/Merged Datasets/'

#Folder name for output data.
dir_output = 'ML Analysis/'

In [2]:
### Load baseline data and store columns. 

# Load clinical and sociodemographic data
df_clin_and_socio = pd.read_excel(dir_input_baseline + 'clinical_and_sociodemographic.xlsx')

# Load lab data
df_health = pd.read_excel(dir_input_baseline + 'health.xlsx')

# Load cognitive data
df_cognitive = pd.read_excel(dir_input_baseline + 'cognitive.xlsx')

# Load quality of life data
df_quality_of_life = pd.read_excel(dir_input_baseline + 'quality_of_life.xlsx')

# Load covariates data
df_covariates = pd.read_excel(dir_input_baseline + 'covariates.xlsx')

analysis_labels = ['src_subject_id', 'M06CD_ABSDIFF', 'M06CD_DIFF_CLASS', 'M06CD_PSD_CLASS',
                   'M12CD_ABSDIFF', 'M12CD_DIFF_CLASS', 'M12CD_PSD_CLASS', 'MAllCD_PSD_CLASS']

analysis_labels = ['src_subject_id', 'M06CD_ABSDIFF', 'M06CD_DIFF_CLASS', 'M06CD_PSD_CLASS',
                   'M12CD_ABSDIFF', 'M12CD_DIFF_CLASS', 'M12CD_PSD_CLASS', 'MAllCD_PSD_CLASS', 'MJust6CD_PSD_CLASS', 
                   'M06PANSS_NEG_ABSDIFF', 'M06PANSS_NEG_DIFF_CLASS', 'M06PANSS_MARDER_DIFF_CLASS', 'M12PANSS_NEG_ABSDIFF', 'M12PANSS_NEG_DIFF_CLASS',
                   'M12PANSS_MARDER_DIFF_CLASS', 'MJust6PANSS_NEG_ANDREASEN_CLASS', 'MJust6PANSS_MARDER_REMISSION_ANDREASEN_CLASS']

label_to_use = 'MJust6PANSS_MARDER_REMISSION_ANDREASEN_CLASS'

#Load predictor labels. 
df_labels = pd.read_excel(dir_input_pred + 'efficacy_labels.xlsx', usecols = analysis_labels)

# Left merge on 'ID' for all baseline dataframes
df_merged = df_clin_and_socio.merge(df_health, on ='src_subject_id', how='left')
df_merged = df_merged.merge(df_cognitive, on='src_subject_id', how='left')
df_merged = df_merged.merge(df_quality_of_life, on='src_subject_id', how='left')
df_merged = df_merged.merge(df_covariates, on='src_subject_id', how='left')
df_merged = df_merged.merge(df_labels, on='src_subject_id', how='left')

# Remove rows where the label is NaN. 
df_merged = df_merged.dropna(subset=analysis_labels)

threshold_percentage = 20
# Remove columns with more than 20% missing values. 
df_merged = df_merged.drop(columns = df_merged.columns[df_merged.isnull().mean() > threshold_percentage / 100])

# Remove rows with more than 20% missing values
df_merged = df_merged.drop(index = df_merged[df_merged.isnull().mean(axis=1) > threshold_percentage / 100].index)

# Split df_merged into the original separated dataframes and the labels. 
df_clin_and_socio = df_merged[df_merged.columns[df_merged.columns.isin(df_clin_and_socio.columns)| df_merged.columns.isin([label_to_use])]]
df_health = df_merged[df_merged.columns[df_merged.columns.isin(df_health.columns)| df_merged.columns.isin([label_to_use])]]
df_cognitive = df_merged[df_merged.columns[df_merged.columns.isin(df_cognitive.columns)| df_merged.columns.isin([label_to_use])]]
df_quality_of_life = df_merged[df_merged.columns[df_merged.columns.isin(df_quality_of_life.columns)| df_merged.columns.isin([label_to_use])]]
df_covariates = df_merged[df_merged.columns[df_merged.columns.isin(df_covariates.columns)]]

label_mapping = {0: 'BadOutcome', 1: 'GoodOutcome'}
# NEEDED IN NM for classification.
if "CLASS" in label_to_use:
    # Convert class labels into text: 1-> BadOutcome and 2->GoodOutcome
    df_clin_and_socio.loc[:, label_to_use] = df_clin_and_socio[label_to_use].map(label_mapping)
    df_health.loc[:, label_to_use] = df_health[label_to_use].map(label_mapping)
    df_cognitive.loc[:, label_to_use] = df_cognitive[label_to_use].map(label_mapping)
    df_quality_of_life.loc[:, label_to_use] = df_quality_of_life[label_to_use].map(label_mapping)


#Save them to excel.
df_clin_and_socio.to_excel(dir_output+'clinical_and_sociodemographic.xlsx', index = False)
df_cognitive.to_excel(dir_output+'cognitive.xlsx', index = False)
df_health.to_excel(dir_output+'health.xlsx', index = False)
df_quality_of_life.to_excel(dir_output+'quality_of_life.xlsx', index = False)
df_covariates.to_excel(dir_output+'covariates.xlsx', index = False)

################ GENERATE THE OTHER LABELS ########################
#Neurominer assigns 1 to the first label in the dataset, so to generate the numeric labels we can do the same.
df_labels = df_merged[df_merged.columns[df_merged.columns.isin(analysis_labels)]]

    
#Save the analysis labels. 
df_labels.to_excel(dir_output+'labels.xlsx', index = False)