In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 5)
pd.set_option('display.max_columns', None)

#Folder name with data.
dir_input_pred = 'Prediction set/Merged Datasets/'
dir_input_baseline = 'Baseline/Merged Datasets/'

#Folder name for output data.
dir_output = ''

In [2]:
#Load baseline data and store columns. 

# Load clinical and sociodemographic data
df_clin_and_socio = pd.read_excel(dir_input_baseline + 'clinical_and_sociodemographic.xlsx')

# Load lab data
df_blood_data = pd.read_excel(dir_input_baseline + 'lab_data.xlsx')

# Load lab data
df_moa_mapping_data = pd.read_excel(dir_input_baseline + 'moa_mapping.xlsx')

# Load cognitive data
df_cognitive = pd.read_excel(dir_input_baseline + 'cognitive.xlsx')

# Load quality of life data
df_quality_of_life = pd.read_excel(dir_input_baseline + 'quality_of_life.xlsx')

# Load prognosis data
df_prognosis = pd.read_excel(dir_input_baseline + 'prognosis.xlsx')

# Load covariates data
df_covariates = pd.read_excel(dir_input_baseline + 'covariates.xlsx')

analysis_labels = ['crfnr','V3CD_ABSDIFF', 'V3CD_DIFF_CLASS', 'V3CD_PSD_CLASS', 'V7CD_ABSDIFF', 'V7CD_DIFF_CLASS', 'V7CD_PSD_CLASS',
                   'V9CD_ABSDIFF', 'V9CD_DIFF_CLASS', 'V9CD_PSD_CLASS', 'V3679CD_PSD_CLASS', 'V367CD_PSD_CLASS', 'V3PANSS_POS_ABSDIFF', 'V3PANSS_POS_DIFF_CLASS', 
                    'V7PANSS_POS_ABSDIFF', 'V7PANSS_POS_DIFF_CLASS', 'V9PANSS_POS_ABSDIFF', 'V9PANSS_POS_DIFF_CLASS', 
                   'V3PANSS_NEG_ABSDIFF', 'V3PANSS_MARDER_ABSDIFF', 'V3PANSS_NEG_DIFF_CLASS', 'V3PANSS_MARDER_DIFF_CLASS', 'V7PANSS_NEG_ABSDIFF', 'V7PANSS_MARDER_ABSDIFF', 'V7PANSS_NEG_DIFF_CLASS',
                   'V7PANSS_MARDER_DIFF_CLASS', 'V9PANSS_NEG_ABSDIFF',  'V9PANSS_MARDER_ABSDIFF', 'V9PANSS_NEG_DIFF_CLASS', 'V9PANSS_MARDER_DIFF_CLASS', 'V367PANSS_NEG_ANDREASEN_CLASS', 'V367PANSS_MARDER_REMISSION_ANDREASEN_CLASS', 'V3GAF_ABSDIFF', 'V3GAF_DIFF_CLASS', 
                   'V7GAF_ABSDIFF', 'V7GAF_DIFF_CLASS', 'V9GAF_ABSDIFF', 'V9GAF_DIFF_CLASS']

label_to_use = 'V367CD_PSD_REG'

#Load predictor labels. 
df_labels = pd.read_excel(dir_input_pred + 'efficacy_labels.xlsx', usecols = analysis_labels)

# NOTE THAT VIOLATORS HAVE BEEN REMOVED AND DATASETS WERE CREATED AS A LEFT MERGE TO V1, SO ALL HAVE THE SAME PATIENT IDs. 

# Left merge on 'ID' for all baseline dataframes
df_merged = df_clin_and_socio.merge(df_blood_data, on='crfnr', how='left')
df_merged = df_merged.merge(df_moa_mapping_data, on='crfnr', how='left')
df_merged = df_merged.merge(df_cognitive, on='crfnr', how='left')
df_merged = df_merged.merge(df_quality_of_life, on='crfnr', how='left')
df_merged = df_merged.merge(df_prognosis, on='crfnr', how='left')
df_merged = df_merged.merge(df_covariates, on='crfnr', how='left')
df_merged = df_merged.merge(df_labels, on='crfnr', how='left')

# Remove rows where the label is NaN. 
df_merged = df_merged.dropna(subset=analysis_labels)

threshold_percentage = 20
# Remove columns with more than 20% missing values. 
df_merged = df_merged.drop(columns = df_merged.columns[df_merged.isnull().mean() > threshold_percentage / 100])

# Remove rows with more than 20% missing values
df_merged = df_merged.drop(index = df_merged[df_merged.isnull().mean(axis=1) > threshold_percentage / 100].index)

# Split df_merged into the original separated dataframes and the labels. 
df_clin_and_socio = df_merged[df_merged.columns[df_merged.columns.isin(df_clin_and_socio.columns)| df_merged.columns.isin([label_to_use])]]
df_blood_data = df_merged[df_merged.columns[df_merged.columns.isin(df_blood_data.columns)| df_merged.columns.isin([label_to_use])]]
df_moa_mapping_data = df_merged[df_merged.columns[df_merged.columns.isin(df_moa_mapping_data.columns)| df_merged.columns.isin([label_to_use])]]
df_cognitive = df_merged[df_merged.columns[df_merged.columns.isin(df_cognitive.columns)| df_merged.columns.isin([label_to_use])]]
df_quality_of_life = df_merged[df_merged.columns[df_merged.columns.isin(df_quality_of_life.columns)| df_merged.columns.isin([label_to_use])]]
df_prognosis = df_merged[df_merged.columns[df_merged.columns.isin(df_prognosis.columns)| df_merged.columns.isin([label_to_use])]]
df_covariates = df_merged[df_merged.columns[df_merged.columns.isin(df_covariates.columns)]]

label_mapping = {0: 'BadOutcome', 1: 'GoodOutcome'}
# NEEDED IN NM for classification.
if "CLASS" in label_to_use:
    
    # Convert class labels into text: 1-> BadOutcome and 2->GoodOutcome
    df_clin_and_socio.loc[:, label_to_use] = df_clin_and_socio[label_to_use].map(label_mapping)
    df_blood_data.loc[:, label_to_use] = df_blood_data[label_to_use].map(label_mapping)
    df_moa_mapping_data.loc[:, label_to_use] = df_moa_mapping_data[label_to_use].map(label_mapping)
    df_cognitive.loc[:, label_to_use] = df_cognitive[label_to_use].map(label_mapping)
    df_quality_of_life.loc[:, label_to_use] = df_quality_of_life[label_to_use].map(label_mapping)
    df_prognosis.loc[:, label_to_use] = df_prognosis[label_to_use].map(label_mapping)

#Save them to excel.
df_clin_and_socio.to_excel(dir_output+'clinical_and_sociodemographic.xlsx', index = False)
df_cognitive.to_excel(dir_output+'cognitive.xlsx', index = False)
df_blood_data.to_excel(dir_output+'lab_data.xlsx', index = False)
df_moa_mapping_data.to_excel(dir_output+'moa_mapping_data.xlsx', index = False)
df_quality_of_life.to_excel(dir_output+'quality_of_life.xlsx', index = False)
df_prognosis.to_excel(dir_output+'prognosis.xlsx', index = False)
df_covariates.to_excel(dir_output+'covariates.xlsx', index = False)

################ GENERATE THE OTHER LABELS ########################
#Neurominer assigns 1 to the first label in the dataset, so to generate the numeric labels we can do the same.
df_labels = df_merged[df_merged.columns[df_merged.columns.isin(analysis_labels)]]
 
#Save the analysis labels. 
df_labels.to_excel(dir_output+'labels.xlsx', index = False)