In [4]:
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 5)
pd.set_option('display.max_columns', None)

In [5]:
##Directories.
#Folder name with original input data.
dir_input_eufest_baseline = 'EUFEST preproc/Baseline/Merged Datasets/'
dir_input_raise_baseline = 'RAISE preproc/Baseline/Merged Datasets/'
dir_input_eufest_labels = 'EUFEST preproc/Prediction set/Merged Datasets/'
dir_input_raise_labels = 'RAISE preproc/Prediction set/Merged Datasets/'
dir_output = ""
##EUFEST

# NOTE THAT VIOLATORS HAVE BEEN REMOVED AND DATASETS WERE CREATED AS A LEFT MERGE TO V1, SO ALL HAVE THE SAME PATIENT IDs.

# Load clinical and sociodemographic data
df_clin_and_socio = pd.read_excel(dir_input_eufest_baseline + 'clinical_and_sociodemographic.xlsx')
df_blood_data = pd.read_excel(dir_input_eufest_baseline + 'lab_data.xlsx')
df_moa_mapping_data = pd.read_excel(dir_input_eufest_baseline + 'moa_mapping.xlsx')
df_cognitive = pd.read_excel(dir_input_eufest_baseline + 'cognitive.xlsx')
df_quality_of_life = pd.read_excel(dir_input_eufest_baseline + 'quality_of_life.xlsx')
df_prognosis = pd.read_excel(dir_input_eufest_baseline + 'prognosis.xlsx')
df_covariates = pd.read_excel(dir_input_eufest_baseline + 'covariates.xlsx')
df_labels = pd.read_excel(dir_input_eufest_labels + 'efficacy_labels.xlsx')

df_eufest = df_clin_and_socio.merge(df_blood_data, on='crfnr', how='left')
df_eufest = df_eufest.merge(df_moa_mapping_data, on='crfnr', how='left')
df_eufest = df_eufest.merge(df_cognitive, on='crfnr', how='left')
df_eufest = df_eufest.merge(df_quality_of_life, on='crfnr', how='left')
df_eufest = df_eufest.merge(df_prognosis, on='crfnr', how='left')
df_eufest = df_eufest.merge(df_covariates, on='crfnr', how='left')
df_eufest = df_eufest.merge(df_labels, on='crfnr', how='left')

##RAISE 
df_clin_and_socio = pd.read_excel(dir_input_raise_baseline + 'clinical_and_sociodemographic.xlsx')
df_health = pd.read_excel(dir_input_raise_baseline + 'health.xlsx')
df_cognitive = pd.read_excel(dir_input_raise_baseline + 'cognitive.xlsx')
df_quality_of_life = pd.read_excel(dir_input_raise_baseline + 'quality_of_life.xlsx')
df_covariates = pd.read_excel(dir_input_raise_baseline + 'covariates.xlsx')
df_labels = pd.read_excel(dir_input_raise_labels + 'efficacy_labels.xlsx')

df_raise = df_clin_and_socio.merge(df_health, on ='src_subject_id', how='left')
df_raise = df_raise.merge(df_cognitive, on='src_subject_id', how='left')
df_raise = df_raise.merge(df_quality_of_life, on='src_subject_id', how='left')
df_raise = df_raise.merge(df_covariates, on='src_subject_id', how='left')
df_raise = df_raise.merge(df_labels, on='src_subject_id', how='left')


In [6]:
########################DIAGNOSE################################################
#Schizophrenia diagnose.
df_eufest['diag_1'] = df_eufest[['code1', 'code2', 'code3', 'code4']].apply(lambda row: 1 if 1 in row.values else 0, axis=1)
df_raise['diag_1'] = df_raise['dxbase_1.0']

#Schizophreniform disorder
df_eufest['diag_2'] = df_eufest['code5']
df_raise['diag_2'] = df_raise[['dxbase_4.0', 'dxbase_5.0']].apply(lambda row: 1 if 1 in row.values else 0, axis=1)

#Schizoaffective disorder
df_eufest['diag_3'] = df_eufest['code6']
df_raise['diag_3'] = df_raise[['dxbase_2.0', 'dxbase_3.0']].apply(lambda row: 1 if 1 in row.values else 0, axis=1)

#Other diagnose (not covered in both datasets)
df_eufest['diag_4'] = df_eufest['code7']
df_raise['diag_4'] = df_raise[['dxbase_6.0', 'dxbase_7.0']].apply(lambda row: 1 if 1 in row.values else 0, axis=1)


################################################################################

########################SUBSTANCE USE###########################################
#Alcohol current use.
df_eufest['SA_Alcohol'] = df_eufest[['V1SAL06']].apply(lambda row: 1 if 1 in row.values else 0, axis=1)
df_raise['SA_Alcohol'] = df_raise['subus25']

#Cannabis.
df_eufest['SA_Cannabis'] = df_eufest[['V1SAL03']].apply(lambda row: 1 if 1 in row.values else 0, axis=1)
df_raise['SA_Cannabis'] = df_raise['ca824']

#Cannabis prev use.
df_eufest['SA_Cannabis_prev'] = df_eufest[['V1SA06']].apply(lambda row: 1 if 1 in row.values else 0, axis=1)
df_raise['SA_Cannabis_prev'] = df_raise['subus29']

#Other illicit drug use.
df_eufest['SA_Other'] = df_eufest[['V1SAL08', 'V1SAL11', 'V1SAL14', 'V1SAL17', 'V1SAL20', 'V1SAL23']].apply(lambda row: 1 if 1 in row.values else 0, axis=1)
df_raise['SA_Other'] = df_raise['subus30']

################################################################################

########################Calgary Depression Scale for Schizophrenia##############
# Variable mapping.
column_mapping = {
    'calg1': 'V1CD01',
    'calg2': 'V1CD02',
    'calg3': 'V1CD03',
    'calg4': 'V1CD04',
    'calg5': 'V1CD05',
    'calg6': 'V1CD06',
    'calg7': 'V1CD07',
    'calg8': 'V1CD08',
    'calg9': 'V1CD09',
    'calg_ts': 'V1CDTOTAL'
}

# Rename calgary variables.
df_raise.rename(columns=column_mapping, inplace=True)

################################################################################


####################Positive and Negative Syndrome Scale (PANSS)##############

# Variable mapping. 
column_mapping = {
    'pos_p1': 'V1pa01',
    'pos_p2': 'V1pa02',
    'pos_p3': 'V1pa03',
    'pos_p4': 'V1pa04',
    'pos_p5': 'V1pa05',
    'pos_p6': 'V1pa06',
    'pos_p7': 'V1pa07',
    'neg_n1': 'V1pa08',
    'neg_n2': 'V1pa09',
    'neg_n3': 'V1pa10',
    'neg_n4': 'V1pa11',
    'neg_n5': 'V1pa12',
    'neg_n6': 'V1pa13',
    'neg_n7': 'V1pa14',
    'gps_g1': 'V1pa15',
    'gps_g2': 'V1pa16',
    'gps_g3': 'V1pa17',
    'gps_g4': 'V1pa18',
    'gps_g5': 'V1pa19',
    'gps_g6': 'V1pa20',
    'gps_g7': 'V1pa21',
    'gps_g8': 'V1pa22',
    'gps_g9': 'V1pa23',
    'gps_g10': 'V1pa24',
    'gps_g11': 'V1pa25',
    'gps_g12': 'V1pa26',
    'gps_g13': 'V1pa27',
    'gps_g14': 'V1pa28',
    'gps_g15': 'V1pa29',
    'gps_g16': 'V1pa30',
    'panss_ptotal': 'V1ptotal',
    'panss_ntotal': 'V1ntotal',
    'panss_gtotal': 'V1gtotal',
    'panss_total': 'V1total'
}

# Change column names. 
df_raise.rename(columns=column_mapping, inplace=True)

################################################################################

#################### Clinical Global Impression ################################
df_raise.rename(columns = {'cs16':'V1CGI_V1'}, inplace = True)


################################################################################


######################## DEMOGRAPHICS #########################################
column_mapping = {
    'interview_age': 'age',
    'sex': 'a003',
    'trt_set_1.0': 'a015_t_1',
    'trt_set_3.0': 'a015_t_2',
    'trt_set_2.0': 'a015_t_3',
    'trt_set_5.0': 'a015_t_4',
    'race_White': 'a022_1',
    'race_Black or African American': 'a022_2',
    'race_Asian': 'a022_3',
    'workyn': 'a030',
    'curres_1.0': 'a041'
}

#Change interview age from months to years. 
df_raise['interview_age'] = df_raise['interview_age']/12

# Renaming.
df_raise.rename(columns=column_mapping, inplace=True)




###############################################################################

######################## Blood data ###########################################

# Name mapping
column_mapping = {
    'rsptc_no': 'V1l05',
    'rsphdl_no': 'V1l06',
    'rspldl_no': 'V1l07',
    'rsptrig_no': 'V1l09'
}

# Rename
df_raise.rename(columns=column_mapping, inplace=True)

###############################################################################

######################## Health ###############################################

column_mapping = {
    'vital_diabp': 'ph05',
    'vital_sysbp': 'ph04',
    'vital_pulse': 'ph06',
    'bmi': 'phbmi'
}

df_raise.rename(columns=column_mapping, inplace=True)

#Converting to imperial in eufest. 
df_eufest['phbmi'] =  (df_eufest['ph02']*2.20462) / ((df_eufest['ph01']/2.54)**2)*703

###############################################################################


######################## QUALITY OF LIFE ######################################

eufest_vars = ['V1MA02', 'V1MA06', 'V1MA12', 'V1MA13', 'V1MA14', 'V1MA15', 'V1MA16']
raise_vars = ['qol13', 'qol03', 'qol05', 'qol02', 'qol09', 'sf01', 'srf019']
#Standardise by the max possible value and min possible value of each of the scores.  

#MANSA scores go from 1 to 7.
df_eufest[eufest_vars] = (df_eufest[eufest_vars]- 1)/(7-1)

#QOL scores go from 0 to 6. 
df_raise[['qol13', 'qol03', 'qol05', 'qol02', 'qol09']] = (df_raise[['qol13', 'qol03', 'qol05', 'qol02', 'qol09']] - 0)/(6-0)

#sf-12 scores go from 1 to 5, 1 meaning good and 5 meaning poor. 
df_raise['sf01'] = (df_raise['sf01'] - 1)/(5-1)

#Invert to consider the fact that the scale is inverted vs. MANSA. 
df_raise['sf01'] = 1 - df_raise['sf01']

#SRF scores go from 1 to 7. 
df_raise['srf019'] = (df_raise['srf019'] - 1)/(7-1)

#Column mapping
column_mapping = {
    'qol13': 'V1MA02',
    'qol03': 'V1MA06',
    'qol05': 'V1MA12',
    'qol02': 'V1MA13',
    'qol09': 'V1MA14',
    'sf01': 'V1MA15',
    'srf019': 'V1MA16'
}

df_raise.rename(columns=column_mapping, inplace=True)


###############################################################################


########################Neurocognitive perf####################################


column_mapping = {
    'bacvm1': 'V1NEU47',
    'bacvm2': 'V1NEU50',
    'bacvm3': 'V1NEU53',
    'bacvm4': 'V1NEU56',
    'bacvm5': 'V1NEU59',
    'bacs_vmttot': 'V1NESUMCORR',
    'bacs_sc_total': 'V1NEU07',
    'bactmts': 'V1NEUHAND', 
    'bacs_tl_total': 'V1NEU04MINUS01'
}


#hand coordination total score. 
df_eufest['V1NEUHAND'] = df_eufest['V1NEU10']+df_eufest['V1NEU11']+df_eufest['V1NEU12']

#Normalise executive functioning variables. 
df_eufest['V1NEU04MINUS01'] = (df_eufest['V1NEU04MINUS01'] - df_eufest['V1NEU04MINUS01'].mean()) / df_eufest['V1NEU04MINUS01'].std()
df_raise['bacs_tl_total'] = (df_raise['bacs_tl_total'] - df_raise['bacs_tl_total'].mean()) / df_raise['bacs_tl_total'].std()

# Rename columns
df_raise.rename(columns=column_mapping, inplace=True)


###############################################################################

########################TREATMENT ARMS ########################################


#Haloperidol treatment. 
df_raise['StudyArm_1'] = (df_raise['adhetpc1_13.0'] | df_raise['adhetpc1_25.0'])

# Olanzapine treatment.
df_raise['StudyArm_2'] = (df_raise['adhetpc1_2.0'])

# Quetiapine treatment.
df_raise['StudyArm_3'] = df_raise['adhetpc1_3.0']

# Amilsulpride treatment.
df_raise['StudyArm_4'] = False

# Ziprasidone treatment.
df_raise['StudyArm_5'] = df_raise['adhetpc1_5.0']


# Risperidone treatment.
df_raise['OTHERANTIPSY_Risperidone'] = (df_raise['adhetpc1_4.0'] | df_raise['adhetpc1_22.0'])

# Clozapine treatment.
df_raise['OTHERANTIPSY_Clozapine'] = df_raise['adhetpc1_21.0']

# Olanzapine treatment.
df_raise['OTHERANTIPSY_Olanzapine'] = (df_raise['adhetpc1_2.0'])

# Quetiapine treatment.
df_raise['OTHERANTIPSY_Quetiapine'] = df_raise['adhetpc1_3.0']

# Sulpiride treatment.
df_raise['OTHERANTIPSY_Sulpiride'] = False

# Haloperidol treatment.
df_raise['OTHERANTIPSY_Haloperidol'] = (df_raise['adhetpc1_13.0'] | df_raise['adhetpc1_25.0'])

# Zuclopenthixol treatment.
df_raise['OTHERANTIPSY_Zuclopenthixol'] = False

# Perphenazine treatment.
df_raise['OTHERANTIPSY_Perphenazine'] = df_raise['adhetpc1_14.0']

# Chlorpromazine treatment.
df_raise['OTHERANTIPSY_Chlorpromazine'] = df_raise['adhetpc1_11.0']

# Levomepromazine treatment.
df_raise['OTHERANTIPSY_Levomepromazine'] = False

# Amisulpride treatment.
df_raise['OTHERANTIPSY_Amisulpride'] = False

# Prothipendyl treatment.
df_raise['OTHERANTIPSY_Prothipendyl'] = False

# Clotiapine treatment.
df_raise['OTHERANTIPSY_Clotiapine'] = False

# Gabapentin treatment.
df_raise['OTHERANTIPSY_Gabapentin'] = False

# Perazine treatment.
df_raise['OTHERANTIPSY_Perazine'] = False

# Flupentixol treatment.
df_raise['OTHERANTIPSY_Flupentixol'] = False

# Penfluridol treatment.
df_raise['OTHERANTIPSY_Penfluridol'] = False

# Pimozide treatment.
df_raise['OTHERANTIPSY_Pimozide'] = False

# Promazine treatment.
df_raise['OTHERANTIPSY_Promazine'] = False

# Fluphenazine treatment.
df_raise['OTHERANTIPSY_Fluphenazine'] = (df_raise['adhetpc1_12.0']|df_raise['adhetpc1_24.0'])

# Loxapine treatment.
df_raise['OTHERANTIPSY_Loxapine'] = df_raise['adhetpc1_19.0']

# Ziprasidone treatment.
df_raise['OTHERANTIPSY_Ziprasidone'] = df_raise['adhetpc1_5.0']

# Chlorprothixene treatment.
df_raise['OTHERANTIPSY_Chlorprothixene'] = False

# Trifluoperazine treatment.
df_raise['OTHERANTIPSY_Trifluoperazine'] = False

# Cyamemazine treatment.
df_raise['OTHERANTIPSY_Cyamemazine'] = False

# Pipamperone treatment.
df_raise['OTHERANTIPSY_Pipamperone'] = False

# Zotepine treatment.
df_raise['OTHERANTIPSY_Zotepine'] = False

# Thioridazine treatment.
df_raise['OTHERANTIPSY_Thioridazine'] = False

# Prochlorperazine treatment.
df_raise['OTHERANTIPSY_Prochlorperazine'] = False



# Prochlorperazine treatment.
df_raise['CONCOMMED_PSYCHOANALEPTICS'] = (df_raise['adhetpc1_30.0']|df_raise['adhetpc1_31.0']|df_raise['adhetpc1_32.0']|df_raise['adhetpc1_34.0']|df_raise['adhetpc1_35.0']|df_raise['adhetpc1_36.0']|
                                          df_raise['adhetpc1_37.0']|df_raise['adhetpc1_38.0']|df_raise['adhetpc1_39.0']|df_raise['adhetpc1_41.0']|df_raise['adhetpc1_42.0']|df_raise['adhetpc1_43.0']|
                                          df_raise['adhetpc1_44.0']) 




###############################################################################

########################MOVEMENT SIDE EFF######################################

#raise_vars = ['eps3', 'eps1', 'eps2', 'eps5', 'aims_facial_score_date1', 'aims_trunk_score_date1', 'aims_extrem_score_date1']
#eufest_vars = ['V1SHRS08',  'V1SHRS11', 'V1SHRS10', 'V1SHRS010203', 'V1SHRS26272829', 'V1SHRS31', 'V1SHRS3233']


#column_mapping = {
    #'eps3': 'V1SHRS08',
#    'eps1': 'V1SHRS11',
#    'eps2': 'V1SHRS10',
#    'eps5': 'V1SHRS010203',
#    'aims_facial_score_date1': 'V1SHRS26272829',
#    'aims_trunk_score_date1': 'V1SHRS31',
#    'aims_extrem_score_date1': 'V1SHRS3233'
#}


#Conglomerate akathisia. 
#df_eufest['V1SHRS010203'] = df_eufest[['V1SHRS01', 'V1SHRS02', 'V1SHRS03']].max(axis=1)
  
#Conglomerate facial.
#df_eufest['V1SHRS26272829'] = df_eufest[['V1SHRS26', 'V1SHRS27', 'V1SHRS28', 'V1SHRS29']].max(axis=1)
                                        
#Conglomerate extremities.
#df_eufest['V1SHRS3233'] = df_eufest[['V1SHRS32', 'V1SHRS33']].max(axis=1)

#Rescale the variables.
#for var in raise_vars:
#    df_raise[var] = (df_raise[var] - 1) / (5-1)
                                    
#for var in eufest_vars:
#    df_eufest[var] = (df_eufest[var] - 0) / (6-0)
                                    
# Rename columns
#df_raise.rename(columns=column_mapping, inplace=True)                                    
###############################################################################
                                    
############################# LABELS ####################################


column_mapping = {
    'M06CD_ABSDIFF': 'V7CD_ABSDIFF',
    'M06CD_DIFF_CLASS': 'V7CD_DIFF_CLASS',
    'M12CD_ABSDIFF': 'V9CD_ABSDIFF',
    'M12CD_DIFF_CLASS': 'V9CD_DIFF_CLASS',
    'MJust6CD_PSD_CLASS': 'V367CD_PSD_CLASS',
    'MJust6CD_PSD_REG':'V367CD_PSD_REG'
}

df_raise.rename(columns=column_mapping, inplace=True)

###############################################################################

############################# Patient ID ######################################
df_raise.rename(columns={'src_subject_id':'crfnr'}, inplace=True)
###############################################################################

#Variables that were harmonised. 
variables = ['crfnr','diag_1', 'diag_2', 'diag_3', 'diag_4', 'SA_Alcohol', 'SA_Cannabis', 'SA_Cannabis', 'SA_Other',
                      'V1CD01', 'V1CD02', 'V1CD03', 'V1CD04', 'V1CD05', 'V1CD06', 'V1CD07', 'V1CD08', 'V1CD09', 'V1CDTOTAL', 
                      'V1pa01', 'V1pa02', 'V1pa03', 'V1pa04', 'V1pa05', 'V1pa06', 'V1pa07', 'V1pa08', 'V1pa09', 'V1pa10', 
                      'V1pa11', 'V1pa12', 'V1pa13', 'V1pa14', 'V1pa15', 'V1pa16', 'V1pa17', 'V1pa18', 'V1pa19', 'V1pa20', 
                      'V1pa21', 'V1pa22', 'V1pa23', 'V1pa24', 'V1pa25', 'V1pa26', 'V1pa27', 'V1pa28', 'V1pa29', 'V1pa30',
                      'V1ptotal', 'V1ntotal', 'V1gtotal', 'V1total', 'V1CGI_V1', 'age', 'a003', 'a015_t_1', 'a015_t_2', 'a015_t_3', 
                      'a015_t_4', 'a022_1', 'a022_2', 'a022_3', 'a030', 'a041', 'V1l05', 'V1l06', 'V1l07', 'V1l09', 'ph05', 'ph04', 'ph06',
                     'phbmi', 'V1MA02', 'V1MA06', 'V1MA12', 'V1MA13', 'V1MA14', 'V1MA15', 'V1MA16', 'V1NEU47', 'V1NEU50', 'V1NEU53', 'V1NEU56',
                     'V1NEU59', 'V1NESUMCORR', 'V1NEU07', 'V1NEUHAND', 'V1NEU04MINUS01', 'StudyArm_1', 'StudyArm_2', 'StudyArm_3', 'StudyArm_4', 'StudyArm_5', 
                     'OTHERANTIPSY_Risperidone', 'OTHERANTIPSY_Clozapine',
                    'OTHERANTIPSY_Sulpiride', 'OTHERANTIPSY_Haloperidol', 'OTHERANTIPSY_Zuclopenthixol', 'OTHERANTIPSY_Perphenazine', 
                    'OTHERANTIPSY_Chlorpromazine', 'OTHERANTIPSY_Levomepromazine', 'OTHERANTIPSY_Prothipendyl', 'OTHERANTIPSY_Clotiapine', 
                    'OTHERANTIPSY_Gabapentin', 'OTHERANTIPSY_Perazine', 'OTHERANTIPSY_Flupentixol', 'OTHERANTIPSY_Penfluridol', 'OTHERANTIPSY_Pimozide', 'OTHERANTIPSY_Promazine', 
                    'OTHERANTIPSY_Fluphenazine', 'OTHERANTIPSY_Loxapine', 'OTHERANTIPSY_Chlorprothixene', 'OTHERANTIPSY_Trifluoperazine', 
                    'OTHERANTIPSY_Cyamemazine', 'OTHERANTIPSY_Pipamperone', 'OTHERANTIPSY_Zotepine', 'OTHERANTIPSY_Thioridazine', 'OTHERANTIPSY_Prochlorperazine', 
                 'CONCOMMED_PSYCHOANALEPTICS']

labels = ['V7CD_ABSDIFF', 'V7CD_DIFF_CLASS', 'V9CD_ABSDIFF', 'V9CD_DIFF_CLASS', 'V367CD_PSD_CLASS', 'V367CD_PSD_REG']

#Save to excel.
df_eufest = df_eufest[variables + labels].copy()
df_raise = df_raise[variables + labels].copy()

df_eufest.to_excel(dir_output + 'harmonised_eufest_data.xlsx', index = None)
df_raise.to_excel(dir_output + 'harmonised_raise_data.xlsx', index = None)
