In [48]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
import seaborn as sns
pd.set_option('display.max_columns', None)

### 0. Data Pre-processing and Cleaning

Features that cannot be measured directly and may cause data leakage if included for modeling are typically those that:

1. **Are derived from the target variable**: Any feature that directly or indirectly reflects information about the target variable or the outcome of interest should not be included. For example:
   - `BL_Diagnosis`: This appears to be the baseline diagnosis, which might directly influence the subsequent diagnosis (`Final_Diagnosis`), so it should not be used in the model.
   - `Final_Diagnosis`: This is likely the final diagnosis, which is derived from other diagnostic measures and should not be used for modeling.

2. **Contain future information**: Features that contain information that would not be available at the time of prediction should be excluded. For example:
   - Features related to diagnosis or cognitive scores at later time points (e.g., `ADAS_COG`, `ADCS_ADL` at subsequent months) should not be used because they provide future information that would not be available when making predictions.

3. **Are identifiers or unique identifiers**: Features that are identifiers or unique to each sample and do not provide meaningful information for prediction should be excluded. For example:
   - `Subject_ID`: This is likely an identifier for each subject and should not be used for modeling.

4. **Contain metadata or batch information**: Features that contain metadata or batch information, such as `Month`, should be excluded if they do not provide meaningful information for prediction.

Based on the provided dataset, the features that cannot be measured directly and thus should not be included for modeling (to avoid data leakage) are likely:
- `BL_Diagnosis`
- `Final_Diagnosis`
- `ADAS_COG` (if used for predicting later diagnoses)
- `ADCS_ADL` (if used for predicting later diagnoses)
- `Subject_ID`
- `Month` (if it represents future time points)
- `SNP_batch` (if it contains batch information)

In [116]:
#reading the data
mri_data = pd.read_csv('../data/ANMerge_clinical_under_90.csv')
mri_data.tail()
remov_cols = ['Unnamed: 0','Sadman_ID','Site','Max_Visit','APOE','Sex','Onset_Age','Onset_Pattern','Progression_Pattern','Marital_Status','MMSE','Global_Deterioration_Scale','Onset_Year','Age','BL_Diagnosis','Diagnosis' ,'SNP_batch', 'ADAS_COG' , 'ADCS_ADL','Month']
mri_data.drop(columns=remov_cols, inplace=True)

In [117]:
#get subject ID in each dataset
mri_data_subject_id = mri_data['Subject_ID']

In [118]:
#find the common subject ID in all datasets
common_subject_id = set(mri_data_subject_id)
print(len(common_subject_id))
print(mri_data.shape)

1695
(4448, 25)


In [119]:
mri_data[mri_data['Subject_ID']=='LNDADC015']

Unnamed: 0,Subject_ID,Visit,Final_Diagnosis,CDR_SOB,CDR_Total,CERAD_A_Total,CERAD_B_Total,CERAD_C_Total,CERAD_D_Total,CERAD_E_Correct,CERAD_E_Intrusions,CERAD_F_Total,CERAD_G_Total,Cant_Read_Total,DEMQOL,Father_Dementia,Fulltime_Education_Years,Geriatric_Depression,Hachinski,MOCA,Mother_Dementia,NPI,Sensory_Impairment,TICSM,Webster
2309,LNDADC015,1,AD,6.0,1.0,,,,,,,,,,,Do_not_know,13.0,,4.0,,No,26.0,1.0,,2.0
2310,LNDADC015,10,AD,,,,,,,,,,,,,Do_not_know,13.0,,,,,23.0,,,
2311,LNDADC015,11,AD,,,,,,,,,,,,,Do_not_know,13.0,,,,,19.0,,,
2312,LNDADC015,2,AD,6.0,1.0,,,,,,,,,,Good,Do_not_know,13.0,,0.0,,No,6.0,1.0,,2.0
2313,LNDADC015,3,AD,6.5,1.0,,,,,,,,,,Good,Do_not_know,13.0,,3.0,,No,5.0,0.0,,6.0
2314,LNDADC015,4,AD,6.5,1.0,,,,,,,,,,Fair,Do_not_know,13.0,,3.0,,Do_not_know,9.0,1.0,,6.0
2315,LNDADC015,5,AD,10.0,2.0,,,,,,,,,,Fair,Do_not_know,13.0,,3.0,,Do_not_know,11.0,0.0,,4.0
2316,LNDADC015,6,AD,12.0,2.0,,,,,,,,,,Good,Do_not_know,13.0,,3.0,,Do_not_know,40.0,1.0,,11.0
2317,LNDADC015,7,AD,14.0,2.0,,,,,,,,,,Poor,Do_not_know,13.0,,3.0,,Do_not_know,34.0,1.0,,20.0
2318,LNDADC015,8,AD,15.0,2.0,,,,,,,,,,,Do_not_know,13.0,,3.0,,Do_not_know,55.0,1.0,,24.0


In [124]:
mri_data[mri_data['Subject_ID']=='LNDADC015']

Unnamed: 0,Subject_ID,Visit,Final_Diagnosis,CDR_SOB,CDR_Total,CERAD_A_Total,CERAD_B_Total,CERAD_C_Total,CERAD_D_Total,CERAD_E_Correct,CERAD_E_Intrusions,CERAD_F_Total,CERAD_G_Total,Cant_Read_Total,DEMQOL,Father_Dementia,Fulltime_Education_Years,Geriatric_Depression,Hachinski,MOCA,Mother_Dementia,NPI,Sensory_Impairment,TICSM,Webster
2309,LNDADC015,1,AD,6.0,1.0,,,,,,,,,,,-1.0,13.0,,4.0,,0.0,26.0,1.0,,2.0
2312,LNDADC015,2,AD,6.0,1.0,,,,,,,,,,Good,-1.0,13.0,,0.0,,0.0,6.0,1.0,,2.0
2313,LNDADC015,3,AD,6.5,1.0,,,,,,,,,,Good,-1.0,13.0,,3.0,,0.0,5.0,0.0,,6.0
2314,LNDADC015,4,AD,6.5,1.0,,,,,,,,,,Fair,-1.0,13.0,,3.0,,0.0,9.0,1.0,,6.0
2315,LNDADC015,5,AD,10.0,2.0,,,,,,,,,,Fair,-1.0,13.0,,3.0,,0.0,11.0,0.0,,4.0
2316,LNDADC015,6,AD,12.0,2.0,,,,,,,,,,Good,-1.0,13.0,,3.0,,0.0,40.0,1.0,,11.0
2317,LNDADC015,7,AD,14.0,2.0,,,,,,,,,,Poor,-1.0,13.0,,3.0,,0.0,34.0,1.0,,20.0
2318,LNDADC015,8,AD,15.0,2.0,,,,,,,,,,,-1.0,13.0,,3.0,,0.0,55.0,1.0,,24.0
2310,LNDADC015,10,AD,,,,,,,,,,,,,-1.0,13.0,,,,0.0,23.0,,,
2311,LNDADC015,11,AD,,,,,,,,,,,,,-1.0,13.0,,,,0.0,19.0,,,


In [121]:
static_cols = ['Father_Dementia','Mother_Dementia']
for column in static_cols:
    mri_data[column] = mri_data[column].map({'Do_not_know':-1,'No':0,'Yes':1})

In [122]:
static_cols = ['Father_Dementia','Fulltime_Education_Years','Mother_Dementia']
mri_data = mri_data.sort_values(by=['Subject_ID', 'Visit'])
for i in static_cols:
    mri_data[i] = mri_data.groupby('Subject_ID')[i].transform(lambda x: x.max())

#### Export Dataset

In [125]:
#print number of features in each dataset
print(f"Number of features in Genotype data: {mri_data.shape[1]}")

Number of features in Genotype data: 25


In [126]:
#save the data
mri_data.to_csv('../data/Clinical_data_0.csv', index=False)