In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
import seaborn as sns

### 0. Data Pre-processing and Cleaning

In [18]:
#reading the data
mri_data = pd.read_csv('../data/ANMerge_MRI_FS5.3_under_90.csv')
mri_data.tail()

Unnamed: 0,Key,Visit,Month,Site,Diagnosis,Sex,Age,APOE,MMSE,lh_bankssts_thickness,...,SupraTentorialVol,SupraTentorialVolNotVent,SupraTentorialVolNotVentVox,MaskVol,BrainSegVol-to-eTIV,MaskVol-to-eTIV,lhSurfaceHoles,rhSurfaceHoles,SurfaceHoles,EstimatedTotalIntraCranialVol
1259,TLSMCI607_1,1.0,0.0,Toulouse,MCI,Female,72.0,E3E3,29.0,2.077,...,776741.9467,758853.9467,756265.0,1291996.0,0.694822,1.009026,20.0,14.0,34.0,1280438.726
1260,TLSMCI607_1.5,,,,,,,,,2.128,...,770502.2442,752913.2442,751888.0,1254851.0,0.697605,0.990604,17.0,16.0,33.0,1266753.178
1261,TLSMCI608_1,1.0,0.0,Toulouse,MCI,Male,73.0,E3E3,28.0,2.146,...,822888.9424,799621.9424,797171.0,1410540.0,0.684031,1.032691,35.0,29.0,64.0,1365887.517
1262,TLSMCI608_1.5,,,,,,,,,2.068,...,826240.3269,803357.3269,801129.0,1428442.0,0.685443,1.044901,39.0,34.0,73.0,1367059.954
1263,TLSMCI612_1,1.0,0.0,Toulouse,MCI,Female,79.0,E3E3,28.0,2.48,...,851105.3678,811701.3678,808219.0,1335847.0,0.705282,0.980842,20.0,16.0,36.0,1361939.428


In [19]:
mri_data['MMSE'].value_counts()

30.0    158
29.0    111
28.0    110
26.0    100
25.0     80
27.0     79
24.0     46
21.0     35
16.0     34
22.0     34
19.0     33
20.0     26
17.0     20
23.0     20
18.0     20
15.0     19
12.0     15
14.0     10
13.0      8
11.0      2
6.0       1
10.0      1
9.0       1
Name: MMSE, dtype: int64

In [3]:
#get subject ID in each dataset
mri_data_subject_id = mri_data['Key']

In [4]:
#find the common subject ID in all datasets
common_subject_id = set(mri_data_subject_id)
print(len(common_subject_id))

993


In [5]:
#**remove data duplicate
mri_data.drop_duplicates(inplace=True)

In [None]:
!ls /l/users/

#### Imputing Missing Values

In [6]:
def impute_missing_values(data):
    #impute missing values with the median
    continuous_imputer = SimpleImputer(strategy='mean')
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    continuous_columns = list(data.select_dtypes(include=np.number).columns)
    categorical_columns = list(data.select_dtypes(include='object').columns)
    if 'Diagnosis' in categorical_columns:
        categorical_columns.remove('Diagnosis')
    data[continuous_columns] = continuous_imputer.fit_transform(data[continuous_columns])
    data[categorical_columns] = categorical_imputer.fit_transform(data[categorical_columns])
    return data

In [7]:
impute_missing_values(mri_data)

Unnamed: 0,Key,Visit,Month,Site,Diagnosis,Sex,Age,APOE,MMSE,lh_bankssts_thickness,...,SupraTentorialVol,SupraTentorialVolNotVent,SupraTentorialVolNotVentVox,MaskVol,BrainSegVol-to-eTIV,MaskVol-to-eTIV,lhSurfaceHoles,rhSurfaceHoles,SurfaceHoles,EstimatedTotalIntraCranialVol
0,KPOADC001_1,1.000000,0.000000,Kuopio,AD,Female,86.0000,E4E4,20.000000,2.224,...,814643.7818,760261.7818,757058.0,1361906.0,0.664587,0.977432,24.0,13.0,37.0,1393350.752
1,KPOADC001_2,2.000000,3.000000,Kuopio,AD,Female,86.0000,E4E4,16.000000,2.140,...,815475.3006,759630.3006,756220.0,1319352.0,0.659338,0.941138,23.0,17.0,40.0,1401868.831
2,KPOADC001_5,5.000000,12.000000,Kuopio,AD,Female,87.0000,E4E4,21.000000,2.248,...,815311.4707,756965.4707,753356.0,1342921.0,0.660968,0.959201,30.0,24.0,54.0,1400041.258
3,KPOADC002_1,1.000000,0.000000,Kuopio,AD,Female,68.0000,E4E4,26.000000,1.971,...,871693.5631,826027.5631,824742.0,1515148.0,0.652202,0.976865,22.0,21.0,43.0,1551030.924
4,KPOADC002_2,2.000000,3.000000,Kuopio,AD,Female,68.0000,E4E4,25.000000,2.066,...,863403.9697,814547.9697,813231.0,1508917.0,0.643059,0.968317,20.0,9.0,29.0,1558288.189
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1259,TLSMCI607_1,1.000000,0.000000,Toulouse,MCI,Female,72.0000,E3E3,29.000000,2.077,...,776741.9467,758853.9467,756265.0,1291996.0,0.694822,1.009026,20.0,14.0,34.0,1280438.726
1260,TLSMCI607_1.5,1.850187,5.382022,Kuopio,,Female,74.8775,E3E3,24.967541,2.128,...,770502.2442,752913.2442,751888.0,1254851.0,0.697605,0.990604,17.0,16.0,33.0,1266753.178
1261,TLSMCI608_1,1.000000,0.000000,Toulouse,MCI,Male,73.0000,E3E3,28.000000,2.146,...,822888.9424,799621.9424,797171.0,1410540.0,0.684031,1.032691,35.0,29.0,64.0,1365887.517
1262,TLSMCI608_1.5,1.850187,5.382022,Kuopio,,Female,74.8775,E3E3,24.967541,2.068,...,826240.3269,803357.3269,801129.0,1428442.0,0.685443,1.044901,39.0,34.0,73.0,1367059.954


In [8]:
#check for missing values
print(mri_data.isnull().sum().sum())

249


#### Scaling Continuous Values

In [9]:
def scale_data(data):
    scaler = MinMaxScaler()
    #get continuous columns
    continuous_columns = data.select_dtypes(include=np.number).columns
    #remove age column if it exists
    if 'Age' in continuous_columns:
        continuous_columns = continuous_columns.drop(['Age'])
    if 'Visit' in continuous_columns:
        continuous_columns = continuous_columns.drop(['Visit'])
    if 'Month' in continuous_columns:
        continuous_columns = continuous_columns.drop(['Month'])
    data[continuous_columns] = scaler.fit_transform(data[continuous_columns])
    return data

In [10]:
mri_data = scale_data(mri_data)

#### Label Encode Categorical Features

In [11]:
def label_encode(data):
    #label encode categorical data
    categorical_columns = data.select_dtypes(include='object').columns
    #drop the Subject_ID column and PCA columns from the categorical columns
    if 'Key' in categorical_columns:
        categorical_columns = categorical_columns.drop('Key')
    if 'PC1' in categorical_columns:
        categorical_columns = categorical_columns.drop(['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10'])
    #encode categorical data but print the mapping
    for column in categorical_columns:
        print(column)
        data[column], mapping_index = data[column].factorize()
        print(mapping_index)
    return data

In [12]:
mri_data = label_encode(mri_data)

Site
Index(['Kuopio', 'Lodz', 'London', 'Perugia', 'Thessaloniki', 'Toulouse'], dtype='object')
Diagnosis
Index(['AD', 'CTL', 'MCI'], dtype='object')
Sex
Index(['Female', 'Male'], dtype='object')
APOE
Index(['E4E4', 'E3E3', 'E3E4', 'E2E3', 'E2E4', 'E2E2'], dtype='object')


#### Export Dataset

In [13]:
#print number of features in each dataset
print(f"Number of features in Genotype data: {mri_data.shape[1]}")

Number of features in Genotype data: 147


In [14]:
#save the data
mri_data.to_csv('../data/MRI_data_0.csv', index=False)