In [44]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
import seaborn as sns

### 0. Data Pre-processing and Cleaning

In [45]:
#reading the data
mri_data = pd.read_csv('../data/ALZ_Serum_HPOS_MS_QI.csv')
mri_data.tail()

Unnamed: 0,Sample ID,2.68_380.3754m/z,2.69_915.9888m/z,2.69_388.9683m/z,2.67_379.2146n,2.68_797.9443m/z,2.70_952.9981m/z,2.70_952.4979m/z,2.70_1049.5313m/z,2.70_1049.0311m/z,...,4.13_137.0701m/z,0.83_213.1557m/z,1.02_168.0485m/z,0.65_188.1075n,0.65_334.1403m/z,0.87_197.1651m/z,0.62_431.1429m/z,1.02_111.0548m/z,0.64_439.2432m/z,0.69_228.2312m/z
551,DCR00254_1,-5343.086422,136915.9,689.957566,5048376.0,4062032.0,2022351.0,2339461.0,512672.171474,647137.2,...,369832.752278,41137.109938,18634.223988,32512.361263,14177.802602,228307.401671,23824.293119,14394.008887,9640.101222,8359.332405
552,LNDCTL062_1,-5343.086422,1300256.0,689.957566,3792428.0,2960368.0,606572.4,727584.5,144263.738337,457141.5,...,337662.901949,29664.604791,37861.412011,35236.729431,15554.207816,116256.670254,14979.311038,11739.437286,8159.33843,7855.094706
553,LNDCTL040_1,-5343.086422,2093258.0,689.957566,6520810.0,5065937.0,1868621.0,2095525.0,506866.774738,1243328.0,...,369619.862146,43902.513905,12860.481238,45172.479532,31633.529375,309234.524409,66130.98016,7595.513523,4725.221324,9177.674462
554,PRGMCI008_1,-5343.086422,-26976.52,689.957566,-195835.4,-109459.1,-56900.47,-54450.43,-17877.123983,-22196.27,...,450121.086723,25398.758253,4893.923706,68894.439755,69970.365023,122940.830067,8790.272244,8051.697391,9617.098716,9976.228777
555,THSADC059_1,-5343.086422,-30313.74,689.957566,2538868.0,1997351.0,1132647.0,1337412.0,255750.00717,345051.2,...,172534.429958,22300.871159,23314.149348,103576.257441,21690.175358,228686.09105,166441.547929,12542.942446,602.8607,5281.221739


In [46]:
mri_data.shape

(556, 2803)

In [47]:
#get subject ID in each dataset
mri_data_subject_id = mri_data['Sample ID']

In [48]:
#find the common subject ID in all datasets
common_subject_id = set(mri_data_subject_id)
print(len(common_subject_id))

556


In [49]:
#**remove data duplicate
mri_data.drop_duplicates(inplace=True)

#### Imputing Missing Values

In [50]:
mri_data.isna().sum().sum()

0

In [51]:
def impute_missing_values(data):
    #impute missing values with the median
    continuous_imputer = SimpleImputer(strategy='mean')
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    continuous_columns = list(data.select_dtypes(include=np.number).columns)
    categorical_columns = list(data.select_dtypes(include='object').columns)
    if 'Diagnosis' in categorical_columns:
        categorical_columns.remove('Diagnosis')
    data[continuous_columns] = continuous_imputer.fit_transform(data[continuous_columns])
    data[categorical_columns] = categorical_imputer.fit_transform(data[categorical_columns])
    return data

In [52]:
impute_missing_values(mri_data)

Unnamed: 0,Sample ID,2.68_380.3754m/z,2.69_915.9888m/z,2.69_388.9683m/z,2.67_379.2146n,2.68_797.9443m/z,2.70_952.9981m/z,2.70_952.4979m/z,2.70_1049.5313m/z,2.70_1049.0311m/z,...,4.13_137.0701m/z,0.83_213.1557m/z,1.02_168.0485m/z,0.65_188.1075n,0.65_334.1403m/z,0.87_197.1651m/z,0.62_431.1429m/z,1.02_111.0548m/z,0.64_439.2432m/z,0.69_228.2312m/z
0,LDZADC003_1,-5343.086422,4.961608e+05,689.957566,4.165335e+06,3.239187e+06,1.507358e+06,1.661309e+06,323592.593372,4.823460e+05,...,393664.153580,40421.342612,11492.507455,37427.814879,16943.014873,236263.920853,37247.283066,13116.363902,9183.975960,7517.977109
1,LNDCTL019_1,-5343.086422,3.091269e+06,689.957566,4.922810e+06,3.960377e+06,2.063693e+06,2.411039e+06,537330.250341,7.119607e+05,...,293597.842686,27042.054497,4628.020381,86915.265353,64589.239433,124755.384165,42865.306301,4186.451032,14126.296802,6374.237972
2,THSADC057_1,-5343.086422,1.300124e+04,689.957566,3.119642e+06,2.535609e+06,1.437754e+06,1.695825e+06,296889.479747,3.947683e+05,...,63277.410685,20068.142534,5181.682363,37869.375682,15667.651704,114509.226227,13540.556249,5280.825814,10589.255564,8286.479816
3,PRGMCI017_1,-5343.086422,-2.941098e+04,689.957566,-2.007562e+05,-1.121781e+05,-5.667722e+04,-5.433716e+04,-17877.123983,-2.219627e+04,...,438856.964682,11780.319484,2777.435863,32917.842347,27062.824928,92127.443326,14035.076910,3758.132384,10815.570700,6950.781350
4,KPOMCI041_1,-5343.086422,-2.733246e+04,689.957566,-2.008044e+05,-1.116523e+05,-5.690047e+04,-5.424606e+04,-17877.123983,-2.219627e+04,...,475298.395722,20981.667034,7489.280877,51721.062925,113655.969459,112337.126210,28996.179836,7293.390730,15974.144747,15717.884654
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
551,DCR00254_1,-5343.086422,1.369159e+05,689.957566,5.048376e+06,4.062032e+06,2.022351e+06,2.339461e+06,512672.171474,6.471372e+05,...,369832.752278,41137.109938,18634.223988,32512.361263,14177.802602,228307.401671,23824.293119,14394.008887,9640.101222,8359.332405
552,LNDCTL062_1,-5343.086422,1.300256e+06,689.957566,3.792428e+06,2.960368e+06,6.065724e+05,7.275845e+05,144263.738337,4.571415e+05,...,337662.901949,29664.604791,37861.412011,35236.729431,15554.207816,116256.670254,14979.311038,11739.437286,8159.338430,7855.094706
553,LNDCTL040_1,-5343.086422,2.093258e+06,689.957566,6.520810e+06,5.065937e+06,1.868621e+06,2.095525e+06,506866.774738,1.243328e+06,...,369619.862146,43902.513905,12860.481238,45172.479532,31633.529375,309234.524409,66130.980160,7595.513523,4725.221324,9177.674462
554,PRGMCI008_1,-5343.086422,-2.697652e+04,689.957566,-1.958354e+05,-1.094591e+05,-5.690047e+04,-5.445043e+04,-17877.123983,-2.219627e+04,...,450121.086723,25398.758253,4893.923706,68894.439755,69970.365023,122940.830067,8790.272244,8051.697391,9617.098716,9976.228777


In [53]:
#check for missing values
print(mri_data.isnull().sum().sum())

0


#### Scaling Continuous Values

In [54]:
def scale_data(data):
    scaler = MinMaxScaler()
    #get continuous columns
    continuous_columns = data.select_dtypes(include=np.number).columns
    #remove age column if it exists
    if 'Age' in continuous_columns:
        continuous_columns = continuous_columns.drop(['Age'])
    if 'Visit' in continuous_columns:
        continuous_columns = continuous_columns.drop(['Visit'])
    if 'Month' in continuous_columns:
        continuous_columns = continuous_columns.drop(['Month'])
    data[continuous_columns] = scaler.fit_transform(data[continuous_columns])
    return data

In [55]:
mri_data = scale_data(mri_data)

#### Label Encode Categorical Features

In [56]:
def label_encode(data):
    #label encode categorical data
    categorical_columns = data.select_dtypes(include='object').columns
    #drop the Subject_ID column and PCA columns from the categorical columns
    if 'Sample ID' in categorical_columns:
        categorical_columns = categorical_columns.drop('Sample ID')
    if 'PC1' in categorical_columns:
        categorical_columns = categorical_columns.drop(['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10'])
    #encode categorical data but print the mapping
    for column in categorical_columns:
        print(column)
        data[column], mapping_index = data[column].factorize()
        print(mapping_index)
    return data

In [57]:
mri_data = label_encode(mri_data)

#### Export Dataset

In [59]:
#print number of features in each dataset
print(f"Number of features in Genotype data: {mri_data.shape[1]}")

Number of features in Genotype data: 2803


In [60]:
#save the data
mri_data.to_csv('../data/Metabolic_data_0.csv', index=False)