In [129]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
import seaborn as sns

### 0. Data Pre-processing and Cleaning

In [147]:
#reading the data
genotype_data = pd.read_csv('data/genotype.csv')
proteomics_data = pd.read_csv('data/proteomics.csv')
gene_expression_data = pd.read_csv('data/gene-expression.csv')

In [148]:
#get subject ID in each dataset
genotype_data_subject_id = genotype_data['Subject_ID']
proteomics_data_subject_id = proteomics_data['Subject_ID']
gene_expression_data_subject_id = gene_expression_data['Subject_ID']

In [149]:
#find the common subject ID in all datasets
common_subject_id = set(genotype_data_subject_id) & set(proteomics_data_subject_id) & set(gene_expression_data_subject_id)
print(len(common_subject_id))

314


In [150]:
#keep only the common subject ID in each dataset
genotype_data = genotype_data[genotype_data['Subject_ID'].isin(common_subject_id)]
proteomics_data = proteomics_data[proteomics_data['Subject_ID'].isin(common_subject_id)]
gene_expression_data = gene_expression_data[gene_expression_data['Subject_ID'].isin(common_subject_id)]

#### Imputing Missing Values

In [151]:
def impute_missing_values(data):
    #impute missing values with the median
    continuous_imputer = SimpleImputer(strategy='mean')
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    continuous_columns = data.select_dtypes(include=np.number).columns
    categorical_columns = data.select_dtypes(include='object').columns
    data[continuous_columns] = continuous_imputer.fit_transform(data[continuous_columns])
    data[categorical_columns] = categorical_imputer.fit_transform(data[categorical_columns])
    return data

In [152]:
genotype_data = impute_missing_values(genotype_data)
proteomics_data = impute_missing_values(proteomics_data)
gene_expression_data = impute_missing_values(gene_expression_data)

In [153]:
#check for missing values
print(genotype_data.isnull().sum().sum())
print(proteomics_data.isnull().sum().sum())
print(gene_expression_data.isnull().sum().sum())

0
0
0


#### Scaling Continuous Values

In [156]:
def scale_data(data):
    scaler = MinMaxScaler()
    #get continuous columns
    continuous_columns = data.select_dtypes(include=np.number).columns
    #remove age column if it exists
    if 'Age' in continuous_columns:
        continuous_columns = continuous_columns.drop(['Age'])
    data[continuous_columns] = scaler.fit_transform(data[continuous_columns])
    return data

In [157]:
genotype_data = scale_data(genotype_data)
proteomics_data = scale_data(proteomics_data)
gene_expression_data = scale_data(gene_expression_data)

#### Label Encode Categorical Features

In [158]:
def label_encode(data):
    #label encode categorical data
    categorical_columns = data.select_dtypes(include='object').columns
    #drop the Subject_ID column and PCA columns from the categorical columns
    if 'Subject_ID' in categorical_columns:
        categorical_columns = categorical_columns.drop('Subject_ID')
    if 'PC1' in categorical_columns:
        categorical_columns = categorical_columns.drop(['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10'])
    #encode categorical data but print the mapping
    for column in categorical_columns:
        print(column)
        data[column], mapping_index = data[column].factorize()
        print(mapping_index)
    return data

In [159]:
genotype_data = label_encode(genotype_data)
proteomics_data = label_encode(proteomics_data)
gene_expression_data = label_encode(gene_expression_data)

Diagnosis
Index(['AD', 'CTL', 'MCI'], dtype='object')
Sex
Index(['Male', 'Female'], dtype='object')
APOE
Index(['E4E4', 'E3E4', 'E3E3', 'E2E3', 'E2E4'], dtype='object')
Diagnosis
Index(['AD', 'CTL', 'MCI'], dtype='object')
Sex
Index(['Male', 'Female'], dtype='object')
APOE
Index(['E4E4', 'E3E4', 'E3E3', 'E2E3', 'E2E4'], dtype='object')


#### Merge Dataset

In [160]:
#find the common columns in all datasets
common_columns = set(genotype_data.columns) & set(proteomics_data.columns) & set(gene_expression_data.columns)
print(common_columns)

{'Subject_ID', 'Sex'}


In [161]:
#print number of features in each dataset
print(f"Number of features in Genotype data: {genotype_data.shape[1]}")
print(f"Number of features in Proteomics data: {proteomics_data.shape[1]}")
print(f"Number of features in Gene Expression data: {gene_expression_data.shape[1]}")

Number of features in Genotype data: 12
Number of features in Proteomics data: 1022
Number of features in Gene Expression data: 5218


In [162]:
#drop "Sex" column from all dataset except Genotype data
proteomics_data = proteomics_data.drop(columns=['Sex'], axis=1)
gene_expression_data = gene_expression_data.drop(columns=['Sex'], axis=1)

In [163]:
#merge the datasets based on the Subject_ID
data = pd.merge(genotype_data, proteomics_data, on='Subject_ID')
data = pd.merge(data, gene_expression_data, on='Subject_ID')

In [181]:
#save the data
data.to_csv('data/gene_prot_data.csv', index=False)

In [177]:
data = pd.read_csv('data/gene_prot_data.csv')

In [179]:
#drop duplicate rows
data = data.drop_duplicates()

In [175]:
#drop rows with the same Key but keep the first one
data = data.drop_duplicates(subset='Subject_ID', keep='first')

In [180]:
data

Unnamed: 0,Subject_ID,Diagnosis,Age,APOE,MMSE,Sex,PC1,PC2,PC3,PC4,...,xu.S6OENiSSCDruXFc,xu3n8ix_RpCpNPRKhc,xuBm5ckgidWRNTl.gQ,xuNOUeR1JerhIuIV7c,xud4v2r3nXECBua55o,xvHfVEL1fC35XJ7neU,xve7nuGHtAizelOmhE,xvrrv4q_nIDgJej.uU,xws9e3UChad1OnXmXY,xz1S1tKD.sgqfTuesU
0,DCR00025,0,75.000000,0,0.746982,0,0.0088,0.0276,0.0132,0.0005,...,0.010746,0.424735,0.398869,0.066384,0.321479,0.149548,0.095191,0.004099,0.253679,0.000000
1,DCR00028,0,88.000000,1,0.746982,0,0.0114,0.0284,0.0192,-0.0095,...,0.239652,0.297554,0.341023,0.071577,0.429470,0.233205,0.268920,0.158143,0.277040,0.361951
2,DCR00031,0,75.461386,0,0.746982,0,0.0097,28.0000,13.0000,-0.0234,...,0.058596,0.438859,0.216219,0.599177,0.179004,0.193932,0.098406,0.122535,0.141998,0.263429
3,DCR00032,0,75.461386,0,0.746982,1,0.0076,28.0000,9.0000,-0.0011,...,0.287620,0.579192,0.211476,0.200429,0.001667,0.218742,0.263119,0.261860,0.221761,0.162624
4,DCR00037,0,75.461386,2,0.746982,1,0.0097,0.0244,18.0000,-0.0043,...,0.316210,0.422101,0.247895,0.154150,0.334002,0.184145,0.304057,0.309619,0.165162,0.385041
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,THSMCI061,2,67.000000,1,0.818182,1,0.0073,-0.0539,0.0073,0.0216,...,0.589071,0.961699,0.622341,0.796417,0.776029,0.823258,0.585026,0.371426,0.803298,0.525950
513,THSMCI064,2,80.000000,1,0.909091,0,0.0135,-0.0574,-0.0044,-2.0000,...,0.663432,0.933129,0.629757,0.826243,0.851986,0.937715,0.710167,0.549367,0.774187,0.639621
514,THSMCI064,0,80.000000,1,0.636364,0,0.0135,-0.0574,-0.0044,-2.0000,...,0.663432,0.933129,0.629757,0.826243,0.851986,0.937715,0.710167,0.549367,0.774187,0.639621
515,THSMCI065,2,76.000000,2,0.954545,0,0.0011,-0.0498,0.0126,0.0303,...,0.659313,0.911345,0.773895,0.843128,0.824156,0.783705,0.709999,0.510799,0.855688,0.716505


In [126]:
#save the data
data.to_csv('data/combined_data_subset.csv', index=False)

In [182]:
data

Unnamed: 0,Subject_ID,Diagnosis,Age,APOE,MMSE,Sex,PC1,PC2,PC3,PC4,...,xu.S6OENiSSCDruXFc,xu3n8ix_RpCpNPRKhc,xuBm5ckgidWRNTl.gQ,xuNOUeR1JerhIuIV7c,xud4v2r3nXECBua55o,xvHfVEL1fC35XJ7neU,xve7nuGHtAizelOmhE,xvrrv4q_nIDgJej.uU,xws9e3UChad1OnXmXY,xz1S1tKD.sgqfTuesU
0,DCR00025,0,75.000000,0,0.746982,0,0.0088,0.0276,0.0132,0.0005,...,0.010746,0.424735,0.398869,0.066384,0.321479,0.149548,0.095191,0.004099,0.253679,0.000000
1,DCR00028,0,88.000000,1,0.746982,0,0.0114,0.0284,0.0192,-0.0095,...,0.239652,0.297554,0.341023,0.071577,0.429470,0.233205,0.268920,0.158143,0.277040,0.361951
2,DCR00031,0,75.461386,0,0.746982,0,0.0097,28.0000,13.0000,-0.0234,...,0.058596,0.438859,0.216219,0.599177,0.179004,0.193932,0.098406,0.122535,0.141998,0.263429
3,DCR00032,0,75.461386,0,0.746982,1,0.0076,28.0000,9.0000,-0.0011,...,0.287620,0.579192,0.211476,0.200429,0.001667,0.218742,0.263119,0.261860,0.221761,0.162624
4,DCR00037,0,75.461386,2,0.746982,1,0.0097,0.0244,18.0000,-0.0043,...,0.316210,0.422101,0.247895,0.154150,0.334002,0.184145,0.304057,0.309619,0.165162,0.385041
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,THSMCI061,2,67.000000,1,0.818182,1,0.0073,-0.0539,0.0073,0.0216,...,0.589071,0.961699,0.622341,0.796417,0.776029,0.823258,0.585026,0.371426,0.803298,0.525950
513,THSMCI064,2,80.000000,1,0.909091,0,0.0135,-0.0574,-0.0044,-2.0000,...,0.663432,0.933129,0.629757,0.826243,0.851986,0.937715,0.710167,0.549367,0.774187,0.639621
514,THSMCI064,0,80.000000,1,0.636364,0,0.0135,-0.0574,-0.0044,-2.0000,...,0.663432,0.933129,0.629757,0.826243,0.851986,0.937715,0.710167,0.549367,0.774187,0.639621
515,THSMCI065,2,76.000000,2,0.954545,0,0.0011,-0.0498,0.0126,0.0303,...,0.659313,0.911345,0.773895,0.843128,0.824156,0.783705,0.709999,0.510799,0.855688,0.716505
