In [1]:
#Importing all packages needed 
#1) Fundatmental 
import pandas as pd
import numpy as np

#2) Preprocessing 
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
import imblearn

#3) Model 
from sklearn.ensemble import RandomForestClassifier

#4) Validation  
from sklearn.model_selection import LeaveOneOut

#5) Performance metrics  
from sklearn.metrics import roc_auc_score, confusion_matrix,accuracy_score

#6) Custom Functions
#!pip install ipynb
from ipynb.fs.full.Functions import *


import warnings
warnings.filterwarnings("ignore")

# 1. Dataset 1 - Battery Tests

In [2]:
#Reading in the dataset
data=pd.read_excel(open("biomarkers.xlsx",'rb'))
#Removing the groups that are not needed 
data=data[data['biodiag'] < 5]

#Taking the columns that are needed 
data_class=data[["edad",
"años_escol",
"Sexo",
"biodiag",
"APOE",
"t_tau",
"p_tau",
"AB_42_TUB_B",                
"buschke_AL",
"buschke_AT",
"buschke_RDL",
"buschke_RDT",
"tam",
"vis_cerad",
"paisajes_tot",
"bnt",
"flu_anim",
"compren",
"ideom",
"prax_cerad",
"tdp",
"VOSP_num",
"VOSP_letras",
"tmtA",
"tmtB",
"fas_total",
"Stroop_lect",
"Stroop_color",
"Stroop_I",
"Clave_num"]]

# A. Creating Baseline Estimate

The classification that we are interested in, *Preclinical* vs *Control*, by definition are divided depending on the values of the biomarkers, namely phosphorylated tau (*p_tau*) and Beta-amyloid (*AB_42_TUB_B*). 

Therefore to have a benchmark of the classification, we use these aforementioned variables as features to make the classification between the two groups *Control* and *Preclinical*. 

For completeness we also report the classification of the other groups in this dataset, in particular, *Prodromal(MCI)* vs *Control*.

It seems our model is effective with an accuracy of 0.971, auc of 0.94 The sensitivity is 0.88, meaning that the model is capable of identifying patients with preclinical Alzheimer's Disease at a rate of 0.88. On the other hand the specificity of 1, meaning the model is predicting it 100% of the time.

In [26]:
#Creating a dataset with the relevant features phosphorylated tau (p_tau) and Beta-amyloid (AB_42_TUB_B), along with the grouping
data_baseline=data_class[["biodiag","p_tau","AB_42_TUB_B"]]

#1)Preclinical vs Control 
#Keeping only the subject in the control(1) and preclinical groups(2)
data_model=data_baseline.loc[data_baseline['biodiag'].isin([1,2])]
#Calling the function data_prep to further prepare the dataset:
#One hot encodes the group variable (biodiag)
#Splits the data into X and y, and turns then into the desired format, an array. 
X,y,coulmnNames=data_prep(data_model)
#Call the function featureimportance to run the model
acc, roc_auc, featureImportance,sensitivity,specificity=featureimportance(X,y,RandomForestClassifier(random_state=1),MinMaxScaler,KNNImputer)
print("For control vs AD preclinical the accuracy is %.3f and the auc is %.3f" % (acc, roc_auc))
print("For control vs AD preclinical the sensitivity is %.3f and the specificity is %.3f" % (sensitivity, specificity))
print_importance(featureImportance,coulmnNames)


#2)Prodromal(MCI) vs Control 
#Keeping only the subject in the control(1) and prodromal groups(3)
data_model=data_baseline.loc[data_baseline['biodiag'].isin([1,3])]
#Calling the function data_prep to further prepare the dataset:
#One hot encodes the group variable (biodiag)
#Splits the data into X and y, and turns then into the desired format, an array. 
X,y,coulmnNames=data_prep(data_model)
#Call the function featureimportance to run the model
acc, roc_auc, featureImportance,sensitivity,specificity=featureimportance(X,y,RandomForestClassifier(random_state=1),MinMaxScaler,KNNImputer)
print("For control vs AD prodromal the accuracy is %.3f and the auc is %.3f" % (acc, roc_auc))
print("For control vs AD prodromal the sensitivity is %.3f and the specificity is %.3f" % (sensitivity, specificity))
print_importance(featureImportance,coulmnNames)

For control vs AD preclinical the accuracy is 0.971 and the auc is 0.940
For control vs AD preclinical the sensitivity is 0.880 and the specificity is 1.000
       feature  importance
1  AB_42_TUB_B    0.738576
0        p_tau    0.261424
For control vs AD prodromal the accuracy is 1.000 and the auc is 1.000
For control vs AD prodromal the sensitivity is 1.000 and the specificity is 1.000
       feature  importance
1  AB_42_TUB_B    0.682178
0        p_tau    0.317822


# B. Battery Tests

Next we run the model on the battery tests to see if there are any effective tests at differentiating between the groups *Preclinical* and *Control*

We find that with an accuracy of 0.743 and auc of 0.570, the model is not successful at distinguishing between *Preclinical* subjects and healthy people. The sensitivity is also very low indicating that the features are not sensitive enough at detecting *Preclinical* subjects.

The model is however successful for the classification of prodromal/MCI subjects and control, with an accuracy of 0.919 and AUC of 0.894.

In [4]:
#Preparing the data
#dropping the biomarkers that should not be used in this model
feat_select=data_class.drop(["APOE","t_tau","p_tau","AB_42_TUB_B"],axis=1)

#1)Preclinical vs Control 
#Keeping only the subject in the control(1) and preclinical groups(2)
data_model=feat_select.loc[feat_select['biodiag'].isin([1,2])]
#Calling the function data_prep to further prepare the dataset:
#One hot encodes the group variable (biodiag)
#Splits the data into X and y, and turns then into the desired format, an array
X,y,coulmnNames=data_prep(data_model)
#Call the function featureimportance to run the model
acc, roc_auc, featureImportance,sensitivity,specificity=featureimportance(X,y,RandomForestClassifier(random_state=1,min_samples_leaf=2),MinMaxScaler,KNNImputer)
print("For control vs AD preclinical the accuracy is %.3f and the auc is %.3f" % (acc, roc_auc))
print("For control vs AD preclinical the sensitivity is %.3f and the specificity is %.3f" % (sensitivity, specificity))
print_importance(featureImportance,coulmnNames)



#2)Prodromal vs Control 
#Keeping only the subject in the control(1) and prodromal groups(3)
data_model=feat_select.loc[feat_select['biodiag'].isin([1,3])]
#Calling the function data_prep to further prepare the dataset:
#One hot encodes the group variable (biodiag)
#Splits the data into X and y, and turns then into the desired format, an array
X,y,coulmnNames=data_prep(data_model)
#Call the function featureimportance to run the model
acc, roc_auc, featureImportance,sensitivity,specificity=featureimportance(X,y,RandomForestClassifier(random_state=1),MinMaxScaler,KNNImputer)
print("For control vs AD prodromal the accuracy is %.3f and the auc is %.3f" % (acc, roc_auc))
print("For control vs AD prodromal the sensitivity is %.3f and the specificity is %.3f" % (sensitivity, specificity))
print_importance(featureImportance,coulmnNames)

For control vs AD preclinical the accuracy is 0.771 and the auc is 0.561
For control vs AD preclinical the sensitivity is 0.160 and the specificity is 0.963
         feature  importance
22  Stroop_color    0.086022
5    buschke_RDL    0.085814
18          tmtA    0.063972
4     buschke_AT    0.059640
3     buschke_AL    0.059539
10           bnt    0.058305
0           edad    0.056946
7            tam    0.056600
9   paisajes_tot    0.053669
19          tmtB    0.048528
21   Stroop_lect    0.047191
20     fas_total    0.041240
24     Clave_num    0.040516
23      Stroop_I    0.040143
11      flu_anim    0.034117
1     años_escol    0.030248
16      VOSP_num    0.029152
8      vis_cerad    0.029130
6    buschke_RDT    0.028984
17   VOSP_letras    0.015064
15           tdp    0.014456
14    prax_cerad    0.013628
2           Sexo    0.006559
12       compren    0.000537
13         ideom    0.000000
For control vs AD prodromal the accuracy is 0.919 and the auc is 0.894
For control vs AD 

# 2. Dataset 2 - Tapping Features

In [28]:
#Reading in the dataset
data=pd.read_spss("FTT.sav")  

#Taking the columns that are needed 
data_class=data[['groups','P_TAU','AB42','FTT_rate','FTT_variability_total_Log10','age', 'gender','education','AL','AT','RD','RDT','BNT','ANIMALES','VOSP_NUM','TMT_A','TMT_B','STROOP_COLOR','STROOP_INT','SDMT',"PL_FTT_time_interval"]]

#Renaming the group name to biodiag to match the previous dataset1 and numbering also in the same way
data_class = data_class.rename(columns={'groups': 'biodiag'})
data_class['biodiag'] = data_class['biodiag'].replace({"grupo control":1, "grupo preclinico":2, "grupo con EA":3})

#Turning gender into binary 
data_class['gender'] = data_class['gender'].replace({"male":1, "female":0})

# A. Creating Baseline Estimate

The classification that we are interested in, *Preclinical* vs *Control*, by definition are divided depending on the values of the biomarkers, namely phosphorylated tau (*p_tau*) and Beta-amyloid (*AB_42_TUB_B*). 

Therefore to have a benchmark of the classification, we use these aforementioned variables as features to make the classification between the two groups *Control* and *Preclinical*. 

Unlike the previous dataset 1, we do not have the groups for *Prodromal(MCI)* and thus can not preform the classification.  

It seems our model is effective with an accuracy of 0.965, auc of 0.961 The sensitivity is 0.95, meaning that the model is capable of identifying patients with preclinical Alzheimer's Disease at a rate of 0.95. On the other hand the specificity is higher at 0.973, this is likely because the dataset is imbalanced with a greater number of *Control* subjects. 

In [29]:
#Preparing the data
#Creating a dataset with the relevant features phosphorylated tau (p_tau) and Beta-amyloid (AB_42_TUB_B), along with the grouping
data_baseline=data_class[["biodiag","P_TAU","AB42"]]

#1)Preclinical vs Control 
#Keeping only the subject in the control(1) and preclinical groups(2)
data_model=data_baseline.loc[data_baseline['biodiag'].isin([1,2])]
#Calling the function data_prep to further prepare the dataset:
#One hot encodes the group variable (biodiag)
#Splits the data into X and y, and turns then into the desired format, an array. 
X,y,coulmnNames=data_prep(data_model)
#Call the function featureimportance to run the model
acc, roc_auc, featureImportance,sensitivity,specificity=featureimportance(X,y,RandomForestClassifier(random_state=1),MinMaxScaler,KNNImputer)
print("For control vs AD preclinical the accuracy is %.3f and the auc is %.3f" % (acc, roc_auc))
print("For control vs AD preclinical the sensitivity is %.3f and the specificity is %.3f" % (sensitivity, specificity))
print_importance(featureImportance,coulmnNames)


For control vs AD preclinical the accuracy is 0.965 and the auc is 0.961
For control vs AD preclinical the sensitivity is 0.950 and the specificity is 0.973
  feature  importance
1    AB42    0.825638
0   P_TAU    0.174362


# B. Battery Tests

Next we run the model on the battery tests to see if there are any effective tests at differenciating between the groups *Preclinical* and *Control*

Similar to dataset 1, we find that bettery tests are not sensitive enough at detecting *Preclinical* subjects, with an accuracy of 0.579 and AUC 0.480. The sensitivity is also extremely low at 0.150.

In [30]:
#Preparing the data
#dropping the biomarkers and tapping features that are not battery tests
feat_select=data_class.drop(["P_TAU","AB42","FTT_variability_total_Log10","FTT_rate","PL_FTT_time_interval"],axis=1)

#1)Preclinical vs Control 
#Keeping only the subject in the control(1) and preclinical groups(2)
data_model=feat_select.loc[feat_select['biodiag'].isin([1,2])]
#Calling the function data_prep to further prepare the dataset:
#One hot encodes the group variable (biodiag)
#Splits the data into X and y, and turns then into the desired format, an array
X,y,coulmnNames=data_prep(data_model)
#Call the function featureimportance to run the model
acc, roc_auc, featureImportance,sensitivity,specificity=featureimportance(X,y,RandomForestClassifier(random_state=1),MinMaxScaler,KNNImputer)
print("For control vs AD preclinical the accuracy is %.3f and the auc is %.3f" % (acc, roc_auc))
print("For control vs AD preclinical the sensitivity is %.3f and the specificity is %.3f" % (sensitivity, specificity))
print_importance(featureImportance,coulmnNames)


For control vs AD preclinical the accuracy is 0.579 and the auc is 0.480
For control vs AD preclinical the sensitivity is 0.150 and the specificity is 0.811
         feature  importance
12  STROOP_COLOR    0.113479
0            age    0.106899
11         TMT_B    0.098043
4             AT    0.076416
14          SDMT    0.075178
10         TMT_A    0.073879
13    STROOP_INT    0.073269
3             AL    0.062477
2      education    0.058677
8       ANIMALES    0.057102
6            RDT    0.055249
5             RD    0.051456
7            BNT    0.048297
9       VOSP_NUM    0.039486
1         gender    0.010092


# C. Tapping Features

Next we run the model on the tapping features to see if there are any effective tests at differentiating between the groups *Preclinical* and *Control*

We see a much better performance with an accuracy of 0.772 and AUC of 0.721, indicating that the tapping features are more successful at identifying the *Preclinical* subjects. However the sensitivity is still low, but this is because the dataset is imbalanced. We need to correct the imbalanced nature to improve the performance. We see that the interval between taps was the most important feature, while the rate was the least important feature. 

In [31]:
#Preparing the data
#dropping the biomarkers that should not be used in this model
#feat_select=data_class.drop(["P_TAU","AB42","STROOP_LECT","gender","ANIMALES","RDT","VOSP_NUM","education","AL","RD","BNT","AT","TMT_A"],axis=1)
feat_select=data_class[["biodiag","FTT_variability_total_Log10","PL_FTT_time_interval","FTT_rate"]]

#1)Preclinical vs Control 
#Keeping only the subject in the control(1) and preclinical groups(2)
data_model=feat_select.loc[feat_select['biodiag'].isin([1,2])]
#Calling the function data_prep to further prepare the dataset:
#One hot encodes the group variable (biodiag)
#Splits the data into X and y, and turns then into the desired format, an array
X,y,coulmnNames=data_prep(data_model)
#Call the function featureimportance to run the model
acc, roc_auc, featureImportance,sensitivity,specificity=featureimportance(X,y,RandomForestClassifier(random_state=1),MinMaxScaler,KNNImputer)
print("For control vs AD preclinical the accuracy is %.3f and the auc is %.3f" % (acc, roc_auc))
print("For control vs AD preclinical the sensitivity is %.3f and the specificity is %.3f" % (sensitivity, specificity))
print_importance(featureImportance,coulmnNames)


For control vs AD preclinical the accuracy is 0.772 and the auc is 0.721
For control vs AD preclinical the sensitivity is 0.550 and the specificity is 0.892
                       feature  importance
1         PL_FTT_time_interval    0.427960
0  FTT_variability_total_Log10    0.300289
2                     FTT_rate    0.271751
