In [2]:
import pandas as pd
from scipy.io import arff
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from sklearn import preprocessing
import numpy as np

from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
#import xgboost as xgb
from sklearn import metrics
from sklearn.metrics import mean_squared_error

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls


FOLDS =10
%matplotlib inline

In [3]:
data = 'oasis_longitudinal.csv'
df = pd.read_csv (data)
df.head()

Unnamed: 0,Subject ID,MRI ID,Group,Visit,MR Delay,M/F,Hand,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
0,OAS2_0001,OAS2_0001_MR1,Nondemented,1,0,M,R,87,14,2.0,27.0,0.0,1987,0.696,0.883
1,OAS2_0001,OAS2_0001_MR2,Nondemented,2,457,M,R,88,14,2.0,30.0,0.0,2004,0.681,0.876
2,OAS2_0002,OAS2_0002_MR1,Demented,1,0,M,R,75,12,,23.0,0.5,1678,0.736,1.046
3,OAS2_0002,OAS2_0002_MR2,Demented,2,560,M,R,76,12,,28.0,0.5,1738,0.713,1.01
4,OAS2_0002,OAS2_0002_MR3,Demented,3,1895,M,R,80,12,,22.0,0.5,1698,0.701,1.034


In [4]:
df.describe()

Unnamed: 0,Visit,MR Delay,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
count,373.0,373.0,373.0,373.0,354.0,371.0,373.0,373.0,373.0,373.0
mean,1.882038,595.104558,77.013405,14.597855,2.460452,27.342318,0.290885,1488.128686,0.729568,1.195461
std,0.922843,635.485118,7.640957,2.876339,1.134005,3.683244,0.374557,176.139286,0.037135,0.138092
min,1.0,0.0,60.0,6.0,1.0,4.0,0.0,1106.0,0.644,0.876
25%,1.0,0.0,71.0,12.0,2.0,27.0,0.0,1357.0,0.7,1.099
50%,2.0,552.0,77.0,15.0,2.0,29.0,0.0,1470.0,0.729,1.194
75%,2.0,873.0,82.0,16.0,3.0,30.0,0.5,1597.0,0.756,1.293
max,5.0,2639.0,98.0,23.0,5.0,30.0,2.0,2004.0,0.837,1.587


In [5]:
nu = pd.DataFrame(df['Group']=='Nondemented')
nu["Group"].value_counts() 

True     190
False    183
Name: Group, dtype: int64

In [6]:
df['Group'] = df['Group'].replace(['Converted'], ['Demented'])
df.head(3)

Unnamed: 0,Subject ID,MRI ID,Group,Visit,MR Delay,M/F,Hand,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
0,OAS2_0001,OAS2_0001_MR1,Nondemented,1,0,M,R,87,14,2.0,27.0,0.0,1987,0.696,0.883
1,OAS2_0001,OAS2_0001_MR2,Nondemented,2,457,M,R,88,14,2.0,30.0,0.0,2004,0.681,0.876
2,OAS2_0002,OAS2_0002_MR1,Demented,1,0,M,R,75,12,,23.0,0.5,1678,0.736,1.046


In [7]:
df.drop(['Subject ID'], axis = 1, inplace = True, errors = 'ignore')
df.drop(['MRI ID'], axis = 1, inplace = True, errors = 'ignore')
df.drop(['Visit'], axis = 1, inplace = True, errors = 'ignore')
#for this study the CDR we eliminated it
df.drop(['CDR'], axis = 1, inplace = True, errors = 'ignore')
df.head(3)

Unnamed: 0,Group,MR Delay,M/F,Hand,Age,EDUC,SES,MMSE,eTIV,nWBV,ASF
0,Nondemented,0,M,R,87,14,2.0,27.0,1987,0.696,0.883
1,Nondemented,457,M,R,88,14,2.0,30.0,2004,0.681,0.876
2,Demented,0,M,R,75,12,,23.0,1678,0.736,1.046


In [8]:
# 1 = Demented, 0 = Nondemented
df['Group'] = df['Group'].replace(['Converted'], ['Demented'])

df['Group'] = df['Group'].replace(['Demented', 'Nondemented'], [1,0])    
df.head(3)

Unnamed: 0,Group,MR Delay,M/F,Hand,Age,EDUC,SES,MMSE,eTIV,nWBV,ASF
0,0,0,M,R,87,14,2.0,27.0,1987,0.696,0.883
1,0,457,M,R,88,14,2.0,30.0,2004,0.681,0.876
2,1,0,M,R,75,12,,23.0,1678,0.736,1.046


In [9]:
df['M/F'] = df['M/F'].replace(['M', 'F'], [1,0])  
df.head(3)

Unnamed: 0,Group,MR Delay,M/F,Hand,Age,EDUC,SES,MMSE,eTIV,nWBV,ASF
0,0,0,1,R,87,14,2.0,27.0,1987,0.696,0.883
1,0,457,1,R,88,14,2.0,30.0,2004,0.681,0.876
2,1,0,1,R,75,12,,23.0,1678,0.736,1.046


In [10]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
encoder.fit(df.Hand.values)
list(encoder.classes_)
#Transoformamos
encoder.transform(df.Hand.values)
df[['Hand']]=encoder.transform(df.Hand.values)
encoder2=LabelEncoder()
encoder2.fit(df.Hand.values)
list(encoder2.classes_)

[0]

In [11]:
data_na = (df.isnull().sum() / len(df)) * 100
data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Lost proportion (%)' :round(data_na,2)})
missing_data.head(20)

Unnamed: 0,Lost proportion (%)
SES,5.09
MMSE,0.54


In [12]:
from sklearn.impute  import SimpleImputer
# We perform it with the most frequent value 
imputer = SimpleImputer ( missing_values = np.nan,strategy='most_frequent')

imputer.fit(df[['SES']])
df[['SES']] = imputer.fit_transform(df[['SES']])

# We perform it with the median
imputer = SimpleImputer ( missing_values = np.nan,strategy='median')

imputer.fit(df[['MMSE']])
df[['MMSE']] = imputer.fit_transform(df[['MMSE']])

In [13]:
from sklearn.impute  import SimpleImputer
# We perform it with the median
imputer = SimpleImputer ( missing_values = np.nan,strategy='median')

imputer.fit(df[['MMSE']])
df[['MMSE']] = imputer.fit_transform(df[['MMSE']])

In [14]:
from sklearn.preprocessing import StandardScaler
df_norm = df
scaler = StandardScaler()
df_norm[['Age','MR Delay','M/F','Hand','EDUC','SES','MMSE','eTIV','nWBV','ASF']]=scaler.fit_transform(df[['Age','MR Delay','M/F','Hand','EDUC','SES','MMSE','eTIV','nWBV','ASF']])

In [15]:
df_norm.head(3)

Unnamed: 0,Group,MR Delay,M/F,Hand,Age,EDUC,SES,MMSE,eTIV,nWBV,ASF
0,0,-0.937715,1.153798,0.0,1.308738,-0.208132,-0.394466,-0.095686,2.836059,-0.905169,-2.265742
1,0,-0.217613,1.153798,0.0,1.439787,-0.208132,-0.394466,0.721664,2.932703,-1.309643,-2.316501
2,1,-0.937715,1.153798,0.0,-0.263856,-0.904394,-0.394466,-1.185486,1.079409,0.173429,-1.083784


In [16]:
df.drop(['Hand'], axis = 1, inplace = True, errors = 'ignore')
df.drop(['MR Delay'], axis = 1, inplace = True, errors = 'ignore')

In [17]:
df.head()

Unnamed: 0,Group,M/F,Age,EDUC,SES,MMSE,eTIV,nWBV,ASF
0,0,1.153798,1.308738,-0.208132,-0.394466,-0.095686,2.836059,-0.905169,-2.265742
1,0,1.153798,1.439787,-0.208132,-0.394466,0.721664,2.932703,-1.309643,-2.316501
2,1,1.153798,-0.263856,-0.904394,-0.394466,-1.185486,1.079409,0.173429,-1.083784
3,1,1.153798,-0.132806,-0.904394,-0.394466,0.176764,1.420506,-0.446765,-1.34483
4,1,1.153798,0.391392,-0.904394,-0.394466,-1.457936,1.193108,-0.770344,-1.1708


In [18]:
data_test = df

In [19]:
X = data_test.drop(["Group"],axis=1)
y = data_test["Group"].values
X.head(3)

Unnamed: 0,M/F,Age,EDUC,SES,MMSE,eTIV,nWBV,ASF
0,1.153798,1.308738,-0.208132,-0.394466,-0.095686,2.836059,-0.905169,-2.265742
1,1.153798,1.439787,-0.208132,-0.394466,0.721664,2.932703,-1.309643,-2.316501
2,1.153798,-0.263856,-0.904394,-0.394466,-1.185486,1.079409,0.173429,-1.083784


In [20]:
# We divide our data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state = 0)

In [21]:
print("{0:0.2f}% Train".format((len(X_train)/len(data_test.index)) * 100))
print("{0:0.2f}% Test".format((len(X_test)/len(data_test.index)) * 100))

74.80% Train
25.20% Test


In [22]:
print("Original Demented : {0} ({1:0.2f}%)".format(len(df_norm.loc[df_norm['Group'] == 1]), 100 * (len(df_norm.loc[df_norm['Group'] == 1]) / len(df_norm))))
print("Original Nondemented : {0} ({1:0.2f}%)".format(len(df_norm.loc[df_norm['Group'] == 0]), 100 * (len(df_norm.loc[df_norm['Group'] == 0]) / len(df_norm))))
print("")
print("Training Demented : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 1]), 100 * (len(y_train[y_train[:] == 1]) / len(y_train))))
print("Training Nondemented : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 0]), 100 * (len(y_train[y_train[:] == 0]) / len(y_train))))
print("")
print("Test Demented : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 1]), 100 * (len(y_test[y_test[:] == 1]) / len(y_test))))
print("Test Nondemented : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 0]), 100 * (len(y_test[y_test[:] == 0]) / len(y_test))))

Original Demented : 183 (49.06%)
Original Nondemented : 190 (50.94%)

Training Demented : 139 (49.82%)
Training Nondemented : 140 (50.18%)

Test Demented : 44 (46.81%)
Test Nondemented : 50 (53.19%)


## KNN Algorithm

In [23]:
from sklearn.neighbors import KNeighborsClassifier

In [24]:
model_knn = KNeighborsClassifier()

In [25]:
model_knn.fit(X_train , y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [26]:
model_knn.score(X_test , y_test)

0.7978723404255319

## Logistic Regression

In [27]:
from sklearn.linear_model import LogisticRegression

In [28]:
model_lr = LogisticRegression()

In [29]:
model_lr.fit(X_train , y_train)





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [30]:
model_lr.score(X_test , y_test)

0.7978723404255319

## Decision Trees

In [31]:
from sklearn.tree import DecisionTreeClassifier

In [32]:
model_dt = DecisionTreeClassifier()

In [33]:
model_dt.fit(X_train , y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [34]:
model_dt.score(X_test , y_test)

0.7553191489361702

## Naive Bayes

In [35]:
from sklearn.naive_bayes import *

In [36]:
model_nb_g = GaussianNB() 

In [37]:
model_nb_g.fit(X_train , y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [38]:
model_nb_g.score(X_test , y_test)

0.7872340425531915

In [39]:
model_nb_b = BernoulliNB()

In [40]:
model_nb_b.fit(X_train , y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [41]:
model_nb_b.score(X_test , y_test)

0.7553191489361702

## Random Forest

In [42]:
from sklearn.ensemble import RandomForestClassifier

In [43]:
model_rf = RandomForestClassifier(n_estimators=10)

In [44]:
model_rf.fit(X_train , y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [45]:
model_rf.score(X_test , y_test)

0.7872340425531915

### Performnce

In [46]:
Predicted_knn= model_knn.predict(X_test)
Predicted_lr = model_lr.predict(X_test)
Predicted_nb_b = model_nb_b.predict(X_test)
Predicted_nb_g= model_nb_g.predict(X_test)
Predicted_dt = model_dt.predict(X_test)


In [48]:
acc = []

In [49]:
model='KNN'
test_score = cross_val_score(model_knn, X_train, y_train, cv=FOLDS, scoring='accuracy').mean() # Get recall for each parameter setting
test_recall = recall_score(y_test, Predicted_knn, pos_label=1)
fpr, tpr, thresholds = roc_curve(y_test, Predicted_knn, pos_label=1)
test_auc = auc(fpr, tpr)
acc.append([model,test_score, test_recall, test_auc, fpr, tpr, thresholds])

model='Logistic Regression'
test_score = cross_val_score(model_lr, X_train, y_train, cv=FOLDS, scoring='accuracy').mean() # Get recall for each parameter setting
test_recall = recall_score(y_test, Predicted_lr, pos_label=1)
fpr, tpr, thresholds = roc_curve(y_test, Predicted_lr, pos_label=1)
test_auc = auc(fpr, tpr)
acc.append([model, test_score,test_recall, test_auc, fpr, tpr, thresholds])

model='Naive Bayes Bernoulli'
test_score = cross_val_score(model_nb_b, X_train, y_train, cv=FOLDS, scoring='accuracy').mean() # Get recall for each parameter setting
test_recall = recall_score(y_test, Predicted_nb_b, pos_label=1)
fpr, tpr, thresholds = roc_curve(y_test, Predicted_nb_b, pos_label=1)
test_auc = auc(fpr, tpr)
acc.append([model, test_score,test_recall, test_auc, fpr, tpr, thresholds])

model='Naive Bayes Gaussian'
test_score = cross_val_score(model_nb_g, X_train, y_train, cv=FOLDS, scoring='accuracy').mean() # Get recall for each parameter setting
test_recall = recall_score(y_test, Predicted_nb_g, pos_label=1)
fpr, tpr, thresholds = roc_curve(y_test, Predicted_nb_g, pos_label=1)
test_auc = auc(fpr, tpr)
acc.append([model, test_score, test_recall, test_auc, fpr, tpr, thresholds])

model='Decision Tree'
test_score = cross_val_score(model_dt, X_train, y_train, cv=FOLDS, scoring='accuracy').mean() # Get recall for each parameter setting
test_recall = recall_score(y_test, Predicted_dt, pos_label=1)
fpr, tpr, thresholds = roc_curve(y_test, Predicted_dt, pos_label=1)
test_auc = auc(fpr, tpr)
acc.append([model, test_score, test_recall, test_auc, fpr, tpr, thresholds])
























In [54]:
def report_performance(model, a):

    model_test = model.predict(X_test)
    print(a)
    print("Confusion Matrix")
    print("{0}".format(metrics.confusion_matrix(y_test, model_test)))
    print("")
    print("Classification Report")
    print(metrics.classification_report(y_test, model_test))

In [55]:
report_performance(model_knn,'KNN')
report_performance(model_lr,'Logistic Regression')
report_performance(model_nb_b,'Naive Bayes Bernoulli')
report_performance(model_nb_g, 'Naive Bayes Gaussian')
report_performance(model_dt,'Decision Tree')

KNN
Confusion Matrix
[[40 10]
 [ 9 35]]

Classification Report
              precision    recall  f1-score   support

           0       0.82      0.80      0.81        50
           1       0.78      0.80      0.79        44

    accuracy                           0.80        94
   macro avg       0.80      0.80      0.80        94
weighted avg       0.80      0.80      0.80        94

Logistic Regression
Confusion Matrix
[[39 11]
 [ 8 36]]

Classification Report
              precision    recall  f1-score   support

           0       0.83      0.78      0.80        50
           1       0.77      0.82      0.79        44

    accuracy                           0.80        94
   macro avg       0.80      0.80      0.80        94
weighted avg       0.80      0.80      0.80        94

Naive Bayes Bernoulli
Confusion Matrix
[[36 14]
 [ 9 35]]

Classification Report
              precision    recall  f1-score   support

           0       0.80      0.72      0.76        50
           1  

In [56]:
result = pd.DataFrame(acc, columns=['Model', 'Accuracy', 'Recall', 'AUC', 'FPR', 'TPR', 'TH'])
result[['Model', 'Accuracy', 'Recall', 'AUC']]

Unnamed: 0,Model,Accuracy,Recall,AUC
0,KNN,0.746032,0.795455,0.797727
1,Logistic Regression,0.795899,0.818182,0.799091
2,Naive Bayes Bernoulli,0.770767,0.795455,0.757727
3,Naive Bayes Gaussian,0.77791,0.727273,0.783636
4,Decision Tree,0.792063,0.795455,0.757727


## Deep Forest

ModuleNotFoundError: No module named 'deepforest'

In [99]:
! pip install DeepForest

Collecting DeepForest
  Using cached https://files.pythonhosted.org/packages/fa/4b/baea4c084f93671dea2d2aaf3e2625d88a394f8169250c954e0b41f27f3c/deepforest-0.2.10-cp37-cp37m-win_amd64.whl
Collecting tensorflow==1.14.0 (from DeepForest)
  Using cached https://files.pythonhosted.org/packages/f7/08/25e47a53692c2e0dcd2211a493ddfe9007a5cd92e175d6dffa6169a0b392/tensorflow-1.14.0-cp37-cp37m-win_amd64.whl
Collecting tensorboard<1.15.0,>=1.14.0 (from tensorflow==1.14.0->DeepForest)
  Using cached https://files.pythonhosted.org/packages/91/2d/2ed263449a078cd9c8a9ba50ebd50123adf1f8cfbea1492f9084169b89d9/tensorboard-1.14.0-py3-none-any.whl
Collecting wrapt>=1.11.1 (from tensorflow==1.14.0->DeepForest)
Installing collected packages: tensorboard, wrapt, tensorflow, DeepForest
  Found existing installation: tensorboard 1.13.1
    Uninstalling tensorboard-1.13.1:
      Successfully uninstalled tensorboard-1.13.1
  Found existing installation: wrapt 1.10.11


ERROR: tensorboard 1.14.0 has requirement setuptools>=41.0.0, but you'll have setuptools 40.6.3 which is incompatible.
ERROR: Cannot uninstall 'wrapt'. It is a distutils installed project and thus we cannot accurately determine which files belong to it which would lead to only a partial uninstall.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.




## MLP

In [72]:
from keras.layers import *
from keras.models import *
from keras.utils import to_categorical

In [73]:
y_train_c = to_categorical(y_train)
y_test_c = to_categorical(y_test)

In [74]:
deep_mlp = Sequential()
deep_mlp.add(Dense(16 ,activation = "tanh" , input_shape=[8]))
deep_mlp.add(Dense(8 , activation="tanh"))
deep_mlp.add(Dense(8 , activation="tanh"))
deep_mlp.add(Dense(4 , activation="tanh"))
deep_mlp.add(Dense(2 , activation="softmax"))
deep_mlp.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_59 (Dense)             (None, 16)                144       
_________________________________________________________________
dense_60 (Dense)             (None, 8)                 136       
_________________________________________________________________
dense_61 (Dense)             (None, 8)                 72        
_________________________________________________________________
dense_62 (Dense)             (None, 4)                 36        
_________________________________________________________________
dense_63 (Dense)             (None, 2)                 10        
Total params: 398
Trainable params: 398
Non-trainable params: 0
_________________________________________________________________


In [75]:
deep_mlp.compile(optimizer="adam" , loss="categorical_crossentropy" , metrics = ["accuracy"])

In [76]:
deep_mlp.fit(X_train , y_train_c , epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x14c5448f1d0>

In [77]:
deep_mlp.evaluate(X_test , y_test_c)



[0.43505545436067783, 0.7872340463577433]