In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd# data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler
from sklearn.impute import KNNImputer
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score,auc,roc_curve
from sklearn.ensemble import RandomForestClassifier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
file =pd.read_csv('/kaggle/input/mri-and-alzheimers/oasis_longitudinal.csv')

In [None]:
file.head()

The data set is comprised of 15 columns. The group column represents the label here. It tells about the popultaion divided into demented and nondemented group. Therefore, there are total 14 features. It can be seen that, 5 features are categorical and 9 features are numerical.

The function "sepdatatype" has been formed to separate these two type of data types, categorical and numerical. 

In [None]:
def sepdatatype(data):
    categorical_data=pd.DataFrame()
    numerical_data=pd.DataFrame()
    
    for col in data.columns:
        if data[col].dtype=='O':
            categorical_data = pd.concat([categorical_data,pd.DataFrame(data[col])],axis=1)
        else:
            numerical_data = pd.concat([numerical_data,pd.DataFrame(data[col])],axis=1)
    return categorical_data,numerical_data

In [None]:
[cat_data,num_data]=sepdatatype(file)

In [None]:
num_data.describe()

The 'Group' column of the categorical data is identified as Label.

In [None]:
Label = pd.DataFrame(cat_data['Group'])

This is very clear that the any kind of indentification parameters are not very useful for training the model. The IDs and the label are dropped down from the categorical data set, which leaves with 2 categorical features.

In [None]:
cat_data = cat_data.drop(['Subject ID','MRI ID','Group'],axis=1)

In [None]:
onc = OneHotEncoder()
cat2num = pd.DataFrame(onc.fit_transform(cat_data).toarray())

One hot encoding provides the encoded data set with, one unit enables and othe units disabled. Therefore, number of variations in the data set is number of columns in encoded data set. 

In [None]:
cat2num

As the gender column has two kind of different data Male and Female, it should give two column data output. But for another feature, which is hand, there is only one column, which depicts there is only one kind of data. It means the feature is not useful.
So the column will be dropped.

In [None]:
cat2num=cat2num.drop([2],axis=1)

The function 'olrem' is designed to remove the outliers from a column and get the reduced data set. This will give imbalanced data set, because every column will have different number of outliers. So, this will only be used to replace the outliers with the mean of outlier removed data set.

In [None]:
def olrem(data):
    length = data.shape[0]
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lrange = Q1 - 1.5*IQR
    urange = Q3 + 1.5*IQR
    for i in range(length):
        
        if data[i]<lrange or data[i]>urange:
            data[i]=0
        
    return data  

This function is designed to replace the outlier values with mean of the outlier removed columns.

In [None]:
def repol(data):
    length = data.shape[0]
    for col in data.columns:
        
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        lrange = Q1 - 1.5*IQR
        urange = Q3 + 1.5*IQR
        
        mn = olrem(data[col]).mean()
        for j in range(length):
            
            if data[col][j]<lrange or data[col][j]>urange:
                data[col][j] = mn
    return data

In [None]:
olrnum_data = repol(num_data)

So, the olrnum_data is the modified dataset, outliers replaced by the mean of the outlier removed data set.


In [None]:
olrnum_data

Now, we have to check for NAN values in the featureset. 

In [None]:
olrnum_data.isna().sum()

It can be observed that only two features have NAN values, SES has 19 NAN values and MMSE has 2.

To replace the NAN values, the imputer has to be used. Here KNN imputer method has been applied, where the postional NAN values are replaced by the possible values from neighbouring elements.

In [None]:
imputer = KNNImputer()
imputed = imputer.fit_transform(olrnum_data)
imputed_data = pd.DataFrame(imputed,columns = olrnum_data.columns)

Now, the imputed_data is the data set after imputing. Now we can check weather the data set is free of NAN values

In [None]:
imputed_data.isna().sum()

Now, the whole feature set will be received after collating the processed numericla data set and converted categorical data set.

In [None]:
total_data = pd.concat([imputed_data,cat2num,Label],axis = 1)

In [None]:
sns.countplot(x='Group',data = total_data)

It can be seen that, there are three classes, Demented, nondemented and converted. But , here the converted class is for the group of people, who has been diagnosed with ALzheimer's diseases lately, but earlier they were not diseased. So, according to the current condition converted can be considered as Demented. So, that wholedata set is now separated into two classes, where nondemanted is labelled with 0 and demented is labelled with 1.

In [None]:
total_data['Group'] = total_data['Group'].replace(['Nondemented','Demented','Converted'],[0,1,1])

In [None]:
sns.countplot(x='Group',data = total_data)

The dataset is checked if there is any imbalance between classes. But, as the difference between classes is not significant, so it is better not to assign any prcossed data.

In [None]:

y= total_data['Group']


In [None]:
X= total_data.drop(['Group'],axis = 1)

So, the feature set and the labels are separated into two groups, named X and y respectively. 

Now they are splitted in train and test data set with train test ration 70:30

In [None]:
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state = 22)

In [None]:
X

The feature set is not standardized yet. So, we should standardize the feature set. Now the train data set is used to evaluate the parameters mean, maximum value and minimum value and then these are used to scale both train and test set. Here, the maximum value will be 1 and minimum value will be 0 and others will be in between.

In [None]:
scaling = MinMaxScaler()
Xs = scaling.fit(X_train)
X_train = Xs.transform(X_train)
X_test = Xs.transform(X_test)

Now it is time to build and train models for learning the data set.

First, the MLP (Multi Layer Perceptron) classifier is used. A few hyperparameters are assigned with a set of values. The best hyperparameter values will be received after feeding the dataset.
Random Search cross validation technique is used to evaluate the best hyperparameter.

In [None]:
L = [0.0001,0.001,0.01,0.1]
solver =['lbfgs','adam']
activation =['logistic','tanh','relu']
learning_rate = ['constant','adaptive']
hidden_layer_sizes=[25,50,100,150]

parameters = {
              'learning_rate_init' : L,
              'solver':solver,
              'activation':activation,
              'learning_rate':learning_rate,
              'hidden_layer_sizes':hidden_layer_sizes
}
mlp = MLPClassifier()

mlprandom = RandomizedSearchCV(estimator = mlp,param_distributions = parameters, n_iter = 100, cv=3, n_jobs = -1,random_state=42)


In [None]:
mlprandom.fit(X_train, y_train)

As the model is trained with data set, now we can get the best parameters derived from rNdom search.

In [None]:
mlprandom.best_params_

Now, we have used Grid Search Cross Validation with the best hyper parameters and it's nearby values to get more accurate idea about the preferrable hyper parameters. 

In [None]:
L = [0.005, 0.009,0.01,0.05,0.09,0.1]
solver =['lbfgs','adam']
activation =['logistic','tanh']
learning_rate = ['constant','adaptive']
hidden_layer_sizes=[80,100,120,140]

parameters = {
              'learning_rate_init' : L,
              'solver':solver,
              'activation':activation,
              'learning_rate':learning_rate,
              'hidden_layer_sizes':hidden_layer_sizes
}

mlpgrid = GridSearchCV(estimator = mlp,param_grid = parameters, cv=3, n_jobs = -1)


In [None]:
mlpgrid.fit(X_train,y_train)

After training, best hyperparameters have been recieved. 

In [None]:
mlpgrid.best_params_

Now the MLP classifier has been finally trained with the evaluated best set of parameters. 

In [None]:
mlpfinal = MLPClassifier(solver='adam', learning_rate_init = 0.005, learning_rate = 'constant', hidden_layer_sizes = 80, activation = 'logistic')

In [None]:
mlpfinal.fit(X_train,y_train)

Now the trained model is ready to predict the test data set and prediction accuracy can be evaluated from that.

In [None]:
predictionm = mlpfinal.predict(X_test)
acc = accuracy_score(y_test,predictionm)
acc

Here testing accuracy of 92% is received.

In [None]:
from sklearn.model_selection import cross_val_score

score = cross_val_score(mlpfinal,X_train,y_train,cv=5)

score

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, predictionm, pos_label=4)
auc(fpr,tpr)

These are 5 fold cross validation accuracy results: 92.5%, 96.2%, 96.2%, 96.2% and 92.3%

Now the same process has been applied from Random Forest Classifier

In [None]:
rfcl = RandomForestClassifier()
n_estimators =[50,100,150,200,250]
criterion = ['gini','entropy']
min_samples_split = [2,3,5,7]
min_samples_leaf = [0.05,0.08,1,1.5,2,2.5,3]
max_features = ['auto','sqrt','log2']
bootstrap = [True, False]

rfparam = {
           'n_estimators' : n_estimators,
           'criterion' : criterion,
           'min_samples_split' : min_samples_split,
           'min_samples_leaf' : min_samples_leaf,
           'max_features' : max_features,
           'bootstrap' : bootstrap
          }
rfgrid = GridSearchCV(estimator = rfcl, param_grid = rfparam,cv=5, n_jobs = -1)

In [None]:
rfgrid.fit(X_train,y_train)

In [None]:
rfgrid.best_params_

In [None]:
rffinal = RandomForestClassifier(bootstrap=True, criterion = 'gini', max_features = 'auto', min_samples_leaf = 0.05, n_estimators = 50, min_samples_split = 5)

In [None]:
rffinal.fit(X_train,y_train)
predictionr = rffinal.predict(X_test)
acc_sc = accuracy_score(y_test,predictionr)
acc_sc

In [None]:
fpr,tpr,th = roc_curve(y_test,prediction)
auc(fpr,tpr)

From the Random Forest model,the accuracy received is 91%

The 5 fold cross validation results are also checked.

In [None]:
from sklearn.model_selection import cross_val_score
rffoldscore = cross_val_score(rffinal,X_train,y_train,cv=5)

rffoldscore

In [None]:
fi = pd.DataFrame(rffinal.feature_importances_)
fi = fi.T

In [None]:
fi.columns = X.columns

In [None]:
fi.plot.bar()