In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# For Imputation
from sklearn.preprocessing import LabelEncoder

# For data preprocessing
from sklearn.model_selection import train_test_split, KFold, cross_validate

# For model building
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

# For visualizing the descision tree
from sklearn import tree





In [None]:

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
#import the necessary modelling algos.
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

#model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,roc_curve,roc_auc_score,auc
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.metrics import (accuracy_score,
                             f1_score,
                             log_loss,
                             roc_auc_score,
                             roc_curve,
                             confusion_matrix)
from sklearn.model_selection import (cross_val_score,
                                     GridSearchCV,
                                     RandomizedSearchCV,
                                     learning_curve,
                                     validation_curve,
                                     train_test_split)

from sklearn.pipeline import make_pipeline # For performing a series of operations

from sklearn.metrics import plot_confusion_matrix

In [None]:
df = pd.read_csv('../input/mri-and-alzheimers/oasis_longitudinal.csv')

## Data Wrangling

In [None]:
df.head()

In [None]:
df.shape

In [None]:
# getting a feel of the data types of the columns

df.info()

In [None]:
df.isnull().sum()

* **SES has 19 missing values**
* **MMSE has 2 missing values**

In [None]:
df.describe() # for numerical cols

In [None]:
df.skew()

## Imputing Missing Values

> For MMSE

In [None]:
df.MMSE.fillna(df.MMSE.median(),inplace=True)

> For SES

In [None]:
df.SES.fillna(df.SES.median(),inplace=True)

In [None]:
df.isnull().sum()

**No More Missing Values**

### Dropping Hand Column
> It contains only one kind of values and hence isnt useful for our model. These type of features are called ***Zero Varaince Predictor*** and should be avoided

In [None]:
df.drop(columns='Hand',axis=1,inplace=True)

In [None]:
df.head()

In [None]:
df.isnull().sum()

## Reversing the order of SES values
> Like CDR, SES is also a level based feature. In CDR the value start from 0 and goes till 2 defining the seriousness of dementia. Whereas in SES , SES=1 (Highest Status) and SES = 5 (Lowest Status) which is the opposite of the trend in CDR.

> Therefore reversing the values of SES so that, SES = 1 (Lowest Status) and SES = 5 (Highest Status)

In [None]:
# Reversing using mapping
ses_map = {5:1,4:2,3:3,2:4,1:5}
df.SES = df.SES.map(ses_map)


In [None]:
df.head()

In [None]:
df.SES.value_counts()

**Saving clean data**

In [None]:
df_copy = df.copy()
df.to_csv('oasis_longitude.csv')

### Encoding M/F - Gender

In [None]:
df.dtypes

In [None]:
gender_map = {'M':0, 'F':1}
df['Gender'] = df['M/F'].map(gender_map)

In [None]:
df.tail()

In [None]:
df.dtypes

In [None]:
df.drop(columns='M/F',axis=1,inplace=True)

In [None]:
df.head()

In [None]:
df.Group.value_counts()

In [None]:
target_map = {'Nondemented':0,'Demented':1,'Converted':2}

df['Group'] = df.Group.map(target_map)

In [None]:
df.Group.value_counts()

# EDA

In [None]:
corr = df.corr()
corr

In [None]:
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the figure
fig, ax = plt.subplots(figsize=(12,8))

# Generate a custom colormap
cmap = sns.diverging_palette(250, 10, s=80, l=55, n=9, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio (mask to not display upper triangle part)
sns.heatmap(corr, mask=mask, cmap=cmap, ax=ax, annot=True);
plt.savefig('corr.png')

* **Between ASF and eTIV there is a high negative correlation**
* **Between MR Delay and Visit there is a high positive correlation**

**Hence we will have to drop any one from both sets**

## Data Visualization

> **Checking the distribution of the target variable**

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(df['CDR'])
plt.title('Distribution of CDR Levels')
plt.xlabel('CDR LEVEL')
plt.ylabel('COUNT')
plt.savefig('CDR_distribution.png')

**The problem we have is a Multi-Class Classification Problem**

### Plotting against Target Variable

In [None]:
sns.factorplot(x='CDR',y='SES',data=df,kind='box',size=5,aspect=1)

In [None]:
a = df.SES.value_counts()

In [None]:
list(a.index)

In [None]:
# Create list of indicies of SES counts
ses_count = df['SES'].value_counts()
ses_indexes = list(ses_count.index)

# Plot of distribution of scores for building categories
plt.figure(figsize=(12, 8))

# Plot each building
for s in ses_indexes:
    # Select the SES category
    subset = df[df['SES'] == s]
    
    # Density plot of CDR scores
    sns.kdeplot(subset['CDR'],
               label = s, shade = False, alpha = 0.8);
    
# label the plot
plt.xlabel('CDR Score', size = 20);
plt.ylabel('Density', size = 20); 
plt.title('Density Plot of CDR Scores by SES', size = 28);
plt.savefig('SES_CDR.png')

* **High SES group (4) have CDR score 0 as a common value**
* **Low SES group (1) have CDR score 0.5 as a common value**

In [None]:
sns.factorplot(x='CDR',kind='count',col='SES',data=df)

## Checking if Education has an effect on CDR

In [None]:
df.EDUC.value_counts()

In [None]:
df.dtypes

In [None]:
# Create list of indicies of SES counts
edu_count= df['EDUC'].value_counts()
edu_index = list(edu_count.index)

# Plot of distribution of scores for building categories
plt.figure(figsize=(12, 8))

# Plot each building
for el in edu_index:
    # Select the SES category
    subset = df[df['EDUC'] == el]
    
    # Density plot of CDR scores
    sns.kdeplot(subset['CDR'],
               label = el, shade = False, alpha = 0.8,bw=0.5);
    
# label the plot
plt.xlabel('CDR Score', size = 20); plt.ylabel('Density', size = 20); 
plt.title('Density Plot of CDR Scores by Years of Education', size = 28);
#plt.xlim([0,2]);
plt.savefig('EDU_CDR.png')

Not helpful lets plot for the top 2 values of EDUC

In [None]:
# Min and Max years of education among subjects
min_edu = df.loc[df['EDUC']==12]
max_edu = df.loc[df['EDUC']==16]

# Stack them into a combine dataframe
edu_concat = pd.concat([min_edu,max_edu])
edu_concat.head()

In [None]:
# Create list of indicies of SES counts
edu_= edu_concat['EDUC'].value_counts()
edu_index = list(edu_.index)

# Plot of distribution of scores for building categories
plt.figure(figsize=(12, 8))

# Plot each building
for el in edu_index:
    # Select the SES category
    subset = edu_concat[edu_concat['EDUC'] == el]
    
    # Density plot of CDR scores
    sns.kdeplot(subset['CDR'],
               label = el, shade = False, alpha = 0.8);
    
# label the plot
plt.xlabel('CDR Score', size = 20); plt.ylabel('Density', size = 20); 
plt.title('Density Plot of CDR Scores by Years of Education', size = 28);
#plt.xlim([0,2]);
plt.savefig('EDU_CDR.png')

* Not a remarkable insight but subject with 12 Years of education has slightly greater CDR score than subject with 16 years of education 

## Does Gender have an effect?

In [None]:
# Create list of indicies of SES counts
gender_count= df['Gender'].value_counts()
gender_indicies = list(gender_count.index)

# Plot of distribution of scores for building categories
plt.figure(figsize=(12, 10))

# Plot each building
for g in gender_indicies:
    # Select the SES category
    subset = df[df['Gender']==g]
    
    # Density plot of CDR scores
    sns.kdeplot(subset['CDR'],
               label = g, shade = False, alpha = 0.8);
    
# label the plot
plt.xlabel('CDR Score', size = 20); plt.ylabel('Density', size = 20); 
plt.title('Density Plot of CDR Scores by Gender', size = 28, );
plt.savefig('Gender_CDR.png')

1 = Female have lower CDR level than male (0). Females seems to be healthier according to the dataset at hand

## Age vs CDR

In [None]:
fig = plt.figure(figsize=(12,8))
sns.catplot(x='CDR',y='Age',data=df,hue='Gender')
plt.savefig('Age_CDR.png')

**Not really insightful, but cdr scores for the age range of 65 -85 vary a lot. But that in no way indicates that age influences CDR score**

## MMSE vs CDR

In [None]:
fig = plt.figure(figsize=(12,8))
sns.catplot(x='CDR',y='MMSE',data=df, hue='Gender')
plt.savefig('MMSE_CDR')

**MMSE Scores below 25 have a higher probability of getting CDR**
* The Ones with moderate Dementia have MMSE < 25
* The Ones with Mild Dementia have MMSE < 25

## eTIV vs CDR

In [None]:
fig = plt.figure(figsize=(12,8))
sns.catplot(x='CDR',y='eTIV',data=df)

### ASF vs CDR

In [None]:
fig = plt.figure(figsize=(12,8))
sns.catplot(x='CDR',y='ASF',data=df)

## nWBV vs CDR

In [None]:
fig = plt.figure(figsize=(12,8))
sns.catplot(x='CDR', y='nWBV', data=df)

* **eTIV & ASF plots doesnt signify much**
* **However, nWBV vs CDR, the normal Whole Brain Volume decreases as CDR level increases.**

## Summary of EDA

* **MMSE: ** From the plots above we can infer that high MMSE scores relate with low CDR levels. Therefore, MMSE is an important feature in predicting CDR Levels

* **SES: ** Couldn't understand much from the plots to certainly say that SES has an influence on CDR scores. But would like to keep it. Going with my Intiution!

* **Gender: ** Gender did suggest that females are heralthier than males and hence it is an important feature

* **ASF: ** No idea!

* **eTIV: ** No Idea!

* **nWBV: ** As Normal Whole Brain Volume decreases CDR Increases. nWBV has an influence on CDR

* **EDUC: ** As seen in the above plots lower Education subject has slighlty greater CDR score than the subjects with higher Education. 



### Correlated Features

* **ASF and eTIV - drop one**
* **Visit and MR_Delay - drop one**

## Feature Selection

In [None]:
df.shape

In [None]:
df.head()

* **Drop features that are highly correlated when using Linear Classifiers**
* **We can drop CDR column and keep Group Column as both represent the same thing**
* **We can drop Subject Id and MRI ID as they are irrevelant**

In [None]:
selected_df = df.drop(['Subject ID','MRI ID','CDR'],axis=1)


In [None]:
selected_df.head()

In [None]:
# Rename columns
rename_cols_dict = {'EDUC':'Education',
                   'Group':'Diagnosis'}
selected_df.rename(rename_cols_dict,axis=1,inplace=True)
selected_df.head()

In [None]:
selected_df.dtypes

### Define Target & Predictor feature(s)

In [None]:
target = selected_df.Diagnosis.values

predictors_df = selected_df.drop(['Diagnosis'],axis=1)

In [None]:
predictors_df.head()

In [None]:
predictors_df.dtypes

### Splitting into training and testing data

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(selected_df.Diagnosis)
plt.title('Distribution of Diagnosis')
plt.xlabel('Diagnosis')
plt.ylabel('COUNT')
plt.savefig('Diagnosis_distribution.png')

Using Stratify to maintaine the same ratio of target variable values in both train and test dataset

In [None]:
x_train,x_test,y_train,y_test = train_test_split(predictors_df,target,test_size=0.2,stratify=target,random_state=1)

In [None]:
print("Training Data - Predictors",x_train.shape)
print("Testing Data - Predictors",x_test.shape)
print("Training Data - Target",y_train.shape)
print("Testing Data - Target",y_test.shape)

## Choosing Evaluation Metric

We will be going forward with AUC and Diagnostic Odds Ratio

## Feature Scaling

In [None]:
from sklearn.pipeline import make_pipeline # For performing a series of operations

from sklearn.metrics import plot_confusion_matrix

from sklearn.preprocessing import StandardScaler

In [None]:
# Build random forest classifier
methods_data = {'Original': (x_train,y_train)}

for method in methods_data.keys():
    pip_rf = make_pipeline(StandardScaler(),
                           RandomForestClassifier(n_estimators=500,
                                                  class_weight="balanced",
                                                  random_state=123))
    hyperparam_grid = {
        "randomforestclassifier__n_estimators": [10, 50, 100, 500],
        "randomforestclassifier__max_features": ["sqrt", "log2", 0.4, 0.5],
        "randomforestclassifier__min_samples_leaf": [1, 3, 5],
        "randomforestclassifier__criterion": ["gini", "entropy"]}
    
    gs_rf = GridSearchCV(pip_rf,
                         hyperparam_grid,
                         scoring="f1_macro",
                         cv=10,
                         n_jobs=-1)
    
    gs_rf.fit(methods_data[method][0], methods_data[method][1])
    
    print("\033[1m" + "\033[0m" + "The best hyperparameters for {} data:".format(method))
    for hyperparam in gs_rf.best_params_.keys():
        print(hyperparam[hyperparam.find("__") + 2:], ": ", gs_rf.best_params_[hyperparam])
    
    print("\033[1m" + "\033[94m" + "Best 10-folds CV f1-score: {:.2f}%.".format((gs_rf.best_score_) * 100))

In [None]:
# Refit RF classifier using best params
clf_rf = make_pipeline(StandardScaler(),
                       RandomForestClassifier(n_estimators=10,
                                              criterion="gini",
                                              max_features=0.4,
                                              min_samples_leaf=3,
                                              class_weight="balanced",
                                              n_jobs=-1,
                                              random_state=123))


clf_rf.fit(x_train, y_train)

In [None]:
# Build Gradient Boosting classifier
pip_gb = make_pipeline(StandardScaler(),
                       GradientBoostingClassifier(loss="deviance",
                                                  random_state=123))

hyperparam_grid = {"gradientboostingclassifier__max_features": ["log2", 0.5],
                   "gradientboostingclassifier__n_estimators": [100, 300, 500],
                   "gradientboostingclassifier__learning_rate": [0.001, 0.01, 0.1],
                   "gradientboostingclassifier__max_depth": [1, 2, 3]}

gs_gb = GridSearchCV(pip_gb,
                      param_grid=hyperparam_grid,
                      scoring="f1_macro",
                      cv=10,
                      n_jobs=-1)

gs_gb.fit(x_train, y_train)

print("\033[1m" + "\033[0m" + "The best hyperparameters:")
print("-" * 25)
for hyperparam in gs_gb.best_params_.keys():
    print(hyperparam[hyperparam.find("__") + 2:], ": ", gs_gb.best_params_[hyperparam])

print("\033[1m" + "\033[94m" + "Best 10-folds CV f1-score: {:.2f}%.".format((gs_gb.best_score_) * 100))

In [None]:
# Build logistic model classifier
pip_logmod = make_pipeline(StandardScaler(),
                           LogisticRegression(class_weight="balanced"))

hyperparam_range = np.arange(0.5, 20.1, 0.5)

hyperparam_grid = {"logisticregression__penalty": ["l1", "l2"],
                   "logisticregression__C":  hyperparam_range,
                   "logisticregression__fit_intercept": [True, False]
                  }

gs_logmodel = GridSearchCV(pip_logmod,
                           hyperparam_grid,
                           scoring="accuracy",
                           cv=2,
                           n_jobs=-1)

gs_logmodel.fit(x_train, y_train)

print("\033[1m" + "\033[0m" + "The best hyperparameters:")
print("-" * 25)
for hyperparam in gs_logmodel.best_params_.keys():
    print(hyperparam[hyperparam.find("__") + 2:], ": ", gs_logmodel.best_params_[hyperparam])

print("\033[1m" + "\033[94m" + "Best 10-folds CV f1-score: {:.2f}%.".format((gs_logmodel.best_score_) * 100))

In [None]:
estimators = {"RF": clf_rf,
              "LR": gs_logmodel,
              "GBT": gs_gb
             }

# Print out accuracy score on test data
print("The accuracy rate on test data are:")
for estimator in estimators.keys():
    print("{}: {:.2f}%".format(estimator,
        accuracy_score(y_test, estimators[estimator].predict(x_test)) * 100
          ))

In [None]:
predictions = gs_gb.predict(x_test)

In [None]:
predictions.shape

In [None]:
selected_df.Diagnosis.value_counts()

In [None]:
model_names=['RandomForestClassifier','Logistic Regression','GradientBoostingClassifier']
models = [clf_rf,gs_logmodel,gs_gb]

In [None]:
def compare_models(model):
    clf=model
    clf.fit(x_train,y_train)
    pred=clf.predict(x_test)
    
    # Calculating various metrics
    
    acc.append(accuracy_score(pred,y_test))
    #prec.append(precision_score(pred,y_test))
    #rec.append(recall_score(pred,y_test))
    #auroc.append(roc_auc_score(pred,y_test))

In [None]:
acc=[]
prec=[]
rec=[]
auroc=[]

In [None]:
for model in models:
    compare_models(model)


In [None]:
d={'Modelling Algo':model_names,'Accuracy':acc}
met_df=pd.DataFrame(d)
met_df

* **GBT has better accuracy, but its not enough**

**Todo - Label Encode Diagnosis column**