In [None]:
# IMPORT LIBRARIES
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score, precision_score, roc_auc_score

import statsmodels

from yellowbrick.classifier import ClassificationReport, ROCAUC

plt.style.use('ggplot')
pd.options.display.float_format = '{:,.2f}'.format

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))

In [None]:
df = pd.read_csv('parkinsons.data', sep=',')

In [None]:
df.head(10)

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.profile_report()

In [None]:
sns.countplot(df['status'])

In [None]:
plt.figure(figsize=(50,20))

sns.heatmap(df.corr(),
            annot=True,
            linewidths=.5,
            center=0,
            cbar=True,
            cmap="YlGnBu")

plt.show()

Perform basic data pre-processing (if needed), univariate and bivariate analysis. Use relevant visualizations to understand the features at hand. Which features are strongly correlated to the target variable? - 15

# - Spread1, Spread2 and PPE are strongly co-related with the target variable status

In [None]:
#levels of Y variable
df['status'].value_counts()

In [None]:
df = pd.read_csv('parkinsons.data')
df = df.drop('name',axis=1)
sns.pairplot(df,hue='status')

In [None]:
df_revised = df.drop(columns = ['name'])

In [None]:
df_revised.head()

In [None]:
df_revised.info()

# The Pairplot below clearly shows the high co-relation between different parameters and status

In [None]:
sns.pairplot(df_revised)

# The joint plots below cleary shows how the Patients with Parkinsons (status = 1) and without Parkinsons (status = 0) are divided across different features

In [None]:
sns.jointplot(x="MDVP:Fo(Hz)", y="status", data=df_revised);

In [None]:
sns.jointplot(x="HNR", y="status", data=df_revised);

In [None]:
sns.jointplot(x="spread1", y="status", data=df_revised);

In [None]:
sns.jointplot(x="spread2", y="status", data=df_revised);

In [None]:
sns.jointplot(x="PPE", y="status", data=df_revised);

In [None]:
sns.jointplot(x="MDVP:Jitter(%)", y="status", data=df_revised);

In [None]:
sns.jointplot(x="MDVP:PPQ", y="status", data=df_revised);

In [None]:
sns.jointplot(x="MDVP:Shimmer", y="status", data=df_revised);

In [None]:
sns.jointplot(x="MDVP:Shimmer", y="status", data=df_revised);

In [None]:
sns.jointplot(x="MDVP:Shimmer(dB)", y="status", data=df_revised);

In [None]:
sns.jointplot(x="Shimmer:APQ3", y="status", data=df_revised);

In [None]:
sns.jointplot(x="Shimmer:APQ5", y="status", data=df_revised);

In [None]:
sns.jointplot(x="MDVP:APQ", y="status", data=df_revised);

In [None]:
sns.jointplot(x="Shimmer:DDA", y="status", data=df_revised);

In [None]:
# splitting data into training and test set for independent attributes
from sklearn.model_selection import train_test_split

features = [col for col in df_revised.columns if col != 'status']

X_train, X_test, y_train, y_test = train_test_split(df_revised[features], df_revised['status'], test_size=.3, random_state=22)
X_train.shape, X_test.shape

In [None]:
X_test.shape

In [None]:
# invoking the decision tree classifier function. Using 'entropy' method of finding the split columns. Other option 
# could be gini index.  

model_entropy = DecisionTreeClassifier(criterion='entropy')

In [None]:
model_entropy.fit(X_train, y_train)

In [None]:
print("Train AUC: %.2f" % roc_auc_score(y_train, model_entropy.predict(X_train), multi_class = 'ovo', average = 'weighted'))  # performance on train data
print("Test AUC: %.2f" % roc_auc_score(y_test, model_entropy.predict(X_test), multi_class = 'ovo', average = 'weighted'))  # performance on test data

In [None]:
# The decision tree is overfitting, in order to resole this issue pruning the tree to max depth of 3
clf_pruned = DecisionTreeClassifier(criterion = "entropy", max_depth = 3)
clf_pruned.fit(X_train, y_train)

In [None]:
print("Train AUC: %.2f" % roc_auc_score(y_train, clf_pruned.predict(X_train), multi_class = 'ovo', average = 'weighted'))  # performance on train data
print("Test AUC: %.2f" % roc_auc_score(y_test, clf_pruned.predict(X_test), multi_class = 'ovo', average = 'weighted'))  # performance on test data

In [None]:
y_train.value_counts()

In [None]:
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus
import graphviz

In [None]:
dot_data = StringIO()
export_graphviz(clf_pruned, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,feature_names = features,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('wines_pruned.png')
Image(graph.create_png())

In [None]:
preds_train = clf_pruned.predict(X_train)
preds_test = clf_pruned.predict(X_test)

AUC_DT_train = roc_auc_score(y_train, clf_pruned.predict(X_train), multi_class = 'ovo', average = 'weighted')
AUC_DT_test = roc_auc_score(y_test, clf_pruned.predict(X_test), multi_class = 'ovo', average = 'weighted')

In [None]:
# Confusion matrix
pd.crosstab(y_test, preds_test, rownames=['Actual'], colnames=['Predicted'])

In [None]:
# Visualize model performance with yellowbrick library
viz = ClassificationReport(DecisionTreeClassifier(criterion = "entropy", max_depth=3))
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.show()

roc = ROCAUC(DecisionTreeClassifier(criterion = "entropy", max_depth=3))
roc.fit(X_train, y_train)
roc.score(X_test, y_test)
roc.show()

In [None]:
## Calculating feature importance

feat_importance = clf_pruned.tree_.compute_feature_importances(normalize=False)

feat_imp_dict = dict(zip(features, clf_pruned.feature_importances_))
feat_imp = pd.DataFrame.from_dict(feat_imp_dict, orient='index', columns = ['importance'])
feat_imp.sort_values(by='importance', ascending=False)

In [None]:
#Store the accuracy results for each model in a dataframe for final comparison
resultsDf = pd.DataFrame({'Method':['Decision Tree'], 
                          'AUC Train': AUC_DT_train,
                          'AUC Test': AUC_DT_test})
resultsDf

In [None]:
# Function to evalaue the performance of the model
def evaluate_model(model):
    auc_train = roc_auc_score(y_train, model.predict(X_train), multi_class = 'ovo', average = 'weighted')
    auc_test = roc_auc_score(y_test, model.predict(X_test), multi_class = 'ovo', average = 'weighted')
    
    return auc_train, auc_test


# Function to Visualize the models
def visClassifierResults(model_w_parameters):
    viz = ClassificationReport(model_w_parameters)
    viz.fit(X_train, y_train)
    viz.score(X_test, y_test)
    viz.show()
    
    roc = ROCAUC(model_w_parameters)
    roc.fit(X_train, y_train)
    roc.score(X_test, y_test)
    roc.show()

In [None]:
df_revised.head()

In [None]:
# Implementing RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=50, random_state=1).fit(X_train, y_train)

scores = evaluate_model(rf)

resultsDf.loc[1] = ['Random Forest', scores[0], scores[1]]
resultsDf

In [None]:
visClassifierResults(rf)

In [None]:
# Implementing Adaptive Boosting
from sklearn.ensemble import AdaBoostClassifier
abcl = AdaBoostClassifier(n_estimators=50, learning_rate=0.1, random_state=1).fit(X_train, y_train)

scores = evaluate_model(abcl)

resultsDf.loc[2] = ['AdaBoost', scores[0], scores[1]]
resultsDf

In [None]:
visClassifierResults(abcl)

In [None]:
# Implementing Bagging Classifier
from sklearn.ensemble import BaggingClassifier
bgcl = BaggingClassifier(n_estimators=50, max_samples=0.7, random_state=1).fit(X_train, y_train)

scores = evaluate_model(bgcl)

resultsDf.loc[3] = ['Bagging', scores[0], scores[1]]
resultsDf

In [None]:
visClassifierResults(bgcl)

In [None]:
# Implmenting Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
gbcl = GradientBoostingClassifier(n_estimators=50, learning_rate=0.1, random_state=1).fit(X_train, y_train)

scores = evaluate_model(gbcl)

resultsDf.loc[4] = ['Gradient Boosting', scores[0], scores[1]]
resultsDf

In [None]:
visClassifierResults(gbcl)