In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.preprocessing import StandardScaler


import optuna

import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset and explore

In [None]:
df = pd.read_csv('../input/parkinsonsdataset/parkinsons.csv')
df.head()

In [None]:
# there are no null data
# number of entries is 195 (it is a tiny dataset)
df.info()

In [None]:
# heatmap shows a bunch of feature each other correlated each other:
# some analisys can be done here to understand which are really useful and which can be dropped
sns.heatmap(df.corr());

In [None]:
# show in details which feature is correlated with

CORR_THRESHOLD = 0.6 # threshold can be changed here

corr_data = df.corr()

highly_correlated_features = []

for feature in df.columns:
    if feature == 'status' or feature == 'name':
        continue
    for feature_index in df.columns:
        if feature_index == 'status' or feature_index == 'name':
            continue
        if feature == feature_index:
            continue
        if corr_data[feature][feature_index] > CORR_THRESHOLD:
            tupla = (feature,feature_index)
            reversed_tupla = (feature_index, feature)
            if reversed_tupla in highly_correlated_features:
                continue
            highly_correlated_features.append((feature,feature_index))
            
highly_correlated_features

In [None]:
# this code can be enabled if you want to remove from datease the correlations computed above

#features_to_be_removed = set()

#for feature_tuple in highly_correlated_features:
#   features_to_be_removed.add(feature_tuple[1])

#df.drop(features_to_be_removed,inplace=True, axis=1)

In [None]:
# mutual info classification
from sklearn.feature_selection import mutual_info_classif

X = df.drop(['name','status'], axis=1)
y = df['status']

mutual_info = mutual_info_classif(X,y)
# print of mutual_info
from matplotlib.pyplot import figure

figure(figsize=(28, 6), dpi=80)
sns.barplot(X.columns, mutual_info);

In [None]:
# some feature shows outliers: more analysis and trials can be done here

for feature in df.columns:
    if feature == 'name' or feature == 'status':
        continue
    plt.figure()
    sns.boxplot(x="status", y=feature, data=df)

# First run with a classifier

In [None]:
# prepare features and target
X = df.drop(['name','status'], axis=1)
y = df['status']

In [None]:
# scale data
from sklearn.preprocessing import RobustScaler

robustScaler = RobustScaler()
robustScaler.fit(X)
robustScaler.transform(X)

In [None]:
# split in train and test set
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, shuffle=True, stratify=y, random_state=0)

In [None]:
# use DecisionTree classifier
classifier = DecisionTreeClassifier(random_state=0)
classifier.fit(X_train,y_train)
print((classifier.score(X_train, y_train), classifier.score(X_test,y_test)))

In [None]:
# plot confusion matrix for train set
plot_confusion_matrix(classifier, X_train, y_train, normalize='true', cmap='Blues',display_labels=["Healthy","Parkinson"]);

In [None]:
# plot confusion matrix for test set
#
# result is quite good and confusion matrix is balanced
# if "Parkinson" is considered as "positive (P)", since we are talking about an illness
# it is nice that agorithm predicted more FP (predict "Parkinson" to "healthy")
# rather than TN ("Healthy" to "Parkinson")
plot_confusion_matrix(classifier, X_test, y_test,normalize='true', cmap='Blues',display_labels=["Healthy","Parkinson"]);

# Use Optuna to optimize hyperparameters

In [None]:
def objective(trial):
    classifier_name = trial.suggest_categorical("classifier", ["DecisionTree"])
    c_criterion = trial.suggest_categorical("criterion", ["gini","entropy"])
    c_splitter = trial.suggest_categorical("splitter", ["best","random"])
    c_max_depth = trial.suggest_int("max_depth", 2, 32, log=True)
    c_min_samples_split = trial.suggest_int("min_samples_split", 2, 20, log=True)
    classifier_obj = DecisionTreeClassifier(criterion=c_criterion, splitter=c_splitter,
        max_depth=c_max_depth, min_samples_split = c_min_samples_split, random_state=0)

    score = cross_val_score(classifier_obj, X, y, n_jobs=-1, cv=10)
    accuracy = score.mean()
    return accuracy


if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100)
    print(study.best_trial)

In [None]:
# plot best params found
study.best_params

# Try again with hyperparameters found with Optuna

In [None]:
# classifier with best parameters from optuna
classifier = DecisionTreeClassifier(
    criterion=study.best_params['criterion'],
    splitter=study.best_params['splitter'],
    max_depth=study.best_params['max_depth'],
    min_samples_split=study.best_params['min_samples_split'],
    random_state=0)
classifier.fit(X_train,y_train)
print((classifier.score(X_train, y_train), classifier.score(X_test,y_test)))

In [None]:
# this is the best result out of Optuna's study on my pc
# (skip this to plot confusion matrix with parameter found)
classifier = DecisionTreeClassifier(
    criterion='entropy',
    splitter='best',
    max_depth=28,
    min_samples_split=2,
    random_state=0)
classifier.fit(X_train,y_train)
print((classifier.score(X_train, y_train), classifier.score(X_test,y_test)))

In [None]:
# plot confusion matrix for train set again
plot_confusion_matrix(classifier, X_train,y_train, normalize='true', cmap='Blues',display_labels=["Healthy","Parkinson"]);

In [None]:
# plot confusion matrix for test set again
# results are slightly better than the original run
plot_confusion_matrix(classifier, X_test, y_test,normalize='true', cmap='Blues',display_labels=["Healthy","Parkinson"]);