**The dataset** is collected from UCI Machine Learning Repository through the following [link](https://archive.ics.uci.edu/ml/datasets/Parkinson%27s+Disease+Classification#)

extract data with its default name `pd_speech_features.csv` in `__data__` directory

In [33]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from pprint import pprint

In [34]:
df = pd.read_csv('./__data__/pd_speech_features.csv')
df.head()

Unnamed: 0,id,gender,PPE,DFA,RPDE,numPulses,numPeriodsPulses,meanPeriodPulses,stdDevPeriodPulses,locPctJitter,...,tqwt_kurtosisValue_dec_28,tqwt_kurtosisValue_dec_29,tqwt_kurtosisValue_dec_30,tqwt_kurtosisValue_dec_31,tqwt_kurtosisValue_dec_32,tqwt_kurtosisValue_dec_33,tqwt_kurtosisValue_dec_34,tqwt_kurtosisValue_dec_35,tqwt_kurtosisValue_dec_36,class
0,0,1,0.85247,0.71826,0.57227,240,239,0.008064,8.7e-05,0.00218,...,1.562,2.6445,3.8686,4.2105,5.1221,4.4625,2.6202,3.0004,18.9405,1
1,0,1,0.76686,0.69481,0.53966,234,233,0.008258,7.3e-05,0.00195,...,1.5589,3.6107,23.5155,14.1962,11.0261,9.5082,6.5245,6.3431,45.178,1
2,0,1,0.85083,0.67604,0.58982,232,231,0.00834,6e-05,0.00176,...,1.5643,2.3308,9.4959,10.7458,11.0177,4.8066,2.9199,3.1495,4.7666,1
3,1,0,0.41121,0.79672,0.59257,178,177,0.010858,0.000183,0.00419,...,3.7805,3.5664,5.2558,14.0403,4.2235,4.6857,4.846,6.265,4.0603,1
4,1,0,0.3279,0.79782,0.53028,236,235,0.008162,0.002669,0.00535,...,6.1727,5.8416,6.0805,5.7621,7.7817,11.6891,8.2103,5.0559,6.1164,1


In [35]:
def reset_random_seed(seed=1917):
    np.random.seed(seed)

In [36]:
X = df.copy()
y = X.pop('class')
ids = X.pop('id')

In [37]:
# Convert to panda dataframes to numpy nd-arrays
X = X.to_numpy()
y = y.values
ids = ids.values
unique_id = np.unique(ids)

## Fit & inform

In [38]:
from collections import defaultdict 
from scipy.stats import mode
from sklearn.metrics import (
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score, 
    matthews_corrcoef
)

def fit_inform(model, model_data, metrics=None):
    if (metrics is None): metrics = defaultdict(list)
    (X_train, X_test, y_train, y_test) = model_data

    # fit models
    model.fit(X_train, y_train)
    # predict
    y_pred = model.predict(X_test)
    # majority vote
    test_index_ids = ids[test_index]
    for test_id in unique_test_ids:
        y_indexs = test_index_ids == test_id 
        y_pred[y_indexs] = mode(y_pred[y_indexs]).mode[0]

    y_true = y_test

    metrics["accuracy"].append(accuracy_score(y_true, y_pred))
    metrics["precision"].append(precision_score(y_true, y_pred))
    metrics["recall"].append(recall_score(y_true, y_pred))
    metrics["fMeasure"].append(f1_score(y_true, y_pred))
    metrics["mcc"].append(matthews_corrcoef(y_true, y_pred))


    for metric in metrics:
        cur_metric = metrics[metric]  
        if metric == 'accuracy':
            metrics[metric] = f"N({np.mean(cur_metric):.3}, {np.std(cur_metric):.2})"
        else:
            metrics[metric] = f"{np.mean(cur_metric):.3}"
            
    return metrics

In [39]:
def get_model_name(model):
    return type(model).__name__
def get_hyper_parameter(model):
    return { key:value for (key, value) in model.get_params().items() if value }

def add_extra_metrics(metrics, model, pca):
    metrics[get_model_name(model)] = get_hyper_parameter(model)
    metrics['PCA'] = get_hyper_parameter(pca)
    return metrics

## Model Selection

In [40]:
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

unique_train_ids, unique_test_ids = train_test_split(unique_id, test_size=0.3)

# same person same predict
train_index = np.isin(ids, unique_train_ids)
test_index = np.isin(ids, unique_test_ids)

# test and train data
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

# PCA min(n_samples, n_features)=528
pca = PCA(n_components=100)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

model_data = (X_train, X_test, y_train, y_test)

In [41]:
""" Decision Tree """
from sklearn.tree import DecisionTreeClassifier
reset_random_seed()

model = DecisionTreeClassifier(min_weight_fraction_leaf=0.1, criterion="entropy", min_samples_split=5)
metrics = fit_inform(model, model_data)
metrics = add_extra_metrics(metrics, model, pca)
pprint(metrics)

defaultdict(<class 'list'>,
            {'DecisionTreeClassifier': {'criterion': 'entropy',
                                        'min_samples_leaf': 1,
                                        'min_samples_split': 5,
                                        'min_weight_fraction_leaf': 0.1,
                                        'presort': 'deprecated',
                                        'splitter': 'best'},
             'PCA': {'copy': True,
                     'iterated_power': 'auto',
                     'n_components': 100,
                     'svd_solver': 'auto'},
             'accuracy': 'N(0.763, 0.0)',
             'fMeasure': '0.85',
             'mcc': '0.331',
             'precision': '0.785',
             'recall': '0.927'})


In [10]:
""" RandomForestClassifier """
from sklearn.ensemble import RandomForestClassifier
reset_random_seed()
model = RandomForestClassifier(bootstrap=False, n_estimators=90)
metrics = fit_inform(model, model_data)
metrics = add_extra_metrics(metrics, model, pca)
pprint(metrics)

defaultdict(<class 'list'>,
            {'PCA': {'copy': True,
                     'iterated_power': 'auto',
                     'n_components': 100,
                     'svd_solver': 'auto'},
             'RandomForestClassifier': {'criterion': 'gini',
                                        'max_features': 'auto',
                                        'min_samples_leaf': 1,
                                        'min_samples_split': 2,
                                        'n_estimators': 90},
             'accuracy': 'N(0.776, 0.0)',
             'fMeasure': '0.866',
             'mcc': '0.227',
             'precision': '0.809',
             'recall': '0.932'})


In [11]:
""" XGBoost (GradientBoostingClassifier) """
from sklearn.ensemble import GradientBoostingClassifier
reset_random_seed()
model = GradientBoostingClassifier(n_estimators=105)
metrics = fit_inform(model, model_data)
metrics = add_extra_metrics(metrics, model, pca)
pprint(metrics)

defaultdict(<class 'list'>,
            {'GradientBoostingClassifier': {'criterion': 'friedman_mse',
                                            'learning_rate': 0.1,
                                            'loss': 'deviance',
                                            'max_depth': 3,
                                            'min_samples_leaf': 1,
                                            'min_samples_split': 2,
                                            'n_estimators': 105,
                                            'presort': 'deprecated',
                                            'subsample': 1.0,
                                            'tol': 0.0001,
                                            'validation_fraction': 0.1},
             'PCA': {'copy': True,
                     'iterated_power': 'auto',
                     'n_components': 100,
                     'svd_solver': 'auto'},
             'accuracy': 'N(0.842, 0.0)',
             'fMeasure': '0.9',
    

In [12]:
""" SVM """
from sklearn.svm import SVC as SVM
reset_random_seed()
model = SVM(kernel="poly", degree=1)
metrics = fit_inform(model, model_data)
metrics = add_extra_metrics(metrics, model, pca)
pprint(metrics['accuracy'])

'N(0.776, 0.0)'


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


### Metrics

In [29]:
from sklearn.model_selection import KFold

def cross_validation(model, X, y, k=5):
    metrics = defaultdict(list)
    kf = KFold(n_splits=k)
    
    for unique_train_ids, unique_test_ids in kf.split(unique_id):
        # same person same predict
        train_index = np.isin(ids, unique_train_ids)
        test_index = np.isin(ids, unique_test_ids)

        # test and train data
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # PCA min(n_samples, n_features)=528
        pca = PCA(n_components=100)
        X_train = pca.fit_transform(X_train)
        X_test = pca.transform(X_test)
        
        # fit models
        model.fit(X_train, y_train)
        # predict
        y_pred = model.predict(X_test)
        
        # majority vote
        test_index_ids = ids[test_index]
        for test_id in unique_test_ids:
            y_indexs = test_index_ids == test_id 
            y_pred[y_indexs] = mode(y_pred[y_indexs]).mode[0]

        y_true = y_test

        metrics["accuracy"].append(accuracy_score(y_true, y_pred))
        metrics["precision"].append(precision_score(y_true, y_pred))
        metrics["recall"].append(recall_score(y_true, y_pred))
        metrics["fMeasure"].append(f1_score(y_true, y_pred))
        metrics["mcc"].append(matthews_corrcoef(y_true, y_pred))


    for metric in metrics:
        cur_metric = metrics[metric]  
        if metric == 'accuracy':
            metrics[metric] = f"N({np.mean(cur_metric):.3}, {np.std(cur_metric):.2})"
        else:
            metrics[metric] = f"{np.mean(cur_metric):.3}"

    return metrics

In [58]:
reset_random_seed()
model = DecisionTreeClassifier(min_weight_fraction_leaf=0.1, criterion="entropy", min_samples_split=5)
pprint({'DecisionTreeClassifier': cross_validation(model, X, y)})

reset_random_seed()
model = SVM(kernel="poly", degree=1)
pprint({'SVM': cross_validation(model, X, y)})

reset_random_seed()
model = GradientBoostingClassifier(subsample=0.84, n_estimators=125, min_samples_split=20, max_features='log2')
pprint({'GradientBoostingClassifier': cross_validation(model, X, y)['accuracy']})

reset_random_seed()
model = RandomForestClassifier(bootstrap=False, n_estimators=90)
pprint({'RandomForestClassifier': cross_validation(model, X, y)})

{'DecisionTreeClassifier': defaultdict(<class 'list'>,
                                       {'accuracy': 'N(0.742, 0.088)',
                                        'fMeasure': '0.838',
                                        'mcc': '0.208',
                                        'precision': '0.773',
                                        'recall': '0.916'})}


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


{'SVM': defaultdict(<class 'list'>,
                    {'accuracy': 'N(0.746, 0.062)',
                     'fMeasure': '0.853',
                     'mcc': '0.0',
                     'precision': '0.746',
                     'recall': '1.0'})}
{'GradientBoostingClassifier': 'N(0.806, 0.029)'}
{'RandomForestClassifier': defaultdict(<class 'list'>,
                                       {'accuracy': 'N(0.794, 0.054)',
                                        'fMeasure': '0.872',
                                        'mcc': '0.371',
                                        'precision': '0.803',
                                        'recall': '0.956'})}


|method          | accuracy | f-measure | percision | recall | MCC |
|:------         |:--------:|:---------:|:---------:|:------:|:---:|
|SVM             | 0.732    | 0.841     | 0.751     | 0.959  |  -  |
|SVC          (T)| 0.746    | 0.853     | 0.746     | 1.0    |0.0  |
|Decision tree   | 0.720    | 0.815     | 0.808     | 0.828  |  -  |
|Decision tree(T)| 0.791    | 0.866     | 0.816     | 0.924  |0.358|
|Random Forest   | 0.832    | 0.892     | 0.842     | 0.951  |  -  |
|Random Forest(T)| 0.839    | 0.897     | 0.847     | 0.956  |0.521|
|XGBoost         | 0.841    | 0.896     | 0.857     | 0.939  |  -  |
|XGBoost      (T)|**0.86**  | 0.909     | 0.862     | 0.963  |0.598|
|[paper][link]   | 0.86     | 0.84      |   -       |    -   | 0.59|
|SVC          (PT)| 0.746    | 0.853     | 0.746     | 1.0    |0.0  |
|Decision tree(PT)| 0.746    | 0.84     | 0.776     | 0.916  |0.224|
|Random Forest(PT)| 0.781    | 0.866     | 0.790     | 0.961  |0.322|
|XGBoost      (PT)| 0.802    | 0.875     | 0.828     | 0.932  |0.39|

> (T) means hyper parameter tuned in this version (5-fold)   
> (PT) paper method with hyper parameter tuned (person aggregation + majority vote + 5-fold)


[link]: https://www.sciencedirect.com/science/article/abs/pii/S1568494618305799?via%3Dihub

## Old Method with correlation removal


### Preprocessing Data
- Remove nearly the same data
pearson correlation provided by [pandas](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.corr.html) is used

<p align="cetner">
    <img src="https://github.com/amirHossein-Ebrahimi/leaf-node/raw/671c396db2f81a2352941ec572dd45b04252b7da/applications/doc/images/correlation.jpg">
</p>

<sub>For high quality image, visit [link](https://render.githubusercontent.com/view/kaggle_corr)</sub>

In [None]:
%matplotlib inline
app_LT_entropy_logs = [att for att in X.columns.values if att.startswith('app_LT_entropy_log')]
sns.heatmap(X[app_LT_entropy_logs].corr(), annot=False)

selected_feature = app_LT_entropy_logs[:5]
X['app_LT_entropy_logs[:5]'] = X[selected_feature].mean(numeric_only=True, axis=1)
X.drop(selected_feature, axis=1, inplace=True)

selected_feature = app_LT_entropy_logs[5:7]
X['app_LT_entropy_logs[5:7]'] = X[selected_feature].mean(numeric_only=True, axis=1)
X.drop(selected_feature, axis=1, inplace=True)

selected_feature = app_LT_entropy_logs[7:]
X['app_LT_entropy_logs[7:]'] = X[selected_feature].mean(numeric_only=True, axis=1)
X.drop(selected_feature, axis=1, inplace=True)

category = 'app_det_TKEO_mean'
selected_feature = [att for att in X.columns.values if att.startswith(category)]
selected_feature = selected_feature[3:]
X[f'{category}[3:]'] = X[selected_feature].mean(numeric_only=True, axis=1)
X.drop(selected_feature, axis=1, inplace=True)

category = 'app_TKEO_std'
selected_feature = [att for att in X.columns.values if att.startswith(category)]
selected_feature = selected_feature[3:]
X[f'{category}[3:]'] = X[selected_feature].mean(numeric_only=True, axis=1)
X.drop(selected_feature, axis=1, inplace=True)

category = 'app_LT_TKEO_mean'
selected_feature = [att for att in X.columns.values if att.startswith(category)]
selected_feature = selected_feature[4:]
X[f'{category}[4:]'] = X[selected_feature].mean(numeric_only=True, axis=1)
X.drop(selected_feature, axis=1, inplace=True)

category = 'app_LT_TKEO_std'
selected_feature = [att for att in X.columns.values if att.startswith(category)]
X[f'{category}[3:6]'] = X[selected_feature[3:6]].mean(numeric_only=True, axis=1)
X[f'{category}[6:]'] = X[selected_feature[6:]].mean(numeric_only=True, axis=1)
X.drop(selected_feature[3:], axis=1, inplace=True)