In this example, we will create a ML algorithm to detect Parkinson's disease. The data for this project has been taken from <a href="https://archive.ics.uci.edu/dataset/174/parkinsons" target="_blank">UCI ML Reopsitory</a>. 
This will be done in two different ways:
1) using XGBClassifier from the xgboost library
2) using XBG1, XGB2, GLM, EBM from the PiML (Python Interpretable ML) toolbox

Details about the PiML library can be found  <a href="https://github.com/SelfExplainML/PiML-Toolbox" target="_blank">here</a>.

### Part I

In [1]:
# %pip install xgboost


In [2]:
import os, sys
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score, precision_score, recall_score
from xgboost import XGBClassifier


In [3]:
df = pd.read_csv("parkinsons.data")
df.head()
# df.describe()

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,...,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [4]:
y = df.loc[:,'status'].values
# First column looks like a psuedo name for the patient and thus not a feature
X = df.loc[:, df.columns!='status'].values[:,1:]

In [5]:
# Print the number of occurances of each lable
print(y[y==0].shape[0], y[y==1].shape[0])

48 147


In [6]:
scaler = MinMaxScaler((-1,1))
X_scaled = scaler.fit_transform(X)

# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled,y, test_size=0.25, random_state=1)

In [8]:
model = XGBClassifier()
model.fit(X_train, y_train)

In [9]:
y_pred = model.predict(X_test)

In [10]:
print("Accuracy Score: ",accuracy_score(y_test,y_pred)*100)
print("MSE: ",mean_squared_error(y_test,y_pred))

Accuracy Score:  93.87755102040816
MSE:  0.061224489795918366


### Part II
__Let us use PiML (Python interpretable ML)__
- Provides interpretability for both model development and model diagnostics


In [11]:
# Uncomment the next line if you have not installed PiML yet
# %pip install piml
from piml import Experiment

In [12]:
# Experiment initialization and data loading from 'df' dataframe
exp = Experiment()
exp.data_loader(data=df.drop('name', axis=1))

Unnamed: 0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,MDVP:APQ,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,119.992,157.302,74.997,0.00784,0.00007,0.00370,0.00554,0.01109,0.04374,0.426,0.02182,0.03130,0.02971,0.06545,0.02211,21.033,1.0,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,122.400,148.650,113.819,0.00968,0.00008,0.00465,0.00696,0.01394,0.06134,0.626,0.03134,0.04518,0.04368,0.09403,0.01929,19.085,1.0,0.458359,0.819521,-4.075192,0.335590,2.486855,0.368674
2,116.682,131.111,111.555,0.01050,0.00009,0.00544,0.00781,0.01633,0.05233,0.482,0.02757,0.03858,0.03590,0.08270,0.01309,20.651,1.0,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,116.676,137.871,111.366,0.00997,0.00009,0.00502,0.00698,0.01505,0.05492,0.517,0.02924,0.04005,0.03772,0.08771,0.01353,20.644,1.0,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,0.584,0.03490,0.04825,0.04465,0.10470,0.01767,19.649,1.0,0.417356,0.823484,-3.747787,0.234513,2.332180,0.410335
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,174.188,230.978,94.261,0.00459,0.00003,0.00263,0.00259,0.00790,0.04087,0.405,0.02336,0.02498,0.02745,0.07008,0.02764,19.517,0.0,0.448439,0.657899,-6.538586,0.121952,2.657476,0.133050
191,209.516,253.017,89.488,0.00564,0.00003,0.00331,0.00292,0.00994,0.02751,0.263,0.01604,0.01657,0.01879,0.04812,0.01810,19.147,0.0,0.431674,0.683244,-6.195325,0.129303,2.784312,0.168895
192,174.688,240.005,74.287,0.01360,0.00008,0.00624,0.00564,0.01873,0.02308,0.256,0.01268,0.01365,0.01667,0.03804,0.10715,17.883,0.0,0.407567,0.655683,-6.787197,0.158453,2.679772,0.131728
193,198.764,396.961,74.904,0.00740,0.00004,0.00370,0.00390,0.01109,0.02296,0.241,0.01265,0.01321,0.01588,0.03794,0.07223,19.020,0.0,0.451221,0.643956,-6.744577,0.207454,2.138608,0.123306


In [13]:
from piml.models import GLMClassifier, ExplainableBoostingClassifier, XGB1Classifier, XGB2Classifier

# exp.data_summary(feature_exclude=["Race", "Gender"], silent=True)
exp.data_prepare(target="status", task_type="classification", silent=True)


HTML(value='\n        <style>\n\n        .left-label {\n            width: 30%;\n        }\n\n        .card-pa…

__Feature Selection__
- Selecting features most relevant to the response
- Large no. of features increases computational burden and chances of overfitting
- reducing the no. of features can also make the model more interpretable

Feature selection by Correlation:
- find the correlation coefficient (Pearson with values between -1 and 1) of each feature with the response
- Select only those feature that have magnitude greater than a certain threshold (usually set 0.1)

In [14]:
exp.feature_select()

HTML(value='\n        <style>\n\n        .left-label {\n            width: 30%;\n        }\n\n        .card-pa…

HBox(children=(Output(), Output()))

VBox(children=(ToggleButtons(layout=Layout(width='100%'), options=('Correlation', 'Distance Correlation', 'Fea…

In [15]:
exp.model_train(GLMClassifier(), name="GLM")
exp.model_train(ExplainableBoostingClassifier(), name="EBM")
exp.model_train(XGB1Classifier(), name="XGB1")
exp.model_train(XGB2Classifier(), name="XGB2")

HTML(value='\n        <style>\n\n        .left-label {\n            width: 30%;\n        }\n\n        .card-pa…

In [16]:
exp.model_interpret()

HTML(value='\n        <style>\n\n        .left-label {\n            width: 30%;\n        }\n\n        .card-pa…

VBox(children=(Dropdown(layout=Layout(width='20%'), options=('Select Model', 'GLM', 'EBM', 'XGB1', 'XGB2'), st…

In [17]:
# exp.model_interpret(model="GLM", show="glm_coef_plot", figsize=(5, 4))
# exp.model_interpret(model="EBM", show="local_fi", sample_id=105, centered=False, original_scale=True, figsize=(5, 4))


HTML(value='\n        <style>\n\n        .left-label {\n            width: 30%;\n        }\n\n        .card-pa…

In [17]:
exp.model_diagnose()

HTML(value='\n        <style>\n\n        .left-label {\n            width: 30%;\n        }\n\n        .card-pa…

VBox(children=(Dropdown(layout=Layout(width='20%'), options=('Select Model', 'GLM', 'EBM', 'XGB1', 'XGB2'), st…

In [18]:
exp.model_compare()

HTML(value='\n        <style>\n\n        .left-label {\n            width: 30%;\n        }\n\n        .card-pa…

VBox(children=(HBox(children=(Dropdown(layout=Layout(width='30%'), options=('Select Model', 'GLM', 'EBM', 'XGB…