# PARKINSON'S DISEASE PREDICTION
#### This is a classification type of problem in which we are going to predict if a patient is suffering from Parkinson's disease based on the audio/ voice measures. 
##### https://www.kaggle.com/datasets/dipayanbiswas/parkinsons-disease-speech-signal-features

In [1]:
# Loading the dependencis
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

In [2]:
warnings.filterwarnings("ignore")

In [3]:
data=pd.read_csv('parkinsons.data')

In [4]:
data.head()

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,...,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [5]:
data.shape

(195, 24)

In [6]:
data.isnull().sum()

name                0
MDVP:Fo(Hz)         0
MDVP:Fhi(Hz)        0
MDVP:Flo(Hz)        0
MDVP:Jitter(%)      0
MDVP:Jitter(Abs)    0
MDVP:RAP            0
MDVP:PPQ            0
Jitter:DDP          0
MDVP:Shimmer        0
MDVP:Shimmer(dB)    0
Shimmer:APQ3        0
Shimmer:APQ5        0
MDVP:APQ            0
Shimmer:DDA         0
NHR                 0
HNR                 0
status              0
RPDE                0
DFA                 0
spread1             0
spread2             0
D2                  0
PPE                 0
dtype: int64

#### There are no null values in the dataframe.

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              195 non-null    object 
 1   MDVP:Fo(Hz)       195 non-null    float64
 2   MDVP:Fhi(Hz)      195 non-null    float64
 3   MDVP:Flo(Hz)      195 non-null    float64
 4   MDVP:Jitter(%)    195 non-null    float64
 5   MDVP:Jitter(Abs)  195 non-null    float64
 6   MDVP:RAP          195 non-null    float64
 7   MDVP:PPQ          195 non-null    float64
 8   Jitter:DDP        195 non-null    float64
 9   MDVP:Shimmer      195 non-null    float64
 10  MDVP:Shimmer(dB)  195 non-null    float64
 11  Shimmer:APQ3      195 non-null    float64
 12  Shimmer:APQ5      195 non-null    float64
 13  MDVP:APQ          195 non-null    float64
 14  Shimmer:DDA       195 non-null    float64
 15  NHR               195 non-null    float64
 16  HNR               195 non-null    float64
 1

In [8]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MDVP:Fo(Hz),195.0,154.228641,41.390065,88.333,117.572,148.79,182.769,260.105
MDVP:Fhi(Hz),195.0,197.104918,91.491548,102.145,134.8625,175.829,224.2055,592.03
MDVP:Flo(Hz),195.0,116.324631,43.521413,65.476,84.291,104.315,140.0185,239.17
MDVP:Jitter(%),195.0,0.00622,0.004848,0.00168,0.00346,0.00494,0.007365,0.03316
MDVP:Jitter(Abs),195.0,4.4e-05,3.5e-05,7e-06,2e-05,3e-05,6e-05,0.00026
MDVP:RAP,195.0,0.003306,0.002968,0.00068,0.00166,0.0025,0.003835,0.02144
MDVP:PPQ,195.0,0.003446,0.002759,0.00092,0.00186,0.00269,0.003955,0.01958
Jitter:DDP,195.0,0.00992,0.008903,0.00204,0.004985,0.00749,0.011505,0.06433
MDVP:Shimmer,195.0,0.029709,0.018857,0.00954,0.016505,0.02297,0.037885,0.11908
MDVP:Shimmer(dB),195.0,0.282251,0.194877,0.085,0.1485,0.221,0.35,1.302


## Attribute Information:

## Matrix column entries (attributes):
### name - ASCII subject name and recording number
MDVP:Fo(Hz) - Average vocal fundamental frequency
MDVP:Fhi(Hz) - Maximum vocal fundamental frequency
MDVP:Flo(Hz) - Minimum vocal fundamental frequency
MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP - Several 
measures of variation in fundamental frequency
MDVP:Shimmer,MDVP:Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,MDVP:APQ,Shimmer:DDA - Several measures of variation in amplitude
NHR,HNR - Two measures of ratio of noise to tonal components in the voice
status - Health status of the subject (one) - Parkinson's, (zero) - healthy
RPDE,D2 - Two nonlinear dynamical complexity measures
DFA - Signal fractal scaling exponent
spread1,spread2,PPE - Three nonlinear measures of fundamental frequency variation 

In [9]:
data.columns

Index(['name', 'MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(%)',
       'MDVP:Jitter(Abs)', 'MDVP:RAP', 'MDVP:PPQ', 'Jitter:DDP',
       'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5',
       'MDVP:APQ', 'Shimmer:DDA', 'NHR', 'HNR', 'status', 'RPDE', 'DFA',
       'spread1', 'spread2', 'D2', 'PPE'],
      dtype='object')

In [10]:
# The Status column is the target column.
data.status

0      1
1      1
2      1
3      1
4      1
      ..
190    0
191    0
192    0
193    0
194    0
Name: status, Length: 195, dtype: int64

#### The values  implies Health status of the subject (1) - Parkinson's, (0) - healthy

# VISUALIZATION

In [None]:
plt.figure(figsize=(10, 6))
data.status.hist()
plt.xlabel('status')
plt.ylabel('Frequencies')
plt.plot()
# The dataset has high number of patients effected with Parkinson's disease.

[]

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x="status",y="NHR",data=data);

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x="status",y="HNR",data=data);

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x="status",y="RPDE",data=data);

### Distribution plot

In [None]:
rows=3
cols=7
fig, ax=plt.subplots(nrows=rows,ncols=cols,figsize=(16,4))
col=data.columns
index=1
for i in range(rows):
    for j in range(cols):
        sns.distplot(data[col[index]],ax=ax[i][j])
        index=index+1
        
plt.tight_layout()

In [None]:
# Removing  name column as it is not having much importance.
data.drop(['name'],axis=1,inplace=True)

In [None]:
# Lets rearrange the columns:

In [None]:
data.columns

In [None]:
# Lets rearrange the columns for convenience
data = data[['MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(%)',
       'MDVP:Jitter(Abs)', 'MDVP:RAP', 'MDVP:PPQ', 'Jitter:DDP',
       'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5',
       'MDVP:APQ', 'Shimmer:DDA', 'NHR', 'HNR',  'RPDE', 'DFA',
       'spread1', 'spread2', 'D2', 'PPE','status']]

In [None]:
data.head()

In [None]:
x = data.iloc[:, :-1].values
x.shape

In [None]:
y = data.iloc[:, -1].values
y.shape

In [None]:
sc = StandardScaler()

In [None]:
x = sc.fit_transform(x)

### Splitting the data

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

In [None]:
x_train.shape

In [None]:
x_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

# MODEL BUILDING

### We will use a for loop to fit and train all the algorithms.

In [None]:
models = {
    "LogisticRegression" : LogisticRegression(),
    "DecisionTreeClassifier" : DecisionTreeClassifier(),
    "RandomForestClassifier" : RandomForestClassifier(),
    "KNeighborsClassifier"   : KNeighborsClassifier(),
    "SVC" : SVC()
}

In [None]:
for name, model in models.items():
    model.fit(x_train, y_train)
    print("model trained with {}".format(name))
    score_train =accuracy_score (y_train, model.predict(x_train))
    score_tst = accuracy_score(y_test, model.predict(x_test))
    con_train = confusion_matrix(y_train, model.predict(x_train))
    con_test = confusion_matrix(y_test, model.predict(x_test))
    print("Model accuracy on train is:: ",score_train)
    print("Model accuracy on test is:: ", score_tst)
    print("confusion_matrix train is:: ", con_train)
    print("confusion_matrix test is:: ", con_test)
    print("Wrong Predictions made:",(y_test !=model.predict(x_test)).sum(),'/',((y_test == model.predict(x_test)).sum()+(y_test != model.predict(x_test)).sum()))
    print("*"*60)

#### By looking at the metrics we can see that  KNeighborsClassifier has the best perfomance overall.

## HYPER PARAMETER TUNING WITH GRIDSEARCHCV

We will use three hyperparamters- n-neighbors, weights and metric.

n_neighbors: Decide the best k based on the values we have computed earlier.
weights: Check whether adding weights to the data points is beneficial to the model or not. 'uniform' assigns no weight, while 'distance' weighs points by the inverse of their distances meaning nearer points will have more weight than the farther points.
metric: The distance metric to be used will calculating the similarity.

In [None]:
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}

In [None]:
gs = GridSearchCV(KNeighborsClassifier(), grid_params, verbose = 1, cv=3, n_jobs = -1)

In [None]:
# fit the model on our train set
g_res = gs.fit(x_train, y_train)

In [None]:
# find the best score
g_res.best_score_

In [None]:
# get the hyperparameters with the best score
g_res.best_params_

In [None]:
# use the best hyperparameters
knn = KNeighborsClassifier(n_neighbors = 7, weights = 'distance',algorithm = 'brute',metric = 'manhattan')
knn.fit(x_train, y_train)

In [None]:
y_pred_knn_train = knn.predict(x_train)
y_pred_knn_train

In [None]:
y_pred_knn_test = knn.predict(x_test)
y_pred_knn_test

In [None]:
print('Training set accuracy: ', metrics.accuracy_score(y_train, y_pred_knn_train))
print('Test set accuracy: ',metrics.accuracy_score(y_test, y_pred_knn_test))

In [None]:
print(confusion_matrix(y_test, y_pred_knn_test))

In [None]:
print("Wrong Predictions made:",(y_test !=knn.predict(x_test)).sum(),'/',((y_test == knn.predict(x_test)).sum()+(y_test != knn.predict(x_test)).sum()))

In [None]:
print(classification_report(y_test, y_pred_knn_test))

In [None]:
scores = cross_val_score(knn, x, y, cv =5)

In [None]:
print('Model accuracy: ',np.mean(scores))

### As we see, we have obtained a very good model accuracy of 0.79. It is possible that the accuracy may be increased further by using more hyperparameters or with a different model.

In [None]:
y_pred_knn_test,y_test

In [None]:
## Saving the actual and predicted values to a dataframe

In [None]:
PREDICTIONS=pd.DataFrame(zip(y_pred_knn_test,y_test), columns=["Actual_state", "predicted_state"])

In [None]:
PREDICTIONS.replace({1:"Patient", 0:"Healthy"}, inplace=True)

In [None]:
PREDICTIONS.shape

In [None]:
PREDICTIONS.head(39)