# Parkinson's Disease Classification using Support vector machine
Dataset Source: UCI ML Dataset
> Dataset Link: *https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons*

> Source of the Data: *https://rdcu.be/cqVH5*


## Importing Frameworks and Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.metrics import accuracy_score
import pickle

## Loading the data from the directory

In [2]:
# data > pandas object
parkinsons_data = pd.read_csv('/content/data/parkinsons.csv')

## Getting an early insight over the data

In [3]:
parkinsons_data.head() # first 5 rows

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,MDVP:APQ,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,0.426,0.02182,0.0313,0.02971,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,0.626,0.03134,0.04518,0.04368,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,0.482,0.02757,0.03858,0.0359,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,0.517,0.02924,0.04005,0.03772,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,0.584,0.0349,0.04825,0.04465,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


### Checking for null values

In [4]:
parkinsons_data.info() # data check

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              195 non-null    object 
 1   MDVP:Fo(Hz)       195 non-null    float64
 2   MDVP:Fhi(Hz)      195 non-null    float64
 3   MDVP:Flo(Hz)      195 non-null    float64
 4   MDVP:Jitter(%)    195 non-null    float64
 5   MDVP:Jitter(Abs)  195 non-null    float64
 6   MDVP:RAP          195 non-null    float64
 7   MDVP:PPQ          195 non-null    float64
 8   Jitter:DDP        195 non-null    float64
 9   MDVP:Shimmer      195 non-null    float64
 10  MDVP:Shimmer(dB)  195 non-null    float64
 11  Shimmer:APQ3      195 non-null    float64
 12  Shimmer:APQ5      195 non-null    float64
 13  MDVP:APQ          195 non-null    float64
 14  Shimmer:DDA       195 non-null    float64
 15  NHR               195 non-null    float64
 16  HNR               195 non-null    float64
 1

### No. of Rows and Columns

In [5]:
parkinsons_data.shape # no of rows and cols

(195, 24)

### Checking for missing values

In [6]:
parkinsons_data.isnull().sum() # check for missing values

name                0
MDVP:Fo(Hz)         0
MDVP:Fhi(Hz)        0
MDVP:Flo(Hz)        0
MDVP:Jitter(%)      0
MDVP:Jitter(Abs)    0
MDVP:RAP            0
MDVP:PPQ            0
Jitter:DDP          0
MDVP:Shimmer        0
MDVP:Shimmer(dB)    0
Shimmer:APQ3        0
Shimmer:APQ5        0
MDVP:APQ            0
Shimmer:DDA         0
NHR                 0
HNR                 0
status              0
RPDE                0
DFA                 0
spread1             0
spread2             0
D2                  0
PPE                 0
dtype: int64

### Checking Data Status Ratio for our Classification Problem
> 0 means NEGATIVE

> 1 means POSITIVE

In [7]:
# status variable percentage 
# 1 = Positive | 0 = Negative
parkinsons_data['status'].value_counts()

1    147
0     48
Name: status, dtype: int64

In [8]:
parkinsons_data.groupby('status').mean() # insights about distinctions for seperate variables and taking the mean

Unnamed: 0_level_0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,MDVP:APQ,Shimmer:DDA,NHR,HNR,RPDE,DFA,spread1,spread2,D2,PPE
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,181.937771,223.63675,145.207292,0.003866,2.3e-05,0.001925,0.002056,0.005776,0.017615,0.162958,0.009504,0.010509,0.013305,0.028511,0.011483,24.67875,0.442552,0.695716,-6.759264,0.160292,2.154491,0.123017
1,145.180762,188.441463,106.893558,0.006989,5.1e-05,0.003757,0.0039,0.011273,0.033658,0.321204,0.017676,0.020285,0.0276,0.053027,0.029211,20.974048,0.516816,0.725408,-5.33342,0.248133,2.456058,0.233828


Data Preprocessing

> Removed: ID

> Seperating Features and Status


In [9]:
X = parkinsons_data.drop(columns=['name','status'], axis = 1) # dropping the column
Y = parkinsons_data['status']
print("\nX Variable Status\n")
print(X)
print("\nY Variable Status\n")
print(Y)


X Variable Status

     MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  ...   spread2        D2       PPE
0        119.992       157.302        74.997  ...  0.266482  2.301442  0.284654
1        122.400       148.650       113.819  ...  0.335590  2.486855  0.368674
2        116.682       131.111       111.555  ...  0.311173  2.342259  0.332634
3        116.676       137.871       111.366  ...  0.334147  2.405554  0.368975
4        116.014       141.781       110.655  ...  0.234513  2.332180  0.410335
..           ...           ...           ...  ...       ...       ...       ...
190      174.188       230.978        94.261  ...  0.121952  2.657476  0.133050
191      209.516       253.017        89.488  ...  0.129303  2.784312  0.168895
192      174.688       240.005        74.287  ...  0.158453  2.679772  0.131728
193      198.764       396.961        74.904  ...  0.207454  2.138608  0.123306
194      214.289       260.277        77.973  ...  0.190667  2.555477  0.148569

[195 rows x 22 colu

## Data Splitting | Training Data & Test Data

In [10]:
# taking 80% data for Training Data, 20% data for Test Data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [11]:
print(X.shape, X_train.shape, X_test.shape) # check

(195, 22) (156, 22) (39, 22)


In [12]:
print(Y.shape, Y_train.shape, Y_test.shape) # check

(195,) (156,) (39,)


## Standardizing the Data

In [13]:
scaler = StandardScaler()

In [14]:
scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [15]:
X_train = scaler.transform(X_train)

In [16]:
X_test = scaler.transform(X_test)

In [17]:
print(X_train)

[[ 0.63239631 -0.02731081 -0.87985049 ... -0.97586547 -0.55160318
   0.07769494]
 [-1.05512719 -0.83337041 -0.9284778  ...  0.3981808  -0.61014073
   0.39291782]
 [ 0.02996187 -0.29531068 -1.12211107 ... -0.43937044 -0.62849605
  -0.50948408]
 ...
 [-0.9096785  -0.6637302  -0.160638   ...  1.22001022 -0.47404629
  -0.2159482 ]
 [-0.35977689  0.19731822 -0.79063679 ... -0.17896029 -0.47272835
   0.28181221]
 [ 1.01957066  0.19922317 -0.61914972 ... -0.716232    1.23632066
  -0.05829386]]


## Training the Model using SVM

In [18]:
svm_model = svm.SVC(kernel='linear')

In [19]:
# training the model with X_train
svm_model.fit(X_train, Y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

## Model Evaluation

### Train Data Accuracy

In [20]:
X_train_predict = svm_model.predict(X_train) # prediction
accuracy_train = accuracy_score(Y_train, X_train_predict)*100 # getting score in percentage
print(accuracy_train) # Training accuracy

88.46153846153845


### Test Data Accuracy

In [21]:
X_test_predict = svm_model.predict(X_test)
accuracy_test = accuracy_score(Y_test, X_test_predict)*100 # getting score in percentage
print(accuracy_test)

87.17948717948718


## Saving the Model with Pickle

In [22]:
# using pickle to store model for saving computational resources
parkinsons_classifier_model = "/content/data/parkinsons_classifier_model.pickle"  
with open(parkinsons_classifier_model, 'wb') as file:  
    pickle.dump(svm_model, file)

## Outcomes: Data Prediction Examples

### **Example: 1**
> Known to us that Person **is affected**.

In [23]:
#imputing an arbitrary data from the set with known value of 1 for 'status' variable which is our outcome
input_data = [120.26700,137.24400,114.82000,0.00333,0.00003,0.00155,0.00202,0.00466,0.01608,0.14000,0.00779,0.00937,0.01351,0.02337,0.00607,24.88600,0.596040,0.764112,-5.634322,0.257682,1.854785,0.211756]
input_data_numpy = np.asanyarray(input_data)
input_data_reshape = input_data_numpy.reshape(1, -1) # predicting for one dat apoint value
standard_data = scaler.transform(input_data_reshape)
# Loading the Pickle model
parkinsons_classifier_model = pickle.load(open('/content/data/parkinsons_classifier_model.pickle','rb'))
score = parkinsons_classifier_model.score(X_test, Y_test)
print("Test score: {0:.2f} %".format(100 * score))

predict = parkinsons_classifier_model.predict(standard_data)
print(predict)

if (predict[0] == 0):
  print("Person is NOT affected.")

else:
  print("Person is affected.")


Test score: 87.18 %
[1]
Person is affected.


### **Example: 2**
> Known to us that Person **isn't affected**.


In [24]:
input_data = [197.07600,206.89600,192.05500,0.00289,0.00001,0.00166,0.00168,0.00498,0.01098,0.09700,0.00563,0.00680,0.00802,0.01689,0.00339,26.77500,0.422229,0.741367,-7.348300,0.177551,1.743867,0.085569]
input_data_numpy = np.asanyarray(input_data)
input_data_reshape = input_data_numpy.reshape(1, -1)
standard_data = scaler.transform(input_data_reshape)

# parkinsons_classifier_model = pickle.load(open('/content/data/parkinsons_classifier_model.pickle','rb'))
# score = parkinsons_classifier_model.score(X_test, Y_test)
# print("Test score: {0:.2f} %".format(100 * score))

predict = parkinsons_classifier_model.predict(standard_data)
print(predict)

if (predict[0] == 0):
  print("Person is NOT affected.")

else:
  print("Person is affected.")

[0]
Person is NOT affected.


### **Example: 3** 
> Final Check.

> Known to us that Person **isn't affected**.



In [25]:
input_data = [128.00100,138.05200,122.08000,0.00436,0.00003,0.00137,0.00166,0.00411,0.02297,0.21000,0.01323,0.01072,0.01677,0.03969,0.00481,24.69200,0.459766,0.766204,-7.072419,0.220434,1.972297,0.119308]
input_data_numpy = np.asanyarray(input_data)
input_data_reshape = input_data_numpy.reshape(1, -1)
standard_data = scaler.transform(input_data_reshape)

# parkinsons_classifier_model = pickle.load(open('/content/data/parkinsons_classifier_model.pickle','rb'))
# score = parkinsons_classifier_model.score(X_test, Y_test)
# print("Test score: {0:.2f} %".format(100 * score))

predict = parkinsons_classifier_model.predict(standard_data)
print(predict)

if (predict[0] == 0):
  print("Person is NOT affected.")

else:
  print("Person is affected.")

[0]
Person is NOT affected.
