In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
dataset = pd.read_csv("Datasets/parkinsons - dataset.csv")
dataset.head()

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,...,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [3]:
print(dataset.shape)

(195, 24)


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              195 non-null    object 
 1   MDVP:Fo(Hz)       195 non-null    float64
 2   MDVP:Fhi(Hz)      195 non-null    float64
 3   MDVP:Flo(Hz)      195 non-null    float64
 4   MDVP:Jitter(%)    195 non-null    float64
 5   MDVP:Jitter(Abs)  195 non-null    float64
 6   MDVP:RAP          195 non-null    float64
 7   MDVP:PPQ          195 non-null    float64
 8   Jitter:DDP        195 non-null    float64
 9   MDVP:Shimmer      195 non-null    float64
 10  MDVP:Shimmer(dB)  195 non-null    float64
 11  Shimmer:APQ3      195 non-null    float64
 12  Shimmer:APQ5      195 non-null    float64
 13  MDVP:APQ          195 non-null    float64
 14  Shimmer:DDA       195 non-null    float64
 15  NHR               195 non-null    float64
 16  HNR               195 non-null    float64
 1

In [5]:
dataset.isnull().sum()            #as we can see there is no null value, dataset can be obtained from UCI Dataset website

name                0
MDVP:Fo(Hz)         0
MDVP:Fhi(Hz)        0
MDVP:Flo(Hz)        0
MDVP:Jitter(%)      0
MDVP:Jitter(Abs)    0
MDVP:RAP            0
MDVP:PPQ            0
Jitter:DDP          0
MDVP:Shimmer        0
MDVP:Shimmer(dB)    0
Shimmer:APQ3        0
Shimmer:APQ5        0
MDVP:APQ            0
Shimmer:DDA         0
NHR                 0
HNR                 0
status              0
RPDE                0
DFA                 0
spread1             0
spread2             0
D2                  0
PPE                 0
dtype: int64

In [6]:
dataset.describe()

Unnamed: 0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
count,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,...,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0
mean,154.228641,197.104918,116.324631,0.00622,4.4e-05,0.003306,0.003446,0.00992,0.029709,0.282251,...,0.046993,0.024847,21.885974,0.753846,0.498536,0.718099,-5.684397,0.22651,2.381826,0.206552
std,41.390065,91.491548,43.521413,0.004848,3.5e-05,0.002968,0.002759,0.008903,0.018857,0.194877,...,0.030459,0.040418,4.425764,0.431878,0.103942,0.055336,1.090208,0.083406,0.382799,0.090119
min,88.333,102.145,65.476,0.00168,7e-06,0.00068,0.00092,0.00204,0.00954,0.085,...,0.01364,0.00065,8.441,0.0,0.25657,0.574282,-7.964984,0.006274,1.423287,0.044539
25%,117.572,134.8625,84.291,0.00346,2e-05,0.00166,0.00186,0.004985,0.016505,0.1485,...,0.024735,0.005925,19.198,1.0,0.421306,0.674758,-6.450096,0.174351,2.099125,0.137451
50%,148.79,175.829,104.315,0.00494,3e-05,0.0025,0.00269,0.00749,0.02297,0.221,...,0.03836,0.01166,22.085,1.0,0.495954,0.722254,-5.720868,0.218885,2.361532,0.194052
75%,182.769,224.2055,140.0185,0.007365,6e-05,0.003835,0.003955,0.011505,0.037885,0.35,...,0.060795,0.02564,25.0755,1.0,0.587562,0.761881,-5.046192,0.279234,2.636456,0.25298
max,260.105,592.03,239.17,0.03316,0.00026,0.02144,0.01958,0.06433,0.11908,1.302,...,0.16942,0.31482,33.047,1.0,0.685151,0.825288,-2.434031,0.450493,3.671155,0.527367


In [7]:
dataset.status.value_counts()        #dataset['status'].value_counts()

1    147
0     48
Name: status, dtype: int64

In [8]:
dataset.groupby('status').mean()       #groups the dataset according to the status of patient

Unnamed: 0_level_0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),...,MDVP:APQ,Shimmer:DDA,NHR,HNR,RPDE,DFA,spread1,spread2,D2,PPE
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,181.937771,223.63675,145.207292,0.003866,2.3e-05,0.001925,0.002056,0.005776,0.017615,0.162958,...,0.013305,0.028511,0.011483,24.67875,0.442552,0.695716,-6.759264,0.160292,2.154491,0.123017
1,145.180762,188.441463,106.893558,0.006989,5.1e-05,0.003757,0.0039,0.011273,0.033658,0.321204,...,0.0276,0.053027,0.029211,20.974048,0.516816,0.725408,-5.33342,0.248133,2.456058,0.233828


In [19]:
# status is the status of disease which is that if a person is suffering from Parkinson's disease or not.
# Therfore status is our dependent variable (y) and rest of the columns excluding name are our independent variables (x)

In [20]:
x = dataset.drop(columns=['name','status'], axis=1)         #spliting our data into independent and dependent variables
y = dataset['status']

In [21]:
x.head()

Unnamed: 0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),...,MDVP:APQ,Shimmer:DDA,NHR,HNR,RPDE,DFA,spread1,spread2,D2,PPE
0,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,0.426,...,0.02971,0.06545,0.02211,21.033,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,0.626,...,0.04368,0.09403,0.01929,19.085,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,0.482,...,0.0359,0.0827,0.01309,20.651,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,0.517,...,0.03772,0.08771,0.01353,20.644,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,0.584,...,0.04465,0.1047,0.01767,19.649,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [23]:
print(x.shape)               # there are 22 columns therefore we have 22 independent variables

(195, 22)


In [24]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: status, dtype: int64

In [25]:
print(y.shape)                # we only have one dependent variable 

(195,)


In [27]:
#next we will split the dataset into tranning(90%) and test(10%) data

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=2)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(175, 22) (20, 22) (175,) (20,)


In [31]:
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [33]:
print(x_train[0:5])               #these are the scaled values where mean = 0 and s.d. = 1

[[-9.58291977e-02 -4.57610670e-01  3.50612305e-01 -9.14329703e-01
  -9.59736059e-01 -8.58707217e-01 -8.96939955e-01 -8.58748799e-01
  -1.03450874e+00 -9.70834154e-01 -1.09624829e+00 -1.01513780e+00
  -8.32671832e-01 -1.09657499e+00 -5.59036444e-01  1.81113088e+00
  -1.75079623e+00 -7.85885403e-01 -9.66847658e-01 -7.52513889e-01
  -2.23679545e-01 -1.00898164e+00]
 [ 1.13640163e+00  3.67672636e-01 -6.96108625e-01 -5.23755966e-01
  -6.77555598e-01 -4.87484001e-01 -4.90717149e-01 -4.86446309e-01
  -5.27701546e-01 -5.34637790e-01 -4.55185215e-01 -5.19873398e-01
  -5.38275512e-01 -4.55507475e-01 -1.39789868e-01 -6.88394788e-01
   3.78185771e-01 -1.58573956e+00 -1.94681881e-01 -1.18145065e-01
   2.20486816e-01 -1.92068985e-01]
 [-8.69635042e-01 -7.99289463e-01 -2.59797177e-01 -7.23028280e-01
  -6.77555598e-01 -7.25197113e-01 -7.59177438e-01 -7.24155479e-01
  -1.03555048e+00 -9.85875408e-01 -1.07591245e+00 -1.00524899e+00
  -8.84320309e-01 -1.07591622e+00 -5.00124256e-01  1.01975895e+00
  -4.5

In [34]:
model = LogisticRegression()               #creating our model with Linear Regression
model.fit(x_train, y_train)                #passing the x_train and y_train data to our model

LogisticRegression()

In [35]:
y_predict_train = model.predict(x_train)                                #predicting on training data
y_predict_train_accuracy = accuracy_score(y_train, y_predict_train)     #checking accuracy of predicted data from dataset
print('Accuracy on training data: ',y_predict_train_accuracy)           #accuracy on seen data is 86%

Accuracy on training data:  0.8628571428571429


In [36]:
y_predict_test = model.predict(x_test)                                  #predicting on testing data
y_predict_test_accuracy = accuracy_score(y_test, y_predict_test)        #checking accuracy of predicted data from dataset
print('Accuracy on testing data: ',y_predict_test_accuracy)             #accuracy on unseen data is 85%

Accuracy on testing data:  0.85


In [61]:
input_data = x_test[1:2]                                        #now we will do our prediction on input data
input_data_array = np.asarray(input_data)                    #converting the input data into array
input_data_reshaped = input_data_array.reshape(1, -1)        #reshaping the array in 1 row by (-1) denaotes that pyhton will auto detect the number of columns
standardized_input_data = scaler.transform(input_data_reshaped)         #applying standardization
prediction = model.predict(standardized_input_data)
print(prediction)
if (prediction[0] == 0):
    print('The person does not have Parkinsons Disease.')
else:
    print('The person has Parkinsons Disease.')

[1]
The person has Parkinsons Disease.


In [62]:
print(y_test[1:2])

79    1
Name: status, dtype: int64
