In [1]:
import numpy as np
import pandas as pd
from sklearn import svm,preprocessing,metrics,model_selection

In [2]:
# Load parkinsons dataset using pandas 
parkinsons_data = pd.read_csv("parkinsons_disease.csv")

In [3]:
# Print first five rows of parkinsons dataset
print(parkinsons_data.head())

             name  MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  \
0  phon_R01_S01_1      119.992       157.302        74.997         0.00784   
1  phon_R01_S01_2      122.400       148.650       113.819         0.00968   
2  phon_R01_S01_3      116.682       131.111       111.555         0.01050   
3  phon_R01_S01_4      116.676       137.871       111.366         0.00997   
4  phon_R01_S01_5      116.014       141.781       110.655         0.01284   

   MDVP:Jitter(Abs)  MDVP:RAP  MDVP:PPQ  Jitter:DDP  MDVP:Shimmer  ...  \
0           0.00007   0.00370   0.00554     0.01109       0.04374  ...   
1           0.00008   0.00465   0.00696     0.01394       0.06134  ...   
2           0.00009   0.00544   0.00781     0.01633       0.05233  ...   
3           0.00009   0.00502   0.00698     0.01505       0.05492  ...   
4           0.00011   0.00655   0.00908     0.01966       0.06425  ...   

   Shimmer:DDA      NHR     HNR  status      RPDE       DFA   spread1  \
0      0.0654

In [4]:
# Print all the keys present in dataset
print(parkinsons_data.keys())

Index(['name', 'MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(%)',
       'MDVP:Jitter(Abs)', 'MDVP:RAP', 'MDVP:PPQ', 'Jitter:DDP',
       'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5',
       'MDVP:APQ', 'Shimmer:DDA', 'NHR', 'HNR', 'status', 'RPDE', 'DFA',
       'spread1', 'spread2', 'D2', 'PPE'],
      dtype='object')


In [5]:
# Print number of rows and columns in dataset
print(parkinsons_data.shape)

(195, 24)


In [6]:
# Check if any value is misssing in dataset
print(parkinsons_data.isnull().sum())

name                0
MDVP:Fo(Hz)         0
MDVP:Fhi(Hz)        0
MDVP:Flo(Hz)        0
MDVP:Jitter(%)      0
MDVP:Jitter(Abs)    0
MDVP:RAP            0
MDVP:PPQ            0
Jitter:DDP          0
MDVP:Shimmer        0
MDVP:Shimmer(dB)    0
Shimmer:APQ3        0
Shimmer:APQ5        0
MDVP:APQ            0
Shimmer:DDA         0
NHR                 0
HNR                 0
status              0
RPDE                0
DFA                 0
spread1             0
spread2             0
D2                  0
PPE                 0
dtype: int64


In [7]:
# make different statistical calculations in dataset
print(parkinsons_data.describe())

       MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  \
count   195.000000    195.000000    195.000000      195.000000   
mean    154.228641    197.104918    116.324631        0.006220   
std      41.390065     91.491548     43.521413        0.004848   
min      88.333000    102.145000     65.476000        0.001680   
25%     117.572000    134.862500     84.291000        0.003460   
50%     148.790000    175.829000    104.315000        0.004940   
75%     182.769000    224.205500    140.018500        0.007365   
max     260.105000    592.030000    239.170000        0.033160   

       MDVP:Jitter(Abs)    MDVP:RAP    MDVP:PPQ  Jitter:DDP  MDVP:Shimmer  \
count        195.000000  195.000000  195.000000  195.000000    195.000000   
mean           0.000044    0.003306    0.003446    0.009920      0.029709   
std            0.000035    0.002968    0.002759    0.008903      0.018857   
min            0.000007    0.000680    0.000920    0.002040      0.009540   
25%            0.000

In [8]:
# distribution of target variable
print(parkinsons_data['status'].value_counts())

1    147
0     48
Name: status, dtype: int64


In [9]:
# group the data based on target variable
print(parkinsons_data.groupby('status').mean())

        MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  \
status                                                            
0        181.937771    223.636750    145.207292        0.003866   
1        145.180762    188.441463    106.893558        0.006989   

        MDVP:Jitter(Abs)  MDVP:RAP  MDVP:PPQ  Jitter:DDP  MDVP:Shimmer  \
status                                                                   
0               0.000023  0.001925  0.002056    0.005776      0.017615   
1               0.000051  0.003757  0.003900    0.011273      0.033658   

        MDVP:Shimmer(dB)  ...  MDVP:APQ  Shimmer:DDA       NHR        HNR  \
status                    ...                                               
0               0.162958  ...  0.013305     0.028511  0.011483  24.678750   
1               0.321204  ...  0.027600     0.053027  0.029211  20.974048   

            RPDE       DFA   spread1   spread2        D2       PPE  
status                                                  

In [10]:
# Define features and labels on X and y axis respt.
X=parkinsons_data.drop(columns=['name','status'],axis=1).values
y=parkinsons_data['status'].values

In [11]:
# splitting dataset into training and testing data
X_train,X_test,y_train,y_test = model_selection.train_test_split(X,y,train_size=0.8,random_state=2)

In [12]:
print(X.shape,X_train.shape,X_test.shape)

(195, 22) (156, 22) (39, 22)


In [13]:
# Scale data in standard form
scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
# Use Support Vector Machine Classifier Model
model = svm.SVC(kernel='linear')

In [15]:
# Train SVM Classifier Model
model.fit(X_train,y_train)

SVC(kernel='linear')

In [16]:
# Predict y based on X_test
y_predicted = model.predict(X_test)
y_actual = y_test
print("y_predicted: ",y_predicted)
print("y_actual: ",y_actual)

y_predicted:  [1 1 1 1 0 1 0 1 1 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0
 1 1]
y_actual:  [1 1 1 1 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 0 1 0 1 1 1 1 1 1 0 1 1 0
 1 1]


In [17]:
# Accuracy of the model 
model_accuracy = metrics.accuracy_score(y_actual,y_predicted)
print(model_accuracy)

0.8717948717948718


In [18]:
# Build a predictive system
user_data = (120.55200,131.16200,113.78700,0.00968,0.00008,0.00463,0.00750,0.01388,0.04701,0.45600,0.02328,0.03526,0.03243,0.06985,0.01222,21.37800,0.415564,0.825069,-4.242867,0.299111,2.187560,0.357775)

In [19]:
# convert user data to numpy array
user_data_nparray = np.asarray(user_data)

In [20]:
# Reshape numpy array
nparray_reshaped = user_data_nparray.reshape(1,-1)

In [21]:
# Scale data in standard form 
user_data_std_form = scaler.transform(nparray_reshaped)

In [22]:
prediction = model.predict(user_data_std_form)
print(prediction)

[1]


In [23]:
if(prediction[0]==0):
    print("Person does not have parkinsons disease")

else:
    print("Person has parkinsons disease")

Person has parkinsons disease
