# Outline

This project is for predicting if a person has Parkinson's disease from voice samples. We can use 15 basic voice tremor parameters to predict if he/she is suffering from Parkinson's disease. They are Jitter, Shimmer, HNR, NHR, Frequency etc.

Dataset (UCI ML Repository):
Training


https://drive.google.com/file/d/10rpHaDYM76itD22mjQvrksRWEM3pJbwE/view?usp=sharing

Testing

https://drive.google.com/file/d/1m7XSSOrH6lqX684SwYiKMyTtG7CDb-3i/view?usp=sharing


In [3]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.decomposition import PCA 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix 
from sklearn.model_selection import GridSearchCV 

In [None]:
df = pd.read_csv('parkinsons.csv') 
df.head() 
df.drop('name', axis = 1, inplace = True) 
df.head() 
df.columns 



In [None]:
grr   =   pd.plotting.scatter_matrix(df,   c=df['status'],   figsize   =   (15,15),   marker   =   'o',  
hist_kwds = {'bins': 20}, s = 60, alpha = 0.5) 
len(df) 
df.describe() 
df.corr() 
pd.value_counts(df.status) 
X_train,   X_test,   y_train,   y_test   =   train_test_split(df.drop('status',   axis   =   1),  
df['status']) 
pd.value_counts(y_train) 
X_train_norm = (X_train - X_train.mean())/X_train.std() 

In [None]:
grr   =   pd.plotting.scatter_matrix(df,   c=df['status'],   figsize   =   (15,15),   marker   =   'o',  
hist_kwds = {'bins': 20}, s = 60, alpha = 0.5) 
len(df) 
df.describe() 
df.corr() 
pd.value_counts(df.status) 
X_train,   X_test,   y_train,   y_test   =   train_test_split(df.drop('status',   axis   =   1),  
df['status']) 
pd.value_counts(y_train) 
X_train_norm = (X_train - X_train.mean())/X_train.std() 

In [None]:
X_test_norm = (X_test - X_train.mean())/X_train.std() 
pca = PCA() 
pca.fit(X_train_norm) 
sum(np.cumsum(pca.explained_variance_ratio_) <= 0.99) 
np.cumsum(pca.explained_variance_ratio_) 
pca = PCA(7) 
pca.fit(X_train_norm) 
X_train_pca = pca.transform(X_train_norm) 
X_test_pca = pca.transform(X_test_norm) 
X_train_pca.shape 
X_test_pca.shape 

In [None]:
lda = LDA() 
lda.fit(X_train_norm, y_train) 
X_train_lda = lda.transform(X_train_norm) 
X_test_lda = lda.transform(X_test_norm) 
X_train_lda.shape 
X_test_lda.shape 

#Model

In [None]:
from sklearn.ensemble import RandomForestClassifier as RF 
from sklearn.linear_model import LogisticRegression as LR 
from sklearn.neighbors import KNeighborsClassifier as KNN 
from sklearn.metrics import classification_report, accuracy_score 
#from sklearn. import  

In [None]:
rf = RF() 
lr = LR() 
knn = KNN() 
 
rf.fit(X_train_pca, y_train) 
pred = rf.predict(X_test_pca) 
rf.score(X_train_pca, y_train) 
rf.score(X_test_pca, y_test) 
 
print(classification_report(y_test, pred)) 
rf.fit(X_train, y_train) 
rf.score(X_train, y_train) 
rf.score(X_test, y_test) 
rf.feature_importances_ 

lr.fit(X_train_pca, y_train) 
pred = lr.predict(X_test_pca) 

In [None]:
print(lr.score(X_train_pca, y_train)) 
print(lr.score(X_test_pca, y_test)) 
lr = LR(C = 0.1) 
lr.fit(X_train_norm, y_train) 
lr.score(X_train_norm, y_train) 
lr.score(X_test_norm, y_test) 
lr.coef_ 
knn.fit(X_train_pca, y_train) 
pred = knn.predict(X_test_pca) 
print(knn.score(X_train_pca, y_train)) 
print(knn.score(X_test_pca, y_test)) 

In [None]:
pred = lda.predict(X_test_norm) 
accuracy_score(y_test, pred) 
 
knn.fit(X_train, y_train) 
knn.score(X_test, y_test) 
neig = [1,2, 3, 4, 5, 6, 8, 10] 
train_ls = [] 
test_ls = [] 

In [None]:
for n in neig: 
    knn = KNN(n) 
    knn.fit(X_train_pca, y_train) 
    train_ls.append(knn.score(X_train_pca, y_train)) 
    test_ls.append(knn.score(X_test_pca, y_test)) 
plt.plot(neig, train_ls, label = 'Training Score') 
plt.plot(neig, test_ls, label = 'Test Score') 
plt.legend() 

Optimal number of neigbours for this problem is 5

In [None]:
knn = KNN(5) 
knn.fit(X_train_pca, y_train) 
pred = knn.predict(X_test_pca) 
print(classification_report(1-y_test, 1-pred)) 
knn.score(X_test_pca, y_test) 
X_train.columns 
tf = pd.read_csv('test.csv') 
tf.head() 
tf.drop('name', axis = 1, inplace = True) 
X_ut = tf.drop('status', axis = 1) 
print (X_ut) 
X_ut_norm = (X_ut - X_train.mean())/X_train.std() 
print (X_ut_norm) 
X_ut_pca = pca.transform(X_ut_norm) 
predictions = knn.predict(X_ut_pca) 

# References



[1] https://en.wikipedia.org/wiki/Parkinson%27s_disease 

[2]   Praat   Software:   Paul   Boersma   and   David   Weenink   of   the   University   of  
Amsterdam. 

[3] http://www.parkinson.org/understanding-parkinsons 