In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px


import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('pd_speech_features.csv')
df.head()

In [None]:
df.shape

In [None]:
df.dtypes


In [None]:
df.info()

In [None]:
df.isnull().sum()/len(df)

In [None]:
df.isnull().sum().sum()

In [None]:
sns.countplot(x="class", data=df)   # we can see that there imbalance in data so model cannot train properly we can do downsampling 

In [None]:
sns.countplot(x="gender", data=df)

In [None]:
pd.crosstab(df["class"], df["gender"])

In [None]:
from sklearn.utils import resample
## majority class 1
df_majority = df[df['class']==1]
## minority class 0
df_minority = df[df['class']==0]
df_minority.shape

In [None]:
df_majority_downsampled = resample(df_majority, 
                                 replace=False,   
                                 n_samples=len(df_minority),    
                                 random_state=1234) 
df = df_majority_downsampled.append(df_minority)
df.head()

In [None]:
condition = ["Does not have Parkinson's","Does have Parkinson's"]

have_or_not = df["class"].value_counts().tolist()
values = [have_or_not[0], have_or_not[1]]

fig = px.pie(values = df["class"].value_counts(), names = condition, width = 800, height = 400, color_discrete_sequence = ["skyblue", "violet"], title = "Percentage whether patient has Parkinson's or not")
fig.show()

In [None]:
sns.distplot(df["numPulses"], color = "orange");

In [None]:
sns.distplot(df["meanPeriodPulses"], color = "indigo")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC 
from sklearn.ensemble import BaggingClassifier

In [None]:
x=df.drop('class',axis=1)
y=df['class']

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.30,random_state=42)

In [None]:
ss=StandardScaler()
ss.fit(xtrain)

In [None]:
xtrain=ss.transform(xtrain)
xtest=ss.transform(xtest)

In [None]:
lv=LinearSVC()

In [None]:
lv.fit(xtrain,ytrain)

In [None]:
lv.score(xtrain,ytrain)*100

In [None]:
pred_lv=lv.predict(xtrain)

In [None]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,cohen_kappa_score

In [None]:
print('accuracy score:',accuracy_score(ytrain,pred_lv))
print('classification_report:\n' , classification_report(ytrain,pred_lv))

In [None]:
pred_lv_y=lv.predict(xtest)

In [None]:
print('accuracy score:',accuracy_score(ytest,pred_lv_y))
print('classification_report:\n' , classification_report(ytest,pred_lv_y))

In [None]:
LR=LogisticRegression()
LR.fit(xtrain,ytrain)

In [None]:
lr_pred=LR.predict(xtrain)

In [None]:
pred_lr_y=LR.predict(xtest)

In [None]:
print('accuracy score:',accuracy_score(ytrain,lr_pred))
print('classification_report:\n' , classification_report(ytrain,lr_pred))

In [None]:
print('accuracy score:',accuracy_score(ytest,pred_lr_y))
print('classification_report:\n' , classification_report(ytest,pred_lr_y))

In [None]:
RF=RandomForestClassifier()

In [None]:
RF.fit(xtrain,ytrain)

In [None]:
rf_pred=RF.predict(xtrain)

In [None]:
rf_pred_y=RF.predict(xtest)

In [None]:
print('accuracy score:',accuracy_score(ytrain,rf_pred))
print('classification_report:\n' , classification_report(ytrain,rf_pred))

In [None]:
print('accuracy score:',accuracy_score(ytest,rf_pred_y))
print('classification_report:\n' , classification_report(ytest,rf_pred_y))

# Hypertuning

In [None]:
np.random.seed(42)

RF = RandomForestClassifier(n_estimators = 100)
RF.fit(xtrain,ytrain)

RF.score(xtest, ytest)

In [None]:
rfh_pred=RF.predict(xtrain)

In [None]:
rfh_pred_y=RF.predict(xtest)

In [None]:
print('accuracy score:',accuracy_score(ytest,rfh_pred_y))
print('classification_report:\n' , classification_report(ytest,rfh_pred_y))

In [None]:
BG=BaggingClassifier()
BG.fit(xtrain,ytrain)

In [None]:
bg_pred=BG.predict(xtrain)

In [None]:
bg_pred_y=BG.predict(xtest)

In [None]:
print('accuracy score:',accuracy_score(ytrain,bg_pred))
print('classification_report:\n' , classification_report(ytrain,bg_pred))

In [None]:
print('accuracy score:',accuracy_score(ytest,bg_pred_y))
print('classification_report:\n' , classification_report(ytest,bg_pred_y))

In [None]:
np.random.seed(42)

bgh = BaggingClassifier()
bgh.fit(xtrain, ytrain)


In [None]:
bgh_pred=BG.predict(xtrain)

In [None]:
bgh_pred_y=BG.predict(xtest)

In [None]:
print('accuracy score:',accuracy_score(ytrain,bgh_pred))
print('classification_report:\n' , classification_report(ytrain,bgh_pred))

In [None]:
print('accuracy score:',accuracy_score(ytest,bgh_pred_y))
print('classification_report:\n' , classification_report(ytest,bgh_pred_y))