In [None]:
# import libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib.cm import rainbow
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [None]:
#to split the train test data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

 # Exploratory Data anlysis

In [None]:
# Load dataset
df=pd.read_csv('../input/parkinsons-disease-classification/pd_speech_features.csv', index_col=0, delimiter=',', skiprows=1)
df.head(20)

In [None]:
df = df.loc[:,~df.columns.duplicated()]
df=df.sample(frac=1).reset_index(drop=True)



In [None]:
df.info()

In [None]:
df.isnull().sum()

 # Data Visualization,EDA Analysis


In [None]:
#df.columns
#Lets now check for null fields
import seaborn as sns
plt.figure(figsize=(10,10))
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')

No Null values in this dataset

In [None]:
df.describe()


# 2-D Scatter Plot

In [None]:
#checking the target variable countplot
plt.figure(figsize=(10,10))
sns.countplot(data=df,x = 'class',palette='plasma')


In [None]:
sns.set(rc={'figure.figsize':(12,10)})
# fig = sns.countplot(x = "class" , data = df)
# plt.xlabel("class")
# plt.ylabel("Count")
# plt.title("Class Count")
# plt.grid(True)
# plt.show(fig)
df['class'].value_counts().plot.pie(autopct='%1.1f%%', textprops={'fontsize':12}).set_title("class distribution")
df['class'].value_counts()


In [None]:
#study the data
#df.hist()
print(f'Duplicates in the dataset: {df.duplicated().sum()}')
print(f'Percentage of duplicates: {df.duplicated().sum()/len(df)*100}%')


In [None]:
# #get the correlation
# plt.figure(figsize=(12,10))
# cor = df.corr()
# sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
# plt.show()

In [None]:
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split


In [None]:
# Function for finding correlation.

def corr_map(feature, size=((10, 7.0))):  
  # Figure size
  plt.figure(figsize=size)
  sns.set_context('poster', font_scale= 1)

  # Histogram
  sns.histplot(data=df, x=feature, hue='class', binwidth=1, kde=True)

  # Aesthetics
  plt.title(f'{feature} distribution')
  plt.xlabel(f'{feature} Value')


In [None]:
#tqwt_kurtosisValue_dec_28	
corr_map('tqwt_kurtosisValue_dec_28', (35, 10))


In [None]:
corr_map('numPulses')


In [None]:
corr_map('PPE')


In [None]:
corr_map('DFA')


In [None]:
corr_map('RPDE')


In [None]:
#Box Plotting All features distribution corresponding Target column
i=1
plt.figure(figsize=(60,60))
for c in df.columns[:49]:
    plt.subplot(10,5,i)
    plt.title(f"Boxplot of {c}",fontsize=16)
    plt.yticks(fontsize=12)
    plt.xticks(fontsize=12)
    sns.boxplot(y=df[c],x=df['class'])
    i+=1
plt.show()


# Data Splitting

In [None]:
dataX=df.drop('class',axis=1)
dataY=df['class']


In [None]:
X_train,X_test,y_train,y_test=train_test_split(dataX,dataY,test_size=0.15,random_state=42)


In [None]:
print('X_train',X_train.shape)
print('X_test',X_test.shape)
print('y_train',y_train.shape)
print('y_test',y_test.shape)


In [None]:
# format scoring output
def score_format(model):
    print(f'Accuracy: {round(model * 100, 2)} %')

# Baseline Models
#Using a simple model for having a baseline accuracy without removing any features.


# K-Nearest Neighbors

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print(f'K-Nearest Neighbors')
score_format(knn.score(X_test, y_test))


# Random Forest


In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
print(f'Random Forest')
score_format(rf.score(X_test, y_test))


# Gradient Boosting

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
print(f'*Gradient Boosting')
score_format(gb.score(X_test, y_test))


In [None]:
dims = X_test.shape[1]
print(dims, 'dims')


In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


In [None]:
def plot_roc_(false_positive_rate,true_positive_rate,roc_auc):
    plt.figure(figsize=(5,5))
    plt.title('Receiver Operating Characteristic')
    plt.plot(false_positive_rate,true_positive_rate, color='red',label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],linestyle='--')
    plt.axis('tight')
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()


In [None]:
from sklearn.neighbors  import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import roc_curve, auc


# Part 1 for ML Algorithms With PCA Analysis

In [None]:
from sklearn.decomposition import PCA
pca=PCA(n_components=50)
X_train=pca.fit_transform(X_train)
X_test=pca.transform(X_test)


In [None]:
print('X_train',X_train.shape)
print('X_test',X_test.shape)
print('y_train',y_train.shape)
print('y_test',y_test.shape)


In [None]:
from sklearn.linear_model import LogisticRegression

lr=LogisticRegression(C=0.1,penalty='l2',random_state=42)
lr.fit(X_train,y_train)

y_pred=lr.predict(X_test)


y_proba=lr.predict_proba(X_test)

false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,y_proba[:,1])
roc_auc = auc(false_positive_rate, true_positive_rate)
plot_roc_(false_positive_rate,true_positive_rate,roc_auc)


from sklearn.metrics import r2_score,accuracy_score

cm=confusion_matrix(y_test,y_pred)
print(cm)
sns.heatmap(cm,annot=True)
plt.show()
#print('Hata Oranı :',r2_score(y_test,y_pred))
print('Accurancy Oranı :',accuracy_score(y_test, y_pred))
print("Logistic TRAIN score with ",format(lr.score(X_train, y_train)))
print("Logistic TEST score with ",format(lr.score(X_test, y_test)))
print()


In [None]:
knn=KNeighborsClassifier(n_jobs=2, n_neighbors=22)
knn.fit(X_train,y_train)

y_pred=knn.predict(X_test)

y_proba=knn.predict_proba(X_test)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,y_proba[:,1])
roc_auc = auc(false_positive_rate, true_positive_rate)
plot_roc_(false_positive_rate,true_positive_rate,roc_auc)

from sklearn.metrics import r2_score,accuracy_score

cm=confusion_matrix(y_test,y_pred)
print(cm)
sns.heatmap(cm,annot=True)
plt.show()
print('Accurancy Oranı :',accuracy_score(y_test, y_pred))
print("KNN TRAIN score with ",format(knn.score(X_train, y_train)))
print("KNN TEST score with ",format(knn.score(X_test, y_test)))
print()



# Withou PCA Analysis & Using Machine Learning Algorithms; Part 2 for ML Algorithms


In [None]:
X_train,X_test,y_train,y_test=train_test_split(dataX,dataY,test_size=0.15,random_state=42)


In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


In [None]:
# Fitting Logistic Regression To the training set 
from sklearn.linear_model import LogisticRegression   
  
classifier = LogisticRegression(penalty='l2',solver='lbfgs',class_weight='balanced', max_iter=1000,random_state = 42) 
classifier.fit(X_train, y_train)


In [None]:
y_pred = classifier.predict(X_test)
# making confusion matrix between 
#  test set of Y and predicted value. 
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred) 
print ("confusion_matrix",cm)


In [None]:
from sklearn.metrics import classification_report,accuracy_score
print(classification_report(y_test,y_pred))
print("Accuracy:",accuracy_score(y_test, y_pred)*100)

print(y_pred)


In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis


In [None]:
names = [ "MLP-Neural Net", "Naive Bayes", "QDA"]

classifiers = [
    MLPClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()
]


In [None]:
from sklearn.model_selection import cross_val_score

# iterate over classifiers
results = {}
for name, clf in zip(names, classifiers):
    scores = cross_val_score(clf, X_train, y_train, cv=5)
    results[name] = scores


In [None]:
for name, scores in results.items():
    print("%20s | Accuracy: %0.2f%% (+/- %0.2f%%)" % (name, 100*scores.mean(), 100*scores.std() * 2))


In [None]:
from sklearn.model_selection import GridSearchCV

clf = SVC(kernel="linear")

# prepare a range of values to test
param_grid = [
  {'C': [.01, .1, 1, 10], 'kernel': ['linear']},
 ]

grid = GridSearchCV(estimator=clf, param_grid=param_grid)
grid.fit(X_train, y_train)
print(grid)


In [None]:
# summarize the results of the grid search
print("Best score: %0.2f%%" % (100*grid.best_score_))
print("Best estimator for parameter C: %f" % (grid.best_estimator_.C))


# Part 3 for Algorithms


In [None]:
seed = 42

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier,\
                            BaggingClassifier,VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline, Pipeline


In [None]:
# split the data into train and test
def split_data(X, Y, seed=42, train_size=0.8):
    xtrain, xtest, ytrain, ytest = train_test_split(X, Y, train_size=train_size, random_state = seed, stratify=Y)
    xtrain, xtest = preprocess(xtrain, xtest)
    return (xtrain, xtest, ytrain, ytest)

# preprocess the data for training
def preprocess(x1, x2=None):
    sc = StandardScaler()
    x1 = pd.DataFrame(sc.fit_transform(x1), columns=x1.columns)
    if x2 is not None:
        x2 = pd.DataFrame(sc.transform(x2), columns=x2.columns)
        return (x1,x2)
    return x1
# for model evaluation and training
def eval_model(model, X, Y, seed=1):
    xtrain, xtest, ytrain, ytest = split_data(X, Y)
    model.fit(xtrain, ytrain)
    
    trainpred = model.predict(xtrain)
    trainpred_prob = model.predict_proba(xtrain)
    testpred = model.predict(xtest)
    testpred_prob = model.predict_proba(xtest)
    
    print("Train ROC AUC : %.4f"%roc_auc_score(ytrain, trainpred_prob, multi_class='ovr'))
    print("\nTrain classification report\n",classification_report(ytrain, trainpred))
    
    ### make a bar chart for displaying the wrong classification of one class coming in which other class
    
    print("\nTest ROC AUC : %.4f"%roc_auc_score(ytest, testpred_prob, multi_class='ovr'))
    print("\nTest classification report\n",classification_report(ytest, testpred))
    
def plot_importance(columns, importance):
    plt.bar(columns, importance)
    plt.show()



# Feature Extraction, Importance & Splitting


In [None]:
#Feature Extraction, Importance & Splitting

Y= df['class']

X = df.drop(['class'],axis = 1)


In [None]:
X_sc = preprocess(X)
X_sc

# Creating array of models


In [None]:
# Creating array of models¶
model_logr = LogisticRegression(random_state=seed,n_jobs=-1)
model_nb = GaussianNB()
model_dt = DecisionTreeClassifier(random_state=seed)
model_dt_bag = BaggingClassifier(model_dt, random_state=seed, n_jobs=-1)
model_ada = AdaBoostClassifier(random_state=seed)
model_gbc = GradientBoostingClassifier(random_state=seed)
model_rf = RandomForestClassifier(random_state=seed, n_jobs=-1)
model_xgb = XGBClassifier(random_state=seed)
model_lgbm = LGBMClassifier(random_state=seed, n_jobs=-1)
model_knn = KNeighborsClassifier(n_jobs=-1)


In [None]:
models = []
models.append(('LR',model_logr))
models.append(('NB',model_nb))
models.append(('DT',model_dt))
models.append(('Bag',model_dt_bag))
models.append(('Ada',model_ada))
models.append(('GBC',model_gbc))
models.append(('RF',model_rf))
models.append(('XGB',model_xgb))
models.append(('LGBM',model_lgbm))
models.append(('KNN',model_knn))


 # Running the algorithms


In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

results = []
names = []

for name, model in models:
    scores = cross_val_score(model, X_sc, Y, scoring='f1_weighted', cv=cv, n_jobs=-1)
    accuracy = scores.mean()
    std = scores.std()
    print(f"{name} : Mean ROC {accuracy} STD:({std})")
    results.append(scores)
    names.append(name)


In [None]:
fig, ax = plt.subplots(figsize=(12,6))
ax.boxplot(results)
ax.set_xticklabels(names)
plt.show()
