# Parkinson's Disease Dataset
Attribute Information:
----------------------
Matrix column entries (attributes):

name - ASCII subject name and recording number

MDVP:Fo(Hz) - Average vocal fundamental frequency

MDVP:Fhi(Hz) - Maximum vocal fundamental frequency

MDVP:Flo(Hz) - Minimum vocal fundamental frequency

MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP - Several measures of variation in fundamental frequency

MDVP:Shimmer,

MDVP:Shimmer(dB),

Shimmer:APQ3,

Shimmer:APQ5,

MDVP:APQ,

Shimmer:DDA - Several measures of variation in amplitude

NHR,HNR - Two measures of ratio of noise to tonal components in the voice

status - Health status of the subject (one) - Parkinson's, (zero) - healthy

RPDE,D2 - Two nonlinear dynamical complexity measures

DFA - Signal fractal scaling exponent

spread1,spread2,PPE - Three nonlinear measures of fundamental frequency variation 


# 1. Importing Libraries for ML

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import RandomOverSampler
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import train_test_split, GridSearchCV,RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore")


ModuleNotFoundError: No module named 'imblearn'

# 2. Importing Data

In [None]:
df = pd.read_csv('parkinsons.csv')
df.head(3)

# 3.  Data Cleaning
1. Finding the null values and handling them in proper way.
2. Drop uneccessay data columns.
3. Find outliers and remove them.

In [None]:
df.iloc[:3,:10]

In [None]:
df.iloc[:3,:14]

In [None]:
df.describe()

In [None]:
df.shape

## 3.1 Finding the null values

In [None]:
df.isnull().sum()

## 3.2 Drop the "name" column which is unnecessary in predictive modelling.

In [None]:
df.drop(["name"], axis=1, inplace=True)

## 3.3 Finding outliers and removing them.

#### Create functions to
- plot the outliers using boxplots and distplots
- find outliers in skewed and normally distributed data.
- remove outliers in skewed and normally distributed data.

In [None]:
# function to plot the distplot and boxplot for given dataframe and column
def plot_distribution(df, col):
    fig, ax =plt.subplots(1,2)
    plt.figure(figsize=(40,10))
    sns.distplot(df[col], ax=ax[0])  
    df.boxplot(column=col, ax=ax[1])
    plt.show()

# function to find outliers in normally distributed data
def find_outliers_normal(data, col):
    upper_bound = data[col].mean() + 3*data[col].std()
    lower_bound = data[col].mean() - 3*data[col].std()

    upper_outliers = len(data.loc[data[col]>upper_bound])
    lower_outliers = len(data.loc[data[col]>upper_bound])
    
    print("Outliers in '",col,"' Column")
    print("Upper Bound - ",upper_bound)
    print("Lower Bound - ",lower_bound)
    print("Number of Upper Outliers - ",upper_outliers)
    print("Number of Lower Outliers - ",lower_outliers)
    print("")

# function to find outliers in skewed data
def find_outliers_skewed(df, col):
    IQR = df[col].quantile(0.75)-df[col].quantile(0.25)
    
    upper_bound = df[col].quantile(0.75) + (IQR*1.5)
    lower_bound = df[col].quantile(0.25) - (IQR*1.5)
    
    upper_outliers = len(df.loc[df[col]>upper_bound])
    lower_outliers = len(df.loc[df[col]<lower_bound])
    
    
    print("Outliers in '",col,"' Column")
    print("Upper Bound - ",upper_bound)
    print("Lower Bound - ",lower_bound)
    print("Number of Upper Outliers - ",upper_outliers)
    print("Number of Lower Outliers - ",lower_outliers)
    print("")


    
# function to remove outliers from normally distributed data.
def remove_outliers_normal(df, col):
    df = df.copy(deep=True)
    upper_bound = df[col].mean() + 3*df[col].std()
    lower_bound = df[col].mean() - 3*df[col].std()
    
    df.loc[df[col]>=upper_bound, col] = upper_bound
    df.loc[df[col]<=lower_bound, col] = lower_bound
    
    return df


# function to remove outliers from skewed data.
def remove_outliers_skewed(df, col):
    IQR = df[col].quantile(0.75)-df[col].quantile(0.25)
    
    upper_bridge = df[col].quantile(0.75) + (IQR*1.5)
    lower_bridge = df[col].quantile(0.25) - (IQR*1.5)
    
    df.loc[df[col]>=upper_bridge, col] = upper_bridge
    df.loc[df[col]<=lower_bridge, col] = lower_bridge
    
    return df

In [None]:
features_list = list(df.columns)
features_list.remove("status")
print(features_list)

In [None]:
# plot distplot and boxplots for each column except "status column"
for i in features_list:
    plot_distribution(df, i)

In [None]:
# find out the outliers in data
for i in features_list:
    find_outliers_skewed(df, i)

In [None]:
# remove the outliers from each column
for i in features_list:
    remove_outliers_skewed(df, i)

In [None]:
# See if their are outliers still exists in data
for i in features_list:
    find_outliers_skewed(df, i)

# 4. Feature Engineering
## 1. Selecting the features by using Heatmap of correlation, removing highly correlated features. 
   - Heatmap for plotting correlation matrix.
   - Create function to remove highly correlated features


In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(df.corr(), annot=True)

In [None]:
# function to find highly correlated features

def highly_correlated_columns(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [None]:
# highly correlated features
corr_features = highly_correlated_columns(df, 0.85)
corr_features

In [None]:
# remove correlated features.
df.drop(corr_features, axis=1, inplace=True)

In [None]:
# See if their are other features which are highly correlated. 
plt.figure(figsize=(12,12))
sns.heatmap(df.corr(), annot=True)

## 4.2 Handling Imbalance data

As we can see their is large imbalance in data, their are 48 records for negative class and 147 for positive class.

In [None]:
# split data into target and features
x = df.drop(["status"], axis=1)
y = df["status"]

# countplot() to see imbalance in data
sns.countplot(y)

In [None]:
# see how much records are their for each class
y.value_counts()

###  Oversampling for imbalance dataset

In [None]:
over_sampler = RandomOverSampler()

x, y = over_sampler.fit_resample(x, y)

In [None]:
sns.countplot(y)

In [None]:
y.value_counts()

## 4.3 Splitting Data into Features and Target

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.5)

# 5. Predictive Modelling
###  Finding the best `n_estimators` value for RandomForest Classifier

Find the score for randomFOrest Classifier for each n_estimator value and see which n_estimator gives maximum value.

In [None]:
scores = []

for i in range(40,150,2):
    model = RandomForestClassifier(n_estimators=i, bootstrap=True,)
    
    model.fit(x_train,y_train)
    
    scores.append(model.score(x_test,y_test))

plt.plot(range(40,150,2), scores)

In [None]:
# find the n_estimator value for which score is highest
high_score_index = scores.index(max(scores))
estimators = np.arange(40,150,2)
n_est = estimators[high_score_index]

### Define RandomForest Classifier

In [None]:
rf = RandomForestClassifier(n_estimators=n_est)
rf.fit(x_train,y_train)

In [None]:
y_pred = rf.predict(x_test)

### Evaluation of RandomForest Classifier

In [None]:
# Classification report for random forest classifier
cr = classification_report(y_test, y_pred)
print(cr)

In [None]:
# Confusion matrix 
confusion_matrix(y_test, y_pred)

In [None]:
# Score for model
rf.score(x_test, y_test)

# 6. Hyperparameter Tuning 

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 5)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5,50,10)]

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]

# parameter dictionary for RandomizedSearchCV 
params = { 'max_features': max_features,
           'max_depth': max_depth,
           'min_samples_split': min_samples_split,
           'min_samples_leaf': min_samples_leaf,
           'criterion':['entropy','gini']}


rf_randomcv = RandomizedSearchCV(estimator=rf, param_distributions=params, n_iter=100, cv=5, verbose=2)

rf_randomcv.fit(x_train, y_train)

In [None]:
y_prediction  = rf_randomcv.predict(x_test)

In [None]:
# Classification report for random forest classifier after hyperparamter tuning
cr = classification_report(y_test, y_prediction)
print(cr)

In [None]:
# Confusion matrix  after hyperparamter tuning
confusion_matrix(y_test, y_prediction)

In [None]:
# Score for model after hyperparamter tuning
rf_randomcv.score(x_test, y_test)

In [None]:
# Best hyperparameter values  after hyperparamter tuning
rf_randomcv.best_params_

# Thank you !!!