In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier

In [None]:
park = pd.read_csv("../input/parkinson-disease-detection/Parkinsson disease.csv")

In [None]:
park.head()

In [None]:
park.shape

In [None]:
park.info()

In [None]:
park["status"].value_counts()
# there are 147 datapoints where it shows they have disease and  48 datapoints where it shows they don't have disease, 
# which means the dataset is skewed.

                                                       CHALLENGES:
                                                        
* In the target column, there are 48 healthy people & 147 people with Parkinson's disease i.e; one of the 2 classes is under represented or skewed for which the accuaracy at model level can be misleading. So need to consider the accuracy at class level i.e; recall using confusion matrix
* 'name' is object and also it doesn't contribute in model building so it has to be removed from dataset
* Large set of attributes, so building and analysing pair plot is difficult
* In 'status' attribute's pair plot, datapoints overlapped over majority region, so distinguishing between classes is difficult.

In [None]:
park[park.isnull().any(axis=1)]
#no missing/null data

In [None]:
sns.countplot(x='status',data=park)
#Shows the distribution of status column - univariate analysis of the target column

In [None]:
sns.pairplot(park)

                                                OBSERVATIONS FROM PAIR PLOT:

* In 'status' column's pair plot, datapoints (classes) overlapped over majority region, so distinguishing between classes is difficult.
* We can see few of the columns are normally distributed like HNR column.
* We can see few of the columns are positively correlated like the Jitter:DDP and MDVP:Shimmer columns.

In [None]:
park = park.drop("name",axis=1)
#Dropped name column as it doesnot contribute to model building

In [None]:
fig, ax = plt.subplots(figsize=(15,5))
park.boxplot(['MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)','HNR'],ax=ax)

In [None]:
k=[]
for i in park.columns:
    for j in park[i]:
        if (j<1 and j>0):
            k.append(i)
            break

fig, ax = plt.subplots(figsize=(15,5))
park.boxplot(k,ax=ax)

                                                OBSERVATIONS FROM BOXPLOTS:

* From the box plots above, we can see that there are outliers or long tails or skewness in almost all the columns except MDVP:Fo(Hz), RPDE and DFA columns.
* In the columns which have outliers, most of them are positively skewed except HNR which is negatively skewed.
* In the column 'spread2', we can see the tails or outliers being present on both the sides.

In [None]:
park.describe().T

                                            OBSERVATIONS FROM FIVE POINT SUMMARY:

* A low standard deviation indicates that the data points tend to be close to the mean of the data set, while a high standard deviation indicates that the data points are spread out over a wider range of values.
* So, from the abaove we can infer that, except MDVP:Fo(Hz), MDVP:Fhi(Hz), MDVP:Flo(Hz), rest o fthe columns have a spread closer to the mean.

In [None]:
X = park.drop("status",axis=1)
y = park["status"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=9)
print(X_train.shape)
print(X_test.shape)

### Created the model using “entropy” method of reducing the entropy and fitted it to training  data.

In [None]:
model = DecisionTreeClassifier(criterion="entropy")

In [None]:
model.fit(X_train,y_train)

### Tested the model on test data and the accuracy achieved. Captured the predicted  values and did a crosstab. 

In [None]:
preds = model.predict(X_test)
preds

In [None]:
model.score(X_train,y_train)
#accuracy of the model obtained for the train data

In [None]:
metrics.accuracy_score(y_test,preds)
#accuracy of the model obtained for the test data

In [None]:
pd.crosstab(y_test,preds)

### Used regularization parameters of max_depth, min_sample_leaf to recreate the model and checked its impact on the model accuracy.

In [None]:
model_reg = DecisionTreeClassifier(criterion="entropy",max_depth=10,min_samples_leaf=20)

In [None]:
model_reg.fit(X_train,y_train)

In [None]:
preds_reg = model_reg.predict(X_test)

In [None]:
model_reg.score(X_train,y_train)

In [None]:
metrics.accuracy_score(y_test,preds_reg)

                                                        OBSERVATIONS:

* After regularizing, the model accuracy has decreased (for test data).
* But the model without regularization was a overfit model as the train accuracy was 100% & there was a significant drop in test accuracy.
* Whereas after regularization, we have managed to bring both the train and test accuracies at the same level which is not a overfit model anymore.

In [None]:
rfcl = RandomForestClassifier(n_estimators=100,max_depth=15)

In [None]:
rfcl.fit(X_train,y_train)

In [None]:
preds_rfcl = rfcl.predict(X_test)

In [None]:
rfcl.score(X_train,y_train)

In [None]:
metrics.accuracy_score(y_test,preds_rfcl)

In [None]:
z=0
b=0
for i in np.arange(10,150):
    rfcl = RandomForestClassifier(n_estimators = i, max_depth=15)
    rfcl.fit(X_train, y_train)
    preds_rfcl=rfcl.predict(X_test)
    acc=accuracy_score(y_test,preds_rfcl)
    if acc>z:
        z=acc
        b=i
print("For",b,"number of trees,accuracy is",z)

                                                    OBSERVATIONS:

* After using Random forest classifier, we can see a drastic increase in test accuracy score.
* We have used a 'for' loop to determine the optimal number of trees that gives the best result and it is shown above.