In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/parkinson-disease-detection/Parkinsson disease.csv')
df.head()

In [None]:
df = df.drop(['name'], axis=1)
df_corr = df.corr()

In [None]:
df_corr.head()

In [None]:
# find highly correlated features and drop them
higly_correlated_features = set()

for feature_column in range(0,len(df_corr.columns)):
    if feature_column == 'status':
        continue
    feature_column_name = df_corr.columns[feature_column]
    for feature_row in range(0,len(df_corr.index)):
        feature_row_name = df_corr.index[feature_row]
        if feature_row_name == feature_column_name:
            continue
        corr_value = df_corr.iloc[feature_column][feature_row]
        if corr_value > 0.67:
            higly_correlated_features.add(feature_row_name)
print(higly_correlated_features)
df = df.drop(higly_correlated_features, axis=1)

In [None]:
# show cleaned-up dataset
df.head()

In [None]:
# check for tuning parameters (looking for accuracy > 0.98 and difference between train and test < 0.5 to avoid most overfitting configurations)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

X = df.drop(['status'], axis=1).values
y = df['status'].values

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, shuffle=True, random_state=0)

for est in range(5,60, 5):
    for depth in range(5,20,2):
        classifier = RandomForestClassifier( n_estimators=est, max_depth=depth)
        classifier.fit(X_train,y_train)
        y_train_pred = classifier.predict(X_train)
        y_test_pred = classifier.predict(X_test)

        accuracy_train = accuracy_score(y_train,y_train_pred)
        accuracy_test = accuracy_score(y_test,y_test_pred)
        if accuracy_test > 0.98 and accuracy_train - accuracy_test < 0.5:
            print('est: ' + str(est) + ', depth: ' + str(depth))
            print('Accuracy\t\ttrain: %.4f , test: %.4f' %(accuracy_train,accuracy_test))

In [None]:
# do the same as above, but with folding: note that averages performances are lower,
# but probably more close to reality

from sklearn.model_selection import StratifiedKFold

n_fold = 5


fold = StratifiedKFold(n_splits = n_fold, random_state = 0, shuffle = True)

accuracy = 0

for est in range(5,60, 5):
    for depth in range(5,20,2):
        
        scores = []

        #Iterate over each step
        for train_index, test_index in fold.split(X, y):

            #Calculate X/Y train of current iteration
            X_train, y_train = X[train_index], y[train_index]

            #Calculate X/Y test of current iteration
            X_test, y_test = X[test_index], y[test_index]

            #create a new model
            classifier = RandomForestClassifier(n_estimators=est, max_depth=depth)

            #fit the model on train test
            classifier.fit(X_train, y_train)

            #predict Out of Fold data (Test)
            pred = classifier.predict(X_test)

            #Calculate score on Out of Fold data (Test)
            score = accuracy_score(y_test, pred)
            scores.append(score)

        scores = np.array(scores)
        print('est: ' + str(est) + ', depth: ' + str(depth))
        print("Accuracy average on test set: %.4f std.dev.: %.4f" %(scores.mean(), scores.std()))