# Decision Tree & Random Forest Classification Notebook
---------
We will use diabetes.csv dataset in this notebook.

The dataset has some medical information on some patients, and the prediction (outcome Y) whether the patient has diabetes or not.

We will also use Cross Validation and Grid Search as in the previous notebooks

## Step 1: Importing Modules

In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV

## Step 2: Reading Data

In [2]:
df = pd.read_csv('Data/diabetes.csv')
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


## Step 3: Cleaning Data if needed

In [3]:
df.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

## Step 4: Seperating Data

In [4]:
X, Y = df.iloc[:, 0:-1], df.iloc[:, -1]
print(X.shape, Y.shape)

(768, 8) (768,)


## Step 5: Build the Model & evaluate

### 5.1: Decision Tree Classifier

In [5]:
# I will use grid search to find the optimal hyperparameters
grid = GridSearchCV(estimator=DecisionTreeClassifier(), cv=10, param_grid=dict(max_depth=[2, 3, 4, 5, 6, 7, 8, None]))
grid = grid.fit(X, Y)

In [6]:
print("%.1f%%" % (grid.best_score_ * 100))
print(grid.best_params_)

74.0%
{'max_depth': 2}


In [8]:
model = DecisionTreeClassifier(max_depth=grid.best_params_['max_depth'])
accuracy = cross_val_score(model, X, Y, cv=KFold(n_splits=10))
print('Accuracy = %.2f %%' % (accuracy.mean() * 100))

Accuracy = 75.51 %


### 5.2: Random Forest Classifier

In [9]:
# I will use grid search to find the optimal hyperparameters
max_depth_options = [2, 3, 4, 5, 6, 7, 8, None]
n_estimators_options = [50, 100, 150, 200, 250]
criterion_options = ['gini', 'entropy']

grid = GridSearchCV(estimator=RandomForestClassifier(), cv=10, param_grid=dict(max_depth=max_depth_options, n_estimators=n_estimators_options, criterion=criterion_options))

In [10]:
grid = grid.fit(X, Y)

In [11]:
print("%.1f%%" % (grid.best_score_ * 100))
print(grid.best_params_)

77.7%
{'criterion': 'entropy', 'max_depth': 8, 'n_estimators': 150}


In [13]:
model = RandomForestClassifier(max_depth=grid.best_params_['max_depth'], n_estimators=grid.best_params_['n_estimators'], criterion=grid.best_params_['criterion'])
accuracy = cross_val_score(model, X, Y, cv=KFold(n_splits=10))
print('Accuracy = %.2f %%' % (accuracy.mean() * 100))

Accuracy = 77.60 %
