In [1]:
import pandas as pd
import numpy as np
import plotly_express as px
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
col_name = ['preg', 'glu', 'bp', 'sft', 'ins', 'bmi', 'dpf', 'age', 'outcome']
df = pd.read_csv('pima-indians-diabetes-2.data', names = col_name)

In [3]:
df.shape

(768, 9)

In [4]:
df.head()

Unnamed: 0,preg,glu,bp,sft,ins,bmi,dpf,age,outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
df['outcome'].value_counts()

0    500
1    268
Name: outcome, dtype: int64

In [6]:
# Replacing 0 as 'Healthy' and 1 as 'Diabetic'

df['outcome'] = df['outcome'].replace({0:'Healthy', 1:'Diabetic'})

In [7]:
# Converting 'outcome' to category type

df['outcome'] = df.outcome.astype('category')

In [8]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, auc

In [9]:
# Separating dependent and independent variables

x = df[['preg', 'glu', 'bp', 'sft', 'ins', 'bmi', 'dpf', 'age']]
y = df['outcome']

In [10]:
# Splitting the data for train and test

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.30, random_state = 1)

In [11]:
# Converting x and y to array

X_train = np.array(x_train)
X_test = np.array(x_test)
Y_train = np.array(y_train)
Y_test = np.array(y_test)

In [12]:
# Training the model

mo = DecisionTreeClassifier(criterion = 'entropy')
mo.fit(X_train, Y_train)

DecisionTreeClassifier(criterion='entropy')

In [13]:
# Prediction

pred = mo.predict(X_test)

In [14]:
# Checking the accuracy

acc = metrics.accuracy_score(Y_test, pred)
print ('Accuracy:', acc * 100)

Accuracy: 66.91449814126395


In [15]:
# Classification report

print (metrics.classification_report(Y_test, pred))

              precision    recall  f1-score   support

    Diabetic       0.52      0.49      0.50       184
     Healthy       0.74      0.76      0.75       354

    accuracy                           0.67       538
   macro avg       0.63      0.63      0.63       538
weighted avg       0.66      0.67      0.67       538



In [16]:
# Confusion matrix

print (metrics.confusion_matrix(Y_test, pred))

[[ 90  94]
 [ 84 270]]


In [17]:
# Importance of columns for the outcome

print (pd.DataFrame(mo.feature_importances_, columns = ['Imp'], index = x_train.columns))

           Imp
preg  0.021339
glu   0.195956
bp    0.072435
sft   0.022428
ins   0.033724
bmi   0.244415
dpf   0.090737
age   0.318964


In [18]:
# Setting max_depth for the decision tree

mo2 = DecisionTreeClassifier(criterion = 'entropy', max_depth = 5)
mo2.fit(X_train, Y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=5)

In [19]:
new_pred = mo2.predict(X_test)

In [21]:
# New accuracy

new_acc = metrics.accuracy_score(Y_test, new_pred)
print ('Accuracy:', new_acc * 100)

Accuracy: 70.26022304832715


Accuracy improved.

In [22]:
print(metrics.confusion_matrix(Y_test, new_pred))

[[109  75]
 [ 85 269]]


In [23]:
print(metrics.classification_report(Y_test, new_pred))

              precision    recall  f1-score   support

    Diabetic       0.56      0.59      0.58       184
     Healthy       0.78      0.76      0.77       354

    accuracy                           0.70       538
   macro avg       0.67      0.68      0.67       538
weighted avg       0.71      0.70      0.70       538



In [25]:
# Exporting the tree diagram

from IPython.display import Image
from sklearn import tree
from os import system

tree_diag = open('decision_tree_diabetes.dot', 'w')
dot = tree.export_graphviz(mo2, out_file = tree_diag, feature_names = list(x_train), class_names = list(y_train))
tree_diag.close()

In [26]:
# To check flow diagram of the the decision tree
# http://graphviz.it/