# ---- Diabetes Detection -----
### Algorithm : Decision Tree Algorithm
###### Created by - Shreyas Mane 

##### Introduction :
Diabetes is a metabolic disorder that affects millions of individuals worldwide.   Diabetes-related problems in several key organs of the body can be deadly if left untreated. Diabetes identification is critical for timely treatment, which can prevent the condition from advancing to severe consequences.  
The main aim of this project is to predict the possible presence of diabetes at an early stage using machine learning technique.

In [1]:
#Import required Libraries and read Data
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#Reading the dataset
df=pd.read_csv("diabetes.csv")
df.head()

Unnamed: 0,Age,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Outcome
0,50,6,148,72,35,0,33.6,0.627,yes
1,31,1,85,66,29,0,26.6,0.351,no
2,32,8,183,64,0,0,23.3,0.672,yes
3,21,1,89,66,23,94,28.1,0.167,no
4,33,0,137,40,35,168,43.1,2.288,yes


In [3]:
#Total Rows and Columns
df.shape

(768, 9)

#### Preprocessing and Feature Extraction

In [4]:
#Select only required features
mdf=df.drop(["DiabetesPedigreeFunction","Pregnancies","SkinThickness"],axis="columns")
mdf.columns

Index(['Age', 'Glucose', 'BloodPressure', 'Insulin', 'BMI', 'Outcome'], dtype='object')

In [5]:
#Detect missing values
mdf.isna().sum()

Age              0
Glucose          0
BloodPressure    0
Insulin          0
BMI              0
Outcome          0
dtype: int64

In [6]:
#Required dataset features
mdf.head()

Unnamed: 0,Age,Glucose,BloodPressure,Insulin,BMI,Outcome
0,50,148,72,0,33.6,yes
1,31,85,66,0,26.6,no
2,32,183,64,0,23.3,yes
3,21,89,66,94,28.1,no
4,33,137,40,168,43.1,yes


In [7]:
#convert input into numbers 
from sklearn.preprocessing import LabelEncoder
le_Outcomes=LabelEncoder()
mdf['Outcome_n'] = le_Outcomes.fit_transform(mdf['Outcome'])
mdf.head()

Unnamed: 0,Age,Glucose,BloodPressure,Insulin,BMI,Outcome,Outcome_n
0,50,148,72,0,33.6,yes,1
1,31,85,66,0,26.6,no,0
2,32,183,64,0,23.3,yes,1
3,21,89,66,94,28.1,no,0
4,33,137,40,168,43.1,yes,1


In [8]:
#Removing old outcome column
nmdf=mdf.drop(['Outcome'],axis=1)
nmdf

Unnamed: 0,Age,Glucose,BloodPressure,Insulin,BMI,Outcome_n
0,50,148,72,0,33.6,1
1,31,85,66,0,26.6,0
2,32,183,64,0,23.3,1
3,21,89,66,94,28.1,0
4,33,137,40,168,43.1,1
...,...,...,...,...,...,...
763,63,101,76,180,32.9,0
764,27,122,70,0,36.8,0
765,30,121,72,112,26.2,0
766,47,126,60,0,30.1,1


In [9]:
#Dataset Description
nmdf.describe()

Unnamed: 0,Age,Glucose,BloodPressure,Insulin,BMI,Outcome_n
count,768.0,768.0,768.0,768.0,768.0,768.0
mean,33.240885,120.894531,69.105469,79.799479,31.992578,0.348958
std,11.760232,31.972618,19.355807,115.244002,7.88416,0.476951
min,21.0,0.0,0.0,0.0,0.0,0.0
25%,24.0,99.0,62.0,0.0,27.3,0.0
50%,29.0,117.0,72.0,30.5,32.0,0.0
75%,41.0,140.25,80.0,127.25,36.6,1.0
max,81.0,199.0,122.0,846.0,67.1,1.0


In [10]:
#checking unique entities in age feature
nmdf.Age.unique()

array([50, 31, 32, 21, 33, 30, 26, 29, 53, 54, 34, 57, 59, 51, 27, 41, 43,
       22, 38, 60, 28, 45, 35, 46, 56, 37, 48, 40, 25, 24, 58, 42, 44, 39,
       36, 23, 61, 69, 62, 55, 65, 47, 52, 66, 49, 63, 67, 72, 81, 64, 70,
       68], dtype=int64)

#### Test-Train Data Split


In [11]:
x=nmdf.drop("Outcome_n",axis="columns")
y=nmdf['Outcome_n']

from sklearn.model_selection import train_test_split
#Splitting dataset into 70-30 ratio
x_train,x_test,y_train,y_test=train_test_split(x,y, test_size=0.3)

In [12]:
# Check the size of data after split
print(x.shape)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(768, 5)
(537, 5)
(231, 5)
(537,)
(231,)


#### Decision Tree Classifier

In [13]:
# Building Decision Tree Classifier
from sklearn import tree
model = tree.DecisionTreeClassifier(criterion='entropy')

In [14]:
# Train the model
model.fit(x_train,y_train)

DecisionTreeClassifier(criterion='entropy')

In [27]:
#Prediction of Diabetes 
print(model.predict([[66,66,0,266,854]]))
print(model.predict([[50,148,72,0,33.6]]))

[0]
[1]


In [26]:
# Calculating y Prediction
y_pred=model.predict(x_test)
y_pred

array([0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1])

#### Model Accuracy

In [17]:
# Check Model Score
model.score(x_test,y_test)

0.7272727272727273

In [18]:
# Measure Accuarcy
from sklearn.metrics import confusion_matrix, average_precision_score
from sklearn.metrics import precision_recall_fscore_support
cm=confusion_matrix(y_pred,y_test)
aps=average_precision_score(y_pred,y_test)
print(precision_recall_fscore_support(y_pred,y_test, average='macro'))
print("--------")
print("Confusion Matrix is -")
print(cm)
print("--- Average Precision Score is ----")
print(aps)

(0.6938775510204082, 0.7046153846153846, 0.6979263964132263, None)
--------
Confusion Matrix is -
[[120  36]
 [ 27  48]]
--- Average Precision Score is ----
0.4825974025974026


In [19]:
#Acccuracy of model
print("Accuracy of Model is :",model.score(x_test,y_test)*100)

Accuracy of Model is : 72.72727272727273


In [20]:
#importing pickle file
import pickle
pickle_out = open('classifier.pkl','wb')
pickle.dump(model,pickle_out)
pickle_out.close()