In [80]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly_express as px
%matplotlib inline

In [81]:
col_name = ['preg', 'glu', 'bp', 'sft', 'ins', 'bmi', 'dpf', 'age', 'outcome']
df = pd.read_csv('pima-indians-diabetes-2.data', names = col_name)

In [82]:
df.shape

(768, 9)

In [83]:
df.head()

Unnamed: 0,preg,glu,bp,sft,ins,bmi,dpf,age,outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [84]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
preg,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
glu,768.0,120.894531,31.972618,0.0,99.0,117.0,140.25,199.0
bp,768.0,69.105469,19.355807,0.0,62.0,72.0,80.0,122.0
sft,768.0,20.536458,15.952218,0.0,0.0,23.0,32.0,99.0
ins,768.0,79.799479,115.244002,0.0,0.0,30.5,127.25,846.0
bmi,768.0,31.992578,7.88416,0.0,27.3,32.0,36.6,67.1
dpf,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


In [85]:
df.isnull().sum()

preg       0
glu        0
bp         0
sft        0
ins        0
bmi        0
dpf        0
age        0
outcome    0
dtype: int64

In [86]:
df.dtypes

preg         int64
glu          int64
bp           int64
sft          int64
ins          int64
bmi        float64
dpf        float64
age          int64
outcome      int64
dtype: object

# Checking for variation in each column

In [87]:
df['preg'].nunique()

17

In [88]:
df['glu'].nunique()

136

In [89]:
df['bp'].nunique()

47

In [90]:
df['sft'].nunique()

51

In [91]:
df['ins'].nunique()

186

In [92]:
df['bmi'].nunique()

248

In [93]:
df['dpf'].nunique()

517

In [94]:
df['age'].nunique()

52

In [95]:
df['outcome'].nunique()

2

In [96]:
# 'dpf' is having many unique values --> higher variation
# Dropping 'dpf'

df = df.drop('dpf', axis = 1)

In [97]:
df.columns

Index(['preg', 'glu', 'bp', 'sft', 'ins', 'bmi', 'age', 'outcome'], dtype='object')

# Checking for outliers

In [98]:
px.box(df['preg'])

In [99]:
# Using IQR to remove outliers from 'preg'

q1 = df.preg.quantile(0.25)
q3 = df.preg.quantile(0.75)
IQR = q3 - q1
low = q1 - 1.5 * IQR
high = q3 + 1.5 * IQR
df.drop(df[df.preg < low].index, inplace = True)
df.drop(df[df.preg > high].index, inplace = True)

In [100]:
px.box(df['preg'])

In [101]:
px.box(df['glu'])

In [102]:
# Using IQR to remove outliers from 'glu'

q1 = df.glu.quantile(0.25)
q3 = df.glu.quantile(0.75)
IQR = q3 - q1
low = q1 - 1.5 * IQR
high = q3 + 1.5 * IQR
df.drop(df[df.glu < low].index, inplace = True)
df.drop(df[df.glu > high].index, inplace = True)

In [103]:
px.box(df['glu'])

In [104]:
px.box(df['bp'])

In [105]:
# Using IQR to remove outliers from 'bp'

q1 = df.bp.quantile(0.25)
q3 = df.bp.quantile(0.75)
IQR = q3 - q1
low = q1 - 1.5 * IQR
high = q3 + 1.5 * IQR
df.drop(df[df.bp < low].index, inplace = True)
df.drop(df[df.bp > high].index, inplace = True)

In [106]:
px.box(df['bp'])

In [107]:
px.box(df['sft'])

In [108]:
# Using IQR to remove outliers from 'sft'

q1 = df.sft.quantile(0.25)
q3 = df.sft.quantile(0.75)
IQR = q3 - q1
low = q1 - 1.5 * IQR
high = q3 + 1.5 * IQR
df.drop(df[df.sft < low].index, inplace = True)
df.drop(df[df.sft > high].index, inplace = True)

In [109]:
px.box(df['sft'])

In [110]:
px.box(df['ins'])

In [111]:
# Using IQR to remove outliers from 'ins'

q1 = df.ins.quantile(0.25)
q3 = df.ins.quantile(0.75)
IQR = q3 - q1
low = q1 - 1.5 * IQR
high = q3 + 1.5 * IQR
df.drop(df[df.ins < low].index, inplace = True)
df.drop(df[df.ins > high].index, inplace = True)

In [112]:
px.box(df['ins'])

In [113]:
px.box(df['bmi'])

In [114]:
# Using IQR to remove outliers from 'bmi'

q1 = df.bmi.quantile(0.25)
q3 = df.bmi.quantile(0.75)
IQR = q3 - q1
low = q1 - 1.5 * IQR
high = q3 + 1.5 * IQR
df.drop(df[df.bmi < low].index, inplace = True)
df.drop(df[df.bmi > high].index, inplace = True)

In [115]:
px.box(df['bmi'])

In [116]:
px.box(df['age'])

In [117]:
# Using IQR to remove outliers from 'age'

q1 = df.age.quantile(0.25)
q3 = df.age.quantile(0.75)
IQR = q3 - q1
low = q1 - 1.5 * IQR
high = q3 + 1.5 * IQR
df.drop(df[df.age < low].index, inplace = True)
df.drop(df[df.age > high].index, inplace = True)

In [118]:
px.box(df['age'])

In [119]:
px.box(df['outcome'])

# Categorizing the data (binning)

In [120]:
df.columns

Index(['preg', 'glu', 'bp', 'sft', 'ins', 'bmi', 'age', 'outcome'], dtype='object')

In [121]:
df['preg'].value_counts()

1     120
0      92
2      90
3      68
4      61
5      50
6      45
7      38
8      31
9      25
10     22
11     10
12      9
13      8
Name: preg, dtype: int64

In [122]:
#Dividing 'preg' into 3 groups i.e 0 - 5, 5 - 10, and 10 - 15

def preg_cat(x):
    if x >= 0 and x <= 5:
        return '0 - 5'
    elif x >= 6 and x <= 10:
        return '5 - 10'
    else:
        return '10 - 15'

In [123]:
df['preg'] = df['preg'].apply(preg_cat)

In [124]:
df['preg'].value_counts()

0 - 5      481
5 - 10     161
10 - 15     27
Name: preg, dtype: int64

In [125]:
df['glu'].value_counts()

100    15
99     15
106    14
111    14
108    13
       ..
65      1
62      1
61      1
57      1
199     1
Name: glu, Length: 133, dtype: int64

In [126]:
df['glu'].describe()

count    669.000000
mean     119.762332
std       29.622708
min       44.000000
25%       99.000000
50%      114.000000
75%      138.000000
max      199.000000
Name: glu, dtype: float64

In [127]:
df['glu'].unique()

array([148,  85, 183,  89, 137, 116,  78, 110, 168, 139, 166, 118, 107,
       115, 126,  99, 196, 119, 143, 125, 147,  97, 145, 117, 109, 158,
        88,  92, 122, 103, 138, 102,  90, 111, 180, 133, 106, 159, 146,
        71, 101, 176,  73, 187, 100, 105,  44, 141, 114,  95, 129,  79,
        62, 112, 113,  83,  80, 123,  81, 134, 142, 144,  93, 163, 151,
       171,  76, 160, 124, 120, 173, 170,  84,  96, 128, 108, 154,  57,
       136, 156, 188, 131, 104,  75, 179, 130,  87, 194, 162, 184, 140,
        74, 181, 164,  91,  86, 155, 191, 161,  77, 150, 182, 152, 157,
       165, 178,  61, 189,  98, 127,  82, 193,  72, 197,  94, 135, 132,
       195,  68, 186, 198, 121,  67, 174, 167, 199,  56, 169, 149, 175,
        65, 153, 190], dtype=int64)

In [128]:
# Dividing 'glu' into 4 categories based on glucose levels

def glu_cat(x):
    if x >= 40 and x <= 90:
        return 'Low'
    elif x >= 90 and x <= 120:
        return 'Normal'
    elif x >= 120 and x <= 160:
        return 'Medium to high'
    else:
        return 'High'

In [129]:
df['glu'] = df['glu'].apply(glu_cat)

In [130]:
df['glu'].value_counts()

Normal            280
Medium to high    210
Low               101
High               78
Name: glu, dtype: int64

In [131]:
df['bp'].value_counts()

70     51
74     49
72     43
78     42
68     41
64     40
76     37
80     36
60     34
62     31
66     29
82     25
88     23
86     21
84     21
58     21
90     19
56     12
50     11
54     11
52     10
92      8
75      7
65      7
94      6
85      6
48      4
44      4
96      3
106     3
55      2
98      2
100     2
104     2
40      1
61      1
95      1
46      1
102     1
38      1
Name: bp, dtype: int64

In [132]:
df.bp.describe()

count    669.000000
mean      72.125561
std       11.407363
min       38.000000
25%       64.000000
50%       72.000000
75%       80.000000
max      106.000000
Name: bp, dtype: float64

In [133]:
# Dividing 'bp' into 3 categories based on diastolic pressure

def bp_cat(x):
    if x >= 30 and x <= 80:
        return 'Normal'
    elif x >= 80 and x <= 89:
        return 'Medium to high'
    else:
        return 'High'

In [134]:
df['bp'] = df['bp'].apply(bp_cat)

In [135]:
df['bp'].value_counts()

Normal            526
Medium to high     96
High               47
Name: bp, dtype: int64

In [136]:
df['sft'].value_counts()

0     183
32     29
30     25
27     22
28     20
18     19
31     18
23     18
39     17
19     17
40     16
22     16
29     16
25     15
33     15
26     15
17     14
37     14
15     14
36     13
41     12
35     12
20     11
13     10
21      9
24      9
34      8
42      8
12      7
46      7
11      6
43      6
45      5
10      5
38      5
16      5
14      5
47      4
50      3
48      3
54      2
52      2
8       2
49      2
44      2
7       2
60      1
Name: sft, dtype: int64

In [137]:
df['sft'].describe()

count    669.000000
mean      20.735426
std       15.247959
min        0.000000
25%        0.000000
50%       23.000000
75%       32.000000
max       60.000000
Name: sft, dtype: float64

In [138]:
# Dividing 'sft' (solitary fibrous tumour) into 2 categories 

def sft_cat(x):
    if x >= 0 and x <= 30:
        return 'Safe'
    else:
        return 'Not safe'

In [139]:
df['sft'] = df['sft'].apply(sft_cat)

In [140]:
df['sft'].value_counts()

Safe        465
Not safe    204
Name: sft, dtype: int64

In [141]:
df['ins'].value_counts()

0      317
105     11
140      9
130      8
120      8
      ... 
112      1
108      1
99       1
91       1
330      1
Name: ins, Length: 156, dtype: int64

In [142]:
df['ins'].describe()

count    669.000000
mean      69.204783
std       83.890228
min        0.000000
25%        0.000000
50%       42.000000
75%      125.000000
max      330.000000
Name: ins, dtype: float64

In [143]:
# Dividing 'ins' (insulin secretion) into 3 categories based on insulin level in blood

def ins_cat(x):
    if x <= 100:
        return 'Normal'
    elif x > 100 and x <= 125:
        return 'Medium'
    else:
        return 'High'

In [144]:
df['ins'] = df['ins'].apply(ins_cat)

In [145]:
df['ins'].value_counts()

Normal    460
High      164
Medium     45
Name: ins, dtype: int64

In [146]:
df['bmi'].value_counts()

32.0    12
31.6    12
31.2    11
33.3    10
32.8     9
        ..
24.9     1
46.3     1
40.9     1
45.7     1
48.3     1
Name: bmi, Length: 230, dtype: int64

In [147]:
df['bmi'].describe()

count    669.000000
mean      32.010762
std        6.372051
min       18.200000
25%       27.300000
50%       32.000000
75%       36.100000
max       49.600000
Name: bmi, dtype: float64

In [148]:
def bmi_cat(x):
    if x <= 19:
        return 'Underweight'
    elif x > 19 and x <= 25:
        return 'Normal'
    elif x > 25 and x <= 30:
        return 'Overweight'
    else:
        return 'Obese'

In [149]:
df['bmi'] = df['bmi'].apply(bmi_cat)

In [150]:
df['bmi'].value_counts()

Obese          405
Overweight     162
Normal          98
Underweight      4
Name: bmi, dtype: int64

In [151]:
df['age'].value_counts()

22    64
21    55
24    42
25    40
23    33
28    31
27    30
26    28
29    26
31    19
41    19
30    18
42    17
33    16
32    15
45    15
37    15
36    14
38    14
39    12
40    12
43    11
34    11
35    10
46    10
51     8
50     8
58     7
52     7
57     5
44     5
49     5
48     5
47     5
53     4
54     4
55     4
60     4
63     4
66     4
56     3
62     3
59     2
61     2
65     2
64     1
Name: age, dtype: int64

In [152]:
df['age'].describe()

count    669.000000
mean      32.863976
std       11.148603
min       21.000000
25%       24.000000
50%       29.000000
75%       40.000000
max       66.000000
Name: age, dtype: float64

In [153]:
# Dividing 'age' into 4 categories 

def age_cat(x):
    if x <= 15:
        return 'Child'
    elif x > 15 and x <= 24:
        return 'Youth'
    elif x > 24 and x <= 64:
        return 'Adult'
    else:
        return 'Senior'

In [154]:
df['age'] = df['age'].apply(age_cat)

In [155]:
df['age'].value_counts()

Adult     469
Youth     194
Senior      6
Name: age, dtype: int64

In [156]:
df.head()

Unnamed: 0,preg,glu,bp,sft,ins,bmi,age,outcome
0,5 - 10,Medium to high,Normal,Not safe,Normal,Obese,Adult,1
1,0 - 5,Low,Normal,Safe,Normal,Overweight,Adult,0
2,5 - 10,High,Normal,Safe,Normal,Normal,Adult,1
3,0 - 5,Low,Normal,Safe,Normal,Overweight,Youth,0
4,0 - 5,Medium to high,Normal,Not safe,High,Obese,Adult,1


# Encoding the data

In [157]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df['preg_new'] = le.fit_transform(df['preg'])
df['glu_new'] = le.fit_transform(df['glu'])
df['bp_new'] = le.fit_transform(df['bp'])
df['sft_new'] = le.fit_transform(df['sft'])
df['ins_new'] = le.fit_transform(df['ins'])
df['bmi_new'] = le.fit_transform(df['bmi'])
df['age_new'] = le.fit_transform(df['age'])

df['outcome'] = df['outcome'].replace({0:'Healthy', 1:'Diabetic'})
df['outcome'] = df.outcome.astype('category')

In [158]:
df.head()

Unnamed: 0,preg,glu,bp,sft,ins,bmi,age,outcome,preg_new,glu_new,bp_new,sft_new,ins_new,bmi_new,age_new
0,5 - 10,Medium to high,Normal,Not safe,Normal,Obese,Adult,Diabetic,2,2,2,0,2,1,0
1,0 - 5,Low,Normal,Safe,Normal,Overweight,Adult,Healthy,0,1,2,1,2,2,0
2,5 - 10,High,Normal,Safe,Normal,Normal,Adult,Diabetic,2,0,2,1,2,0,0
3,0 - 5,Low,Normal,Safe,Normal,Overweight,Youth,Healthy,0,1,2,1,2,2,2
4,0 - 5,Medium to high,Normal,Not safe,High,Obese,Adult,Diabetic,0,2,2,0,0,1,0


# Using the data in the model

In [238]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, auc

x = df[['preg_new', 'glu_new', 'bp_new', 'sft_new', 'ins_new', 'bmi_new', 'age_new']]
y = df['outcome']

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.30, random_state = 1)

X_train = np.array(x_train)
X_test = np.array(x_test)
Y_train = np.array(y_train)
Y_test = np.array(y_test)

mo = DecisionTreeClassifier(criterion = 'entropy', max_depth = 7, splitter = 'best', max_leaf_nodes = 2)
mo.fit(X_train, Y_train)

pred = mo.predict(X_test)

acc = metrics.accuracy_score(Y_test, pred)
print ('Accuracy: {}%'.format(round(acc * 100), 2))

Accuracy: 76%


In [239]:
# Printing the confusion matrix

from sklearn.metrics import confusion_matrix, classification_report

print ('Confusion matrix:')
print (metrics.confusion_matrix(Y_test, pred))

Confusion matrix:
[[ 46 105]
 [  9 309]]


In [240]:
# Printing the classification report

print ('Classification report:')
print (metrics.classification_report(Y_test, pred))

Classification report:
              precision    recall  f1-score   support

    Diabetic       0.84      0.30      0.45       151
     Healthy       0.75      0.97      0.84       318

    accuracy                           0.76       469
   macro avg       0.79      0.64      0.65       469
weighted avg       0.78      0.76      0.72       469

