In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris

In [4]:
data=pd.read_csv('diabetes.csv') #loading the dataset

In [5]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
l=['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
for i in l:
    data[i]=data[i].replace(0,np.mean(data[i]))

In [7]:
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.000000,79.799479,33.6,0.627,50,1
1,1,85.0,66.0,29.000000,79.799479,26.6,0.351,31,0
2,8,183.0,64.0,20.536458,79.799479,23.3,0.672,32,1
3,1,89.0,66.0,23.000000,94.000000,28.1,0.167,21,0
4,0,137.0,40.0,35.000000,168.000000,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48.000000,180.000000,32.9,0.171,63,0
764,2,122.0,70.0,27.000000,79.799479,36.8,0.340,27,0
765,5,121.0,72.0,23.000000,112.000000,26.2,0.245,30,0
766,1,126.0,60.0,20.536458,79.799479,30.1,0.349,47,1


In [8]:
x=data.iloc[:,:8]
y=data['Outcome']

## Decision Tree classifier

It is a classification problem where we need to classify if the person is diabetic or not. We will be using the decision tree classifier for predicting the same.

Decision tree classifier splits the training data in a way that minimises the gini or entropy .
Decision tree works on the basis of CART algo in sklearn.
The algo automatically selects that feature and a datapoint which leads to zero entropy /gini or in other words that feature is selected which gives the highest gain (Information Gain).

In [23]:
x_train,x_test,y_train ,y_test= train_test_split(x,y,test_size=.30,random_state=0)

## Gini as a Criterion

In [24]:
model=DecisionTreeClassifier( criterion='gini',random_state=0) #default parameters are used

In [25]:
model.fit(x_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [26]:
y_pred=model.predict(x_test)

In [27]:
from sklearn.metrics import confusion_matrix,accuracy_score,roc_auc_score,classification_report
from sklearn.model_selection import cross_val_score

In [28]:
roc_auc_score(y_test,y_pred)

#we are getting roc_auc of 72% when the hyperparameters are default that is no regularisation is done.

0.724823549664314

In [29]:
confusion_matrix(y_test,y_pred)

array([[130,  27],
       [ 28,  46]], dtype=int64)

In [19]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.82      0.83      0.83       157
           1       0.63      0.62      0.63        74

    accuracy                           0.76       231
   macro avg       0.73      0.72      0.73       231
weighted avg       0.76      0.76      0.76       231



## Entropy as a Criterion

In [30]:
model=DecisionTreeClassifier( criterion='entropy',random_state=0) #default parameters arent used

In [31]:
model.fit(x_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [32]:
y_pred=model.predict(x_test)

In [33]:
roc_auc_score(y_test,y_pred)

0.674728868996385

In [34]:
confusion_matrix(y_test,y_pred)

array([[127,  30],
       [ 34,  40]], dtype=int64)

### Cross validation

In [36]:
#cross validation is used as our above accuracy will change at different random state's value.

score=cross_val_score(model,x,y,cv=5 ,scoring='roc_auc') #this will divide the data into 5 folds and will build model on eachof the fold.

print(score) # this will give accuracies of 5 different experiments done on 5 different fold of the data using same model.


[0.68351852 0.69685185 0.60722222 0.70688679 0.69188679]


In [37]:
print(score.mean()) #this will give the mean of all the accuracies of the 5 different experiment.

0.6772732354996506


# Estimating Class Probabilities

In [99]:
model.predict_proba(x)

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [0., 1.],
       [1., 0.]])