In [1]:
#importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

In [2]:
#reading file using pandas
df = pd.read_csv("updated_diabetes.csv")#use the path of your file after downloading from Dataset folder

In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,30.5,33.6,0.627,50,1
1,1,85.0,66.0,29.0,30.5,26.6,0.351,31,0
2,8,183.0,64.0,20.536458,30.5,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    float64
 2   BloodPressure             768 non-null    float64
 3   SkinThickness             768 non-null    float64
 4   Insulin                   768 non-null    float64
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(6), int64(3)
memory usage: 54.1 KB


In [5]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,121.681605,72.254807,26.606479,94.652344,32.450805,0.471876,33.240885,0.348958
std,3.369578,30.436016,12.115932,9.631241,105.547598,6.875374,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.75,64.0,20.536458,30.5,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,31.25,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [6]:
#segregating the independent and dependent variables
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [7]:
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72.0,35.000000,30.5,33.6,0.627,50
1,1,85.0,66.0,29.000000,30.5,26.6,0.351,31
2,8,183.0,64.0,20.536458,30.5,23.3,0.672,32
3,1,89.0,66.0,23.000000,94.0,28.1,0.167,21
4,0,137.0,40.0,35.000000,168.0,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48.000000,180.0,32.9,0.171,63
764,2,122.0,70.0,27.000000,30.5,36.8,0.340,27
765,5,121.0,72.0,23.000000,112.0,26.2,0.245,30
766,1,126.0,60.0,20.536458,30.5,30.1,0.349,47


In [8]:
y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

In [9]:
#sepeating data into train and test
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((576, 8), (192, 8), (576,), (192,))

I am not scaling the data because, **Decision tree** is not sensitive to the scale of features since it make decisions based on relative feature comparisons.

In [10]:
#calculating cross validation score using KFold with 10 splits for the training data
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
cv = KFold(n_splits=10)
CV_score = cross_val_score(DecisionTreeClassifier(),X_train,y_train,cv=cv)
print("Cross Valdation score for 10 splits:",CV_score)

Cross Valdation score for 10 splits: [0.68965517 0.62068966 0.68965517 0.74137931 0.65517241 0.65517241
 0.56140351 0.59649123 0.70175439 0.63157895]


In [11]:
print("Average Cross Validation score:",np.mean(CV_score))

Average Cross Validation score: 0.6542952208106474


In [12]:
import warnings
warnings.filterwarnings("ignore")

In [13]:
#Hyperparameter tuning using GridSearch CV
parameters = {
    'criterion': ['gini', 'entropy','log_loss'],
    'splitter':['best','random'],
    'max_depth': [None, 1,2,3,4,5,6,7,8,9,10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}
estimator = DecisionTreeClassifier()
grid_clf = GridSearchCV(estimator,param_grid=parameters,cv=5,scoring = "accuracy")
grid_clf.fit(X_train,y_train)

In [14]:
grid_clf.best_estimator_

In [15]:
grid_clf.best_params_

{'criterion': 'entropy',
 'max_depth': 7,
 'max_features': 'log2',
 'min_samples_leaf': 4,
 'min_samples_split': 5,
 'splitter': 'best'}

In [16]:
grid_clf.best_score_

0.7517541229385307

CV score got increased around 10% after using Grid Search CV

In [17]:
# After using GridSearch CV we got the best parameters to use in Decision Tree Classifier
classifier = DecisionTreeClassifier(criterion = 'entropy',max_depth = 7,max_features = 'log2',min_samples_leaf = 4,
                                    min_samples_split = 5,splitter='best')
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
y_pred

array([1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0], dtype=int64)

In [18]:
y_pred.shape

(192,)

In [19]:
#Checking performance
print("Confusion_matrix :")
print(confusion_matrix(y_test,y_pred))
print("Accuracy :",accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

Confusion_matrix :
[[103  27]
 [ 25  37]]
Accuracy : 0.7291666666666666
              precision    recall  f1-score   support

           0       0.80      0.79      0.80       130
           1       0.58      0.60      0.59        62

    accuracy                           0.73       192
   macro avg       0.69      0.69      0.69       192
weighted avg       0.73      0.73      0.73       192



The accuracy of the test and predicted data is high (73%), surpassing the cross-validation score of the training data obtained through k-fold cross-validation (65%). This suggests that the model generalizes well to new, unseen data. The higher accuracy on the test set indicates the effectiveness of the trained model in making accurate predictions on real-world scenarios, instilling confidence in its predictive capabilities.