In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler



In [2]:
df = pd.read_csv("E:\Data Science\Datasets\diabetes.csv")
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [3]:
X = df.drop('Outcome', axis = 1)
Y = df.Outcome

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 123)

In [4]:
# Defining function to do a grid search CV of multiple models

models = ['Decision Tree', 'Random Forest', 'KNeighbors Classifier', 'Logistic Regression']  

classifiers = {DecisionTreeClassifier(random_state = 104) : {'criterion' : ('gini', 'entropy'),
                                                             'splitter' : ('best', 'random'),
                                                             #'max_depth' : [2, 4, 6, 8, 10],
                                                             #'min_samples_split' : [2,3,4,5,6],
                                                             #"min_samples_leaf": [1,2,3,4,5],
                                                             #"max_features": [4,5,6,"sqrt"],
                                                             #"max_leaf_nodes": [1,2,3,4,5]}         
                                                            },
              RandomForestClassifier(random_state = 100) : {#"max_depth": [2,3,4,5,6,7,8,9,10,11,12],
                                                             #"min_samples_split" :[2,3,4,5,6],
                                                             "n_estimators" : [100, 300, 500],
                                                             #"min_samples_leaf": [1,2,3,4,5],
                                                             #"max_features": [4,5,6,"sqrt"],
                                                             "criterion": ['gini','entropy']},
               
               KNeighborsClassifier() : {'n_neighbors' : range(15,30),
                                         'p' : [1,2],
                                         'weights': ['uniform','distance']},
               
               #GaussianNB() : {'priors' : [0.1, 0.9]},
               
               LogisticRegression(random_state = 102) : {'C' : [0.001,0.01,0.1,1,10],
                                                            'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                                                            'max_iter' : [100, 300, 500],
                                                            'multi_class' : ['auto', 'ovr', 'multinomial'],
                                                            }
              }

In [5]:
for model, classifier, classifier_param in zip(models, classifiers.keys(), classifiers.values()):
    
    # Defining the grid search
    clf = GridSearchCV(estimator = classifier,
                       param_grid = classifier_param,
                       cv = 10, 
                       scoring = "roc_auc", 
                       n_jobs = -1)
    
    #fitting the data
    clf.fit(X_train, y_train)
    
    #printing model and its best score
    print('For %s, Best Score is : %.5f'%(model, clf.best_score_)) 
    
    #printing model and its best score
    print('\nFor %s, Best Parameters are : %s'%(model, clf.best_params_)) 
    
    #Making predictions
    y_pred = clf.predict(X_test)
    
    #printing accuracy score of model
    print('\nFor %s, The accuracy score : %.5f'%(model, accuracy_score(y_pred,y_test)))
    
    #confusion matrix
    print("\nConfusion Matrix :: \n")
    matrix = confusion_matrix(y_pred,y_test)
    print(matrix)
    print('\nClassification Report for %s is \n%s'%(model, classification_report(y_pred,y_test)))
    print('*' * 80)

For Decision Tree, Best Score is : 0.69095

For Decision Tree, Best Parameters are : {'criterion': 'entropy', 'splitter': 'best'}

For Decision Tree, The accuracy score : 0.68831

Confusion Matrix :: 

[[72 24]
 [24 34]]

Classification Report for Decision Tree is 
              precision    recall  f1-score   support

           0       0.75      0.75      0.75        96
           1       0.59      0.59      0.59        58

    accuracy                           0.69       154
   macro avg       0.67      0.67      0.67       154
weighted avg       0.69      0.69      0.69       154

********************************************************************************
For Random Forest, Best Score is : 0.82266

For Random Forest, Best Parameters are : {'criterion': 'entropy', 'n_estimators': 300}

For Random Forest, The accuracy score : 0.79221

Confusion Matrix :: 

[[82 18]
 [14 40]]

Classification Report for Random Forest is 
              precision    recall  f1-score   support

    