In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
data = pd.read_csv("./diabetes.csv")

In [3]:
data.shape

(768, 9)

In [4]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
data.isnull().values.any()

False

In [6]:
list(data.columns)

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome']

In [7]:
data.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

In [8]:
data.corr()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,1.0,0.129459,0.141282,-0.081672,-0.073535,0.017683,-0.033523,0.544341,0.221898
Glucose,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514,0.466581
BloodPressure,0.141282,0.15259,1.0,0.207371,0.088933,0.281805,0.041265,0.239528,0.065068
SkinThickness,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,0.074752
Insulin,-0.073535,0.331357,0.088933,0.436783,1.0,0.197859,0.185071,-0.042163,0.130548
BMI,0.017683,0.221071,0.281805,0.392573,0.197859,1.0,0.140647,0.036242,0.292695
DiabetesPedigreeFunction,-0.033523,0.137337,0.041265,0.183928,0.185071,0.140647,1.0,0.033561,0.173844
Age,0.544341,0.263514,0.239528,-0.11397,-0.042163,0.036242,0.033561,1.0,0.238356
Outcome,0.221898,0.466581,0.065068,0.074752,0.130548,0.292695,0.173844,0.238356,1.0


In [9]:
from sklearn.model_selection import train_test_split
features = ['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age']

In [10]:
X = data[features].values
y = data['Outcome'].values

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=1) 

In [12]:
for i in list(features):
    print("Missing values in {0}: {1}".format(i, len(data.loc[data[i] ==0])))

Missing values in Pregnancies: 111
Missing values in Glucose: 5
Missing values in BloodPressure: 35
Missing values in SkinThickness: 227
Missing values in Insulin: 374
Missing values in BMI: 11
Missing values in DiabetesPedigreeFunction: 0
Missing values in Age: 0


In [13]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=0, strategy='mean')

X_train = imputer.fit_transform(X_train)
X_test = imputer.fit_transform(X_test)

In [14]:
# Using logistic regression 

from sklearn.linear_model import LogisticRegression 

lr_model = LogisticRegression(max_iter=10000)
lr_model.fit(X_train, y_train)
lr_y_pred = lr_model.predict(X_test)

In [15]:
#Accuracy of Logistic regression 
from sklearn import metrics

print("Logistic Regression Metrics:")
print("Accuracy = ", metrics.accuracy_score(y_test, lr_y_pred))
print("Precision = ",metrics.precision_score(y_test, lr_y_pred))
print("Recall = ",metrics.recall_score(y_test, lr_y_pred))

lr_cnf_matrix = metrics.confusion_matrix(y_test, lr_y_pred)
print("Confusion matrix : ")
lr_cnf_matrix

Logistic Regression Metrics:
Accuracy =  0.7878787878787878
Precision =  0.7727272727272727
Recall =  0.6
Confusion matrix : 


array([[131,  15],
       [ 34,  51]], dtype=int64)

In [16]:
# Using naive Bayes theorem 

from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_y_pred = nb_model.predict(X_test)


In [17]:
print("Naive Bayes Metrics:")
print("Accuracy = ", metrics.accuracy_score(y_test, nb_y_pred))
print("Precision = ",metrics.precision_score(y_test, nb_y_pred))
print("Recall = ",metrics.recall_score(y_test, nb_y_pred))

nb_cnf_matrix = metrics.confusion_matrix(y_test, nb_y_pred)
print("Confusion matrix : ")
nb_cnf_matrix

Naive Bayes Metrics:
Accuracy =  0.7705627705627706
Precision =  0.7051282051282052
Recall =  0.6470588235294118
Confusion matrix : 


array([[123,  23],
       [ 30,  55]], dtype=int64)