In [8]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix

In [9]:
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [10]:
y = df['Outcome']
x = df.drop(['Outcome'],axis=1)

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, random_state=42)

In [12]:
x_train.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,576.0,576.0,576.0,576.0,576.0,576.0,576.0,576.0
mean,3.831597,120.767361,69.170139,20.723958,77.899306,32.064583,0.4802,33.536458
std,3.312864,31.77138,18.699887,15.877307,107.415003,7.861032,0.333188,11.878752
min,0.0,0.0,0.0,0.0,0.0,0.0,0.084,21.0
25%,1.0,99.0,62.0,0.0,0.0,27.6,0.24575,24.0
50%,3.0,116.5,72.0,23.0,40.0,32.4,0.384,30.0
75%,6.0,141.0,80.0,32.0,129.25,36.525,0.64625,41.0
max,17.0,199.0,122.0,99.0,744.0,67.1,2.329,81.0


In [13]:
train_mean_pos = x_train[y_train==1].mean()
train_std_pos = x_train[y_train==1].std()
train_mean_neg = x_train[y_train==0].mean()
train_std_neg = x_train[y_train==0].std()

In [14]:
from math import sqrt
from math import pi
from math import exp

def cond_probability(x, mean, std):
    exponent = exp(-((x - mean)**2/(2*std**2)))
    return (1 / (sqrt(2*pi)*std)) * exponent

In [15]:
def predict(row):
    prob_pos = len(x_train[y_train==1]) / len(x_train)
    
    for i in range(0,len(row)):
        prob_pos = prob_pos * cond_probability(row[i],train_mean_pos[i],train_std_pos[i])
        
    prob_neg = len(x_train[y_train==0]) / len(x_train)
    
    for i in range(0,len(row)):
        prob_neg = prob_neg * cond_probability(row[i],train_mean_neg[i],train_std_neg[i])    
    
    return [prob_pos,prob_neg]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
683,4,125,80,0,0,32.3,0.536,27
586,8,143,66,0,0,34.9,0.129,41
43,9,171,110,24,240,45.4,0.721,54
26,7,147,76,0,0,39.4,0.257,43
11,10,168,74,0,0,38.0,0.537,34
...,...,...,...,...,...,...,...,...
218,5,85,74,22,0,29.0,1.224,32
207,5,162,104,0,0,37.7,0.151,52
308,0,128,68,19,180,30.5,1.391,25
676,9,156,86,0,0,24.8,0.230,53


In [30]:
predictions_raw = []

for row in x_test.values.tolist():
    predictions_raw.append(predict(row))

In [31]:
predictions_raw[0]

[1.6299028206157718e-14, 1.0044068228290291e-14]

In [32]:
predictions = []
for row in predictions_raw:
    if(row[0]>row[1]):
        predictions.append(1)
    else:
        predictions.append(0)

In [34]:
accuracy_score(y_test.tolist(),predictions)

0.71354166666666663

In [36]:
confusion_matrix(y_test.tolist(),predictions)

array([[96, 29],
       [26, 41]])

In [37]:
model = GaussianNB()
model.fit(x_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [39]:
confusion_matrix(y_test,model.predict(x_test))

array([[96, 29],
       [26, 41]])