In [2]:
filename = 'diabetes.csv'
import pandas as pd
import numpy as np

df = pd.read_csv(filename)
df = df.astype(float)

train = df.sample(frac=0.8,random_state=105)
test = df.drop(train.index)

In [3]:
print('Training Dataset:')
print(train)
print('Test Dataset:')
print(test)

Training Dataset:
     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
535          4.0    132.0            0.0            0.0      0.0  32.9   
336          0.0    117.0            0.0            0.0      0.0  33.8   
718          1.0    108.0           60.0           46.0    178.0  35.5   
691         13.0    158.0          114.0            0.0      0.0  42.3   
260          3.0    191.0           68.0           15.0    130.0  30.9   
..           ...      ...            ...            ...      ...   ...   
622          6.0    183.0           94.0            0.0      0.0  40.8   
277          0.0    104.0           64.0           23.0    116.0  27.8   
308          0.0    128.0           68.0           19.0    180.0  30.5   
148          5.0    147.0           78.0            0.0      0.0  33.7   
339          7.0    178.0           84.0            0.0      0.0  39.9   

     DiabetesPedigreeFunction   Age  Outcome  
535                     0.302  23.0      1.0  

In [4]:
outcome_group = train.groupby(df.columns[-1])
n_attr = len(df.columns)-1
summaries = {}
for classValue, instances in outcome_group:
    print("============", classValue)
    print("============", instances)
    attr = []
    mean = list(instances.mean(axis=0).values)
    stdev = list(instances.std(axis=0).values)
    for i in range(n_attr):
        attr.append([mean[i],stdev[i]])
    summaries[classValue] = attr

336          0.0    117.0            0.0            0.0      0.0  33.8   
718          1.0    108.0           60.0           46.0    178.0  35.5   
260          3.0    191.0           68.0           15.0    130.0  30.9   
353          1.0     90.0           62.0           12.0     43.0  27.2   
361          5.0    158.0           70.0            0.0      0.0  29.8   
..           ...      ...            ...            ...      ...   ...   
305          2.0    120.0           76.0           37.0    105.0  39.7   
554          1.0     84.0           64.0           23.0    115.0  36.9   
622          6.0    183.0           94.0            0.0      0.0  40.8   
277          0.0    104.0           64.0           23.0    116.0  27.8   
148          5.0    147.0           78.0            0.0      0.0  33.7   

     DiabetesPedigreeFunction   Age  Outcome  
336                     0.932  44.0      0.0  
718                     0.415  24.0      0.0  
260                     0.299  34.0      0.0

In [5]:
import math

def calculateProb(x, mean, stdev):
    exponent = math.exp(-math.pow(x-mean,2)/2*math.pow(stdev,2))
    return (1 / (math.sqrt(2*math.pi)*math.pow(stdev,2))) * exponent

def calculateClassProb(summaries, X_vec):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = X_vec[i]
            probabilities[classValue] *= calculateProb(x, mean, stdev)
    return probabilities

def predict(summaries, X_vec):
    prob = calculateClassProb(summaries, X_vec)
    bestLabel, bestProb = None, -1
    for classValue, probability in prob.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel

In [6]:
predictions = []
testSet = test.values.tolist()
for i in range(len(testSet)):
    result = predict(summaries, testSet[i])
    predictions.append(result)

def getAccuracy(test, predictions):
    correct = 0
    for i in range(len(test)):
        if test.iloc[i,-1] == predictions[i]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

accuracy = getAccuracy(test, predictions)
print(f'Split {len(df)} rows into train={len(train)} and test={len(test)}')
print(f'Accuracy: {accuracy}')

Split 768 rows into train=614 and test=154
Accuracy: 70.77922077922078


In [7]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
data_train = train.iloc[:,:-1]
target_train = train.iloc[:,-1]
gnb.fit(data_train, target_train)

data_test = test.iloc[:,:-1]
y_pred = gnb.predict(data_test)

from sklearn import metrics
print(f'Split {len(df)} rows into train={len(data_train)} and test={len(data_test)}')
print(f'Accuracy: {metrics.accuracy_score(test.iloc[:,-1], y_pred)*100}')

Split 768 rows into train=614 and test=154
Accuracy: 77.92207792207793
