In [2]:
filename = 'pima-indians-diabetes.csv'
import pandas as pd
import numpy as np

df= pd.read_csv(filename)
df = df.astype(float) #converting all columns to float type

train=df.sample(frac=0.8,random_state=105) #random state is a seed value
# print(train.index) #train.index gives the index of all the data-rows included in train
test=df.drop(train.index) #dropping those indices which are in train (keeping the remaining in test)
# print(test) 
# columns ['Pregnancies','Glucose','BP','SkinThickness','Insulin','BMI','DiabetesPedigree','Age','Outcome']

In [4]:
# Train model
# group by outcomes, in the train group i.e all ones together & zeroes together
outcome_group = train.groupby(df.columns[-1])
print(outcome_group.first()) #displays the first row(as tags)
n_attr = len(df.columns) -1
summaries = {}
#summarize by outcome, find mean and std deviation of each outcome.
for classValue, instances in outcome_group:
    attr_mv=[]
    mean=list(instances.mean(axis=0).values)
    stdev=list(instances.std(axis=0).values)
    for i in range(n_attr):
        attr_mv.append([mean[i],stdev[i]])
        
    summaries[classValue]=attr_mv


       6    148    72    35     0  33.6  0.627    50
1                                                   
0.0  2.0   95.0  54.0  14.0  88.0  26.1  0.748  22.0
1.0  3.0  141.0   0.0   0.0   0.0  30.0  0.761  27.0


In [None]:
import math
def calculateProb(x, mean, stdev):
    exponent = math.exp(-math.pow(x-mean,2)/(2*math.pow(stdev,2)))
    return (1 / (math.sqrt(2*math.pi)*math.pow(stdev,2))) * exponent
  

def calculateClassProb(summaries, X_vec):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = X_vec[i]
            probabilities[classValue] *= calculateProb(x, mean, stdev)
            
    return probabilities
    
def predict(summaries, X_vec):
    prob = calculateClassProb(summaries, X_vec)
    bestLabel, bestProb = None, -1
    for classValue, probability in prob.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel


In [None]:
# test model
predictions = []
testSet=test.values.tolist()
for i in range(len(testSet)):
    result = predict(summaries, testSet[i])
    predictions.append(result)

In [None]:
def getAccuracy(test, predictions):
    correct = 0
    for i in range(len(test)):
        if test.iloc[i,-1] == predictions[i]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

accuracy = getAccuracy(test, predictions)
print(f'Split {len(df)} rows into train={len(train)} and test={len(test)}')
print(f'Accuracy: {accuracy}')


Split 767 rows into train=614 and test=153
Accuracy: 74.50980392156863


<h2>Using sci-kit-learn Gaussian NB</h2>

In [None]:
import pandas as pd
import numpy as np

from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
data_train = train.iloc[:,:-1]
target_train  = train.iloc[:,-1]
gnb.fit(data_train, target_train)

data_test = test.iloc[:,:-1]
y_pred = gnb.predict(data_test)


from sklearn import metrics

#Model Accuracy, how often is the classifier correct?
print(f'Split {len(df)} rows into train={len(data_train)} and test={len(data_test)}')
print("Accuracy:",(metrics.accuracy_score(test.iloc[:,-1], y_pred)*100))


Split 767 rows into train=614 and test=153
Accuracy: 74.50980392156863
