In [17]:
filename = 'pima-indians-diabetes.csv'
# filename = 'test.csv'
import pandas as pd
import numpy as np

df= pd.read_csv(filename)
df = df.astype(float) #converting all columns to float type

train=df.sample(frac=0.8,random_state=105) #random state is a seed value
print(train) #train.index gives the index of all the data-rows included in train
test=df.drop(train.index) #dropping those indices which are in train (keeping the remaining in test)
# print(test) 
# columns ['Pregnancies','Glucose','BP','SkinThickness','Insulin','BMI','DiabetesPedigree','Age','Outcome']

       6    148    72    35      0  33.6  0.627    50    1
564  2.0   95.0  54.0  14.0   88.0  26.1  0.748  22.0  0.0
470  0.0  137.0  70.0  38.0    0.0  33.2  0.170  22.0  0.0
719  4.0   83.0  86.0  19.0    0.0  29.3  0.317  34.0  0.0
632  1.0  128.0  82.0  17.0  183.0  27.5  0.115  22.0  0.0
260  3.0  141.0   0.0   0.0    0.0  30.0  0.761  27.0  1.0
..   ...    ...   ...   ...    ...   ...    ...   ...  ...
277  5.0  114.0  74.0   0.0    0.0  24.9  0.744  57.0  0.0
308  2.0  124.0  68.0  28.0  205.0  32.9  0.875  30.0  1.0
148  2.0   90.0  70.0  17.0    0.0  27.3  0.085  22.0  0.0
339  1.0  130.0  70.0  13.0  105.0  25.9  0.472  22.0  0.0
673  8.0   91.0  82.0   0.0    0.0  35.6  0.587  68.0  0.0

[614 rows x 9 columns]


In [18]:
# Train model
# group by outcomes, in the train group i.e all ones together & zeroes together
from pprint import pprint
outcome_group = train.groupby(df.columns[-1])
# print(outcome_group.size()) #displays the first row(as tags)
n_attr = len(df.columns) - 1 #-1 to remove the count of 0 & 1 (result)

summaries = {}
#summarize by outcome, find mean and std deviation of each outcome.
for classValue, instances in outcome_group:
    #class Value is 0 or 1, instances are - all instances in each group
    #loop runs only twice, once of 1 and once for 0
    pprint(instances)
    pprint(list(instances.mean(axis=0)))
    attr_mv=[]
    mean=list(instances.mean(axis=0).values)
    stdev=list(instances.std(axis=0).values)

    #for loop because all instances(columns) are together, can't separate them
    for i in range(n_attr):
        attr_mv.append([mean[i],stdev[i]]) #for each attribute value we have an array consisting of std dev & mean
        
    summaries[classValue]=attr_mv


        6    148    72    35      0  33.6  0.627    50    1
564   2.0   95.0  54.0  14.0   88.0  26.1  0.748  22.0  0.0
470   0.0  137.0  70.0  38.0    0.0  33.2  0.170  22.0  0.0
719   4.0   83.0  86.0  19.0    0.0  29.3  0.317  34.0  0.0
632   1.0  128.0  82.0  17.0  183.0  27.5  0.115  22.0  0.0
633  10.0   92.0  62.0   0.0    0.0  25.9  0.167  31.0  0.0
..    ...    ...   ...   ...    ...   ...    ...   ...  ...
621   6.0  183.0  94.0   0.0    0.0  40.8  1.461  45.0  0.0
277   5.0  114.0  74.0   0.0    0.0  24.9  0.744  57.0  0.0
148   2.0   90.0  70.0  17.0    0.0  27.3  0.085  22.0  0.0
339   1.0  130.0  70.0  13.0  105.0  25.9  0.472  22.0  0.0
673   8.0   91.0  82.0   0.0    0.0  35.6  0.587  68.0  0.0

[404 rows x 9 columns]
[3.2896039603960396,
 110.16831683168317,
 67.97524752475248,
 19.849009900990097,
 69.43069306930693,
 30.350000000000023,
 0.4308465346534652,
 31.094059405940595,
 0.0]
        6    148     72    35      0  33.6  0.627    50    1
260   3.0  141.0    0.0

In [19]:
import math
def calculateProb(x, mean, stdev):
    exponent = math.exp(-math.pow(x-mean,2)/(2*math.pow(stdev,2)))
    return (1 / (math.sqrt(2*math.pi)*math.pow(stdev,2))) * exponent
  

def calculateClassProb(summaries, X_vec):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        
        # print('class summaries\n', classSummaries)
        # print('length\n', len(classSummaries))

        #calculating probablity of each attribute and multiplying them. 
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = X_vec[i]
            probabilities[classValue] *= calculateProb(x, mean, stdev)
            
    return probabilities
    
def predict(summaries, X_vec):
    prob = calculateClassProb(summaries, X_vec)
    bestLabel, bestProb = None, -1

    # print('Probablitites-----------class\n')
    # pprint(prob)                 

    #in this case we could have directly compared. But, when there are more than 2 classes(yes/no/maybe?) we need to select the best/largest 
    for classValue, probability in prob.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel


In [20]:
# test model
predictions = []
testSet=test.values.tolist()
# pprint(summaries)
for i in range(len(testSet)):
    result = predict(summaries, testSet[i]) #summaries - mean & std dev of test dataset
    predictions.append(result)


In [21]:
def getAccuracy(test, predictions):
    correct = 0
    # pprint(test)
    # print(test.iloc[1, -1])
    for i in range(len(test)):
        if test.iloc[i,-1] == predictions[i]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

accuracy = getAccuracy(test, predictions)
print(f'Split {len(df)} rows into train={len(train)} and test={len(test)}')
print(f'Accuracy: {accuracy}')


Split 767 rows into train=614 and test=153
Accuracy: 74.50980392156863


<h2>Using sci-kit-learn Gaussian NB</h2>

In [22]:
import pandas as pd
import numpy as np

from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
data_train = train.iloc[:,:-1]
target_train  = train.iloc[:,-1]
gnb.fit(data_train, target_train)

data_test = test.iloc[:,:-1]
target_test = test.iloc[:, -1]
y_pred = gnb.predict(data_test)


from sklearn import metrics

#Model Accuracy, how often is the classifier correct?
print(f'Split {len(df)} rows into train={len(data_train)} and test={len(data_test)}')
print("Accuracy:",(metrics.accuracy_score(test.iloc[:,-1], y_pred)*100))


Split 767 rows into train=614 and test=153
Accuracy: 74.50980392156863


In [23]:
print(train)
data_train = train.iloc[:,:-1]
print('data\n',data_train)
target_train  = train.iloc[:,-1]
print('target\n',target_train)


       6    148    72    35      0  33.6  0.627    50    1
564  2.0   95.0  54.0  14.0   88.0  26.1  0.748  22.0  0.0
470  0.0  137.0  70.0  38.0    0.0  33.2  0.170  22.0  0.0
719  4.0   83.0  86.0  19.0    0.0  29.3  0.317  34.0  0.0
632  1.0  128.0  82.0  17.0  183.0  27.5  0.115  22.0  0.0
260  3.0  141.0   0.0   0.0    0.0  30.0  0.761  27.0  1.0
..   ...    ...   ...   ...    ...   ...    ...   ...  ...
277  5.0  114.0  74.0   0.0    0.0  24.9  0.744  57.0  0.0
308  2.0  124.0  68.0  28.0  205.0  32.9  0.875  30.0  1.0
148  2.0   90.0  70.0  17.0    0.0  27.3  0.085  22.0  0.0
339  1.0  130.0  70.0  13.0  105.0  25.9  0.472  22.0  0.0
673  8.0   91.0  82.0   0.0    0.0  35.6  0.587  68.0  0.0

[614 rows x 9 columns]
data
        6    148    72    35      0  33.6  0.627    50
564  2.0   95.0  54.0  14.0   88.0  26.1  0.748  22.0
470  0.0  137.0  70.0  38.0    0.0  33.2  0.170  22.0
719  4.0   83.0  86.0  19.0    0.0  29.3  0.317  34.0
632  1.0  128.0  82.0  17.0  183.0  27.5  0.11