In [2]:
# filename = 'pima-indians-diabetes.csv'
filename = 'test.csv'
import pandas as pd
import numpy as np

df= pd.read_csv(filename)
df = df.astype(float) #converting all columns to float type

train=df.sample(frac=0.8,random_state=105) #random state is a seed value
print(train) #train.index gives the index of all the data-rows included in train
test=df.drop(train.index) #dropping those indices which are in train (keeping the remaining in test)
# print(test) 
# columns ['Pregnancies','Glucose','BP','SkinThickness','Insulin','BMI','DiabetesPedigree','Age','Outcome']

      6    148    72    35      0  33.6  0.627    50    1
6  10.0  115.0   0.0   0.0    0.0  35.3  0.134  29.0  0.0
2   1.0   89.0  66.0  23.0   94.0  28.1  0.167  21.0  0.0
3   0.0  137.0  40.0  35.0  168.0  43.1  2.288  33.0  1.0
7   2.0  197.0  70.0  45.0  543.0  30.5  0.158  53.0  1.0
4   5.0  116.0  74.0   0.0    0.0  25.6  0.201  30.0  0.0
1   8.0  183.0  64.0   0.0    0.0  23.3  0.672  32.0  1.0


In [3]:
# Train model
# group by outcomes, in the train group i.e all ones together & zeroes together
from pprint import pprint
outcome_group = train.groupby(df.columns[-1])
# print(outcome_group.size()) #displays the first row(as tags)
n_attr = len(df.columns) - 1 #-1 to remove the count of 0 & 1 (result)

summaries = {}
#summarize by outcome, find mean and std deviation of each outcome.
for classValue, instances in outcome_group:
    #class Value is 0 or 1, instances are - all instances in each group
    #loop runs only twice, once of 1 and once for 0
    pprint(instances)
    pprint(list(instances.mean(axis=0)))
    attr_mv=[]
    mean=list(instances.mean(axis=0).values)
    stdev=list(instances.std(axis=0).values)

    #for loop because all instances(columns) are together, can't separate them
    for i in range(n_attr):
        attr_mv.append([mean[i],stdev[i]]) #for each attribute value we have an array consisting of std dev & mean
        
    summaries[classValue]=attr_mv


      6    148    72    35     0  33.6  0.627    50    1
6  10.0  115.0   0.0   0.0   0.0  35.3  0.134  29.0  0.0
2   1.0   89.0  66.0  23.0  94.0  28.1  0.167  21.0  0.0
4   5.0  116.0  74.0   0.0   0.0  25.6  0.201  30.0  0.0
[5.333333333333333,
 106.66666666666667,
 46.666666666666664,
 7.666666666666667,
 31.333333333333332,
 29.666666666666668,
 0.16733333333333333,
 26.666666666666668,
 0.0]
     6    148    72    35      0  33.6  0.627    50    1
3  0.0  137.0  40.0  35.0  168.0  43.1  2.288  33.0  1.0
7  2.0  197.0  70.0  45.0  543.0  30.5  0.158  53.0  1.0
1  8.0  183.0  64.0   0.0    0.0  23.3  0.672  32.0  1.0
[3.3333333333333335,
 172.33333333333334,
 58.0,
 26.666666666666668,
 237.0,
 32.3,
 1.0393333333333332,
 39.333333333333336,
 1.0]


In [4]:
import math
def calculateProb(x, mean, stdev):
    exponent = math.exp(-math.pow(x-mean,2)/(2*math.pow(stdev,2)))
    return (1 / (math.sqrt(2*math.pi)*math.pow(stdev,2))) * exponent
  

def calculateClassProb(summaries, X_vec):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        
        # print('class summaries\n', classSummaries)
        # print('length\n', len(classSummaries))

        #calculating probablity of each attribute and multiplying them. 
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = X_vec[i]
            probabilities[classValue] *= calculateProb(x, mean, stdev)
            
    return probabilities
    
def predict(summaries, X_vec):
    prob = calculateClassProb(summaries, X_vec)
    bestLabel, bestProb = None, -1

    # print('Probablitites-----------class\n')
    # pprint(prob)

    #in this case we could have directly compared. But, when there are more than 2 classes(yes/no/maybe?) we need to select the best/largest 
    for classValue, probability in prob.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel


In [5]:
# test model
predictions = []
testSet=test.values.tolist()
# pprint(summaries)
for i in range(len(testSet)):
    result = predict(summaries, testSet[i]) #summaries - mean & std dev of test dataset
    predictions.append(result)
    # print(predictions)


Probablitites-----------class

{0.0: 1.834672604102135e-24, 1.0: 1.1814708198750518e-24}
Probablitites-----------class

{0.0: 1.953179164869056e-19, 1.0: 6.571345077991297e-25}


In [6]:
def getAccuracy(test, predictions):
    correct = 0
    # pprint(test)
    # print(test.iloc[1, -1])
    for i in range(len(test)):
        if test.iloc[i,-1] == predictions[i]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

accuracy = getAccuracy(test, predictions)
print(f'Split {len(df)} rows into train={len(train)} and test={len(test)}')
print(f'Accuracy: {accuracy}')


Split 8 rows into train=6 and test=2
Accuracy: 50.0


<h2>Using sci-kit-learn Gaussian NB</h2>

In [7]:
import pandas as pd
import numpy as np

from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
data_train = train.iloc[:,:-1]
target_train  = train.iloc[:,-1]
gnb.fit(data_train, target_train)

data_test = test.iloc[:,:-1]
y_pred = gnb.predict(data_test)


from sklearn import metrics

#Model Accuracy, how often is the classifier correct?
print(f'Split {len(df)} rows into train={len(data_train)} and test={len(data_test)}')
print("Accuracy:",(metrics.accuracy_score(test.iloc[:,-1], y_pred)*100))


ModuleNotFoundError: No module named 'sklearn'

In [12]:
print(train)
data_train = train.iloc[:,:-1]
print('data\n',data_train)
target_train  = train.iloc[:,-1]
print('target\n',target_train)


      6    148    72    35      0  33.6  0.627    50    1
6  10.0  115.0   0.0   0.0    0.0  35.3  0.134  29.0  0.0
2   1.0   89.0  66.0  23.0   94.0  28.1  0.167  21.0  0.0
3   0.0  137.0  40.0  35.0  168.0  43.1  2.288  33.0  1.0
7   2.0  197.0  70.0  45.0  543.0  30.5  0.158  53.0  1.0
4   5.0  116.0  74.0   0.0    0.0  25.6  0.201  30.0  0.0
1   8.0  183.0  64.0   0.0    0.0  23.3  0.672  32.0  1.0
data
       6    148    72    35      0  33.6  0.627    50
6  10.0  115.0   0.0   0.0    0.0  35.3  0.134  29.0
2   1.0   89.0  66.0  23.0   94.0  28.1  0.167  21.0
3   0.0  137.0  40.0  35.0  168.0  43.1  2.288  33.0
7   2.0  197.0  70.0  45.0  543.0  30.5  0.158  53.0
4   5.0  116.0  74.0   0.0    0.0  25.6  0.201  30.0
1   8.0  183.0  64.0   0.0    0.0  23.3  0.672  32.0
target
 6    0.0
2    0.0
3    1.0
7    1.0
4    0.0
1    1.0
Name: 1, dtype: float64
