# NAIVE BAYES CLASSIFIER

In [1]:
import csv
import math
import random


def mean(numbers):
    return (sum(numbers)/float(len(numbers)-1))
def load_csv(filename):
    lines=csv.reader(open(filename,"r"))
    dataset=list(lines)
    for i in range(len(dataset)):
        dataset[i] = [float(x) for x in dataset[i]]
    return dataset

def split_dataset(dataset, split_ratio):
    trainSize = len(dataset)*split_ratio
    trainSet=[]
    copy=list(dataset)
    while len(trainSet)<trainSize:
        index=random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return [trainSet,copy]

def diff_by_class(dataset):
    diff={}
    for i in range(len(dataset)):
        vector=dataset[i]
        if(vector[-1] not in diff):
            diff[vector[-1]]=[]
        diff[vector[-1]].append(vector)
    return diff


def std_dev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x-avg, 2) for x in numbers])/float(len(numbers) - 1)
    return math.sqrt(variance)

def summarize(dataset):
    summaries=[(mean(attribute),std_dev(attribute)) for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries

def summarize_by_class(dataset):
    class_group=diff_by_class(dataset)
    summaries = {}
    for class_value, data in class_group.items():
        summaries[class_value] = summarize(data)
    return summaries


def calc_probability(x, mean_val, std_dev_val):
    exponent = math.exp(-(math.pow(x-mean_val, 2)/(2*math.pow(std_dev_val, 2))))
    return (1/(math.sqrt(2*math.pi)*std_dev_val))*exponent

def calc_class_probability(summaries,input_vector):
    probabilities={}
    for class_name , class_value in summaries.items():
        probabilities[class_name] = 1
        for i in range(len(class_value)):
            mean_val,std_dev_val=class_value[i]
            x=input_vector[i]
            probabilities[class_name]*=calc_probability(x,mean_val,std_dev_val)
    return probabilities



In [2]:

def predict(summaries,input_vec):
    prob=calc_class_probability(summaries,input_vec)
    best_label,best_prob=None,1
    for class_val,prob in prob.items():
        if best_label is None or best_prob < prob:
            best_prob=prob
            best_label=class_val
    return best_label

def get_predictions(summaries,test_set):
    predictions=[]
    for i in range(len(test_set)):
        result=predict(summaries,test_set[i])
        predictions.append(result)
    return predictions

def get_accuracies(test_set, predictions):
    correct=0
    for i in range(len(test_set)):
        if test_set[i][-1] == predictions[i]:
            correct+=1
    return (correct/float(len(test_set)))*100

In [3]:
def main():
    filename="../data/diabetesdata.csv"
    split_ratio=.99
    dataset=load_csv(filename)
    training_set,test_set=split_dataset(dataset,split_ratio)
    print('Split {0} rows into train={1} and test={2} rows'.format(len(dataset),len(training_set),len(test_set)))
    summaries=summarize_by_class(training_set)
    predictions=get_predictions(summaries,test_set)
    accuracy=get_accuracies(test_set,predictions)
    print("accuracy of the classifier is {0}".format(accuracy))

main()

Split 768 rows into train=761 and test=7 rows
accuracy of the classifier is 100.0
