# Naive Bayes Classifier

In [79]:
import csv
import random
import math
import numpy as np

In [80]:
def loadCsv(filename):
    return np.genfromtxt(filename, delimiter=',')

In [81]:
def splitDataset(dataset, splitRatio):
    # Training set size
    trainSize = int(dataset.shape[0] * splitRatio)
    
    # List of randomly chosen indicies
    indices = np.random.permutation(dataset.shape[0])
    
    # Split indicies for training and test set by trainSize
    training_idx, test_idx = indices[:trainSize], indices[trainSize:]
    # Create training and test sets by indicies
    training, test = dataset[training_idx,:], dataset[test_idx,:]
    
    return training, test

In [82]:
def separateByClass(dataset):
    return {
        1: dataset[np.where(dataset[:, -1]==1), :],
        0: dataset[np.where(dataset[:, -1]==0), :]
    }


In [83]:
def summarize(dataset):
    means = dataset.mean(axis=1)[0][:-1]
    stds = dataset.std(axis=1,ddof=1)[0][:-1]
    return means, stds

In [84]:
def summarizeByClass(dataset):
    separated = separateByClass(dataset)
    
    summaries = {}
    
    for classValue, instances in separated.items():
        summaries[classValue] = summarize(instances)
    return summaries

In [85]:
def calculateProbability(x, mean, stdev):
    return np.exp(-(x  - mean)**2/(2 * stdev**2))/np.sqrt((2 * np.pi) * stdev)

In [86]:
def calculateClassProbabilities(summaries, inputVector):
    probabilities = {}
    
    for classValue, classSummaries in summaries.items():
        
        means = classSummaries[0]
        stds  = classSummaries[1]
        
        # Calculate corresonding probabilities and multiply them
        probabilities[classValue] = np.prod(calculateProbability(inputVector[:-1], means, stds))
    return probabilities

In [87]:
def predict(summaries, inputVector):
    probabilities = calculateClassProbabilities(summaries, inputVector)
    
    bestLabel, bestProb = None, -1
    
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel

In [88]:
def getPredictions(summaries, testSet):
    
    predictions = []
    
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions

In [89]:
def getAccuracy(testSet, predictions):
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][-1] == predictions[i]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

In [90]:
def main():
    # Set initial data
    filename = 'pima-indians-diabetes.csv'
    
    # Set split ratio
    splitRatio = 0.67
    
    # Load dataset and return numpy array
    dataset = loadCsv(filename)
    # Split dataset
    trainingSet, testSet = splitDataset(dataset, splitRatio)
    
    # Log row amounts
    print('Split {0} rows into train={1} and test={2} rows'.format(len(dataset), len(trainingSet), len(testSet)))
    
    # Prepare model
    summaries = summarizeByClass(trainingSet)
    
    # Test model
    predictions = getPredictions(summaries, testSet)
    
    accuracy = getAccuracy(testSet, predictions)
    
    print('Accuracy: {0}%'.format(accuracy))

In [91]:
main()

Split 768 rows into train=514 and test=254 rows


NameError: name 'mean' is not defined