In [1]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import hashlib
import math
import timeit

In [2]:
"""
Read csv file into DataFrame
Parameters: Path of csv file
Returns: Pandas DataFrame 
"""  
def load_data(csv_path):
    return pd.read_csv(csv_path)

In [3]:
#load iris.csv into dataframe object
iris_data = load_data("dataset/iris.csv")

#have a look at the data
iris_data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [4]:
#add index in the data
iris_data = iris_data.reset_index()

#have a look at the data
iris_data.head()

Unnamed: 0,index,sepal_length,sepal_width,petal_length,petal_width,species
0,0,5.1,3.5,1.4,0.2,setosa
1,1,4.9,3.0,1.4,0.2,setosa
2,2,4.7,3.2,1.3,0.2,setosa
3,3,4.6,3.1,1.5,0.2,setosa
4,4,5.0,3.6,1.4,0.2,setosa


In [5]:
#method to split data into training and test set
def test_set_check(identifier,test_ratio,hash):
    return bytearray(hash(np.int64(identifier)).digest())[-1]  < 51

def split_train_test_by_id(data,test_ratio,id_column,hash):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_:test_set_check(id_,test_ratio,hash))
    return in_test_set

In [6]:
#split data into training and test set using index column
test_set_index = split_train_test_by_id(iris_data, 0.66, "index", hashlib.md5)
test_set = iris_data[test_set_index]
train_set = iris_data[~test_set_index]

#delete index column from training and test set
del train_set['index']
del test_set['index']

In [7]:
#Convert train_set into matrix
train_set_matrix = train_set.as_matrix()

#create copy of test_set to be used for checking accuracy
test_set_copy = test_set.copy()

#delete species column(i.e. label to be predicted) from test set
del test_set['species']

#Convert test_set into matrix
test_set_matrix = test_set.as_matrix()

In [8]:
#Methods for training Naive Bayes Classifier
def separateByClass(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated

def mean(numbers):
    return sum(numbers)/float(len(numbers))
 
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
    return math.sqrt(variance)

def summarize(dataset):
    zipd= list(zip(*dataset))
   
    del zipd[-1]
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zipd]
    return summaries

def summarizeByClass(dataset):
    separated = separateByClass(dataset)
    summary = {}
    for classValue, instances in separated.items():
       summary[classValue] = summarize(instances)
    return summary

In [9]:
#store current time int start_time
start_time = timeit.default_timer()

#Train Naive Bayes Classifier
summary = summarizeByClass(train_set_matrix)
print(summary)

#Find training time
end_time = timeit.default_timer()
training_time = end_time - start_time

print("\nTraining Time: ", training_time, "seconds")

{'setosa': [(5.002380952380952, 0.361222919645362), (3.4119047619047613, 0.39706384628363195), (1.452380952380952, 0.16854918257117807), (0.24047619047619045, 0.10605917310524214)], 'versicolor': [(5.899999999999998, 0.51720402163943), (2.7731707317073173, 0.29497829601845493), (4.246341463414634, 0.4909671888098073), (1.3195121951219513, 0.19133676096756994)], 'virginica': [(6.595121951219512, 0.5970557844591222), (3.012195121951219, 0.3163819858582498), (5.517073170731706, 0.4949254686437092), (2.017073170731707, 0.28098259645785023)]}

Training Time:  0.002717418001338956 seconds


In [10]:
#Methods to find predicted labels
def calculateProbability(x, mean, stdev):
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

def calculateClassProbabilities(summaries, inputVector):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = inputVector[i]
            probabilities[classValue] *= calculateProbability(x, mean, stdev)
    return probabilities

def predict(summaries, inputVector):
    probabilities = calculateClassProbabilities(summaries, inputVector)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel

def getPredictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries, testSet[i])
        predictions.append(result)
    return predictions

In [11]:
#Get predicted labels
predictions = getPredictions(summary, test_set_matrix)

In [12]:
"""
calculate accuracy of the model
Parameters:
testSet - test_set matrix with output labels
predictions - label predicted by model
Returns: accuracy %
"""
def getAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

In [13]:
#Convert test_set_copy to a matrix
test_set_mat = test_set_copy.as_matrix()

#Calculate the accuracy
accuracy = getAccuracy(test_set_mat, predictions)
print("Accuracy: ", accuracy)

Accuracy:  88.46153846153845
