In [60]:
# Example of Naive Bayes implemented from Scratch in Python
import csv
import random
import math
import matplotlib.pyplot as plt
import numpy as np

def loadCsv(filename):
	lines = csv.reader(open(filename, "rb"))
	dataset = list(lines)
	for i in range(len(dataset)):
		dataset[i] = [float(x) for x in dataset[i]]
	return dataset

def splitDataset(dataset, splitRatio):
	trainSize = int(len(dataset) * splitRatio)
	trainSet = []
	copy = list(dataset)
	while len(trainSet) < trainSize:
		index = random.randrange(len(copy))
		trainSet.append(copy.pop(index))
	return [trainSet, copy]

def separateByClass(dataset):
	separated = {}
	for i in range(len(dataset)):
		vector = dataset[i]
		if (vector[-1] not in separated):
			separated[vector[-1]] = []
		separated[vector[-1]].append(vector)
	return separated

def mean(numbers):
	return sum(numbers)/float(len(numbers))

def stdev(numbers):
	avg = mean(numbers)
	variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
	return math.sqrt(variance)

def summarize(dataset):
	summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
	del summaries[-1]
	return summaries

def summarizeByClass(dataset):
	separated = separateByClass(dataset)
	summaries = {}
	for classValue, instances in separated.iteritems():
		summaries[classValue] = summarize(instances)
	return summaries

def calculateProbability(x, mean, stdev):
	exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
	return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

def calculateClassProbabilities(summaries, inputVector):
	probabilities = {}
	for classValue, classSummaries in summaries.iteritems():
		probabilities[classValue] = 1
		for i in range(len(classSummaries)):
			mean, stdev = classSummaries[i]
			x = inputVector[i]
			probabilities[classValue] *= calculateProbability(x, mean, stdev)
	return probabilities
			
    
def calculateClassProbability(summaries, inputVector):
	probabilities = {}
	for classValue, classSummaries in summaries.iteritems():
		probabilities[classValue] = 1
		for i in range(len(classSummaries)):
			mean, stdev = classSummaries[i]
			x = inputVector[i]
			probabilities= calculateProbability(x, mean, stdev)
	return probabilities
			
    
def getProbabilities(summaries, testSet):
	predictions = []
	for i in range(len(testSet)):
		result = calculateClassProbability(summaries, testSet[i])
		predictions.append(result)
	return predictions
    
def predict(summaries, inputVector):
	probabilities = calculateClassProbabilities(summaries, inputVector)
	bestLabel, bestProb = None, -1
	for classValue, probability in probabilities.iteritems():
		if bestLabel is None or probability > bestProb:
			bestProb = probability
			bestLabel = classValue
	return bestLabel

def getPredictions(summaries, testSet):
	predictions = []
	for i in range(len(testSet)):
		result = predict(summaries, testSet[i])
		predictions.append(result)
	return predictions

def getAccuracy(testSet, predictions):
	correct = 0
	for i in range(len(testSet)):
		if testSet[i][-1] == predictions[i]:
			correct += 1
	return (correct/float(len(testSet))) * 100.0

def main():
    filename = 'data-test.csv'
    splitRatio = 0.67
    dataset = loadCsv(filename)
    trainingSet, testSet = splitDataset(dataset, splitRatio)
    print('Split {0} rows into train={1} and test={2} rows').format(len(dataset), len(trainingSet), len(testSet))
    # prepare model
    summaries = summarizeByClass(trainingSet)
    # test model
    predictions = getPredictions(summaries, testSet)
    probabilities = getProbabilities(summaries, testSet)
    accuracy = getAccuracy(testSet, predictions)
    print('Accuracy: {0}%').format(accuracy)
    print min(probabilities)
    print max(probabilities)
    #plt.hist(probabilities)
    #plt.show()
main()

Split 768 rows into train=514 and test=254 rows
Accuracy: 100.0%
1.05374764676e-05
0.0335675605148


In [61]:
filename = 'data-test.csv'
splitRatio = 0.67
dataset = loadCsv(filename)
trainingSet, testSet = splitDataset(dataset, splitRatio)
print('Split {0} rows into train={1} and test={2} rows').format(len(dataset), len(trainingSet), len(testSet))

Split 768 rows into train=514 and test=254 rows


In [62]:
# prepare model
summaries = summarizeByClass(trainingSet)
# test model
initialprobabilities = getProbabilities(summaries, trainingSet)
print min(initialprobabilities)
print max(initialprobabilities)
print len(initialprobabilities)
#plt.hist(probabilities)
#plt.show()

0.000145846863945
0.0342252425253
514


In [63]:
import numpy as np
threshold = np.percentile(initialprobabilities, 15) # return 50th percentile, e.g median.
threshold = min(initialprobabilities)
print threshold

0.000145846863945


In [64]:
testprobabilities = getProbabilities(summaries, testSet)
positiveclass = []
for i in range(len(testprobabilities)) :
    if testprobabilities[i] > threshold:
        positiveclass.append(1)
    else:
        positiveclass.append(0)

positivepercentage = float(sum(positiveclass)) / len(positiveclass)
positivepercentage

0.9960629921259843

In [None]:
##################################FROM HERE IS THE REAL DEAL############################

In [65]:
filenorth = 'SC_import_north.csv'
trainingSet = loadCsv(filenorth)

# prepare model
summaries = summarizeByClass(trainingSet)

summaries

{1.0: [(0.44523714863829716, 0.15210032925551442),
  (0.28338794945271867, 0.06116894251779305),
  (0.04748898241843975, 0.04593169825569141),
  (0.10724664711702125, 0.05682590830973934),
  (0.5304386308877065, 0.045187730805163556),
  (2.5865315516193825, 0.3755209120378424),
  (0.039358275943262466, 0.04969265455887662),
  (0.40413534640543675, 0.07760065011852355)]}

In [66]:
# test model
initialprobabilities = getProbabilities(summaries, trainingSet)
print min(initialprobabilities)
print max(initialprobabilities)
print len(initialprobabilities)
#plt.hist(probabilities)
#plt.show()

0.000249737392038
5.14095955404
846


In [67]:
import numpy as np
threshold = np.percentile(initialprobabilities, 15) # return 50th percentile, e.g median.
threshold = min(initialprobabilities)
print threshold

0.000249737392038


In [68]:
filesouth = 'SC_import_south.csv'
testSet = loadCsv(filesouth)

testprobabilities = getProbabilities(summaries, testSet)
positiveclass = []
for i in range(len(testprobabilities)) :
    if testprobabilities[i] > threshold:
        positiveclass.append(1)
    else:
        positiveclass.append(0)

positivepercentage = float(sum(positiveclass)) / len(positiveclass)
positivepercentage

1.0