In [65]:
# Example of Naive Bayes implemented from Scratch in Python
import csv
import random
import math
import matplotlib.pyplot as plt

def loadCsv(filename):
	lines = csv.reader(open(filename, "rb"))
	dataset = list(lines)
	for i in range(len(dataset)):
		dataset[i] = [float(x) for x in dataset[i]]
	return dataset

def splitDataset(dataset, splitRatio):
	trainSize = int(len(dataset) * splitRatio)
	trainSet = []
	copy = list(dataset)
	while len(trainSet) < trainSize:
		index = random.randrange(len(copy))
		trainSet.append(copy.pop(index))
	return [trainSet, copy]

def separateByClass(dataset):
	separated = {}
	for i in range(len(dataset)):
		vector = dataset[i]
		if (vector[-1] not in separated):
			separated[vector[-1]] = []
		separated[vector[-1]].append(vector)
	return separated

def mean(numbers):
	return sum(numbers)/float(len(numbers))

def stdev(numbers):
	avg = mean(numbers)
	variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
	return math.sqrt(variance)

def summarize(dataset):
	summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
	del summaries[-1]
	return summaries

def summarizeByClass(dataset):
	separated = separateByClass(dataset)
	summaries = {}
	for classValue, instances in separated.iteritems():
		summaries[classValue] = summarize(instances)
	return summaries

def calculateProbability(x, mean, stdev):
	exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
	return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

def calculateClassProbabilities(summaries, inputVector):
	probabilities = {}
	for classValue, classSummaries in summaries.iteritems():
		probabilities[classValue] = 1
		for i in range(len(classSummaries)):
			mean, stdev = classSummaries[i]
			x = inputVector[i]
			probabilities[classValue] *= calculateProbability(x, mean, stdev)
	return probabilities
			
def predict(summaries, inputVector):
	probabilities = calculateClassProbabilities(summaries, inputVector)
	bestLabel, bestProb = None, -1
	for classValue, probability in probabilities.iteritems():
		if bestLabel is None or probability > bestProb:
			bestProb = probability
			bestLabel = classValue
	return bestLabel

def getPredictions(summaries, testSet):
	predictions = []
	for i in range(len(testSet)):
		result = predict(summaries, testSet[i])
		predictions.append(result)
	return predictions

def getAccuracy(testSet, predictions):
	correct = 0
	for i in range(len(testSet)):
		if testSet[i][-1] == predictions[i]:
			correct += 1
	return (correct/float(len(testSet))) * 100.0

def main():
    filename = 'pima-indians-diabetes-data.csv'
    splitRatio = 0.67
    dataset = loadCsv(filename)
    trainingSet, testSet = splitDataset(dataset, splitRatio)
    print('Split {0} rows into train={1} and test={2} rows').format(len(dataset), len(trainingSet), len(testSet))
    # prepare model
    summaries = summarizeByClass(trainingSet)
    # test model
    predictions = getPredictions(summaries, testSet)
    accuracy = getAccuracy(testSet, predictions)
    print('Accuracy: {0}%').format(accuracy) 
    
main()


Split 768 rows into train=514 and test=254 rows
Accuracy: 74.0157480315%


In [70]:
# Example of Naive Bayes implemented from Scratch in Python
import csv
import random
import math
import matplotlib.pyplot as plt

def loadCsv(filename):
	lines = csv.reader(open(filename, "rb"))
	dataset = list(lines)
	for i in range(len(dataset)):
		dataset[i] = [float(x) for x in dataset[i]]
	return dataset

def splitDataset(dataset, splitRatio):
	trainSize = int(len(dataset) * splitRatio)
	trainSet = []
	copy = list(dataset)
	while len(trainSet) < trainSize:
		index = random.randrange(len(copy))
		trainSet.append(copy.pop(index))
	return [trainSet, copy]

def separateByClass(dataset):
	separated = {}
	for i in range(len(dataset)):
		vector = dataset[i]
		if (vector[-1] not in separated):
			separated[vector[-1]] = []
		separated[vector[-1]].append(vector)
	return separated

def mean(numbers):
	return sum(numbers)/float(len(numbers))

def stdev(numbers):
	avg = mean(numbers)
	variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
	return math.sqrt(variance)

def summarize(dataset):
	summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
	del summaries[-1]
	return summaries

def summarizeByClass(dataset):
	separated = separateByClass(dataset)
	summaries = {}
	for classValue, instances in separated.iteritems():
		summaries[classValue] = summarize(instances)
	return summaries

def calculateProbability(x, mean, stdev):
	exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
	return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

def calculateClassProbabilities(summaries, inputVector):
	probabilities = {}
	for classValue, classSummaries in summaries.iteritems():
		probabilities[classValue] = 1
		for i in range(len(classSummaries)):
			mean, stdev = classSummaries[i]
			x = inputVector[i]
			probabilities[classValue] *= calculateProbability(x, mean, stdev)
	return probabilities
			
def predict(summaries, inputVector):
	probabilities = calculateClassProbabilities(summaries, inputVector)
	bestLabel, bestProb = None, -1
	for classValue, probability in probabilities.iteritems():
		if bestLabel is None or probability > bestProb:
			bestProb = probability
			bestLabel = classValue
	return bestLabel

def getPredictions(summaries, testSet):
	predictions = []
	for i in range(len(testSet)):
		result = predict(summaries, testSet[i])
		predictions.append(result)
	return predictions

def getProbabilities(summaries, testSet):
	predictions = []
	for i in range(len(testSet)):
		result = calculateClassProbabilities(summaries, testSet[i])
		predictions.append(result)
	return predictions

def getAccuracy(testSet, predictions):
	correct = 0
	for i in range(len(testSet)):
		if testSet[i][-1] == predictions[i]:
			correct += 1
	return (correct/float(len(testSet))) * 100.0

def main():
    filename = 'pima-indians-diabetes-data.csv'
    splitRatio = 0.67
    dataset = loadCsv(filename)
    trainingSet, testSet = splitDataset(dataset, splitRatio)
    print('Split {0} rows into train={1} and test={2} rows').format(len(dataset), len(trainingSet), len(testSet))
    # prepare model
    summaries = summarizeByClass(trainingSet)
    # test model
    predictions = getPredictions(summaries, testSet)
    probabilities = getProbabilities(summaries, testSet)
    accuracy = getAccuracy(testSet, predictions)
    print('Accuracy: {0}%').format(accuracy)
    print probabilities
main()

Split 768 rows into train=514 and test=254 rows
Accuracy: 73.6220472441%
[{0.0: 2.778134757427176e-13, 1.0: 7.565553077138689e-13}, {0.0: 1.079036984224474e-14, 1.0: 7.025272975348757e-14}, {0.0: 2.674090033087424e-12, 1.0: 6.299452050763295e-14}, {0.0: 6.549185366955833e-17, 1.0: 1.0140841758737226e-15}, {0.0: 8.356362769942293e-21, 1.0: 1.2118457318337072e-16}, {0.0: 1.5239931984501297e-14, 1.0: 2.311182103131133e-13}, {0.0: 5.113719745790031e-14, 1.0: 4.1998072759586635e-14}, {0.0: 1.8338588894387962e-13, 1.0: 9.886247867824941e-14}, {0.0: 1.110899881645214e-15, 1.0: 8.191217477543102e-14}, {0.0: 6.131628329374151e-13, 1.0: 2.752460431898608e-13}, {0.0: 5.1012423271643476e-15, 1.0: 1.0687990227072784e-13}, {0.0: 3.924620741797153e-13, 1.0: 5.093429403513013e-13}, {0.0: 2.6540090030204473e-12, 1.0: 7.488025626655367e-14}, {0.0: 9.109219960308435e-16, 1.0: 7.98718782457069e-15}, {0.0: 9.641483026115555e-13, 1.0: 4.1170991124122323e-13}, {0.0: 2.751892682160192e-13, 1.0: 1.392823902514

In [101]:
# Example of Naive Bayes implemented from Scratch in Python
import csv
import random
import math
import matplotlib.pyplot as plt
import numpy as np

def loadCsv(filename):
	lines = csv.reader(open(filename, "rb"))
	dataset = list(lines)
	for i in range(len(dataset)):
		dataset[i] = [float(x) for x in dataset[i]]
	return dataset

def splitDataset(dataset, splitRatio):
	trainSize = int(len(dataset) * splitRatio)
	trainSet = []
	copy = list(dataset)
	while len(trainSet) < trainSize:
		index = random.randrange(len(copy))
		trainSet.append(copy.pop(index))
	return [trainSet, copy]

def separateByClass(dataset):
	separated = {}
	for i in range(len(dataset)):
		vector = dataset[i]
		if (vector[-1] not in separated):
			separated[vector[-1]] = []
		separated[vector[-1]].append(vector)
	return separated

def mean(numbers):
	return sum(numbers)/float(len(numbers))

def stdev(numbers):
	avg = mean(numbers)
	variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
	return math.sqrt(variance)

def summarize(dataset):
	summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
	del summaries[-1]
	return summaries

def summarizeByClass(dataset):
	separated = separateByClass(dataset)
	summaries = {}
	for classValue, instances in separated.iteritems():
		summaries[classValue] = summarize(instances)
	return summaries

def calculateProbability(x, mean, stdev):
	exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
	return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

def calculateClassProbabilities(summaries, inputVector):
	probabilities = {}
	for classValue, classSummaries in summaries.iteritems():
		probabilities[classValue] = 1
		for i in range(len(classSummaries)):
			mean, stdev = classSummaries[i]
			x = inputVector[i]
			probabilities[classValue] *= calculateProbability(x, mean, stdev)
	return probabilities
			
    
def calculateClassProbability(summaries, inputVector):
	probabilities = {}
	for classValue, classSummaries in summaries.iteritems():
		probabilities[classValue] = 1
		for i in range(len(classSummaries)):
			mean, stdev = classSummaries[i]
			x = inputVector[i]
			probabilities= calculateProbability(x, mean, stdev)
	return probabilities
			
    
def getProbabilities(summaries, testSet):
	predictions = []
	for i in range(len(testSet)):
		result = calculateClassProbability(summaries, testSet[i])
		predictions.append(result)
	return predictions
    
def predict(summaries, inputVector):
	probabilities = calculateClassProbabilities(summaries, inputVector)
	bestLabel, bestProb = None, -1
	for classValue, probability in probabilities.iteritems():
		if bestLabel is None or probability > bestProb:
			bestProb = probability
			bestLabel = classValue
	return bestLabel

def getPredictions(summaries, testSet):
	predictions = []
	for i in range(len(testSet)):
		result = predict(summaries, testSet[i])
		predictions.append(result)
	return predictions

def getAccuracy(testSet, predictions):
	correct = 0
	for i in range(len(testSet)):
		if testSet[i][-1] == predictions[i]:
			correct += 1
	return (correct/float(len(testSet))) * 100.0

def main():
    filename = 'data-test.csv'
    splitRatio = 0.67
    dataset = loadCsv(filename)
    trainingSet, testSet = splitDataset(dataset, splitRatio)
    print('Split {0} rows into train={1} and test={2} rows').format(len(dataset), len(trainingSet), len(testSet))
    # prepare model
    summaries = summarizeByClass(trainingSet)
    # test model
    predictions = getPredictions(summaries, testSet)
    probabilities = getProbabilities(summaries, testSet)
    accuracy = getAccuracy(testSet, predictions)
    print('Accuracy: {0}%').format(accuracy)
    print min(probabilities)
    print max(probabilities)
    print probabilities
    #plt.hist(probabilities)
    #plt.show()
main()

Split 768 rows into train=514 and test=254 rows
Accuracy: 100.0%
0.000613012124813
0.0335439675464
[0.012607611139157067, 0.03333908663135438, 0.03351211721125657, 0.0032814596978605276, 0.03290180100966713, 0.033543967546386014, 0.02907706863332204, 0.021247859331224053, 0.0046503284939702376, 0.030309433958466362, 0.02078864754511945, 0.033543967546386014, 0.007439757144393006, 0.021247859331224053, 0.03224128560398044, 0.03333908663135438, 0.02619849415367057, 0.02907706863332204, 0.030309433958466362, 0.027306225949859934, 0.02769810917782115, 0.02460533269935501, 0.021247859331224053, 0.03274589440304311, 0.03202760108325996, 0.02907706863332204, 0.02769810917782115, 0.024165208515657902, 0.02460533269935501, 0.02907706863332204, 0.01953656520215092, 0.03202760108325996, 0.02619849415367057, 0.02999436237669548, 0.03110417377394416, 0.02619849415367057, 0.02294609880777888, 0.0018445589095506074, 0.03224128560398044, 0.021247859331224053, 0.030309433958466362, 0.03110417377394416,

In [109]:
filename = 'data-test.csv'
splitRatio = 0.67
dataset = loadCsv(filename)
trainingSet, testSet = splitDataset(dataset, splitRatio)
print('Split {0} rows into train={1} and test={2} rows').format(len(dataset), len(trainingSet), len(testSet))

Split 768 rows into train=514 and test=254 rows


In [113]:
# prepare model
summaries = summarizeByClass(trainingSet)
# test model
initialprobabilities = getProbabilities(summaries, trainingSet)
print('Accuracy: {0}%').format(accuracy)
print min(initialprobabilities)
print max(initialprobabilities)
print len(initialprobabilities)
#plt.hist(probabilities)
#plt.show()

Accuracy: 100.0%
0.000187988456662
0.0336260773981
514


In [127]:
import numpy as np
threshold = np.percentile(initialprobabilities, 15) # return 50th percentile, e.g median.
print threshold
3.0

0.0187997629886


3.0

In [128]:
testprobabilities = getProbabilities(summaries, testSet)
positiveclass = []
for i in range(len(testprobabilities)) :
    if testprobabilities[i] > threshold:
        positiveclass.append(1)
    else:
        positiveclass.append(0)

positivepercentage = float(sum(positiveclass)) / len(positiveclass)
positivepercentage

0.7992125984251969

In [121]:
a = range(10)
# [0,1,2,3,4,5,6,7,8,9]
b = sum(a)
print b
# prints 45

45
