In [5]:
import csv
import random
import math

In [6]:
def loadCsv(filename):
    lines = csv.reader(open(filename,"rb"))
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [float(x) for x in dataset[i]]
    return dataset

In [7]:
filename = "../datasets/pima-indians-diabetes.data.csv"
dataset = loadCsv(filename)
print("Loaded data file {0} with {1} rows".format(filename,len(dataset)))

Loaded data file ../datasets/pima-indians-diabetes.data.csv with 768 rows


In [8]:
def splitDataset(dataset, splitRatio):
    trainSize = int(len(dataset)*splitRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet) < trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return [trainSet,copy]

In [9]:
dataset = [[1],[2],[3],[4],[5]]
splitRatio = 0.67
train, test = splitDataset(dataset, splitRatio)
print("split {0} rows into train with {1} and test with {2}").format(len(dataset), train, test)

split 5 rows into train with [[1], [5], [2]] and test with [[3], [4]]


In [10]:
def separateByClass(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated

In [11]:
dataset = [[1,20,1],[2,21,0],[3,22,1]]
separated = separateByClass(dataset)
print("Separated instances: {0}").format(separated)

Separated instances: {0: [[2, 21, 0]], 1: [[1, 20, 1], [3, 22, 1]]}


In [12]:
def mean(numbers):
    return sum(numbers) / float(len(numbers))

def standardDeviation(numbers):
    avg = mean(numbers)
    variance = sum(pow(x-avg,2) for x in numbers) / float(len(numbers)-1)
    return math.sqrt(variance)

In [13]:
numbers = [1,2,3,4,5]
print("Summary of {0} mean: {1} | std-dev: {2}").format(numbers, mean(numbers), standardDeviation(numbers))

Summary of [1, 2, 3, 4, 5] mean: 3.0 | std-dev: 1.58113883008


In [14]:
# zip(*dataset) unzips the list into items
def summarize(dataset):
    summaries = [(mean(attribute), standardDeviation(attribute))for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries

In [15]:
dataset = [[1,2,0], [2,10,1], [3,22,0]]
summary = summarize(dataset)
print("Attribute Summaries: {0}").format(summary)

Attribute Summaries: [(2.0, 1.0), (11.333333333333334, 10.066445913694334)]


In [16]:
def summarizeByClass(dataset):
    separated = separateByClass(dataset)
    summaries = {}
    for classValue, instances in separated.iteritems():
        print instances
        summaries[classValue] = summarize(instances)
    return summaries

In [17]:
dataset = [[1,20,1], [2,21,0], [3,22,1], [4,22,0]]
summary = summarizeByClass(dataset)
print("Summary by class value: {}").format(summary)

[[2, 21, 0], [4, 22, 0]]
[[1, 20, 1], [3, 22, 1]]
Summary by class value: {0: [(3.0, 1.4142135623730951), (21.5, 0.7071067811865476)], 1: [(2.0, 1.4142135623730951), (21.0, 1.4142135623730951)]}


In [18]:
# 3 MAKE PREDICTIONS
# calculate Gaussian Probability
# calculate class probabilities
# make a prediction
# estimating accuracy

In [19]:
# understand the formula ???????
def calculateProbability(x, mean, stdev):
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * stdev)) * exponent

In [20]:
x = 71.5
mean = 73 
stdev = 6.2
probability = calculateProbability(x, mean, stdev)
print("probability of belonging to class: {0}").format(probability)

probability of belonging to class: 0.0624896575937


In [21]:
#??????????????
def calculateClassProbabilities(summaries, inputVector):
    probabilities = {}
    for classValue, classSummaries in summaries.iteritems():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = inputVector[i]
            probabilities[classValue] *= calculateProbability(x,mean,stdev)
    return probabilities

In [22]:
summaries = {0:[(1, 0.5)], 1:[(20, 5.0)]}
inputVector = [1.1, "?"]
probabilities = calculateClassProbabilities(summaries, inputVector)
print("probabilities for each class: {0}").format(probabilities)

probabilities for each class: {0: 0.7820853879509118, 1: 6.298736258150442e-05}


In [23]:
def predict(summaries, inputVector):
    probabilities = calculateClassProbabilities(summaries, inputVector)
    bestLabel, bestProb = None, -1
    
    for classValue, probability in probabilities.iteritems():
        if bestLabel is None or probability > bestProb :
            bestProb = probability
            bestLabel = classValue
    return bestLabel

In [39]:
summaries = {"A":[(1,0.5)], "B":[(20,5.0)]}

inputVector = [3.0, "?"] 
# why the ?
# plus the value is changing at the value = 3.0, why is that??

result = predict(summaries,inputVector)
print("The output class prediction is {0}").format(result)

The output class prediction is A


In [47]:
def get_predictions(summaries, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(summaries,testSet[i])
        predictions.append(result)
        
    return predictions

In [48]:
summaries = {"A":[(1,0.5)], "B":[(20,5.0)]}
inputVector = [[3.2, "?"],[1.1,"?"]] 
result = get_predictions(summaries,inputVector)
print("The output class prediction is {0}").format(result)

The output class prediction is ['B', 'A']


In [55]:
def get_accuracy(testSet, predictions):
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][-1] == predictions[i]:
            correct += 1    
    return str((correct / float(len(testSet))) * 100) + "%"

In [58]:
testSet = [[1,1,1,'b'], [2,2,2,'a'], [3,3,3,'b']]
predictions = ['a', 'a', 'a']
accuracy = get_accuracy(testSet, predictions)
print('Accuracy: {0}').format(accuracy)

Accuracy: 33.3333333333%


In [None]:
# Future improvements
# - log probabilities
# - different density function bernoulli / multinomial / gaussian
# - create implementation for nominal values as well