In [47]:
import csv
import random
import math
def loadcsv(filename):
    lines=csv.reader(open(filename,"r"))
    dataset=list(lines)
    for i in range(len(dataset)):
        dataset[i]=[float(x) for x in dataset[i]]
    return dataset

def splitdataset(dataset,splitratio):
    trainsize=int(len(dataset)*splitratio)
    trainset=[]
    copy=list(dataset)
    while len(trainset)<trainsize:
        index=random.randrange(len(copy))
        trainset.append(copy.pop(index))
    return [trainset,copy]

In [48]:
def seperatebyclass(dataset):
    seperated={}
    for i in range(len(dataset)):
        vector=dataset[i]
        if(vector[-1] not in seperated):
            seperated[vector[-1]]=[]
        seperated[vector[-1]].append(vector)
    return seperated

In [49]:
def mean(numbers):
    return sum(numbers) /float(len(numbers))
def stdev(numbers):
    avg=mean(numbers)
    variance=sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
    return math.sqrt(variance)

def summarize(dataset):
    summaries=[(mean(attribute),stdev(attribute))for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries

In [50]:
def summarizebyclass(dataset):
    seperated =seperatebyclass(dataset)
    summaries={}
    for classvalue,instances in seperated.items():
        summaries[classvalue]=summarize(instances)
    return summaries
def calculateprobability(x,mean,stdev):
    exponent =math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1/(math.sqrt(2*math.pi)*stdev))* exponent
def calculateclassprobabilities(summaries,inputvector):
    probabilities={}
    for classvalue,classsummaries in summaries.items():
        probabilities[classvalue]=1
        for i in range(len(classsummaries)):
            mean,stdev=classsummaries[i]
            x=inputvector[i]
            probabilities[classvalue]*=calculateprobability(x,mean,stdev)
    return probabilities

In [51]:
def predict(summaries,inputvector):
    probabilities=calculateclassprobabilities(summaries,inputvector)
    bestlabel,bestprob=None,-1
    for classvalue,probability in probabilities.items():
        if bestlabel is None or probability>bestprob:
            bestprob=probability
            bestlabel=classvalue
    return bestlabel
def getpredictions(summaries,testset):
    prediction=[]
    for i in range(len(testset)):
        result=predict(summaries,testset[i])
        prediction.append(result)
    return prediction

In [52]:
def getaccuracy(testset,prediction):
    correct=0
    for i in range(len(testset)):
        if testset[i][-1]==prediction[i]:
            correct+=1
    return (correct/float(len(testset)))*100.0

def main():
    filename='/home/mite/Desktop/cs089/data5.csv'
    splitratio=0.67
    dataset=loadcsv(filename)
    print("\n the length of the data set=",len(dataset))
    print("\n the data set splitting into training and testing \n")
    trainingset,testset=splitdataset(dataset,splitratio)
    print("\n number of rows in training set:{0}rows".format(len(trainingset)))
    print("\n number of rows in testing set:{0}rows".format(len(testset)))
    print("\n first five rows of training set:\n")
    for i in range(0,5):
        print(trainingset[i],"\n")  
    print("\n first five rows of testing set:\n")
    for i in range(0,5):
        print (testset[i],"\n")
    summaries=summarizebyclass(trainingset)
    print("\n model summaries :\n",summaries)
    predictions=getpredictions(summaries,testset)
    print("\n predictions:\n",predictions)
    accuracy=getaccuracy(testset,predictions)
    print("\n Accuracy:{0}%".format(accuracy))
main()
    


 the length of the data set= 768

 the data set splitting into training and testing 


 number of rows in training set:514rows

 number of rows in testing set:254rows

 first five rows of training set:

[9.0, 154.0, 78.0, 30.0, 100.0, 30.9, 0.164, 45.0, 0.0] 

[8.0, 143.0, 66.0, 0.0, 0.0, 34.9, 0.129, 41.0, 1.0] 

[8.0, 154.0, 78.0, 32.0, 0.0, 32.4, 0.443, 45.0, 1.0] 

[10.0, 101.0, 86.0, 37.0, 0.0, 45.6, 1.136, 38.0, 1.0] 

[0.0, 165.0, 76.0, 43.0, 255.0, 47.9, 0.259, 26.0, 0.0] 


 first five rows of testing set:

[8.0, 183.0, 64.0, 0.0, 0.0, 23.3, 0.672, 32.0, 1.0] 

[1.0, 89.0, 66.0, 23.0, 94.0, 28.1, 0.167, 21.0, 0.0] 

[2.0, 197.0, 70.0, 45.0, 543.0, 30.5, 0.158, 53.0, 1.0] 

[1.0, 103.0, 30.0, 38.0, 83.0, 43.3, 0.183, 33.0, 0.0] 

[1.0, 115.0, 70.0, 30.0, 96.0, 34.6, 0.529, 32.0, 1.0] 


 model summaries :
 {0.0: [(3.3294117647058825, 2.9990686274757996), (108.87352941176471, 25.885244926610927), (68.54411764705883, 17.398702884152467), (19.894117647058824, 15.165769194681632),