In [1]:
import pandas as pd
import numpy as np

In [26]:
def parseMultinomialBayes(txtfile, numClasses = 2, linesplit = "\r\n"):
    with open(txtfile) as f:
        f_read = f.read()
    instanceList = f_read.split(linesplit)
    del instanceList[-1]

    classStats = [{} for i in range(numClasses)]
    wordCount = [0 for i in range(numClasses)]
    numInstances = len(instanceList)
    frequencies = [0 for i in range(numClasses)]

    for instance in instanceList:
        features = instance.split(" ")
        classification = int(features[0])
        
        #for sentimental analysis classification of -1 will just be 0 for simplicity
        if (classification == -1):
            classification = 0
            
        frequencies[classification] += 1.0
        del features[0]
        for feature in features:
            pair = feature.split(":")
            if not pair[0] in classStats[0]:
                classStats[0][pair[0]] = 0

            if not pair[0] in classStats[1]:
                classStats[1][pair[0]] = 0

            classStats[classification][pair[0]] += int(pair[1])
            wordCount[classification] += 1
            
    for i in range(len(frequencies)):
        frequencies[i] = frequencies[i]/numInstances
    
    return classStats, wordCount, frequencies

def parseBernoulliBayes(txtfile, numClasses = 2, linesplit = "\r\n"):
    with open(txtfile) as f:
        f_read = f.read()
    instanceList = f_read.split(linesplit)
    del instanceList[-1]

    classStats = [{} for i in range(numClasses)]
    wordCount = [0 for i in range(numClasses)]
    numInstances = 0
    frequencies = [0 for i in range(numClasses)]

    for instance in instanceList:
        features = instance.split(" ")
        classification = int(features[0])
        
        #for sentimental analysis classification of -1 will just be 0 for simplicity
        if (classification == -1):
            classification = 0
            
        frequencies[classification] += 1.0
        del features[0]
        for feature in features:
            pair = feature.split(":")
            if not pair[0] in classStats[0]:
                classStats[0][pair[0]] = 0
                numInstances += 1

            if not pair[0] in classStats[1]:
                classStats[1][pair[0]] = 0

            classStats[classification][pair[0]] += int(pair[1])
            wordCount[classification] = 1
            
    for i in range(len(frequencies)):
        frequencies[i] = frequencies[i]/numInstances
    
    return classStats, wordCount, frequencies

def findLikelihood(stats, wordCount, laplaceSmoothFactor, numClasses = 2):
    likelihood = [{} for i in range(numClasses)]
    for i in range(numClasses):
        numUnique = 0
        #Smooth the parameters using Laplacian smoothing
        for word in stats[i]:
            if stats[i][word] <= laplaceSmoothFactor:
                numUnique += 1
                likelihood[i][word] = 0 
                
            else:
                likelihood[i][word] = np.log(float(stats[i][word])/wordCount[i])
            
        for word in stats[i]:
            if likelihood[i][word] == 0:
                likelihood[i][word] = np.log(float(stats[i][word]+1)/(wordCount[i]+numUnique))
                
    return likelihood


In [32]:
class baysianClassifier():
    def __init__(self, likelihood, frequencies):
        self.likelihood = likelihood
        self.frequencies = frequencies
    
    def test(self, txtfile, numClasses = 2 , linesplit = "\r\n"):
        with open(txtfile) as f:
            f_read = f.read()
        instanceList = f_read.split(linesplit)
        del instanceList[-1]
        
        accuracy = 0

        for instance in instanceList:
            score = [0 for i in range(numClasses)]
            classification = 0
            features = instance.split(" ")
            classification = int(features[0]) 
            
            #for sentimental analysis classification of -1 will just be 0 for simplicity
            if (classification == -1):
                classification = 0
            
            del features[0]
            for feature in features:
                pair = feature.split(":")
                
                if pair[0] in self.likelihood[0]:
                    for i in range(numClasses):
                        score[i] += self.likelihood[i][pair[0]]
            
            for i in range(numClasses):
                score[i] = score[i]*frequencies[i]
            newclass = score.index(max(score))
            
            if classification == newclass:
                accuracy += 1
            else:
                accuracy += 0
        
        return float(accuracy)/len(instanceList)
    

In [34]:
classStats, wordCount, frequencies = parseMultinomialBayes("spam_detection/train_email.txt")
likelihood = findLikelihood(classStats, wordCount, 0)
bayes = baysianClassifier(likelihood, frequencies)
print 'train: ', bayes.test("spam_detection/train_email.txt")
print 'test: ', bayes.test("spam_detection/test_email.txt")

train:  0.995714285714
test:  0.969230769231


In [33]:
classStats, wordCount, frequencies = parseMultinomialBayes("sentiment/rt-train.txt", linesplit = '\n')
likelihood = findLikelihood(classStats, wordCount, 0)
bayes = baysianClassifier(likelihood, frequencies)
print 'train: ', bayes.test("sentiment/rt-train.txt", linesplit = '\n')
print 'test: ', bayes.test("sentiment/rt-test.txt", linesplit = '\n')

train:  0.91675
test:  0.753
