In [67]:
def parseMultinomialBayes(txtfile, numClasses = 2, linesplit = "\r\n"):
    #parse file
    with open(txtfile) as f:
        f_read = f.read()
    instanceList = f_read.split(linesplit)
    del instanceList[-1]

    classStats = [{} for i in range(numClasses)]
    wordCount = [0 for i in range(numClasses)] #counts the number of words associated with that particular class
    numInstances = len(instanceList)
    frequencies = [0 for i in range(numClasses)] #the frequency in which a class appears

    for instance in instanceList:
        features = instance.split(" ")
        classification = int(features[0])
        
        #for sentimental analysis classification of -1 will just be 0 for simplicity
        if (classification == -1):
            classification = 0
            
        frequencies[classification] += 1.0
        del features[0]
        for feature in features:
            pair = feature.split(":")
            if not pair[0] in classStats[0]:
                classStats[0][pair[0]] = 0

            if not pair[0] in classStats[1]:
                classStats[1][pair[0]] = 0

            classStats[classification][pair[0]] += int(pair[1]) #add the number of instances you have seen the word to the total
            wordCount[classification] += 1
            
    for i in range(len(frequencies)):
        frequencies[i] = frequencies[i]/numInstances
    
    return classStats, wordCount, frequencies

def parseBernoulliBayes(txtfile, numClasses = 2, linesplit = "\r\n"):
    
    #parse file
    with open(txtfile) as f:
        f_read = f.read()
    instanceList = f_read.split(linesplit)
    del instanceList[-1]

    classStats = [{} for i in range(numClasses)]
    docCount = [0 for i in range(numClasses)]
    numInstances = len(instanceList)
    frequencies = [0 for i in range(numClasses)]

    for instance in instanceList:
        features = instance.split(" ")
        classification = int(features[0])
        
        #for sentimental analysis classification of -1 will just be 0 for simplicity
        if (classification == -1):
            classification = 0
            
        frequencies[classification] += 1.0
        del features[0]
        for feature in features:
            pair = feature.split(":")
            if not pair[0] in classStats[0]:
                classStats[0][pair[0]] = 0

            if not pair[0] in classStats[1]:
                classStats[1][pair[0]] = 0

            classStats[classification][pair[0]] += 1 #for every document in which the feature appears add one
        docCount[classification] = numInstances
            
    for i in range(len(frequencies)):
        frequencies[i] = frequencies[i]/numInstances
    
    return classStats, docCount, frequencies

def findLikelihood(stats, wordCount, laplaceSmoothFactor, numClasses = 2):
    likelihood = [{} for i in range(numClasses)]
    for i in range(numClasses):
        numUnique = 0
        #Smooth the parameters using Laplacian smoothing
        for word in stats[i]:
            if stats[i][word] <= laplaceSmoothFactor:
                numUnique += 1
                likelihood[i][word] = 0 
                
            else:
                likelihood[i][word] = np.log(float(stats[i][word])/wordCount[i])
            
        for word in stats[i]:
            if likelihood[i][word] == 0:
                likelihood[i][word] = np.log(float(stats[i][word]+1)/(wordCount[i]+numUnique))
                
    return likelihood

def pickKeysWithMaxValues(A, n =20):
    #picks the keys in a dictionary that correspond to n maximum values
    newlist = []
    v=list(A.values())
    k=list(A.keys())
    for i in range(n):
        idx = v.index(max(v))
        newlist.append(k[idx])
        del k[idx]
        del v[idx]
    return newlist

In [68]:
class baysianClassifier():
    def __init__(self, likelihood, frequencies):
        self.likelihood = likelihood
        self.frequencies = frequencies
    
    def test(self, txtfile, numClasses = 2 , linesplit = "\r\n"):
        with open(txtfile) as f:
            f_read = f.read()
        instanceList = f_read.split(linesplit)
        del instanceList[-1]
        
        accuracy = 0

        for instance in instanceList:
            score = [0 for i in range(numClasses)]
            classification = 0
            features = instance.split(" ")
            classification = int(features[0]) 
            
            #for sentimental analysis classification of -1 will just be 0 for simplicity
            if (classification == -1):
                classification = 0
            
            del features[0]
            for feature in features:
                pair = feature.split(":")
                
                if pair[0] in self.likelihood[0]:
                    for i in range(numClasses):
                        score[i] += self.likelihood[i][pair[0]]
            
            for i in range(numClasses):
                score[i] = score[i]*frequencies[i]
            newclass = score.index(max(score))
            
            if classification == newclass:
                accuracy += 1
            else:
                accuracy += 0
        
        return float(accuracy)/len(instanceList)
    

In [69]:
classStats, wordCount, frequencies = parseMultinomialBayes("spam_detection/train_email.txt")
likelihood = findLikelihood(classStats, wordCount, 0)
print pickKeysWithMaxValues(likelihood[0], n = 20)
print pickKeysWithMaxValues(likelihood[1], n = 20)
bayes = baysianClassifier(likelihood, frequencies)
print 'train: ', bayes.test("spam_detection/train_email.txt")
print 'test: ', bayes.test("spam_detection/test_email.txt")

['language', 'university', 's', 'linguistic', 'de', 'information', 'conference', 'workshop', 'email', 'paper', 'e', 'english', 'one', 'please', 'include', 'edu', 'http', 'research', 'abstract', 'address']
['email', 's', 'order', 'report', 'our', 'address', 'mail', 'program', 'send', 'free', 'money', 'list', 'receive', 'name', 'business', 'one', 'd', 'work', 'com', 'nt']
train:  0.995714285714
test:  0.969230769231


In [70]:
classStats, wordCount, frequencies = parseMultinomialBayes("sentiment/rt-train.txt", linesplit = '\n')
likelihood = findLikelihood(classStats, wordCount, 0)
print pickKeysWithMaxValues(likelihood[0], n = 20)
print pickKeysWithMaxValues(likelihood[1], n = 20)
bayes = baysianClassifier(likelihood, frequencies)
print 'train: ', bayes.test("sentiment/rt-train.txt", linesplit = '\n')
print 'test: ', bayes.test("sentiment/rt-test.txt", linesplit = '\n')

['movie', 'film', 'like', 'one', '--', 'bad', 'story', 'much', 'time', 'even', 'good', 'characters', 'little', 'would', 'comedy', 'never', 'nothing', 'makes', 'plot', 'make']
['film', 'movie', '--', 'one', 'like', 'story', 'good', 'comedy', 'way', 'even', 'time', 'best', 'much', 'performances', 'funny', 'make', 'life', 'us', 'makes', 'characters']
train:  0.91675
test:  0.753


In [71]:
classStats, wordCount, frequencies = parseBernoulliBayes("spam_detection/train_email.txt")
likelihood = findLikelihood(classStats, wordCount, 0)
print pickKeysWithMaxValues(likelihood[0], n = 20)
print pickKeysWithMaxValues(likelihood[1], n = 20)
bayes = baysianClassifier(likelihood, frequencies)
print 'train: ', bayes.test("spam_detection/train_email.txt")
print 'test: ', bayes.test("spam_detection/test_email.txt")

['language', 'university', 's', 'information', 'linguistic', 'http', 'email', 'please', 'e', 'follow', 'fax', 'include', 'one', 'english', 'call', 'research', 'www', 'word', 'address', 'interest']
['our', 's', 'free', 'please', 'email', 'mail', 'one', 'address', 'list', 'com', 'receive', 'http', 'us', 'send', 'day', 'information', 'remove', 'here', 'over', 'want']
train:  1.0
test:  0.980769230769


In [72]:
classStats, wordCount, frequencies = parseBernoulliBayes("sentiment/rt-train.txt", linesplit = '\n')
likelihood = findLikelihood(classStats, wordCount, 0)
print pickKeysWithMaxValues(likelihood[0], n = 20)
print pickKeysWithMaxValues(likelihood[1], n = 20)
bayes = baysianClassifier(likelihood, frequencies)
print 'train: ', bayes.test("sentiment/rt-train.txt", linesplit = '\n')
print 'test: ', bayes.test("sentiment/rt-test.txt", linesplit = '\n')

['movie', 'film', 'like', 'one', 'story', 'much', '--', 'bad', 'time', 'even', 'characters', 'little', 'good', 'would', 'comedy', 'nothing', 'makes', 'plot', 'never', 'make']
['film', 'movie', 'one', 'like', '--', 'story', 'comedy', 'way', 'even', 'good', 'best', 'time', 'much', 'performances', 'funny', 'makes', 'life', 'make', 'characters', 'work']
train:  0.9545
test:  0.752
