In [1]:
import os
import pickle
import math
import gc

from Parser import *
from Classifier import *

In [2]:
class DictionaryUtils:
    #Append new data to existing dictionary/frequency arrays
    @staticmethod
    def createDictionary(dictionary, frequency, files):
        tmpDict = []
        tmpFreq = []
        tmpAppends = []
                
        #Create small dict
        for file in files:
            for word in file.split(" "):
                if word in tmpDict:
                    tmpFreq[tmpDict.index(word)] += 1
                else:
                    tmpDict.append(word)
                    tmpFreq.append(1)

        #Merge old and new dict
        for i in xrange(len(tmpDict)):
            if tmpDict[i] in dictionary:
                frequency[dictionary.index(tmpDict[i])] += tmpFreq[i]
            else:
                tmpAppends.append(tmpDict[i])
                frequency.append(tmpFreq[i])
                
        return dictionary + tmpAppends, frequency
    
    #Cut the extremes off the dictionary array
    @staticmethod
    def simplifyDictionary(dictionary, frequency, percentageUpperBound = 0.95, minOccurences = 200):
        newDict = []
        maximum = max(frequency)
        for i in xrange(len(dictionary)):
            if frequency[i] <= maximum * percentageUpperBound and frequency[i] >= minOccurences:
                newDict.append(dictionary[i])
        return newDict

In [3]:
class PickledDataProcessing:
    #Expand the database from a given pickled file. Merge every mergeAfter messages.
    @staticmethod
    def crawlMails(dictionary, frequency, dump, breakAfter = 10000, mergeAfter = 10000):
        i = 0
        j = 0
        data = pickle.load(open( dump, "rb" ))
        files = []
        for text in data:
            content = (Parser.stripHeaders(text)).lower()
            files.append(content)
            if i == breakAfter:
                break;
            if j == mergeAfter:
                dictionary, frequency = DictionaryUtils.createDictionary(dictionary, frequency, files)
                del(files)
                files = []
                j = -1
            i+=1        
            j+=1
        dictionary, frequency = DictionaryUtils.createDictionary(dictionary, frequency, files)
        return dictionary, frequency
    
    #Load file from given databases. Return it as an input data for classifier parsed with a given parser.
    @staticmethod
    def loadMails(parser, pickledFile, count = -1):
        msgs = []
        ctr = 0
        for i in pickle.load(open( pickledFile, "rb" )):
            msgs.append(parser.parseEmail(i))
            if (count != -1):
                if ctr == count:
                    break
                ctr += 1
                
        return msgs
    
    #Serialize all mails in given directory into one pickle file of the same name.
    @staticmethod
    def picklizeDirectory(rootDir):
        msgs = []
        for filename in os.listdir(rootDir):
            with open(rootDir + "/" + filename) as f:
                msgs.append(f.read().lower())
        print "total # of emails: " + str(len(msgs))
        pickle.dump( msgs, open( rootDir + ".p", "wb" ))
        
    #Build a dictionary (array) containing all words from given datasets.
    @staticmethod
    def extractDictionary(pickledData, amount = 2000, dictionarySize = 15000):
        d, f = [], []
        for x in pickledData:
            d, f = PickledDataProcessing.crawlMails(d, f, x, breakAfter = amount)

        topBarrier = 1
        botBarrier = 1
        sim = DictionaryUtils.simplifyDictionary(d, f, topBarrier, botBarrier)
        while len(sim) > dictionarySize and dictionarySize != -1:
            topBarrier -= 0.0025
            botBarrier += 1
            sim = DictionaryUtils.simplifyDictionary(d, f, topBarrier, botBarrier)
        return sim

In [4]:
#Remove words that have no effect on the classification process for a given classifier.
def optimizeDictionary(dictionary, classifier):
    newDict = []
    for i in xrange(len(dictionary)):
        if classifier.a[i, 0] != 0:
            newDict.append(dictionary[i])
    return newDict

#Create a classifier from two given pickled arrays - ham and spam.
def buildClassifier(pickledHam, pickledSpam, parser, amount = 2000):
    spam = PickledDataProcessing.loadMails(parser, pickledSpam, amount)
    ham = PickledDataProcessing.loadMails(parser, pickledSpam, amount)
    
    classifier = Classifier(spam + ham, [1] * len(spam) + [0] * len(ham))
    return classifier

#Create a classifier from two given pickled arrays - ham and spam. This classifier has all zero-valued words cut off.
def buildOptimizedClassifier(pickledHam, pickledSpam, amount):
    print "******************************"

    dictionary = PickledDataProcessing.extractDictionary([pickledHam, pickledSpam], amount = amount, dictionarySize = -1)    
    newDictionary = optimizeDictionary(dictionary, buildClassifier(pickledHam, pickledSpam, Parser(dictionary), amount = amount))
    parser = Parser(newDictionary)
    classifier = buildClassifier(pickledHam, pickledSpam, parser, amount = amount)
    
    print "Classifier stats:"
    print "HAM"
    evaluateDataset(classifier, Parser(newDictionary), pickledHam, amount)
    print "SPAM"
    evaluateDataset(classifier, Parser(newDictionary), pickledSpam, amount)
    
    print "DONE"
    print "******************************"

    return classifier, newDictionary

#Evaluate the quality of a given classifier on several saved datasets.
def evaluateDataset(classifier, parser, pickledDataset, amount = -1):
    i = 0
    ctr = 0
    dataset = pickle.load(open(pickledDataset, "rb"))
    for x in dataset:
        if classifier.evaluate(parser.parseEmail(x)) > 1.0 / 2:
            i += 1
        ctr += 1
        if amount == ctr:
            break
    print str(100 * i / ctr) + "% of given datased was classified as positive!"
    
def testClassifier(classifier, dictionary, label = "", amount = -1):
    print "***************"+ label +"***************"
    print "Spamassassin HAM"
    evaluateDataset(classifier, Parser(dictionary), "ham.p", amount)
    gc.collect()
    print "Spamassassin SPAM"
    evaluateDataset(classifier, Parser(dictionary), "spam.p", amount)
    gc.collect()
    print "Enron HAM"
    evaluateDataset(classifier, Parser(dictionary), "Enron/ham.p", amount)
    gc.collect()
    print "Enron SPAM"
    evaluateDataset(classifier, Parser(dictionary), "Enron/spam.p", amount)
    gc.collect()
    print "SPAM dataset"
    evaluateDataset(classifier, Parser(dictionary), "SPAM/01.p")
    gc.collect()
    print "Classifier evaluation DONE"
    print "***************"+ label +"***************"

In [5]:
import sys
import resource

soft, hard = 5.0 * 10**9, 5.0 * 10**9
resource.setrlimit(resource.RLIMIT_AS,(soft, hard))

In [None]:
amount = 200
class1, dict1 = buildOptimizedClassifier("Enron/ham.p", "Enron/spam.p", amount)
class2, dict2 = buildOptimizedClassifier("ham.p", "spam.p", amount)

pickle.dump([class1, dict1], open("classifier1.p", "wb"))
pickle.dump([class2, dict2], open("classifier2.p", "wb"))

******************************
Classifier stats:
HAM
37% of given datased was classified as positive!
SPAM
95% of given datased was classified as positive!
DONE
******************************
******************************
Classifier stats:
HAM
79% of given datased was classified as positive!
SPAM
64% of given datased was classified as positive!
DONE
******************************


In [None]:
class1, dic1 = pickle.load(open("classifier1.p", "rb"))
class2, dic2 = pickle.load(open("classifier2.p", "rb"))

testClassifier(class1, dic1, "Enron dataset", 1000)
testClassifier(class2, dic2, "Spamassassin dataset", 1000)

***************Enron dataset***************
Spamassassin HAM
84% of given datased was classified as positive!
Spamassassin SPAM
84% of given datased was classified as positive!
Enron HAM
37% of given datased was classified as positive!
Enron SPAM
52% of given datased was classified as positive!
SPAM dataset
