In [1]:
import os
import pickle
import math
import gc

from Parser import *
from Classifier import *

In [7]:
class DictionaryUtils:
    #Append new data to existing dictionary/frequency arrays
    @staticmethod
    def createDictionary(dictionary, frequency, files):
        tmpDict = []
        tmpFreq = []
        tmpAppends = []
                
        #Create small dict
        for file in files:
            for word in file.split(" "):
                if word in tmpDict:
                    tmpFreq[tmpDict.index(word)] += 1
                else:
                    tmpDict.append(word)
                    tmpFreq.append(1)

        #Merge old and new dict
        for i in xrange(len(tmpDict)):
            if tmpDict[i] in dictionary:
                frequency[dictionary.index(tmpDict[i])] += tmpFreq[i]
            else:
                tmpAppends.append(tmpDict[i])
                frequency.append(tmpFreq[i])
                
        return dictionary + tmpAppends, frequency
    
    #Cut the extremes off the dictionary array
    @staticmethod
    def simplifyDictionary(dictionary, frequency, percentageUpperBound = 0.95, minOccurences = 200):
        newDict = []
        maximum = max(frequency)
        for i in xrange(len(dictionary)):
            if frequency[i] <= maximum * percentageUpperBound and frequency[i] >= minOccurences:
                newDict.append(dictionary[i])
        return newDict
    
    @staticmethod
    def mutualInformationSummand(fSel, fOpp, tSel, tOpp):
        p_xy = 1.0 * fSel / (tSel + tOpp)
        p_x = (fSel + fOpp) * 1.0 / (tSel + tOpp)
        p_y = tSel * 1.0 / (tSel + tOpp)
        if p_xy / (p_x * p_y) > 0:
            return p_xy * math.log(p_xy / (p_x * p_y))
        else: 
            return 0
    
    @staticmethod
    def mutualInformation(dSpam, fSpam, dHam, fHam):
        mSpam = []
        for i in xrange(len(dSpam)):
            if dSpam[i] in dHam:
                fOpp = fHam[dHam.index(dSpam[i])]
            else:
                fOpp = 0
            MI = DictionaryUtils.mutualInformationSummand(fSpam[i], fOpp, len(dSpam), len(dHam))
            MI += DictionaryUtils.mutualInformationSummand(fOpp, fSpam[i], len(dHam), len(dSpam))
            mSpam += [{"word" : dSpam[i], "MI" : MI}]
        for i in xrange(len(dHam)):
            if dHam[i] not in dSpam:
                MI = DictionaryUtils.mutualInformationSummand(fHam[i], 0, len(dHam), len(dSpam))
                mSpam += [{"word" : dHam[i], "MI" : MI}]
        return mSpam

In [3]:
class PickledDataProcessing:
    #Expand the database from a given pickled file. Merge every mergeAfter messages.
    @staticmethod
    def crawlMails(dictionary, frequency, dump, breakAfter = -1, mergeAfter = 10000):
        i = 0
        j = 0
        data = pickle.load(open( dump, "rb" ))
        files = []
        for text in data:
            content = (Parser.stripHeaders(text)).lower()
            files.append(content)
            if i == breakAfter:
                break;
            if j == mergeAfter:
                dictionary, frequency = DictionaryUtils.createDictionary(dictionary, frequency, files)
                del(files)
                files = []
                j = -1
            i+=1        
            j+=1
        dictionary, frequency = DictionaryUtils.createDictionary(dictionary, frequency, files)
        return dictionary, frequency
    
    #Load file from given databases. Return it as an input data for classifier parsed with a given parser.
    @staticmethod
    def loadMails(parser, pickledFile, count = -1):
        msgs = []
        ctr = 0
        for i in pickle.load(open( pickledFile, "rb" )):
            msgs.append(parser.parseEmail(i))
            if (count != -1):
                if ctr == count:
                    break
                ctr += 1
                
        return msgs
    
    #Serialize all mails in given directory into one pickle file of the same name.
    @staticmethod
    def picklizeDirectory(rootDir):
        msgs = []
        for filename in os.listdir(rootDir):
            with open(rootDir + "/" + filename) as f:
                msgs.append(f.read().lower())
        print "total # of emails: " + str(len(msgs))
        pickle.dump( msgs, open( rootDir + ".p", "wb" ))
        
    #Build a dictionary (array) containing all words from given datasets.
    @staticmethod
    def extractDictionary(pickledData, amount = 2000, dictionarySize = 15000):
        d, f = [], []
        for x in pickledData:
            d, f = PickledDataProcessing.crawlMails(d, f, x, breakAfter = amount)
        
        if dictionarySize == -1:
            return d, f
        
        topBarrier = 1
        botBarrier = 1
        sim = DictionaryUtils.simplifyDictionary(d, f, topBarrier, botBarrier)
        while len(sim) > dictionarySize and dictionarySize != -1:
            topBarrier -= 0.0025
            botBarrier += 1
            sim = DictionaryUtils.simplifyDictionary(d, f, topBarrier, botBarrier)
        return sim

In [4]:
#Remove words that have no effect on the classification process for a given classifier.
def optimizeDictionary(dictionary, classifier):
    newDict = []
    for i in xrange(len(dictionary)):
        if classifier.a[i, 0] != 0:
            newDict.append(dictionary[i])
    return newDict

#Create a classifier from two given pickled arrays - ham and spam.
def buildClassifier(pickledHam, pickledSpam, parser, amount = 2000):
    spam = PickledDataProcessing.loadMails(parser, pickledSpam, amount)
    ham = PickledDataProcessing.loadMails(parser, pickledHam, amount)
    
    classifier = Classifier(spam + ham, [1] * len(spam) + [0] * len(ham))
    return classifier

#Evaluate the quality of a given classifier on several saved datasets.
def evaluateDataset(classifier, parser, pickledDataset, amount = -1):
    i = 0
    ctr = 0
    dataset = pickle.load(open(pickledDataset, "rb"))
    for x in dataset:
        if classifier.evaluate(parser.parseEmail(x)) > 1.0 / 2:
            i += 1
        ctr += 1
        if amount == ctr:
            break
    print str(100 * i / ctr) + "% of given datased was classified as positive!"
    
def testClassifier(classifier, parser, label = "", amount = -1):
    print "***************"+ label +"***************"
#    print "Spamassassin HAM"
#    evaluateDataset(classifier, parser, "ham.p", amount)
#    gc.collect()
#    print "Spamassassin SPAM"
#    evaluateDataset(classifier, parser, "spam.p", amount)
#    gc.collect()
    print "Enron HAM"
    evaluateDataset(classifier, parser, "Enron/ham.p", amount)
    gc.collect()
    print "Enron SPAM"
    evaluateDataset(classifier, parser, "Enron/spam.p", amount)
    gc.collect()
    print "SPAM dataset"
    evaluateDataset(classifier, parser, "SPAM/01.p")
    gc.collect()
    print "Classifier evaluation DONE"
    print "***************"+ label +"***************"

In [5]:
import sys
import resource

soft, hard = 5.0 * 10**9, 5.0 * 10**9
resource.setrlimit(resource.RLIMIT_AS,(soft, hard))

In [11]:
dHam, fHam = PickledDataProcessing.extractDictionary(["Enron/ham.p"], 500, -1)
dSpam, fSpam = PickledDataProcessing.extractDictionary(["Enron/spam.p"], 500, -1)
print "Frequencies/Dictionaries loaded"

MI = DictionaryUtils.mutualInformation(dSpam, fSpam, dHam, fHam)
sMI = sorted(MI, key=lambda k: -k['MI']) 
print "Dictionary Impact-sorting done"
sDict = [i['word'] for i in sMI]
print "Dictionary Export done, length " + str(len(sDict))

Frequencies/Dictionaries loaded
Dictionary Impact-sorting done
Dictionary Export done, length 26661


In [12]:
optimziedDictionary = sDict[0:250]
parser = Parser(optimziedDictionary)
#classifier = buildClassifier("Enron/ham.p", "Enron/spam.p", parser, amount = 1000)

In [14]:
spam = PickledDataProcessing.loadMails(parser, "Enron/spam.p", 500)
ham = PickledDataProcessing.loadMails(parser, "Enron/ham.p", 500)
print "Prepared spam/ham"
classifier = Classifier(spam + ham, [1] * len(spam) + [0] * len(ham))
testClassifier(classifier, parser, "Enron dataset", 10000)

Prepared spam/ham
***************Enron dataset***************
Enron HAM
11% of given datased was classified as positive!
Enron SPAM
89% of given datased was classified as positive!
SPAM dataset
96% of given datased was classified as positive!
Classifier evaluation DONE
***************Enron dataset***************
