In [1]:
import os
import pickle
import math
import gc

from Parser import *
from Classifier import *

In [2]:
class DictionaryUtils:
    #Append new data to existing dictionary/frequency arrays
    @staticmethod
    def createDictionary(dictionary, frequency, files):
        tmpDict = []
        tmpFreq = []
        tmpAppends = []
                
        #Create small dict
        for file in files:
            for word in file.split(" "):
                if word in tmpDict:
                    tmpFreq[tmpDict.index(word)] += 1
                else:
                    tmpDict.append(word)
                    tmpFreq.append(1)

        #Merge old and new dict
        for i in xrange(len(tmpDict)):
            if tmpDict[i] in dictionary:
                frequency[dictionary.index(tmpDict[i])] += tmpFreq[i]
            else:
                tmpAppends.append(tmpDict[i])
                frequency.append(tmpFreq[i])
                
        return dictionary + tmpAppends, frequency
        
    @staticmethod
    def mutualInformationSummand(fSel, fOpp, tSel, tOpp):
        p_xy = 1.0 * fSel / (tSel + tOpp)
        p_x = (fSel + fOpp) * 1.0 / (tSel + tOpp)
        p_y = tSel * 1.0 / (tSel + tOpp)
        if p_xy / (p_x * p_y) > 0:
            return p_xy * math.log(p_xy / (p_x * p_y))
        else: 
            return 0
    
    @staticmethod
    def mutualInformation(dSpam, fSpam, dHam, fHam):
        mSpam = []
        for i in xrange(len(dSpam)):
            if dSpam[i] in dHam:
                fOpp = fHam[dHam.index(dSpam[i])]
            else:
                fOpp = 0
            MI = DictionaryUtils.mutualInformationSummand(fSpam[i], fOpp, len(dSpam), len(dHam))
            MI += DictionaryUtils.mutualInformationSummand(fOpp, fSpam[i], len(dHam), len(dSpam))
            mSpam += [{"word" : dSpam[i], "MI" : MI}]
        for i in xrange(len(dHam)):
            if dHam[i] not in dSpam:
                MI = DictionaryUtils.mutualInformationSummand(fHam[i], 0, len(dHam), len(dSpam))
                mSpam += [{"word" : dHam[i], "MI" : MI}]
        return mSpam
    
    @staticmethod
    def dictionaryMI(dSpam, fSpam, dHam, fHam):
        MI = DictionaryUtils.mutualInformation(dSpam, fSpam, dHam, fHam)
        sMI = sorted(MI, key=lambda k: -k['MI']) 
        return [i['word'] for i in sMI]
        

In [3]:
class PickledDataProcessing:
    #Expand the database from a given pickled file. Merge every mergeAfter messages.
    @staticmethod
    def crawlMails(dictionary, frequency, dump, breakAfter = -1, mergeAfter = 10000):
        i = 0
        j = 0
        data = pickle.load(open( dump, "rb" ))
        files = []
        for text in data:
            content = (Parser.stripHeaders(text)).lower()
            files.append(content)
            if i == breakAfter:
                break;
            if j == mergeAfter:
                dictionary, frequency = DictionaryUtils.createDictionary(dictionary, frequency, files)
                del(files)
                files = []
                j = -1
            i+=1        
            j+=1
        dictionary, frequency = DictionaryUtils.createDictionary(dictionary, frequency, files)
        return dictionary, frequency
    
    #Load file from given databases. Return it as an input data for classifier parsed with a given parser.
    @staticmethod
    def loadMails(parser, pickledFile, count = -1):
        msgs = []
        ctr = 0
        for i in pickle.load(open( pickledFile, "rb" )):
            msgs.append(parser.parseEmail(i))
            if (count != -1):
                if ctr == count:
                    break
                ctr += 1
                
        return msgs
        
    #Build a dictionary (array) containing all words from given datasets.
    @staticmethod
    def extractDictionary(pickledData, amount = 2000):
        d, f = [], []
        for x in pickledData:
            d, f = PickledDataProcessing.crawlMails(d, f, x, breakAfter = amount)
        
        return d, f

In [None]:
#Evaluate the quality of a given classifier on several saved datasets.
def evaluateDataset(classifier, parser, pickledDataset, amount = -1):
    i = 0
    ctr = 0
    dataset = pickle.load(open(pickledDataset, "rb"))
    for x in dataset:
        if classifier.evaluate(parser.parseEmail(x)) > 1.0 / 2:
            i += 1
        ctr += 1
        if amount == ctr:
            break
    if amount == -1:
        print "Dataset length: " + str(len(dataset))
    else:
        print "Dataset length: " + str(min(len(dataset), amount))
    print str(100 * i / ctr) + "% of given datased was classified as positive!\n\r"
    
def testClassifier(classifier, parser, label = "", amount = -1):
    print "***************"+ label +"***************"
    print "Spamassassin HAM"
    evaluateDataset(classifier, parser, "../ham.p", amount)
    gc.collect()
    print "Spamassassin SPAM"
    evaluateDataset(classifier, parser, "../spam.p", amount)
    gc.collect()
    print "Enron HAM"
    evaluateDataset(classifier, parser, "../Enron/ham.p", amount)
    gc.collect()
    print "Enron SPAM"
    evaluateDataset(classifier, parser, "../Enron/spam.p", amount)
    gc.collect()
    print "SPAM dataset"
    evaluateDataset(classifier, parser, "../SPAM/01.p")
    gc.collect()
    print "Classifier evaluation DONE"
    print "***************"+ label +"***************"

In [5]:
import sys
import resource

soft, hard = 5.0 * 10**9, 5.0 * 10**9
resource.setrlimit(resource.RLIMIT_AS,(soft, hard))

In [6]:
def buildTesting(pHam, pSpam, learningDataset = 500, label = "", dictLength = 250):    
    dHam, fHam = PickledDataProcessing.extractDictionary([pHam], learningDataset)
    dSpam, fSpam = PickledDataProcessing.extractDictionary([pSpam], learningDataset)
    print "Frequencies/Dictionaries loaded"

    optimziedDictionary = DictionaryUtils.dictionaryMI(dSpam, fSpam, dHam, fHam)[0:dictLength]
    parser = Parser(optimziedDictionary)

    spam = PickledDataProcessing.loadMails(parser, pSpam, learningDataset)
    ham = PickledDataProcessing.loadMails(parser, pHam, learningDataset)
    print "Prepared spam/ham"
    classifier = Classifier(spam + ham, [1] * len(spam) + [0] * len(ham))
    testClassifier(classifier, parser, label, 10000)
    return classifier, parser

In [10]:
buildTesting("../ham.p", "../spam.p", 500, "Spamassassin dataset", 250)
buildTesting("../Enron/ham.p", "../Enron/spam.p", 500, "Enron dataset", 250)

Frequencies/Dictionaries loaded
Prepared spam/ham
***************Spamassassin dataset***************
Spamassassin HAM
Dataset length: 2752
17% of given datased was classified as positive!

Spamassassin SPAM
Dataset length: 501
99% of given datased was classified as positive!

Enron HAM
Dataset length: 10000
77% of given datased was classified as positive!

Enron SPAM
Dataset length: 4502
88% of given datased was classified as positive!

SPAM dataset
Dataset length: 35600
62% of given datased was classified as positive!

Classifier evaluation DONE
***************Spamassassin dataset***************
Frequencies/Dictionaries loaded
Prepared spam/ham
***************Enron dataset***************
Spamassassin HAM
Dataset length: 2752
58% of given datased was classified as positive!

Spamassassin SPAM
Dataset length: 501
80% of given datased was classified as positive!

Enron HAM
Dataset length: 10000
11% of given datased was classified as positive!

Enron SPAM
Dataset length: 4502
89% of given

(<Classifier.Classifier instance at 0x7f79a0b59248>,
 <Parser.Parser instance at 0x7f79a0b59998>)