In [1]:
import re
import numpy
from cvxpy import *
import email
import os
import pickle
import math

In [2]:
class Parser:
    dictionary = None
    
    #stripHeaders helper conjoining multiple substitutions into one regex
    @staticmethod
    def multiRegexSubstitute(regexArray, text, substitute = ""):
        rules = "|".join(regexArray)
        pattern = re.compile(rules)
        return pattern.sub(substitute, text)
    
    #Parse the standard email structure into a plaintext (strip headers, footers and section separators)
    @staticmethod  
    def stripHeaders(mail):
        text = ""
        for msg in email.message_from_string(mail).walk():
            try:
                text += msg.get_payload() + " "
            except:
                continue
                
        if "delivered-to" in text:
            text = text[re.search("\n\n", text).start()+1:]
       
        #REGEX punctuation mark removal
        text = Parser.multiRegexSubstitute(["\.", "\?", ";", ",", "\"", "'", "=", "#", "[0-9]", "\*", "!", "%"], text) # non-expanding substitutions
        text = Parser.multiRegexSubstitute(["-", "\(", "\)", "\n", "\t", "&nbsp", "_", "&", "$", "@", ":", "\[", "\]"], text, " ") # expanding substitutions

        text = re.sub("<[^<>]*>", " ", text) #remove HTML/XML tags
        text = re.sub(" +" ," ", text) #finisher whitespace removal
        return text
    
    def __init__(self, dictionary):
        self.dictionary = dictionary

    #Translate a given email according into a word vector according to the preset dictionary
    def parseEmail(self, mail):
        parsedData = [0] * len(self.dictionary)
        plaintext = Parser.stripHeaders(mail).split(" ")
        for x in plaintext:
            if x in self.dictionary:
                parsedData[self.dictionary.index(x)] += 1
        return parsedData

In [3]:
class FilterSPAM:
    a = None
    b = None
    
    #CVXPY convex optimization solver, given vectorized emailes it tries to find optimal parameter values for logistic regression curve.
    @staticmethod
    def getModelParams(data, labels, size):
        a = Variable(size, 1)
        b = Variable()
        
        spam = sum(labels)

        testing = numpy.matrix(labels)
            
        logLogistic = sum([logistic(data[i] * a + b) for i in xrange(len(data))])
        positive = testing * data * a  + b * spam
        problem = Problem(Maximize(positive - logLogistic))
        
        result = problem.solve()    
        return a.value, b.value
    
    #Classify sample
    def evaluate(self, sample):
        if self.a is None or self.b is None:
            raise ValueError('Model parameters not set!')
        return math.e**(sample * self.a + self.b)[0, 0] / (1 + math.e**(sample * self.a + self.b)[0, 0])
    
    def __init__(self, data, labels):
        self.a, self.b = FilterSPAM.getModelParams(numpy.matrix(data), labels, len(data[0]))

In [4]:
class PickledDataProcessing:
    #Append new data to existing dictionary/frequency database
    @staticmethod
    def createDictionary(dictionary, frequency, files):
        tmpDict = []
        tmpFreq = []
        tmpAppends = []
                
        #Create small dict
        for file in files:
            for word in file.split(" "):
                if word in tmpDict:
                    tmpFreq[tmpDict.index(word)] += 1
                else:
                    tmpDict.append(word)
                    tmpFreq.append(1)

        #Merge old and new dict
        for i in xrange(len(tmpDict)):
            if tmpDict[i] in dictionary:
                frequency[dictionary.index(tmpDict[i])] += tmpFreq[i]
            else:
                tmpAppends.append(tmpDict[i])
                frequency.append(tmpFreq[i])
                
        return dictionary + tmpAppends, frequency

    #Expand the database from a given pickled file. Merge every mergeAfter messages.
    @staticmethod
    def crawlMails(dictionary, frequency, dump, breakAfter = 10000, mergeAfter = 10000):
        i = 0
        j = 0
        spam = pickle.load(open( dump, "rb" ))
        files = []
        for text in spam:
            content = (Parser.stripHeaders(text)).lower()
            files.append(content)
            if i == breakAfter:
                break;
            if j == mergeAfter:
                dictionary, frequency = PickledDataProcessing.createDictionary(dictionary, frequency, files)
                del(files)
                files = []
                j = -1
            i+=1        
            j+=1
        dictionary, frequency = PickledDataProcessing.createDictionary(dictionary, frequency, files)
        return dictionary, frequency
    
    #Load SPAM/HAM files from two pickled databases. Return them as an input data for classifier parsed with a given parser.
    @staticmethod
    def loadMails(parser, pickledFile, count = -1):
        msgs = []
        ctr = 0
        for i in pickle.load(open( pickledFile, "rb" )):
            msgs.append(parser.parseEmail(i))
            if (count != -1):
                if ctr == count:
                    break
                ctr += 1
                
        return msgs

    #Cut the extremes off the dictionary
    @staticmethod
    def simplifyDictionary(dictionary, frequency, percentageUpperBound = 0.95, minOccurences = 200):
        newDict = []
        maximum = max(frequency)
        for i in xrange(len(dictionary)):
            if frequency[i] <= maximum * percentageUpperBound and frequency[i] >= minOccurences:
                newDict.append(dictionary[i])
        return newDict
    
    #Serialize all mails in given directory into one pickle file of the same name.
    @staticmethod
    def picklizeDirectory(rootDir):
        msgs = []
        for filename in os.listdir(rootDir):
            with open(rootDir + "/" + filename) as f:
                msgs.append(f.read().lower())
        print "total # of emails: " + str(len(msgs))
        pickle.dump( msgs, open( rootDir + ".p", "wb" ))

In [5]:
def buildDictionary(pickledHam, pickledSpam, amount = 2000, dictionarySize = 15000):
    d, f = PickledDataProcessing.crawlMails([], [], pickledHam, breakAfter = amount)
    d, f = PickledDataProcessing.crawlMails(d, f, pickledSpam, breakAfter = amount)
    
    topBarrier = 1
    botBarrier = 0
    sim = PickledDataProcessing.simplifyDictionary(d, f, topBarrier, botBarrier)
    while len(sim) > dictionarySize and dictionarySize != -1:
        topBarrier -= 0.0025
        botBarrier += 1
        sim = PickledDataProcessing.simplifyDictionary(d, f, topBarrier, botBarrier)
    return sim

def buildClassifier(pickledHam, pickledSpam, parser, amount = 2000):
    spam = PickledDataProcessing.loadMails(parser, pickledSpam, amount)
    ham = PickledDataProcessing.loadMails(parser, pickledSpam, amount)
    
    classifier = FilterSPAM(spam + ham, [1] * len(spam) + [0] * len(ham))
    return classifier

def optimizeDictionary(dictionary, classifier):
    newDict = []
    for i in xrange(len(dictionary)):
        if classifier.a[i, 0] != 0:
            newDict.append(dictionary[i])
    return newDict

def testDataset(classifier, dataset):
    i = 0
    for x in dataset:
        if classifier.evaluate(x) > 1.0 / 2:
            i += 1
    print str(100 * i / len(dataset)) + "% of given datased were classified as positive!"


In [6]:
import gc

def test(label, pickledHam, pickledSpam, amount):
    print "******************************"
    print label +" START"

    dictionary = buildDictionary(pickledHam, pickledSpam, amount = amount, dictionarySize = -1)    
    #dictionary = pickle.load(open("dictionary.p","rb"))
    newDictionary = optimizeDictionary(dictionary, buildClassifier(pickledHam, pickledSpam, Parser(dictionary), amount = amount))
    parser = Parser(newDictionary)
    classifier = buildClassifier(pickledHam, pickledSpam, parser, amount = amount)
    
    print "Classifier stats:"
    print "HAM"
    testDataset(classifier, PickledDataProcessing.loadMails(parser, pickledHam, amount))
    print "SPAM"
    testDataset(classifier, PickledDataProcessing.loadMails(parser, pickledSpam, amount))
    
    print "Spamassassin ham"
    testDataset(classifier, PickledDataProcessing.loadMails(parser, "ham.p", 10000))
    gc.collect()
    print "Spamassassin easy_ham"
    testDataset(classifier, PickledDataProcessing.loadMails(parser, "easy_ham.p", 10000))
    gc.collect()
    print "Spamassassin hard_ham"
    testDataset(classifier, PickledDataProcessing.loadMails(parser, "hard_ham.p", 10000))
    gc.collect()
    print "Spamassassin spam"
    testDataset(classifier, PickledDataProcessing.loadMails(parser, "spam.p", 10000))
    gc.collect()
    print "SPAM spam"
    testDataset(classifier, PickledDataProcessing.loadMails(parser, "SPAM/01.p", 10000))
    gc.collect()
    print "Enron ham"
    testDataset(classifier, PickledDataProcessing.loadMails(parser, "Enron/ham.p", 10000))
    gc.collect()
    print "Enron spam"
    testDataset(classifier, PickledDataProcessing.loadMails(parser, "Enron/spam.p", 10000))
    gc.collect()
    print label + " END" 
    print "******************************"

In [7]:
import sys
import resource

soft, hard = 4.8 * 10**9, 5 * 10**9
resource.setrlimit(resource.RLIMIT_AS,(soft, hard))

In [None]:
def mergePickledFiles(files, output):
    result = []
    for f in files:
        result += pickle.load(open( f, "rb" ))
    result.sort()
    pickle.dump(result, open( output + ".p", "wb" ))

In [None]:
dataset = 500

test("TEST ENRON", "Enron/ham.p", "Enron/spam.p", dataset)
test("TEST SPAMASSASSIN", "ham.p", "spam.p", dataset)
test("TEST E/S1", "Enron/ham.p", "SPAM/01.p", dataset)
test("TEST A/S1", "ham.p", "SPAM/01.p", dataset)
test("TEST E/S2", "Enron/ham.p", "SPAM/02.p", dataset)
test("TEST A/S2", "ham.p", "SPAM/02.p", dataset)

******************************
TEST ENRON START
Classifier stats:
HAM
48% of given datased were classified as positive!
SPAM
94% of given datased were classified as positive!
Spamassassin ham
53% of given datased were classified as positive!
Spamassassin easy_ham
53% of given datased were classified as positive!
Spamassassin hard_ham
43% of given datased were classified as positive!
Spamassassin spam
60% of given datased were classified as positive!
SPAM spam
10% of given datased were classified as positive!
Enron ham
44% of given datased were classified as positive!
Enron spam
50% of given datased were classified as positive!
TEST ENRON END
******************************
******************************
TEST SPAMASSASSIN START
Classifier stats:
HAM
64% of given datased were classified as positive!
SPAM
99% of given datased were classified as positive!
Spamassassin ham
55% of given datased were classified as positive!
Spamassassin easy_ham
54% of given datased were classified as positive