In [1]:
import re
import numpy
from cvxpy import *
import email
import os
import pickle

In [2]:
class Parser:
    dictionary = None
    
    #stripHeaders helper conjoining multiple substitutions into one regex
    @staticmethod
    def multiRegexSubstitute(regexArray, text, substitute = ""):
        rules = "|".join(regexArray)
        pattern = re.compile(rules)
        return pattern.sub(substitute, text)
    
    #Parse the standard email structure into a plaintext (strip headers, footers and section separators)
    @staticmethod  
    def stripHeaders(mail):
        text = ""
        for msg in email.message_from_string(mail).walk():
            try:
                text += msg.get_payload() + " "
            except:
                continue

        #REGEX diacritics removal
        text = re.sub("<[^<>]*>", " ", text) #remove HTML/XML tags
        
        text = Parser.multiRegexSubstitute(["\.", "\?", ";", ",", "\"", "'", "=", "#", "[0-9]", "\*", "!", "%"], text) # non-expanding substitutions
        text = Parser.multiRegexSubstitute(["-", "\(", "\)", "\n", "\t", "&nbsp", "_", "&", "$", "@", ":"], text, " ") # expanding substitutions
        
        text = re.sub(" +" ," ", text) #finisher whitespace removal
        
        return text
    
    def __init__(self, dictionary):
        self.dictionary = dictionary

    #Translate a given email according into a word vector according to the preset dictionary
    def parseEmail(self, mail):
        parsedData = [0] * len(self.dictionary)
        plaintext = Parser.stripHeaders(mail).split(" ")
        for x in plaintext:
            if x in self.dictionary:
                parsedData[self.dictionary.index(x)] += 1
        return parsedData

In [3]:
class FilterSPAM:
    a = None
    b = None
    
    #CVXPY convex optimization solver, given vectorized emailes it tries to find optimal parameter values for logistic regression curve.
    @staticmethod
    def getModelParams(data, labels, size):
        a = Variable(size, 1)
        b = Variable()
        spam = labels.index(False)
        
        testing = numpy.zeros((len(data), 1))
        for i in range(len(labels)):
            if not labels[i]:
                break
            testing[i, 0] = 1
            
        logLogistic = sum([logistic(data[i] * a + b) for i in range(len(data))])
        positive = testing.T * data * a  + b * spam
        problem = Problem(Maximize(positive - logLogistic))
        
        result = problem.solve()    
        return a.value, b.value
    
    #Classify sample
    def evaluate(self, sample):
        if self.a is None or self.b is None:
            raise ValueError('Model parameters not set!')
        return 1.0 / (1 + math.e**(sample * self.a + self.b)[0, 0])
    
    def __init__(self, data, labels):
        self.a, self.b = FilterSPAM.getModelParams(numpy.matrix(data), labels, len(data[0]))

In [6]:
class PickledDataProcessing:
    #Append new data to existing dictionary/frequency database
    @staticmethod
    def createDictionary(dictionary, frequency, files):
        tmpDict = []
        tmpFreq = []
        tmpAppends = []

        #Create small dict
        for file in files:
            for word in file.split(" "):
                if word in tmpDict:
                    tmpFreq[tmpDict.index(word)] += 1
                else:
                    tmpDict.append(word)
                    tmpFreq.append(1)

        #Merge old and new dict
        for i in range(len(tmpDict)):
            if tmpDict[i] in dictionary:
                frequency[dictionary.index(tmpDict[i])] += tmpFreq[i]
            else:
                tmpAppends.append(word)
                frequency.append(tmpFreq[i])
        return dictionary + tmpAppends, frequency

    #Expand the database from a given pickled file. Merge every mergeAfter messages.
    @staticmethod
    def crawlMails(dictionary, frequency, dump, mergeAfter = 10000):
        i = 0
        spam = pickle.load(open( dump, "rb" ))
        files = []
        for text in spam:
            content = (Parser.stripHeaders(text)).lower()
            files.append(content)
            if i==mergeAfter:
                i = -1
                dictionary, frequency = PickledDataProcessing.createDictionary(dictionary, frequency, files)
                del(files)
                files = []
                break
            i+=1        
        return dictionary, frequency
    
    #Load SPAM/HAM files from two pickled databases. Return them as an input data for classifier parsed with a given parser.
    @staticmethod
    def loadMails(parser, spamFile, hamFile, count = -1):
        spam = []
        msgs = []
        
        ctr = 0
        print "Load SPAM"
        for i in pickle.load(open( spamFile, "rb" )):
            spam.append(parser.parseEmail(i))
            if (count != -1):
                if ctr == count:
                    ctr = 0
                    break
                ctr += 1

        print "Load HAM"
        for i in pickle.load(open( hamFile, "rb" )):
            msgs.append(parser.parseEmail(i))
            if (count != -1):
                if ctr == count:
                    ctr = 0
                    break
                ctr += 1

        return spam, msgs

    #Cut the extremes off the dictionary
    @staticmethod
    def simplifyDictionary(dictionary, frequency, percentageUpperBound = 0.95, minOccurences = 200):
        newDict = []
        maximum = max(frequency)
        for i in range(len(dictionary)):
            if frequency[i] < maximum * percentageUpperBound and frequency[i] > minOccurences:
                newDict.append(dictionary[i])
        return newDict

In [11]:
d, f = PickledDataProcessing.crawlMails([], [], "SPAM.p", 2000)
d, f = PickledDataProcessing.crawlMails(d, f, "HAM.p", 2000)
sim = PickledDataProcessing.simplifyDictionary(d, f, 0.995, 5)
print len(sim)
parser = Parser(sim)
s, m = PickledDataProcessing.loadMails(parser, "SPAM.p", "HAM.p", 2000)
print "DONE preprocessing"

fil = FilterSPAM(s + m, [True] * len(s) + [False] * len(m))
print fil.a
print fil.b

print "DONE learning"

8242
Load SPAM
Load HAM
DONE preprocessing
[[ 3.85362574]
 [ 0.        ]
 [ 0.        ]
 ..., 
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]]
-4.48312629543
DONE learning


[[ 3.85362574]
 [ 0.        ]
 [ 0.        ]
 ..., 
 [ 0.        ]
 [ 0.        ]
 [ 0.        ]]
-4.48312629543


In [38]:
class CustomDataPickling:
    @staticmethod
    def picklizeSPAM():  
        msgs = []
        for directory in os.listdir("./SPAM"):
            for filename in os.listdir("./SPAM/"+directory):
                with open("./SPAM/"+directory+"/"+filename) as f:
                    msgs.append(f.read().lower())
            print directory + " with total # of emails: " + str(len(msgs))
        pickle.dump( msgs, open( "SPAM.p", "wb" ))

    @staticmethod
    def picklizeEnron():  
        msgs = []

        for directory in os.listdir("./Enron"):
            if "inbox" not in os.listdir("./Enron/"+directory):
                continue
            for filename in os.listdir("./Enron/"+directory+"/inbox"):
                try:
                    with open("./Enron/"+directory+"/inbox/"+filename) as f:
                        msgs.append(f.read().lower())
                except:
                    None
            print directory + " with total # of emails: " + str(len(msgs))
        pickle.dump( msgs, open( "HAM.p", "wb" ))
    
    @staticmethod
    def picklizeHAM():
        msgs = []
        for filename in os.listdir("./HAM"):
            with open("./HAM/"+filename) as f:
                msgs.append(f.read().lower())
        print "total # of emails: " + str(len(msgs))
        pickle.dump( msgs, open( "HAM.p", "wb" ))

In [12]:
import math
i = 0
for x in s:
    test = fil.evaluate(x)
    if not test < 1.0 / 2:
        i += 1
print str(100 * i / len(s)) + "% of SPAM passed through filter"

i = 0
for x in m:
    test = fil.evaluate(x)
    if test < 1.0 / 2:
        i += 1
print str(100 * i / len(m)) + "% of MESSAGES were filtered"

52% of SPAM passed through filter
1% of MESSAGES were filtered
