In [7]:
import math, random, re, pathlib, os
from collections import defaultdict, Counter

def get_data():
    data = []
    emailsdir = pathlib.Path.cwd() / 'emails'
    for dirpath, dirs, files in os.walk(emailsdir, topdown=False):
        is_spam = 'spam' in dirpath
        for file in files:
            with open(pathlib.Path(dirpath) / file, 'rb') as fin:
                for line in fin:
                    line = line.decode('utf-8', 'ignore')
                    if line.startswith('Subject:'):
                        line = line.replace('Subject:', '')
                        line = line.strip()
                        data.append((line, is_spam))
    return data

def split(data, split_fraction=0.7):
    shuffled_data = random.sample(data, k=len(data))
    split_ix = int(split_fraction * len(data))
    return shuffled_data[:split_ix], shuffled_data[split_ix:]

def counts(data):
    cts = Counter([label for _, label in data])
    return cts[True], cts[False]

def tokenize(message):
    message = message.lower()
    words = re.findall(r"([a-z0-9']+)", message)
    return set(words)

def word_counts(data):
    wcounts = defaultdict(lambda: [0,0])
    for message, is_spam in data:
        words = tokenize(message)
        for word in words:
            wcounts[word][0 if is_spam else 1] += 1
    return wcounts

def word_probabilities(wcounts, tot_spam, tot_nonspam, k=0.5):
    return [
        (w, (ctspam + k)/(tot_spam + 2*k), (ctnonspam + k)/(tot_nonspam + 2*k)) 
            for w, [ctspam, ctnonspam] in wcounts.items()
    ]

def nbclassify(wprobs, message):
    log_prob_spam = log_prob_nonspam = 0.0
    message_words = tokenize(message)
    for word, pspam, pnonspam in wprobs:
        if word in message_words:
            log_prob_spam += math.log(pspam)
            log_prob_nonspam += math.log(pnonspam)
        else:
            log_prob_spam += math.log(1-pspam)
            log_prob_nonspam += math.log(1-pnonspam)
    prob_spam = math.exp(log_prob_spam)
    prob_nonspam = math.exp(log_prob_nonspam)
    return prob_spam / (prob_spam + prob_nonspam)

def run_test(data, wprobs):
    results = []
    for message, is_spam in data:
        prob = nbclassify(wprobs, message)
        results.append((prob > 0.5, is_spam))
    return Counter(results)


In [8]:
data = get_data()
train, test = split(data)
wcounts = word_counts(train)
tot_spam, tot_nonspam = counts(train)
wprobs = word_probabilities(wcounts, tot_spam, tot_nonspam)
run_test(test, wprobs)

Counter({(True, True): 95,
         (False, False): 838,
         (False, True): 66,
         (True, False): 28})