In [9]:
import math, os, pathlib, random, re
from collections import Counter, defaultdict

def get_data():
    data = []
    emailsdir = pathlib.Path.cwd() / 'emails'
    for dirpath, dirs, files in os.walk(emailsdir, topdown=False):
        if files:
            is_spam = 'spam' in dirpath
            for file in files:
                fp = pathlib.Path(dirpath) / file
                with open(fp, 'rb') as fin:
                    for line in fin:
                        line = line.decode('utf-8', 'ignore')
                        line = line.strip()
                        if line.startswith('Subject:'):
                            data.append((line, is_spam))
    return data

def split(data, split_fraction=0.7):
    shuffled_data = random.sample(data, k=len(data))
    split_ix = int(len(data) * split_fraction)
    return shuffled_data[:split_ix], shuffled_data[split_ix:]

def ctspam(data):
    labels = [label for _, label in data]
    cts = Counter(labels)
    return cts[True], cts[False]

def tokenize(message):
    message = message.lower()
    words = re.findall(r"[a-z0-9']+", message)
    return set(words)

def word_counts(data):
    cts = defaultdict(lambda: [0,0])
    for message, is_spam in data:
        words = tokenize(message)
        for word in words:
            cts[word][0 if is_spam else 1] += 1
    return cts

def word_probabilities(wcounts, tot_spam, tot_nonspam, k=0.5):
    return [
        (w, (ctspam + k)/(tot_spam + 2*k), (ctnonspam + k)/(tot_nonspam + 2*k))
            for w, [ctspam, ctnonspam] in wcounts.items()
    ]

def naivebayes_classify(wprobs, message):
    log_prob_spam = log_prob_nonspam = 0.0
    message_words = tokenize(message)
    for word, pspam, pnonspam in wprobs:
        if word in message_words:
            log_prob_spam += math.log(pspam)
            log_prob_nonspam += math.log(pnonspam)
        else:
            log_prob_spam += math.log(1 - pspam)
            log_prob_nonspam += math.log(1 - pnonspam)
    prob_spam = math.exp(log_prob_spam)
    prob_nonspam = math.exp(log_prob_nonspam)
    return prob_spam / (prob_spam + prob_nonspam)

def run_test(wprobs, test):
    results = []
    for message, is_spam in test:
        prob = naivebayes_classify(wprobs, message)
        results.append((is_spam, prob >= 0.5))
    return Counter(results)


In [12]:
data = get_data()
train, test = split(data)
tot_spam, tot_nonspam = ctspam(train)
wcounts = word_counts(train)
wprobs = word_probabilities(wcounts, tot_spam, tot_nonspam)
run_test(wprobs, test)

Counter({(False, False): 839,
         (True, True): 99,
         (False, True): 40,
         (True, False): 51})