In [20]:
import math, random, re, os, pathlib
from collections import defaultdict, Counter

def get_data():
    data = []
    emaildir = pathlib.Path.cwd() / 'emails'
    for dirpath, dirs, files in os.walk(emaildir, topdown=False):
        if files:
            is_spam = 'spam' in dirpath
            for file in files:
                filepath = pathlib.Path(dirpath) / file
                with open(filepath, 'rb') as fin:
                    for line in fin:
                        line = line.decode('utf-8', 'ignore')
                        if line.startswith('Subject:'):
                            line = line.replace('Subject:', '')
                            line = line.strip()
                            data.append((line, is_spam))
    return data

def split(data, split_fraction=0.7):
    shuffled_data = random.sample(data, k=len(data))
    split_ix = int(split_fraction * len(data))
    return shuffled_data[:split_ix], shuffled_data[split_ix:]

def count_spams(data):
    ctspam = sum([1 for msg, label in data if label == True])
    ctnonspam = sum([1 for msg, label in data if label == False])
    return ctspam, ctnonspam

def tokenize(message):
    message = message.lower()
    words = re.findall(r"[a-z0-9']+", message)
    return set(words)

def word_counts(data):
    wcounts = defaultdict(lambda: [0,0])
    for message, is_spam in data:
        words = tokenize(message)
        for word in words:
            wcounts[word][0 if is_spam else 1] += 1
    return wcounts

def word_probabilities(wcounts, tot_spams, tot_nonspams, k=0.5):
    return [
        (w, (ct_spam + k)/(tot_spams + 2*k), (ct_nonspam + k)/(tot_nonspams + 2*k))
            for w, [ct_spam, ct_nonspam] in wcounts.items()
    ]

def naivebayes_classify(message, wprobs):
    log_prob_spam = log_prob_nonspam = 0.0
    message_words = tokenize(message)
    for word, pspam, pnonspam in wprobs:
        if word in message_words:
            log_prob_spam += math.log(pspam)
            log_prob_nonspam += math.log(pnonspam)
        else:
            log_prob_spam += math.log(1 - pspam)
            log_prob_nonspam += math.log(1 - pnonspam)
    prob_spam = math.exp(log_prob_spam)
    prob_nonspam = math.exp(log_prob_nonspam)
    return prob_spam / (prob_spam + prob_nonspam)

def run_test(data, wprobs):
    results = []
    for msg, is_spam in data:
        pspam = naivebayes_classify(msg, wprobs)
        results.append((is_spam, pspam >= 0.5))
    return Counter(results)


In [21]:
data = get_data()
train, test = split(data)
tot_spam, tot_nonspam = count_spams(train)
wcounts = word_counts(train)
wprobs = word_probabilities(wcounts, tot_spam, tot_nonspam)
run_test(test, wprobs)

Counter({(False, False): 839,
         (True, True): 94,
         (True, False): 61,
         (False, True): 33})