In [1]:
import string
import math
from collections import defaultdict, Counter

In [2]:
data = []
with open("messages.data", "r") as f:
    for line in f:
        text, label = line.strip().rsplit(",", 1)
        data.append((text, label))

In [3]:
def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text.split()

In [4]:
vocab = set()
class_counts = Counter()
word_counts = defaultdict(Counter)

for msg, label in data:
    words = preprocess(msg)
    vocab.update(words)
    class_counts[label] += 1
    for w in words:
        word_counts[label][w] += 1

total_docs = len(data)
classes = list(class_counts.keys())
print(len(vocab))

29


In [5]:
priors = {c: class_counts[c] / total_docs for c in classes}
print(priors)

{' spam': 0.5, ' ham': 0.5}


In [6]:
alpha = 1  # smoothing
V = len(vocab)
likelihood = {}
for c in classes:
    total_words = sum(word_counts[c].values())
    likelihood[c] = {
        w: (word_counts[c][w] + alpha) / (total_words + alpha * V)
        for w in vocab
    }
    print(c, " : ", likelihood[c])

 spam  :  {'lunch': 0.023809523809523808, 'medicines': 0.047619047619047616, 'win': 0.047619047619047616, 'offer': 0.047619047619047616, 'to': 0.023809523809523808, 'lets': 0.023809523809523808, 'now': 0.047619047619047616, 'prize': 0.047619047619047616, 'the': 0.023809523809523808, 'catch': 0.023809523809523808, 'easily': 0.047619047619047616, 'me': 0.023809523809523808, 'for': 0.047619047619047616, 'over': 0.023809523809523808, 'up': 0.023809523809523808, 'call': 0.023809523809523808, 'submit': 0.023809523809523808, 'report': 0.023809523809523808, 'when': 0.023809523809523808, 'just': 0.047619047619047616, 'forget': 0.023809523809523808, 'cheap': 0.047619047619047616, 'buy': 0.047619047619047616, 'dont': 0.023809523809523808, 'get': 0.023809523809523808, 'you': 0.047619047619047616, 'time': 0.023809523809523808, 'limited': 0.047619047619047616, 'cash': 0.047619047619047616}
 ham  :  {'lunch': 0.043478260869565216, 'medicines': 0.021739130434782608, 'win': 0.021739130434782608, 'offer

In [7]:
def predict_nb(text):
    words = preprocess(text)
    scores = {}
    for c in classes:
        score = math.log(priors[c])
        for w in words:
            pw = likelihood[c].get(w, alpha / (sum(word_counts[c].values()) + alpha * V))
            score += math.log(pw)
        scores[c] = score
    return max(scores, key=scores.get)

In [8]:
test_msg = "My name is kn"
print(f"Message: “{test_msg}” → Predicted: {predict_nb(test_msg)}\n")

Message: “My name is kn” → Predicted:  spam

