In [9]:
import string
import math
from collections import defaultdict, Counter


In [10]:
data = []
with open("messages.data", "r") as f:
    for line in f:
        text, label = line.strip().rsplit(",", 1)
        data.append((text, label))

In [11]:
def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text.split()

In [12]:
vocab = set()
class_counts = Counter()
word_counts = defaultdict(Counter)

for msg, label in data:
    words = preprocess(msg)
    vocab.update(words)
    class_counts[label] += 1
    for w in words:
        word_counts[label][w] += 1

total_docs = len(data)
classes = list(class_counts.keys())

In [13]:
priors = {c: class_counts[c] / total_docs for c in classes}

In [14]:
alpha = 1  # smoothing
V = len(vocab)
likelihood = {}
for c in classes:
    total_words = sum(word_counts[c].values())
    likelihood[c] = {
        w: (word_counts[c][w] + alpha) / (total_words + alpha * V)
        for w in vocab
    }

In [15]:
def predict_nb(text):
    words = preprocess(text)
    scores = {}
    for c in classes:
        # start with log‑prior
        score = math.log(priors[c])
        for w in words:
            # if word unseen in class‑counts, use smoothed prob
            pw = likelihood[c].get(w, alpha / (sum(word_counts[c].values()) + alpha * V))
            score += math.log(pw)
        scores[c] = score
    return max(scores, key=scores.get)

In [16]:
test_msg = "Get your cash prize now"
print(f"Message: “{test_msg}” → Predicted: {predict_nb(test_msg)}\n")

Message: “Get your cash prize now” → Predicted: spam

