In [None]:
emails = [
    "win money now",
    "cheap money loans",
    "meeting schedule today",
    "win money prizes",
    "project schedule update",
    "cheap loans win money"
]

labels = [
    "spam",  # win money now
    "spam",  # cheap money loans
    "ham",   # meeting schedule today
    "spam",  # win money prizes
    "ham",   # project schedule update
    "spam"   # cheap loans win money
]


In [None]:
from collections import defaultdict
from pprint import pprint

def build_vocab(corpus):
    vocab = {}
    for email in corpus:
        for w in email.split():
            if w not in vocab:
                vocab[w] = len(vocab)
    return vocab

vocab = build_vocab(emails)

def vectorize(email, vocab):
    v = [0] * len(vocab)
    for w in email.split():
        if w in vocab:
            v[vocab[w]] = 1
    return v

vectors = [vectorize(e, vocab) for e in emails]


print(["Words", vocab.keys()])
print(["Unique Word Count", len(vocab.keys())])
print(["Vector Column Count", len(vectors[0])])

ve = list(zip(vectors, emails))
pprint(["Vectors:", ve])

['Words', dict_keys(['win', 'money', 'now', 'cheap', 'loans', 'meeting', 'schedule', 'today', 'prizes', 'project', 'update'])]
['Unique Word Count', 11]
['Vector Column Count', 11]
['Vectors:',
 [([1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], 'win money now'),
  ([0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0], 'cheap money loans'),
  ([0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0], 'meeting schedule today'),
  ([1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0], 'win money prizes'),
  ([0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1], 'project schedule update'),
  ([1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0], 'cheap loans win money')]]


In [None]:
import math

def train_naive_bayes(vectors, labels):
    n = len(vectors)
    vocab_size = len(vectors[0])

    spam_count = sum(1 for L in labels if L == "spam")
    ham_count  = sum(1 for L in labels if L == "ham")

    p_spam = spam_count / n
    p_ham  = ham_count / n

    spam_word_count = [0] * vocab_size
    ham_word_count  = [0] * vocab_size

    for vec, label in zip(vectors, labels):
        for i, bit in enumerate(vec):
            if label == "spam" and bit == 1:
                spam_word_count[i] += 1
            if label == "ham" and bit == 1:
                ham_word_count[i] += 1

    # Laplace smoothing for binary NB
    p_word_given_spam = [
        (spam_word_count[i] + 1) / (spam_count + 2)
        for i in range(vocab_size)
    ]
    p_word_given_ham = [
        (ham_word_count[i] + 1) / (ham_count + 2)
        for i in range(vocab_size)
    ]

    return {
        "p_spam": p_spam,
        "p_ham":  p_ham,
        "p_w_spam": p_word_given_spam,
        "p_w_ham":  p_word_given_ham,
        "vocab": vocab
    }

model = train_naive_bayes(vectors, labels)


In [None]:
def predict_proba(email, model):
    vec = vectorize(email, model["vocab"])

    log_spam = math.log(model["p_spam"])
    log_ham  = math.log(model["p_ham"])

    for i, bit in enumerate(vec):
        pw_spam = model["p_w_spam"][i]
        pw_ham  = model["p_w_ham"][i]

        if bit == 1:
            log_spam += math.log(pw_spam)
            log_ham  += math.log(pw_ham)
        else:
            log_spam += math.log(1 - pw_spam)
            log_ham  += math.log(1 - pw_ham)

    # convert log-space back
    spam_score = math.exp(log_spam)
    ham_score  = math.exp(log_ham)
    total = spam_score + ham_score
    return spam_score / total, ham_score / total


In [None]:
tests = [
    "win a million dollars",
    "schedule meeting update",
    "cheap loans available",
    "money prize now"
]

for t in tests:
    p_spam, p_ham = predict_proba(t, model)
    print(t, "→ spam=", round(p_spam, 4), "ham=", round(p_ham, 4))


win a million dollars → spam= 0.9146 ham= 0.0854
schedule meeting update → spam= 0.0047 ham= 0.9953
cheap loans available → spam= 0.9414 ham= 0.0586
money prize now → spam= 0.9757 ham= 0.0243


## Getting the dataset

In [None]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
spambase = fetch_ucirepo(id=94)

# data (as pandas dataframes)
x = spambase.data.features
y = spambase.data.targets

# metadata
print(spambase.metadata)

# variable information
print(spambase.variables)

{'uci_id': 94, 'name': 'Spambase', 'repository_url': 'https://archive.ics.uci.edu/dataset/94/spambase', 'data_url': 'https://archive.ics.uci.edu/static/public/94/data.csv', 'abstract': 'Classifying Email as Spam or Non-Spam', 'area': 'Computer Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 4601, 'num_features': 57, 'feature_types': ['Integer', 'Real'], 'demographics': [], 'target_col': ['Class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1999, 'last_updated': 'Mon Aug 28 2023', 'dataset_doi': '10.24432/C53G6X', 'creators': ['Mark Hopkins', 'Erik Reeber', 'George Forman', 'Jaap Suermondt'], 'intro_paper': None, 'additional_info': {'summary': 'The "spam" concept is diverse: advertisements for products/web sites, make money fast schemes, chain letters, pornography...\n\nThe classification task for this dataset is to determine whether a given email is spam or not.\n\t\nOur collecti

In [None]:
print(x)

      word_freq_make  word_freq_address  word_freq_all  word_freq_3d  \
0               0.00               0.64           0.64           0.0   
1               0.21               0.28           0.50           0.0   
2               0.06               0.00           0.71           0.0   
3               0.00               0.00           0.00           0.0   
4               0.00               0.00           0.00           0.0   
...              ...                ...            ...           ...   
4596            0.31               0.00           0.62           0.0   
4597            0.00               0.00           0.00           0.0   
4598            0.30               0.00           0.30           0.0   
4599            0.96               0.00           0.00           0.0   
4600            0.00               0.00           0.65           0.0   

      word_freq_our  word_freq_over  word_freq_remove  word_freq_internet  \
0              0.32            0.00              0.00     