# Exercise for Unit 4.1 Na√Øve Bayes

In [None]:
import re
from collections import defaultdict, Counter

documents = [
    ("Free money now!!!", "SPAM"),
    ("Hi mom, how are you?", "HAM"),
    ("Lowest price for your meds", "SPAM"),
    ("Are we still on for dinner?", "HAM"),
    ("Win a free iPhone today", "SPAM"),
    ("Let's catch up tomorrow at the office", "HAM"),
    ("Meeting at 3 PM tomorrow", "HAM"),
    ("Get 50% off, limited time!", "SPAM"),
    ("Team meeting in the office", "HAM"),
    ("Click here for prizes!", "SPAM"),
    ("Can you send the report?", "HAM")
]

#text preprocess
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text.split()

#build vocab
def build_vocab(documents):
    vocab = set()
    for text, _ in documents:
        words = preprocess(text)
        vocab.update(words)
    return list(vocab)

def bag_of_words(documents, vocab):
    word_counts = {
        "HAM": defaultdict(int),
        "SPAM": defaultdict(int),
    }
    class_counts = {"HAM": 0, "SPAM": 0}

    for text, label in documents:
        class_counts[label] += 1
        words = preprocess(text)
        for word in words:
            word_counts[label][word] += 1

    return word_counts, class_counts

#naive bays train
def train_naive_bays(documents):
    vocab = build_vocab(documents)
    word_counts, class_counts = bag_of_words(documents, vocab)

    total_docs = len(documents)
    priors = {
        c: class_counts[c] / total_docs
        for c in class_counts
    }

    likelihoods = {"HAM": {}, "SPAM": {}}
    vocab_size = len (vocab)

    for c in ["HAM", "SPAM"]:
        total_words = sum(word_counts[c].values())
        for word in vocab:
            likelihoods[c][word] = (
                word_counts[c][word] + 1
            ) / (total_words + vocab_size)

    return priors, likelihoods, vocab
    
def predict(text, priors, likelihoods, vocab):
    words = preprocess(text)
    scores = {}

    for c in ["HAM", "SPAM"]:
        score = priors[c]
        for word in words:
            if word in vocab:
                score *= likelihoods[c][word]
            scores[c] = score
    
    return max(scores, key=scores.get)



In [18]:
priors, likelihoods, vocab = train_naive_bays(documents)

print("Priors: ")
print(priors)

test_sentences = [
    "Limited offer, click here!",
    "Meeting at 2 PM with the manager."
]


for sentence in test_sentences:
    prediction = predict(sentence, priors, likelihoods, vocab)
    print(f"\nSentence: {sentence}\n")
    print("Predicted Class:", prediction)
    print(f"Likelihoods: {likelihoods}")

Priors: 
{'HAM': 0.5454545454545454, 'SPAM': 0.45454545454545453}

Sentence: Limited offer, click here!

Predicted Class: SPAM
Likelihoods: {'HAM': {'lets': 0.025974025974025976, '50': 0.012987012987012988, 'click': 0.012987012987012988, 'now': 0.012987012987012988, 'the': 0.05194805194805195, 'iphone': 0.012987012987012988, 'tomorrow': 0.03896103896103896, 'report': 0.025974025974025976, 'pm': 0.025974025974025976, 'can': 0.025974025974025976, 'money': 0.012987012987012988, 'price': 0.012987012987012988, 'catch': 0.025974025974025976, 'meeting': 0.03896103896103896, 'limited': 0.012987012987012988, 'off': 0.012987012987012988, 'free': 0.012987012987012988, 'today': 0.012987012987012988, 'lowest': 0.012987012987012988, 'office': 0.03896103896103896, 'for': 0.025974025974025976, 'mom': 0.025974025974025976, 'how': 0.025974025974025976, 'win': 0.012987012987012988, 'you': 0.03896103896103896, 'we': 0.025974025974025976, 'dinner': 0.025974025974025976, 'hi': 0.025974025974025976, 'your': 

In [None]:
# vocab = build_vocab(documents)
# print(bag_of_words(documents, vocab))
# print(train_naive_bays(documents))


({'HAM': defaultdict(<class 'int'>, {'hi': 1, 'mom': 1, 'how': 1, 'are': 2, 'you': 2, 'we': 1, 'still': 1, 'on': 1, 'for': 1, 'dinner': 1, 'lets': 1, 'catch': 1, 'up': 1, 'tomorrow': 2, 'at': 2, 'the': 3, 'office': 2, 'meeting': 2, '3': 1, 'pm': 1, 'team': 1, 'in': 1, 'can': 1, 'send': 1, 'report': 1}), 'SPAM': defaultdict(<class 'int'>, {'free': 2, 'money': 1, 'now': 1, 'lowest': 1, 'price': 1, 'for': 2, 'your': 1, 'meds': 1, 'win': 1, 'a': 1, 'iphone': 1, 'today': 1, 'get': 1, '50': 1, 'off': 1, 'limited': 1, 'time': 1, 'click': 1, 'here': 1, 'prizes': 1})}, {'HAM': 6, 'SPAM': 5})
({'HAM': 0.5454545454545454, 'SPAM': 0.45454545454545453}, {'HAM': {'lets': 0.025974025974025976, '50': 0.012987012987012988, 'click': 0.012987012987012988, 'now': 0.012987012987012988, 'the': 0.05194805194805195, 'iphone': 0.012987012987012988, 'tomorrow': 0.03896103896103896, 'report': 0.025974025974025976, 'pm': 0.025974025974025976, 'can': 0.025974025974025976, 'money': 0.012987012987012988, 'price': 0.

# Part 2 Using Scikit-Learn MultinomialDB

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

texts = [
    "Free money now!!!",
    "Hi mom, how are you?",
    "Lowest price for your meds",
    "Are we still on for dinner?",
    "Win a free iPhone today",
    "Let's catch up tomorrow at the office",
    "Meeting at 3 PM tomorrow",
    "Get 50% off, limited time!",
    "Team meeting in the office",
    "Click here for prizes!",
    "Can you send the report?"
]

labels = [
    "SPAM", "HAM", "SPAM", "HAM", "SPAM",
    "HAM", "HAM", "SPAM", "HAM", "SPAM", "HAM"
]

#convert text to feature vectors
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(texts)

#model
model = MultinomialNB()
model.fit(X, labels)

test_sentences = [
    "Limited offer, click here!",
    "Meeting at 2 PM with the manager."
]

X_test = vectorizer.transform(test_sentences)
predictions = model.predict(X_test)

for sentence, pred in zip(test_sentences, predictions):
    print(f"\nSentence: {sentence}")
    print("Predicted Class:", pred)


Sentence: Limited offer, click here!
Predicted Class: SPAM

Sentence: Meeting at 2 PM with the manager.
Predicted Class: HAM
