In [None]:
import pandas as pd
import numpy as np
import re
from collections import Counter, defaultdict
from math import log, exp
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")


Libraries imported successfully!


## 1. Load and Prepare Data


In [None]:
# Load data
train_df = pd.read_csv('../../data/processed/train.csv')
test_df = pd.read_csv('../../data/processed/test.csv')

print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(f"\nLabel distribution:")
print(train_df['label'].value_counts())

train_df.head()


Training samples: 4457
Test samples: 1115

Label distribution:
label
ham     3859
spam     598
Name: count, dtype: int64


Unnamed: 0,label,message
0,ham,Going on nothing great.bye
1,ham,I wont. So wat's wit the guys
2,ham,Ok k..sry i knw 2 siva..tats y i askd..
3,ham,Where are you ? What do you do ? How can you s...
4,ham,Have you not finished work yet or something?


## 2. Text Preprocessing (Simple Tokenization)


In [None]:
def tokenize(text):
    """
    Simple tokenization: lowercase and split on non-word characters.
    """
    # Convert to lowercase
    text = str(text).lower()
    # Extract words (alphanumeric sequences)
    words = re.findall(r'\b\w+\b', text)
    return words


## 3. Naive Bayes Classifier Implementation

### Key Components:
1. **Prior probabilities**: P(spam) and P(ham)
2. **Word probabilities**: P(word | spam) and P(word | ham)
3. **Laplace smoothing**: Prevents zero probabilities
4. **Log probabilities**: Avoids numerical underflow


In [None]:
class NaiveBayesClassifier:
    """
    Simplified Naive Bayes classifier for text classification.
    
    Assumptions:
    - Words are conditionally independent given the class (naive assumption)
    - Uses multinomial model (counts word occurrences)
    - Laplace smoothing to handle unseen words
    """
    
    def __init__(self, alpha=1.0):
        """
        Parameters:
        -----------
        alpha : float
            Smoothing parameter (Laplace smoothing). 
            alpha=1.0 means add-one smoothing.
        """
        self.alpha = alpha
        self.vocab = set()
        self.word_counts_spam = defaultdict(int)
        self.word_counts_ham = defaultdict(int)
        self.total_words_spam = 0
        self.total_words_ham = 0
        self.prior_spam = 0.0
        self.prior_ham = 0.0
        self.vocab_size = 0
        
    def fit(self, messages, labels):
        """
        Train the classifier.
        
        Parameters:
        -----------
        messages : list or pd.Series
            Training messages
        labels : list or pd.Series
            Training labels ('spam' or 'ham')
        """
        print("Training Naive Bayes classifier...")
        
        # Build vocabulary and count words
        for message, label in zip(messages, labels):
            tokens = tokenize(message)
            self.vocab.update(tokens)
            
            if label == 'spam':
                for token in tokens:
                    self.word_counts_spam[token] += 1
                    self.total_words_spam += 1
            else:  # ham
                for token in tokens:
                    self.word_counts_ham[token] += 1
                    self.total_words_ham += 1
        
        self.vocab_size = len(self.vocab)
        
        # Calculate prior probabilities
        label_counts = Counter(labels)
        total_samples = len(labels)
        self.prior_spam = label_counts['spam'] / total_samples
        self.prior_ham = label_counts['ham'] / total_samples
        
        print(f"Vocabulary size: {self.vocab_size}")
        print(f"Prior P(spam) = {self.prior_spam:.4f}")
        print(f"Prior P(ham) = {self.prior_ham:.4f}")
        print(f"Total words in spam: {self.total_words_spam}")
        print(f"Total words in ham: {self.total_words_ham}")
        
    def get_word_probability(self, word, class_label):
        """
        Calculate P(word | class) with Laplace smoothing.
        
        Formula: P(word | class) = (count(word, class) + alpha) / (total_words_in_class + alpha * vocab_size)
        """
        if class_label == 'spam':
            count = self.word_counts_spam[word]
            total = self.total_words_spam
        else:  # ham
            count = self.word_counts_ham[word]
            total = self.total_words_ham
        
        # Laplace smoothing
        numerator = count + self.alpha
        denominator = total + self.alpha * self.vocab_size
        
        return numerator / denominator
    
    def predict_log_probability(self, message):
        """
        Calculate log P(class | message) for both classes.
        
        Uses log probabilities to avoid numerical underflow.
        
        Formula: log P(class | message) = log P(class) + sum(log P(word | class))
        """
        tokens = tokenize(message)
        
        # Start with log prior
        log_prob_spam = log(self.prior_spam)
        log_prob_ham = log(self.prior_ham)
        
        # Add log probabilities for each word
        for word in tokens:
            if word in self.vocab:  # Only consider words in vocabulary
                log_prob_spam += log(self.get_word_probability(word, 'spam'))
                log_prob_ham += log(self.get_word_probability(word, 'ham'))
        
        return log_prob_spam, log_prob_ham
    
    def predict(self, message):
        """
        Predict class for a message.
        
        Returns:
        --------
        str : 'spam' or 'ham'
        """
        log_prob_spam, log_prob_ham = self.predict_log_probability(message)
        
        # Class with higher log probability wins
        return 'spam' if log_prob_spam > log_prob_ham else 'ham'
    
    def predict_proba(self, message):
        """
        Predict class probabilities.
        
        Returns:
        --------
        dict : {'spam': probability, 'ham': probability}
        """
        log_prob_spam, log_prob_ham = self.predict_log_probability(message)
        
        # Convert log probabilities to probabilities (using log-sum-exp trick)
        # To avoid underflow, subtract the maximum
        max_log_prob = max(log_prob_spam, log_prob_ham)
        prob_spam = exp(log_prob_spam - max_log_prob)
        prob_ham = exp(log_prob_ham - max_log_prob)
        
        # Normalize
        total = prob_spam + prob_ham
        prob_spam /= total
        prob_ham /= total
        
        return {'spam': prob_spam, 'ham': prob_ham}
    
    def get_word_contributions(self, message):
        """
        Get contribution of each word to the classification decision.
        Useful for interpretability!
        
        Returns:
        --------
        list : List of (word, spam_log_prob, ham_log_prob, contribution) tuples
        """
        tokens = tokenize(message)
        contributions = []
        
        log_prob_spam = log(self.prior_spam)
        log_prob_ham = log(self.prior_ham)
        
        for word in tokens:
            if word in self.vocab:
                word_log_prob_spam = log(self.get_word_probability(word, 'spam'))
                word_log_prob_ham = log(self.get_word_probability(word, 'ham'))
                
                # Contribution = difference in log probabilities
                contribution = word_log_prob_spam - word_log_prob_ham
                
                contributions.append({
                    'word': word,
                    'spam_log_prob': word_log_prob_spam,
                    'ham_log_prob': word_log_prob_ham,
                    'contribution': contribution
                })
                
                log_prob_spam += word_log_prob_spam
                log_prob_ham += word_log_prob_ham
        
        return contributions

print("NaiveBayesClassifier class defined!")


NaiveBayesClassifier class defined!


## 4. Train the Classifier


In [None]:
# Train classifier
nb = NaiveBayesClassifier(alpha=1.0)  # Laplace smoothing
nb.fit(train_df['message'], train_df['label'])


Training Naive Bayes classifier...
Vocabulary size: 7742
Prior P(spam) = 0.1342
Prior P(ham) = 0.8658
Total words in spam: 15480
Total words in ham: 56439


## 5. Evaluate on Test Set


In [None]:
# Make predictions
predictions = []
probabilities = []

for message in test_df['message']:
    pred = nb.predict(message)
    proba = nb.predict_proba(message)
    predictions.append(pred)
    probabilities.append(proba)

# Calculate accuracy
accuracy = sum(p == t for p, t in zip(predictions, test_df['label'])) / len(test_df)
print(f"Accuracy: {accuracy:.4f}")

# Confusion matrix
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(test_df['label'], predictions, labels=['ham', 'spam'])
print("\nConfusion Matrix:")
print("                 Predicted")
print("              Ham    Spam")
print(f"Actual Ham   {cm[0,0]:4d}   {cm[0,1]:4d}")
print(f"       Spam  {cm[1,0]:4d}   {cm[1,1]:4d}")

print("\nClassification Report:")
print(classification_report(test_df['label'], predictions, target_names=['Ham', 'Spam']))


Accuracy: 0.9848

Confusion Matrix:
                 Predicted
              Ham    Spam
Actual Ham    961      5
       Spam    12    137

Classification Report:
              precision    recall  f1-score   support

         Ham       0.99      0.99      0.99       966
        Spam       0.96      0.92      0.94       149

    accuracy                           0.98      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.98      0.98      0.98      1115



## 6. Most Spam-Indicative and Ham-Indicative Words

In [None]:
# Calculate word probabilities for all words in vocabulary
word_scores = []

for word in nb.vocab:
    spam_prob = nb.get_word_probability(word, 'spam')
    ham_prob = nb.get_word_probability(word, 'ham')
    
    # Ratio: how much more likely in spam vs ham
    ratio = spam_prob / ham_prob if ham_prob > 0 else float('inf')
    
    word_scores.append({
        'word': word,
        'spam_prob': spam_prob,
        'ham_prob': ham_prob,
        'ratio': ratio,
        'log_ratio': log(ratio) if ratio != float('inf') else 100
    })

word_scores_df = pd.DataFrame(word_scores)

# Top spam words (highest spam/ham ratio)
print("Top 20 Spam-Indicative Words:")
print("-" * 80)
top_spam = word_scores_df.nlargest(20, 'ratio')
for idx, row in top_spam.iterrows():
    print(f"{row['word']:20s} | P(spam)={row['spam_prob']:.6f} | P(ham)={row['ham_prob']:.6f} | Ratio={row['ratio']:.2f}")

print("\n" + "=" * 80)

# Top ham words (lowest spam/ham ratio)
print("Top 20 Ham-Indicative Words:")
print("-" * 80)
top_ham = word_scores_df.nsmallest(20, 'ratio')
for idx, row in top_ham.iterrows():
    print(f"{row['word']:20s} | P(spam)={row['spam_prob']:.6f} | P(ham)={row['ham_prob']:.6f} | Ratio={row['ratio']:.2f}")


Top 20 Spam-Indicative Words:
--------------------------------------------------------------------------------
claim                | P(spam)=0.004091 | P(ham)=0.000016 | Ratio=262.56
prize                | P(spam)=0.002928 | P(ham)=0.000016 | Ratio=187.94
å                    | P(spam)=0.010249 | P(ham)=0.000062 | Ratio=164.45
150p                 | P(spam)=0.002368 | P(ham)=0.000016 | Ratio=152.01
tone                 | P(spam)=0.002024 | P(ham)=0.000016 | Ratio=129.90
18                   | P(spam)=0.001809 | P(ham)=0.000016 | Ratio=116.08
cs                   | P(spam)=0.001723 | P(ham)=0.000016 | Ratio=110.55
guaranteed           | P(spam)=0.001636 | P(ham)=0.000016 | Ratio=105.02
500                  | P(spam)=0.001636 | P(ham)=0.000016 | Ratio=105.02
1000                 | P(spam)=0.001464 | P(ham)=0.000016 | Ratio=93.97
uk                   | P(spam)=0.002756 | P(ham)=0.000031 | Ratio=88.44
150ppm               | P(spam)=0.001249 | P(ham)=0.000016 | Ratio=80.15
awarded         

## 7. Key Insights

### What This Implementation Shows:

1. **Probabilistic Reasoning**: We can see exactly how each word contributes to the final decision
2. **Transparency**: Every prediction can be explained by showing word probabilities
3. **Interpretability**: Unlike neural networks, we can inspect and understand every step
4. **Mathematical Foundation**: Based on Bayes' theorem and conditional independence assumption

### Limitations:

1. **Naive Assumption**: Assumes words are independent (not true in reality)
2. **No Feature Engineering**: Simple tokenization, no TF-IDF weighting
3. **Performance**: Not optimized like scikit-learn's implementation
4. **Vocabulary**: Limited to words seen in training data

### Why This Matters for Your Project:

- **Interpretability**: You can show exactly why a message was classified as spam/ham
- **Human-like Reasoning**: The probabilistic approach mirrors how humans might reason about spam
- **Contrast with Neural Networks**: Highlights the trade-off between interpretability and flexibility
