# Case Study: Python Demo - Implementing Gaussian Naive Bayes Assisted Spam Email Filtering

In [None]:
# Import Libraries and Dataset
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

# Create the experimental dataset for spam/non-spam email

In [None]:
# Create the dataset--Using Panda's DataFrame
# Step 1: Prepare a larger dataset
data = {
    'EmailID': np.arange(1, 21),
    'WordCount': [350, 200, 500, 150, 300, 450, 600, 250, 550, 400, 700, 100, 450, 370, 650, 230, 510, 330, 600, 220],  # Example feature: Word count in email
    'CapFrequency': [2, 1, 3, 0, 4, 2, 3, 1, 3, 0, 5, 0, 2, 3, 4, 1, 3, 2, 4, 1],  # Example feature: Capital letter frequency
    'Spam': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]  # Target: 1 = Spam, 0 = Not Spam
}

df = pd.DataFrame(data)

In [None]:
df

# Features and Labels and Split Data into Training and Testing

In [None]:
# Features and target variable
X = df[['WordCount', 'CapFrequency']]  # Features: WordCount, CapFrequency
y = df['Spam']  # Target: Spam (1) or Not Spam (0)

# Split into Training and Test Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the split sizes
print(f"Training Set Size: {X_train.shape}")
print(f"Test Set Size: {X_test.shape}")

# Train the Model

In [None]:
# Initialize the Gaussian Naive Bayes model
gnb = GaussianNB()

# Train the model
gnb.fit(X_train, y_train)

# Print the model parameters (mean and variance for each feature in each class)
print("Model Parameters (Mean and Variance):")
print(f"Class 0 (Not Spam) - Mean: {gnb.theta_[0]}, Variance: {gnb.var_[0]}")
print(f"Class 1 (Spam) - Mean: {gnb.theta_[1]}, Variance: {gnb.var_[1]}")

# Make Predictions

In [None]:
y_pred = gnb.predict(X_test)

# Print the predicted and actual labels
print("\nPredictions:", y_pred)
print("Actual Labels:", y_test.values)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy * 100:.2f}%")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

# Functions for Unseen Input Prediction

In [None]:
def predict_single_email(word_count, cap_frequency):
    """Predict whether a single email is spam or not based on its features."""
    input_data = np.array([[word_count, cap_frequency]])
    prediction = gnb.predict(input_data)[0]
    return "Spam" if prediction == 1 else "Not Spam"

In [None]:
# Test single email prediction
sample_email = (400, 2)  # Example: 400 words, 2 capital letters
print(f"\nSingle Email Prediction: {predict_single_email(*sample_email)}")

In [None]:
def predict_batch_emails(email_list):
    """Predict spam status for a batch of emails."""
    input_data = np.array(email_list)
    predictions = gnb.predict(input_data)
    return ["Spam" if pred == 1 else "Not Spam" for pred in predictions]

In [None]:
# Test batch email prediction
sample_batch = [(150, 1), (700, 5), (300, 2)]  # Example batch
print("\nBatch Email Predictions:")
print(predict_batch_emails(sample_batch))
