In [None]:
import os
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK data
print("Setting up NLTK data for Naive Bayes...")
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

print("✅ Naive Bayes spam fighting setup completed!")

## Naive Bayes Spam Classification with Kaggle Ling-Spam Dataset

**About Naive Bayes for Spam Detection:**
- Naive Bayes is one of the most effective methods for text classification
- Assumes feature independence (naive assumption) but works well in practice
- Uses probabilistic approach: P(spam|words) vs P(ham|words)
- Fast training and prediction, suitable for large datasets

**Dataset Setup:**
1. Download the Ling-Spam dataset from Kaggle 
2. Place the CSV file in the 'datasets/' directory
3. The CSV should have 'email_text' and 'label' columns
4. If no dataset is found, a sample dataset will be created for demonstration

**Feature Extraction Options:**
- **Bag of Words (CountVectorizer)**: Simple word counts
- **TF-IDF (TfidfVectorizer)**: Term frequency-inverse document frequency weighting
- Both will be demonstrated for comparison

In [None]:
# Configuration for Ling-Spam Dataset
DATASET_PATH = 'datasets/lingspam_dataset.csv'
TRAINING_SET_RATIO = 0.7

# Text preprocessing parameters
REMOVE_STOPWORDS = True
USE_STEMMING = True
MIN_DF = 2  # Minimum document frequency for features
MAX_FEATURES = 5000  # Maximum number of features

print(f"📁 Dataset path: {DATASET_PATH}")
print(f"🔄 Training ratio: {TRAINING_SET_RATIO}")
print(f"🛠️  Remove stopwords: {REMOVE_STOPWORDS}")
print(f"🌱 Use stemming: {USE_STEMMING}")
print(f"📊 Max features: {MAX_FEATURES}")

In [None]:
def load_lingspam_dataset():
    """
    Load the Ling-Spam dataset from CSV format
    
    Returns:
        pandas.DataFrame: Dataset with email_text and label columns
    """
    try:
        if os.path.exists(DATASET_PATH):
            df = pd.read_csv(DATASET_PATH)
            print(f"✅ Loaded dataset from {DATASET_PATH}")
            print(f"Dataset shape: {df.shape}")
            return df
        else:
            print(f"⚠️  Dataset not found at {DATASET_PATH}")
            print("Creating sample dataset for Naive Bayes demonstration...")
            
            # Create diverse sample dataset for Naive Bayes
            sample_data = {
                'email_text': [
                    "Dear colleague, I hope this email finds you well. We are organizing a linguistics conference next month on computational methods.",
                    "URGENT!!! You have won $1,000,000!!! Click here now to claim your prize!!! Limited time offer!!! Don't miss out!!!",
                    "The latest research on phonetics shows interesting patterns in vowel recognition systems and speech processing algorithms.",
                    "FREE VIAGRA!!! Buy now with 90% discount!!! No prescription needed!!! Order today!!! Fast shipping worldwide!!!",
                    "Thank you for your submission to the journal. We will review it and get back to you within 5-7 business days.",
                    "MAKE MONEY FAST!!! Work from home!!! Earn $5000 per week!!! No experience required!!! Start immediately!!!",
                    "The syntax paper you requested is attached. Please let me know if you need any clarifications on the methodology.",
                    "CREDIT CARD DEBT FORGIVENESS!!! Eliminate your debt today!!! Government program!!! Call now for free consultation!!!",
                    "Could you please review the manuscript on morphological analysis? Your expertise in computational linguistics would be valuable.",
                    "WIN A FREE IPHONE!!! Click now!!! Limited time offer!!! Act fast!!! Only 100 phones left!!! Don't wait!!!",
                    "The linguistics department is hosting a seminar on natural language processing next Friday at 2 PM in room A101.",
                    "HOT SINGLES IN YOUR AREA!!! Meet them tonight!!! No strings attached!!! 100% free registration!!! Join now!!!",
                    "I found your paper on semantic analysis very insightful. Would you be interested in collaboration on future research?",
                    "LOSE 30 POUNDS IN 30 DAYS!!! Revolutionary diet pill!!! Doctor approved!!! No exercise needed!!! Order now!!!",
                    "The conference proceedings are now available online. Thank you for your participation and excellent presentation.",
                    "WORK FROM HOME!!! Earn $3000/week!!! No experience needed!!! Start today!!! Flexible hours!!! Apply now!!!",
                    "Please find attached the corrected version of the phoneme classification algorithm for your review and comments.",
                    "MIRACLE CURE!!! Lose weight without diet or exercise!!! 100% guaranteed!!! Revolutionary breakthrough!!! Order today!!!",
                    "The workshop on computational linguistics has been scheduled for next month. We would appreciate your input.",
                    "FREE MONEY!!! Government grants available!!! Claim yours now!!! No repayment required!!! Limited time offer!!!"
                ],
                'label': ['ham', 'spam', 'ham', 'spam', 'ham', 'spam', 'ham', 'spam', 'ham', 'spam',
                         'ham', 'spam', 'ham', 'spam', 'ham', 'spam', 'ham', 'spam', 'ham', 'spam']
            }
            
            df = pd.DataFrame(sample_data)
            print(f"Created sample dataset with {len(df)} emails")
            return df
            
    except Exception as e:
        print(f"❌ Error loading dataset: {e}")
        return None

# Load the dataset
dataset = load_lingspam_dataset()
if dataset is not None:
    print(f"\n📊 Dataset summary:")
    print(f"Total emails: {len(dataset)}")
    print(f"Label distribution:")
    print(dataset['label'].value_counts())
    print(f"\n📝 Sample emails:")
    print(f"Ham: '{dataset[dataset['label']=='ham'].iloc[0]['email_text'][:80]}...'")
    print(f"Spam: '{dataset[dataset['label']=='spam'].iloc[0]['email_text'][:80]}...'")
else:
    print("❌ Failed to load dataset")

In [None]:
def preprocess_text(text):
    """
    Comprehensive text preprocessing for email classification
    
    Args:
        text (str): Raw email text
        
    Returns:
        str: Preprocessed text ready for vectorization
    """
    if pd.isna(text) or not isinstance(text, str):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs, email addresses, and special characters
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize
    try:
        tokens = word_tokenize(text)
    except:
        # Fallback to simple split if NLTK fails
        tokens = text.split()
    
    # Remove stopwords
    try:
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]
    except:
        print("⚠️  NLTK stopwords not available, skipping stopword removal")
        pass
    
    # Apply stemming
    try:
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]
    except:
        print("⚠️  NLTK stemmer not available, skipping stemming")
        pass
    
    # Remove short tokens and join
    tokens = [token for token in tokens if len(token) > 2]
    return ' '.join(tokens)

In [None]:
# Test preprocessing and apply to dataset
if dataset is not None:
    print("🔧 Testing text preprocessing:")
    sample_text = dataset.iloc[0]['email_text']
    print(f"Original: '{sample_text[:100]}...'")
    
    preprocessed = preprocess_text(sample_text)
    print(f"Preprocessed: '{preprocessed[:100]}...'")
    
    # Apply preprocessing to entire dataset
    print(f"\n⚙️  Preprocessing all {len(dataset)} emails...")
    dataset['preprocessed_text'] = dataset['email_text'].apply(preprocess_text)
    
    # Remove empty emails after preprocessing
    original_size = len(dataset)
    dataset = dataset[dataset['preprocessed_text'].str.len() > 0]
    if len(dataset) < original_size:
        print(f"Removed {original_size - len(dataset)} empty emails after preprocessing")
    
    print(f"✅ Preprocessing complete! Dataset size: {len(dataset)} emails")
    print(f"Sample preprocessed text: '{dataset.iloc[0]['preprocessed_text'][:80]}...'")
else:
    print("❌ No dataset available for preprocessing")

In [None]:
# Split dataset into training and testing sets
if dataset is not None:
    print("📊 Preparing train/test split...")
    
    X = dataset['preprocessed_text']
    y = dataset['label']
    
    # Split with stratification to maintain class balance
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
    )
    
    print(f"Training set: {len(X_train)} emails ({len(y_train[y_train=='ham'])} ham, {len(y_train[y_train=='spam'])} spam)")
    print(f"Test set: {len(X_test)} emails ({len(y_test[y_test=='ham'])} ham, {len(y_test[y_test=='spam'])} spam)")
    
    # Verify no data leakage
    train_texts = set(X_train)
    test_texts = set(X_test)
    overlap = train_texts.intersection(test_texts)
    if len(overlap) > 0:
        print(f"⚠️  Warning: {len(overlap)} texts appear in both train and test sets")
    else:
        print("✅ No data leakage detected between train and test sets")
else:
    print("❌ No dataset available for splitting")



In [None]:
# Feature extraction with vectorization
print("🔤 Feature extraction with vectorization...")

if VECTORIZER_TYPE == 'count':
    vectorizer = CountVectorizer(
        max_features=MAX_FEATURES,
        stop_words='english',
        ngram_range=(1, 2),  # Use unigrams and bigrams
        min_df=2,  # Ignore terms that appear in less than 2 documents
        max_df=0.95  # Ignore terms that appear in more than 95% of documents
    )
    print("📊 Using CountVectorizer (word counts)")
else:
    vectorizer = TfidfVectorizer(
        max_features=MAX_FEATURES,
        stop_words='english',
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.95
    )
    print("📊 Using TfidfVectorizer (TF-IDF scores)")

# Fit vectorizer on training data and transform both sets
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

print(f"✅ Feature extraction complete!")
print(f"Training features shape: {X_train_vectorized.shape}")
print(f"Test features shape: {X_test_vectorized.shape}")
print(f"Vocabulary size: {len(vectorizer.get_feature_names_out())}")

# Show some example features
feature_names = vectorizer.get_feature_names_out()
print(f"Sample features: {list(feature_names[:10])}")

In [None]:
# Train Naive Bayes classifier
print("🤖 Training Naive Bayes classifier...")

# Initialize the classifier
nb_classifier = MultinomialNB(alpha=ALPHA)

# Train the model
nb_classifier.fit(X_train_vectorized, y_train)

# Make predictions
y_train_pred = nb_classifier.predict(X_train_vectorized)
y_test_pred = nb_classifier.predict(X_test_vectorized)

# Get prediction probabilities
y_test_proba = nb_classifier.predict_proba(X_test_vectorized)

print("✅ Training complete!")

# Calculate accuracy scores
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"\n📈 Model Performance:")
print(f"Training Accuracy: {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")
print(f"Test Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")

# Check for overfitting
if train_accuracy - test_accuracy > 0.1:
    print("⚠️  Warning: Possible overfitting detected (train accuracy >> test accuracy)")
else:
    print("✅ Good generalization (train and test accuracies are close)")

             precision    recall  f1-score   support

       Spam       0.99      0.94      0.97     15035
        Ham       0.90      0.98      0.94      7591

avg / total       0.96      0.96      0.96     22626

Classification accuracy 95.6%


In [None]:
# Detailed evaluation metrics
print("📊 Detailed Classification Report:")
print(classification_report(y_test, y_test_pred, target_names=['Ham', 'Spam']))

print("\n🔢 Confusion Matrix:")
cm = confusion_matrix(y_test, y_test_pred, labels=['ham', 'spam'])
print(cm)

# Calculate additional metrics
tn, fp, fn, tp = cm.ravel()
precision_spam = tp / (tp + fp)
recall_spam = tp / (tp + fn)
f1_spam = 2 * (precision_spam * recall_spam) / (precision_spam + recall_spam)

print(f"\n📈 Spam Detection Metrics:")
print(f"Precision (Spam): {precision_spam:.4f} (of emails classified as spam, {precision_spam*100:.1f}% are actually spam)")
print(f"Recall (Spam): {recall_spam:.4f} (of actual spam emails, {recall_spam*100:.1f}% are correctly identified)")
print(f"F1-Score (Spam): {f1_spam:.4f}")

print(f"\n🔍 Error Analysis:")
print(f"False Positives (Ham classified as Spam): {fp}")
print(f"False Negatives (Spam classified as Ham): {fn}")

if fp > 0:
    print("⚠️  False positives are problematic - legitimate emails marked as spam")
if fn > 0:
    print("⚠️  False negatives are concerning - spam emails reaching inbox")

In [None]:
# Feature analysis - most informative features
print("🔍 Most Informative Features for Spam Detection:")

# Get feature names and their log probabilities
feature_names = vectorizer.get_feature_names_out()
log_prob_spam = nb_classifier.feature_log_prob_[1]  # Spam class
log_prob_ham = nb_classifier.feature_log_prob_[0]   # Ham class

# Calculate feature importance (log probability ratio)
feature_importance = log_prob_spam - log_prob_ham

# Get top features for spam
spam_features_idx = np.argsort(feature_importance)[-20:][::-1]
ham_features_idx = np.argsort(feature_importance)[:20]

print("\n📧 Top 10 Spam Indicators:")
for i, idx in enumerate(spam_features_idx[:10]):
    feature = feature_names[idx]
    importance = feature_importance[idx]
    print(f"{i+1:2d}. '{feature}' (score: {importance:.3f})")

print("\n📮 Top 10 Ham Indicators:")
for i, idx in enumerate(ham_features_idx[:10]):
    feature = feature_names[idx]
    importance = feature_importance[idx]
    print(f"{i+1:2d}. '{feature}' (score: {importance:.3f})")

# Show class probabilities
print(f"\n📊 Class Distribution in Training Data:")
spam_prob = (y_train == 'spam').mean()
ham_prob = (y_train == 'ham').mean()
print(f"Spam: {spam_prob:.3f} ({spam_prob*100:.1f}%)")
print(f"Ham: {ham_prob:.3f} ({ham_prob*100:.1f}%)")

In [None]:
# Interactive prediction function
def classify_email(email_text, show_probability=True):
    """
    Classify a single email as spam or ham
    
    Args:
        email_text (str): The email text to classify
        show_probability (bool): Whether to show prediction probabilities
    
    Returns:
        str: Classification result ('spam' or 'ham')
    """
    # Preprocess the text
    preprocessed = preprocess_text(email_text)
    
    if not preprocessed:
        print("⚠️  Warning: Email text is empty after preprocessing")
        return None
    
    # Vectorize the text
    vectorized = vectorizer.transform([preprocessed])
    
    # Make prediction
    prediction = nb_classifier.predict(vectorized)[0]
    
    if show_probability:
        probabilities = nb_classifier.predict_proba(vectorized)[0]
        ham_prob = probabilities[0] if nb_classifier.classes_[0] == 'ham' else probabilities[1]
        spam_prob = probabilities[1] if nb_classifier.classes_[1] == 'spam' else probabilities[0]
        
        print(f"📧 Email Classification:")
        print(f"Text: '{email_text[:100]}{'...' if len(email_text) > 100 else ''}'")
        print(f"Prediction: {prediction.upper()}")
        print(f"Confidence: Ham {ham_prob:.3f} | Spam {spam_prob:.3f}")
        
        # Show confidence level
        max_prob = max(ham_prob, spam_prob)
        if max_prob > 0.9:
            print("🟢 High confidence")
        elif max_prob > 0.7:
            print("🟡 Medium confidence")
        else:
            print("🔴 Low confidence")
    
    return prediction

# Test with sample emails
print("🧪 Testing classifier with sample emails:")
print("="*50)

test_emails = [
    "Dear colleague, thank you for your research paper on computational linguistics.",
    "URGENT!!! You have won $1,000,000!!! Click here now!!!",
    "The conference paper deadline has been extended to next month.",
    "FREE VIAGRA!!! Buy now with 90% discount!!! No prescription needed!!!"
]

for i, email in enumerate(test_emails, 1):
    print(f"\nTest {i}:")
    classify_email(email)

In [None]:
# Visualizations
plt.figure(figsize=(15, 10))

# Plot 1: Confusion Matrix Heatmap
plt.subplot(2, 3, 1)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')

# Plot 2: Class Distribution
plt.subplot(2, 3, 2)
class_counts = dataset['label'].value_counts()
plt.pie(class_counts.values, labels=class_counts.index, autopct='%1.1f%%', 
        colors=['lightblue', 'lightcoral'])
plt.title('Dataset Class Distribution')

# Plot 3: Prediction Confidence Distribution
plt.subplot(2, 3, 3)
max_probabilities = np.max(y_test_proba, axis=1)
plt.hist(max_probabilities, bins=20, alpha=0.7, color='green', edgecolor='black')
plt.xlabel('Prediction Confidence')
plt.ylabel('Frequency')
plt.title('Prediction Confidence Distribution')
plt.axvline(x=0.5, color='red', linestyle='--', alpha=0.7, label='50% threshold')
plt.axvline(x=0.9, color='orange', linestyle='--', alpha=0.7, label='90% threshold')
plt.legend()

# Plot 4: Feature Importance (Top Spam Features)
plt.subplot(2, 3, 4)
top_spam_features = feature_names[spam_features_idx[:10]]
top_spam_scores = feature_importance[spam_features_idx[:10]]
plt.barh(range(len(top_spam_features)), top_spam_scores, color='red', alpha=0.7)
plt.yticks(range(len(top_spam_features)), top_spam_features)
plt.xlabel('Log Probability Ratio')
plt.title('Top 10 Spam Features')
plt.gca().invert_yaxis()

# Plot 5: Feature Importance (Top Ham Features)
plt.subplot(2, 3, 5)
top_ham_features = feature_names[ham_features_idx[:10]]
top_ham_scores = feature_importance[ham_features_idx[:10]]
plt.barh(range(len(top_ham_features)), top_ham_scores, color='blue', alpha=0.7)
plt.yticks(range(len(top_ham_features)), top_ham_features)
plt.xlabel('Log Probability Ratio')
plt.title('Top 10 Ham Features')
plt.gca().invert_yaxis()

# Plot 6: Performance Metrics
plt.subplot(2, 3, 6)
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
values = [test_accuracy, precision_spam, recall_spam, f1_spam]
colors = ['blue', 'green', 'orange', 'red']
plt.bar(metrics, values, color=colors, alpha=0.7)
plt.ylabel('Score')
plt.title('Performance Metrics (Spam Class)')
plt.ylim(0, 1)
for i, v in enumerate(values):
    plt.text(i, v + 0.01, f'{v:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.savefig('figures/naive-bayes-analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("📊 Visualizations saved to 'figures/naive-bayes-analysis.png'")

## Summary and Conclusions

The Naive Bayes classifier demonstrates excellent performance for email spam detection:

### Key Findings:

1. **High Accuracy**: The model achieves strong classification performance on the Ling-Spam dataset
2. **Interpretable Features**: We can easily identify which words/phrases contribute most to spam/ham classification
3. **Fast Training**: Naive Bayes trains quickly and requires minimal computational resources
4. **Probabilistic Output**: The classifier provides confidence scores for predictions

### Advantages of Naive Bayes for Spam Detection:

- **Simple and Fast**: Easy to implement and train
- **Handles Text Well**: Natural fit for bag-of-words text classification
- **Feature Independence**: Works well even when word independence assumption is violated
- **Probabilistic**: Provides meaningful confidence scores
- **Small Dataset Friendly**: Performs well with limited training data

### Limitations:

- **Feature Independence Assumption**: Assumes words are independent (not always true)
- **Zero Frequency Problem**: Smoothing (alpha parameter) needed for unseen words
- **Context Ignorance**: Doesn't consider word order or context

### Security Applications:

This approach can be extended to:
- Email spam filtering systems
- Malicious URL detection
- Phishing email identification
- Content-based security filtering
- Social media spam detection

The combination of high accuracy, interpretability, and speed makes Naive Bayes an excellent baseline for text-based security applications.