In [9]:
import string
import nltk
import pandas as pd
import numpy as np
import os
import pickle
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK data
print("Downloading NLTK data...")
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)  # For newer NLTK versions

# Initialize text processing tools
punctuations = list(string.punctuation)
stopwords_set = set(stopwords.words('english'))
stemmer = PorterStemmer()

print("Text processing tools initialized successfully!")

Downloading NLTK data...
Text processing tools initialized successfully!
Text processing tools initialized successfully!


## Kaggle Ling-Spam Dataset Setup

**Dataset Information:**
- The Ling-Spam corpus contains emails from a linguistics mailing list
- Contains both legitimate emails (ham) and spam emails
- Dataset structure: CSV file with 'email_text' and 'label' columns

**Setup Instructions:**
1. Download the Ling-Spam dataset from Kaggle
2. Place the CSV file in the 'datasets/' directory
3. Ensure the CSV has columns: 'email_text' (text content) and 'label' (spam/ham)

**Alternative Dataset Sources:**
- Original Ling-Spam: http://www.aueb.gr/users/ion/data/lingspam_public.tar.gz
- Kaggle Ling-Spam variations
- Or create your own CSV with email_text and label columns

In [5]:
# Configuration for Ling-Spam Dataset
DATASET_PATH = 'datasets/lingspam_dataset.csv'  # Update this path to your CSV file
TRAINING_SET_RATIO = 0.7

# Alternative: If you have the original Ling-Spam format (directories)
# SPAM_DIR = 'datasets/lingspam_public/spam-assassin/'
# HAM_DIR = 'datasets/lingspam_public/legitimate/'

print(f"Dataset path configured: {DATASET_PATH}")
print(f"Training set ratio: {TRAINING_SET_RATIO}")

Dataset path configured: datasets/lingspam_dataset.csv
Training set ratio: 0.7


In [10]:
def preprocess_text(text):
    """
    Process email text into stemmed tokens for analysis
    
    Args:
        text (str): Raw email text
        
    Returns:
        list: List of stemmed tokens
    """
    if not text or pd.isna(text):
        return []
    
    # Convert to lowercase and tokenize
    tokens = nltk.word_tokenize(str(text).lower())
    
    # Remove punctuation and filter tokens
    tokens = [token.strip("".join(punctuations)) for token in tokens 
              if token not in punctuations and len(token) > 1]
    
    # Remove stopwords and stem tokens
    if len(tokens) > 2:
        return [stemmer.stem(word) for word in tokens 
                if word not in stopwords_set and word.isalpha()]
    return []

# Test the preprocessing function
sample_text = "Hello! This is a test email with some words."
sample_tokens = preprocess_text(sample_text)
print(f"Sample preprocessing: '{sample_text}' → {sample_tokens[:5]}...")

Sample preprocessing: 'Hello! This is a test email with some words.' → ['hello', 'test', 'email', 'word']...


In [11]:
def load_lingspam_dataset():
    """
    Load the Ling-Spam dataset from CSV format
    
    Returns:
        pandas.DataFrame: Dataset with email_text and label columns
    """
    try:
        # Try loading the CSV file
        if os.path.exists(DATASET_PATH):
            df = pd.read_csv(DATASET_PATH)
            print(f"✅ Loaded dataset from {DATASET_PATH}")
            print(f"Dataset shape: {df.shape}")
            print(f"Columns: {list(df.columns)}")
            return df
        else:
            # Create a sample dataset for demonstration
            print(f"⚠️  Dataset not found at {DATASET_PATH}")
            print("Creating sample dataset for demonstration...")
            
            sample_data = {
                'email_text': [
                    "Dear colleague, I hope this email finds you well. We are organizing a linguistics conference next month.",
                    "URGENT!!! You have won $1,000,000!!! Click here now to claim your prize!!! Limited time offer!!!",
                    "The latest research on phonetics shows interesting patterns in vowel recognition systems.",
                    "FREE VIAGRA!!! Buy now with 90% discount!!! No prescription needed!!! Order today!!!",
                    "Thank you for your submission to the journal. We will review it and get back to you soon.",
                    "MAKE MONEY FAST!!! Work from home!!! Earn $5000 per week!!! No experience required!!!",
                    "The syntax paper you requested is attached. Please let me know if you need any clarifications.",
                    "CREDIT CARD DEBT FORGIVENESS!!! Eliminate your debt today!!! Government program!!!"
                ],
                'label': ['ham', 'spam', 'ham', 'spam', 'ham', 'spam', 'ham', 'spam']
            }
            
            df = pd.DataFrame(sample_data)
            print(f"Created sample dataset with {len(df)} emails")
            return df
            
    except Exception as e:
        print(f"❌ Error loading dataset: {e}")
        return None

# Load the dataset
dataset = load_lingspam_dataset()
if dataset is not None:
    print(f"\n📊 Dataset loaded successfully!")
    print(f"Total emails: {len(dataset)}")
    print(f"Label distribution:")
    print(dataset['label'].value_counts())

⚠️  Dataset not found at datasets/lingspam_dataset.csv
Creating sample dataset for demonstration...
Created sample dataset with 8 emails

📊 Dataset loaded successfully!
Total emails: 8
Label distribution:
label
ham     4
spam    4
Name: count, dtype: int64


In [12]:
# Convert labels to binary format (1 for ham, 0 for spam)
dataset['label_binary'] = dataset['label'].map({'ham': 1, 'spam': 0})

# Split the dataset into training and testing sets
X = dataset['email_text']
y = dataset['label_binary']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=(1 - TRAINING_SET_RATIO), 
    random_state=42, 
    stratify=y
)

print(f"📝 Dataset split completed:")
print(f"Training set: {len(X_train)} emails")
print(f"Testing set: {len(X_test)} emails")
print(f"Training spam/ham ratio: {sum(y_train == 0)}/{sum(y_train == 1)}")
print(f"Testing spam/ham ratio: {sum(y_test == 0)}/{sum(y_test == 1)}")

📝 Dataset split completed:
Training set: 5 emails
Testing set: 3 emails
Training spam/ham ratio: 2/3
Testing spam/ham ratio: 2/1


In [13]:
# Build blacklist from training data
spam_words = set()
ham_words = set()

if not os.path.exists('blacklist_lingspam.pkl'):
    print("🔍 Building blacklist from training data...")
    
    # Process training emails
    for idx, (email_text, label) in enumerate(zip(X_train, y_train)):
        if idx % 50 == 0:  # Progress indicator
            print(f"Processing email {idx + 1}/{len(X_train)}")
            
        # Preprocess the email text
        tokens = preprocess_text(email_text)
        
        if tokens:  # Only process if we got valid tokens
            if label == 1:  # Ham email
                ham_words.update(tokens)
            else:  # Spam email
                spam_words.update(tokens)
    
    # Create blacklist: words that appear in spam but not in ham
    blacklist = spam_words - ham_words
    
    # Save the blacklist for future use
    with open('blacklist_lingspam.pkl', 'wb') as f:
        pickle.dump(blacklist, f)
        
    print(f"✅ Blacklist created and saved!")
    print(f"Total spam words: {len(spam_words)}")
    print(f"Total ham words: {len(ham_words)}")
    print(f"Blacklist size: {len(blacklist)} words")
    
else:
    # Load existing blacklist
    with open('blacklist_lingspam.pkl', 'rb') as f:
        blacklist = pickle.load(f)
    print(f"✅ Blacklist loaded from file!")
    print(f"Blacklist size: {len(blacklist)} words")

# Show some example blacklist words
if blacklist:
    print(f"\n📋 Sample blacklist words: {list(blacklist)[:10]}")

🔍 Building blacklist from training data...
Processing email 1/5
✅ Blacklist created and saved!
Total spam words: 18
Total ham words: 29
Blacklist size: 17 words

📋 Sample blacklist words: ['fast', 'prescript', 'viagra', 'free', 'buy', 'make', 'experi', 'home', 'work', 'today']


In [14]:
# Analyze blacklist quality by checking against English dictionary
try:
    from nltk.corpus import words
    nltk.download('words', quiet=True)
    english_words = set(words.words())
    
    # Find blacklist words that are actual English words
    english_blacklist = blacklist.intersection(english_words)
    
    print(f"🔤 Blacklist analysis:")
    print(f"Total blacklist words: {len(blacklist)}")
    print(f"Valid English words in blacklist: {len(english_blacklist)}")
    print(f"Non-English/specialized terms: {len(blacklist) - len(english_blacklist)}")
    
    if english_blacklist:
        print(f"\n📝 Sample English words in blacklist: {list(english_blacklist)[:10]}")
        
except Exception as e:
    print(f"⚠️  Could not analyze blacklist against English dictionary: {e}")
    print("Proceeding with blacklist-based classification...")

🔤 Blacklist analysis:
Total blacklist words: 17
Valid English words in blacklist: 14
Non-English/specialized terms: 3

📝 Sample English words in blacklist: ['fast', 'prescript', 'make', 'free', 'buy', 'home', 'work', 'today', 'discount', 'earn']


In [15]:
# Test the blacklist classifier on the test set
print("🧪 Testing blacklist classifier...")

# Initialize confusion matrix variables
fp = 0  # False Positive: Ham classified as Spam
tp = 0  # True Positive: Spam classified as Spam
fn = 0  # False Negative: Spam classified as Ham
tn = 0  # True Negative: Ham classified as Ham

# Test each email in the test set
for idx, (email_text, true_label) in enumerate(zip(X_test, y_test)):
    # Preprocess the email
    tokens = preprocess_text(email_text)
    
    if not tokens:  # Skip if no valid tokens
        continue
        
    # Check if email contains blacklisted words
    email_tokens_set = set(tokens)
    contains_spam_words = bool(email_tokens_set & blacklist)
    
    # Classify based on blacklist
    if contains_spam_words:
        predicted_label = 0  # Classify as spam
        if true_label == 1:  # Actually ham
            fp += 1
        else:  # Actually spam
            tp += 1
    else:
        predicted_label = 1  # Classify as ham
        if true_label == 1:  # Actually ham
            tn += 1
        else:  # Actually spam
            fn += 1

print(f"📊 Classification completed!")
print(f"True Positives (Spam → Spam): {tp}")
print(f"True Negatives (Ham → Ham): {tn}")
print(f"False Positives (Ham → Spam): {fp}")
print(f"False Negatives (Spam → Ham): {fn}")

total_predictions = tp + tn + fp + fn
print(f"Total predictions: {total_predictions}")

🧪 Testing blacklist classifier...
📊 Classification completed!
True Positives (Spam → Spam): 1
True Negatives (Ham → Ham): 1
False Positives (Ham → Spam): 0
False Negatives (Spam → Ham): 1
Total predictions: 3


In [16]:
# Display confusion matrix
from IPython.display import HTML, display

print("📈 Confusion Matrix (Raw Counts):")
print("Predicted →")
print("Actual ↓     Ham    Spam")

conf_matrix = [
    ['Ham', tn, fp],
    ['Spam', fn, tp]
]

# Create HTML table for better visualization
html_table = "<table border='1' style='border-collapse: collapse;'>"
html_table += "<tr><th></th><th>Predicted Ham</th><th>Predicted Spam</th></tr>"
html_table += f"<tr><td><b>Actual Ham</b></td><td>{tn}</td><td>{fp}</td></tr>"
html_table += f"<tr><td><b>Actual Spam</b></td><td>{fn}</td><td>{tp}</td></tr>"
html_table += "</table>"

display(HTML(html_table))

print(f"\n🎯 Interpretation:")
print(f"• True Negatives (TN): {tn} - Ham emails correctly identified as Ham")
print(f"• False Positives (FP): {fp} - Ham emails incorrectly identified as Spam")
print(f"• False Negatives (FN): {fn} - Spam emails incorrectly identified as Ham")
print(f"• True Positives (TP): {tp} - Spam emails correctly identified as Spam")

📈 Confusion Matrix (Raw Counts):
Predicted →
Actual ↓     Ham    Spam


Unnamed: 0,Predicted Ham,Predicted Spam
Actual Ham,1,0
Actual Spam,1,1



🎯 Interpretation:
• True Negatives (TN): 1 - Ham emails correctly identified as Ham
• False Positives (FP): 0 - Ham emails incorrectly identified as Spam
• False Negatives (FN): 1 - Spam emails incorrectly identified as Ham
• True Positives (TP): 1 - Spam emails correctly identified as Spam


In [17]:
# Display confusion matrix as percentages
count = tn + tp + fn + fp

if count > 0:
    print("📊 Confusion Matrix (Percentages):")
    
    tn_pct = f"{tn/count:.1%}"
    fp_pct = f"{fp/count:.1%}"
    fn_pct = f"{fn/count:.1%}"
    tp_pct = f"{tp/count:.1%}"
    
    # Create HTML table for percentages
    html_table_pct = "<table border='1' style='border-collapse: collapse;'>"
    html_table_pct += "<tr><th></th><th>Predicted Ham</th><th>Predicted Spam</th></tr>"
    html_table_pct += f"<tr><td><b>Actual Ham</b></td><td>{tn_pct}</td><td>{fp_pct}</td></tr>"
    html_table_pct += f"<tr><td><b>Actual Spam</b></td><td>{fn_pct}</td><td>{tp_pct}</td></tr>"
    html_table_pct += "</table>"
    
    display(HTML(html_table_pct))
    
    # Calculate key metrics
    accuracy = (tp + tn) / count
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    print(f"\n📈 Performance Metrics:")
    print(f"• Accuracy: {accuracy:.1%} - Overall correct predictions")
    print(f"• Precision: {precision:.1%} - Of predicted spam, how much was actually spam")
    print(f"• Recall: {recall:.1%} - Of actual spam, how much was detected")
    print(f"• F1-Score: {f1_score:.3f} - Harmonic mean of precision and recall")
    
else:
    print("❌ No predictions made - check dataset and preprocessing")

📊 Confusion Matrix (Percentages):


Unnamed: 0,Predicted Ham,Predicted Spam
Actual Ham,33.3%,0.0%
Actual Spam,33.3%,33.3%



📈 Performance Metrics:
• Accuracy: 66.7% - Overall correct predictions
• Precision: 100.0% - Of predicted spam, how much was actually spam
• Recall: 50.0% - Of actual spam, how much was detected
• F1-Score: 0.667 - Harmonic mean of precision and recall


In [18]:
# Final summary and demonstration
print("🎉 Spam Fighting with Blacklist - Ling-Spam Dataset Analysis Complete!")
print("="*70)

if count > 0:
    accuracy = (tp + tn) / count
    print(f"🎯 Final Classification Accuracy: {accuracy:.1%}")
    
    print(f"\n💡 Key Insights:")
    print(f"• Dataset size: {len(dataset)} emails")
    print(f"• Training set: {len(X_train)} emails")
    print(f"• Test set: {len(X_test)} emails")
    print(f"• Blacklist words: {len(blacklist)} unique terms")
    print(f"• Method: Simple blacklist-based classification")
    
    # Analyze effectiveness
    if accuracy > 0.8:
        print(f"✅ Good performance! The blacklist method works well on this dataset.")
    elif accuracy > 0.6:
        print(f"⚠️  Moderate performance. Consider improving the blacklist or using additional features.")
    else:
        print(f"❌ Poor performance. The blacklist method may not be sufficient for this dataset.")
        
    print(f"\n🔮 Next Steps:")
    print(f"• Try different feature engineering techniques")
    print(f"• Experiment with machine learning classifiers (Naive Bayes, SVM)")
    print(f"• Use TF-IDF features instead of simple word presence")
    print(f"• Analyze false positives and false negatives to improve the blacklist")
    
else:
    print("❌ Classification failed - please check your dataset and try again.")
    
print("="*70)

🎉 Spam Fighting with Blacklist - Ling-Spam Dataset Analysis Complete!
🎯 Final Classification Accuracy: 66.7%

💡 Key Insights:
• Dataset size: 8 emails
• Training set: 5 emails
• Test set: 3 emails
• Blacklist words: 17 unique terms
• Method: Simple blacklist-based classification
⚠️  Moderate performance. Consider improving the blacklist or using additional features.

🔮 Next Steps:
• Try different feature engineering techniques
• Experiment with machine learning classifiers (Naive Bayes, SVM)
• Use TF-IDF features instead of simple word presence
• Analyze false positives and false negatives to improve the blacklist


## Optional: Create Sample Dataset

If you don't have access to the Kaggle Ling-Spam dataset, you can create a sample dataset for testing purposes. Run the cell below to generate a sample CSV file.

In [None]:
# Create a larger sample dataset for demonstration
def create_sample_lingspam_dataset(filename='datasets/lingspam_dataset.csv', num_emails=100):
    """
    Create a sample Ling-Spam dataset for demonstration purposes
    """
    # Create datasets directory if it doesn't exist
    os.makedirs('datasets', exist_ok=True)
    
    # Sample spam emails (typical spam characteristics)
    spam_samples = [
        "URGENT!!! You have won $1,000,000!!! Click here now to claim your prize!!! Limited time offer!!!",
        "FREE VIAGRA!!! Buy now with 90% discount!!! No prescription needed!!! Order today!!!",
        "MAKE MONEY FAST!!! Work from home!!! Earn $5000 per week!!! No experience required!!!",
        "CREDIT CARD DEBT FORGIVENESS!!! Eliminate your debt today!!! Government program!!!",
        "HOT SINGLES IN YOUR AREA!!! Meet them tonight!!! No strings attached!!!",
        "LOSE 30 POUNDS IN 30 DAYS!!! Revolutionary diet pill!!! Doctor approved!!!",
        "WIN A FREE IPHONE!!! Click now!!! Limited time offer!!! Act fast!!!",
        "WORK FROM HOME!!! Earn $3000/week!!! No experience needed!!! Start today!!!",
        "FREE MONEY!!! Government grants available!!! Claim yours now!!! No repayment!!!",
        "MIRACLE CURE!!! Lose weight without diet or exercise!!! 100% guaranteed!!!"
    ]
    
    # Sample ham emails (legitimate linguistics/academic content)
    ham_samples = [
        "Dear colleague, I hope this email finds you well. We are organizing a linguistics conference next month.",
        "The latest research on phonetics shows interesting patterns in vowel recognition systems.",
        "Thank you for your submission to the journal. We will review it and get back to you soon.",
        "The syntax paper you requested is attached. Please let me know if you need any clarifications.",
        "Could you please review the manuscript on morphological analysis? Your expertise would be valuable.",
        "The linguistics department is hosting a seminar on computational linguistics next Friday.",
        "I found your paper on semantic analysis very insightful. Would you be interested in collaboration?",
        "The conference proceedings are now available online. Thank you for your participation.",
        "Please find attached the corrected version of the phoneme classification algorithm.",
        "The workshop on natural language processing has been scheduled for next month."
    ]
    
    # Generate dataset
    emails = []
    labels = []
    
    # Add multiple copies with variations
    for i in range(num_emails // 2):
        # Add spam email (with some variation)
        spam_email = spam_samples[i % len(spam_samples)]
        if i > 0:  # Add some variation
            spam_email += f" Email #{i + 1}. Reference code: SPAM{i:03d}."
        emails.append(spam_email)
        labels.append('spam')
        
        # Add ham email (with some variation)
        ham_email = ham_samples[i % len(ham_samples)]
        if i > 0:  # Add some variation
            ham_email += f" Email reference: HAM{i:03d}. Best regards, Academic Team."
        emails.append(ham_email)
        labels.append('ham')
    
    # Create DataFrame
    df = pd.DataFrame({
        'email_text': emails,
        'label': labels
    })
    
    # Save to CSV
    df.to_csv(filename, index=False)
    print(f"✅ Sample dataset created: {filename}")
    print(f"📊 Contains {len(df)} emails ({len(df[df['label']=='spam'])} spam, {len(df[df['label']=='ham'])} ham)")
    
    return df

# Uncomment the line below to create a sample dataset
# sample_df = create_sample_lingspam_dataset()

print("💡 To create a sample dataset, uncomment the line above and run this cell.")

## ✅ Fixes Applied - Notebook Successfully Updated!

### **Error Fixed in preprocess_text cell:**

1. **Missing pandas import**: Added `import pandas as pd` to the imports
2. **Missing NLTK data**: Added `nltk.download('punkt_tab')` for newer NLTK versions
3. **Complete import reorganization**: All required imports now properly organized

### **Key Features Working:**
- ✅ Text preprocessing with stemming and stopword removal
- ✅ Dataset loading (with sample data generation if file not found)
- ✅ Train/test split with stratification
- ✅ Blacklist creation from training data
- ✅ Spam classification using blacklist approach
- ✅ Performance evaluation with confusion matrix and metrics

### **Notebook converted for Kaggle Ling-Spam Dataset:**
- 🔄 **Original**: TREC 2007 Spam Corpus (email files + labels file)
- 🔄 **Updated**: Kaggle Ling-Spam Dataset (CSV format with email_text and label columns)
- 🔄 **Fallback**: Sample dataset generation for demonstration

The notebook now successfully demonstrates spam filtering using blacklist methods with modern Python and NLTK compatibility!