In [None]:
from google.colab import files
uploaded = files.upload()

Saving email_classification_dataset.csv to email_classification_dataset.csv


In [None]:
# SPAM EMAIL DETECTION - SECTION 1: IMPORTS AND DATA LOADING

import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# 1. DATA LOADING AND INSPECTION
from google.colab import files
uploaded = files.upload()

df = pd.read_csv('email_classification_dataset.csv')
df.head()

Saving email_classification_dataset.csv to email_classification_dataset (1).csv


Unnamed: 0,id,email,label
0,2685,From: support@legitcompany.com\nSubject: Regar...,ham
1,5857,From: noreply@softwareupdates.com\nSubject: We...,ham
2,2399,From: noreply@softwareupdates.com\nSubject: Im...,ham
3,3244,From: info@customerservice.co\nSubject: Team S...,ham
4,2844,From: info@customerservice.co\nSubject: Team S...,ham


In [None]:
# SPAM EMAIL DETECTION - SECTION 2: DATA QUALITY CHECK

def basic_data_quality_check(df):
    """
    Perform comprehensive data quality checks
    """
    print("DATA QUALITY ANALYSIS")
    print("-" * 30)

    print(f"Total records: {len(df)}")
    print(f"Total features: {df.shape[1]}")

    # Missing values
    print(f"\nMissing Values:")
    missing_summary = df.isnull().sum()
    for col, missing_count in missing_summary.items():
        missing_pct = (missing_count / len(df)) * 100
        print(f"  {col}: {missing_count} ({missing_pct:.2f}%)")

    # Data types
    print(f"\nData Types:")
    for col, dtype in df.dtypes.items():
        print(f"  {col}: {dtype}")

    # Label distribution
    print(f"\nLabel Distribution:")
    label_counts = df['label'].value_counts()
    for label, count in label_counts.items():
        pct = (count / len(df)) * 100
        print(f"  {label}: {count} ({pct:.1f}%)")

    # Check for duplicates
    duplicates = df.duplicated().sum()
    print(f"\nDuplicate records: {duplicates}")

    # Sample data preview
    print(f"\nSample Records:")
    print(df.head(3).to_string())

    return {
        'missing_values': missing_summary,
        'label_distribution': label_counts,
        'duplicates': duplicates
    }

# Usage example:
quality_report = basic_data_quality_check(df)

DATA QUALITY ANALYSIS
------------------------------
Total records: 10000
Total features: 3

Missing Values:
  id: 0 (0.00%)
  email: 0 (0.00%)
  label: 0 (0.00%)

Data Types:
  id: int64
  email: object
  label: object

Label Distribution:
  ham: 8500 (85.0%)
  spam: 1500 (15.0%)

Duplicate records: 0

Sample Records:
     id                                                                                                                                                                                                                                                                                                  email label
0  2685                                       From: support@legitcompany.com\nSubject: Regarding Your Recent Inquiry\n\nThank you for reaching out regarding [your inquiry]. We have reviewed your request and will get back to you within 24 hours with a detailed response. Sincerely, Customer Service Team   ham
1  5857  From: noreply@softwareupdates.com\nSubject: Weekly N

In [None]:
# SPAM EMAIL DETECTION - SECTION 3: FEATURE ENGINEERING

def extract_email_features(email_text):
    """
    Extract key features from email text
    """
    if pd.isna(email_text):
        return {
            'sender_domain': '',
            'subject': '',
            'body': '',
            'email_length': 0,
            'subject_length': 0,
            'body_length': 0,
            'num_links': 0,
            'has_urgency_words': False,
            'has_financial_words': False,
            'num_caps_words': 0
        }

    # Extract sender domain
    sender_match = re.search(r'From:\s*\S+@(\S+)', email_text)
    sender_domain = sender_match.group(1) if sender_match else ''

    # Extract subject
    subject_match = re.search(r'Subject:\s*(.+?)(?:\n|$)', email_text)
    subject = subject_match.group(1).strip() if subject_match else ''

    # Extract body (everything after subject)
    body_match = re.search(r'Subject:.*?\n\s*(.*)', email_text, re.DOTALL)
    body = body_match.group(1).strip() if body_match else email_text

    # Calculate lengths
    email_length = len(email_text)
    subject_length = len(subject)
    body_length = len(body)

    # Count hyperlinks
    num_links = len(re.findall(r'http[s]?://|www\.|\[link', email_text, re.IGNORECASE))

    # Check for urgency words
    urgency_words = ['urgent', 'immediately', 'asap', 'expire', 'deadline', 'limited time', 'act now']
    has_urgency = any(word in email_text.lower() for word in urgency_words)

    # Check for financial words
    financial_words = ['payment', 'invoice', 'billing', 'account', 'credit', 'bank', 'money', '$']
    has_financial = any(word in email_text.lower() for word in financial_words)

    # Count words in ALL CAPS (potential spam indicator)
    caps_words = re.findall(r'\b[A-Z]{2,}\b', email_text)
    num_caps_words = len(caps_words)

    return {
        'sender_domain': sender_domain,
        'subject': subject,
        'body': body,
        'email_length': email_length,
        'subject_length': subject_length,
        'body_length': body_length,
        'num_links': num_links,
        'has_urgency_words': has_urgency,
        'has_financial_words': has_financial,
        'num_caps_words': num_caps_words
    }

def feature_engineering(df):
    """
    Apply feature engineering to the dataset
    """
    print("FEATURE ENGINEERING")
    print("-" * 30)

    print("Extracting email features...")
    features_list = []

    for idx, email in df['email'].items():
        if idx % 1000 == 0:
            print(f"  Processed {idx}/{len(df)} emails...")
        features = extract_email_features(email)
        features_list.append(features)

    # Convert to DataFrame
    features_df = pd.DataFrame(features_list)

    # Combine with original data
    enhanced_df = pd.concat([df, features_df], axis=1)

    print(f"Feature engineering complete!")
    print(f"New dataset shape: {enhanced_df.shape}")
    print(f"New features added: {list(features_df.columns)}")

    return enhanced_df

# Usage example:
enhanced_df = feature_engineering(df)
enhanced_df.head()

FEATURE ENGINEERING
------------------------------
Extracting email features...
  Processed 0/10000 emails...
  Processed 1000/10000 emails...
  Processed 2000/10000 emails...
  Processed 3000/10000 emails...
  Processed 4000/10000 emails...
  Processed 5000/10000 emails...
  Processed 6000/10000 emails...
  Processed 7000/10000 emails...
  Processed 8000/10000 emails...
  Processed 9000/10000 emails...
Feature engineering complete!
New dataset shape: (10000, 13)
New features added: ['sender_domain', 'subject', 'body', 'email_length', 'subject_length', 'body_length', 'num_links', 'has_urgency_words', 'has_financial_words', 'num_caps_words']


Unnamed: 0,id,email,label,sender_domain,subject,body,email_length,subject_length,body_length,num_links,has_urgency_words,has_financial_words,num_caps_words
0,2685,From: support@legitcompany.com\nSubject: Regar...,ham,legitcompany.com,Regarding Your Recent Inquiry,Thank you for reaching out regarding [your inq...,253,29,182,0,False,False,0
1,5857,From: noreply@softwareupdates.com\nSubject: We...,ham,softwareupdates.com,Weekly Newsletter - Latest Updates,Please find attached your invoice for the serv...,290,34,211,0,False,True,1
2,2399,From: noreply@softwareupdates.com\nSubject: Im...,ham,softwareupdates.com,Important: Software Update Notification,Thank you for your order #6789. Your items wil...,269,39,185,1,False,False,0
3,3244,From: info@customerservice.co\nSubject: Team S...,ham,customerservice.co,Team Stand-up at 10 AM,Please find attached your invoice for the serv...,274,22,211,0,False,True,2
4,2844,From: info@customerservice.co\nSubject: Team S...,ham,customerservice.co,Team Stand-up at 10 AM,Here's your weekly dose of news and updates fr...,241,22,178,1,False,False,1


In [None]:
# SPAM EMAIL DETECTION - SECTION 4: EXPLORATORY DATA ANALYSIS

def basic_statistics_analysis(df):
    """
    Generate basic statistics for the enhanced dataset
    """
    print("BASIC STATISTICS ANALYSIS")
    print("-" * 30)

    # Numerical features statistics
    numerical_features = ['email_length', 'subject_length', 'body_length', 'num_links', 'num_caps_words']

    print("Numerical Features Summary:")
    for feature in numerical_features:
        if feature in df.columns:
            print(f"\n{feature}:")
            print(f"  Mean: {df[feature].mean():.2f}")
            print(f"  Median: {df[feature].median():.2f}")
            print(f"  Std: {df[feature].std():.2f}")
            print(f"  Min: {df[feature].min()}")
            print(f"  Max: {df[feature].max()}")

    # Categorical features analysis
    print(f"\nTop 10 Sender Domains:")
    domain_counts = df['sender_domain'].value_counts().head(10)
    for domain, count in domain_counts.items():
        print(f"  {domain}: {count}")

    # Boolean features analysis
    boolean_features = ['has_urgency_words', 'has_financial_words']
    print(f"\nBoolean Features:")
    for feature in boolean_features:
        if feature in df.columns:
            true_count = df[feature].sum()
            true_pct = (true_count / len(df)) * 100
            print(f"  {feature}: {true_count} ({true_pct:.1f}%)")

    # Spam vs Ham comparison
    print(f"\nSPAM vs HAM Comparison:")
    for feature in numerical_features:
        if feature in df.columns:
            spam_mean = df[df['label'] == 'spam'][feature].mean()
            ham_mean = df[df['label'] == 'ham'][feature].mean()
            print(f"  {feature}: Spam={spam_mean:.2f}, Ham={ham_mean:.2f}")

def compare_spam_ham_features(df):
    """
    Detailed comparison of features between spam and ham emails
    """
    print("\nDETAILED SPAM vs HAM ANALYSIS")
    print("-" * 30)

    spam_df = df[df['label'] == 'spam']
    ham_df = df[df['label'] == 'ham']

    print(f"Spam emails: {len(spam_df)}")
    print(f"Ham emails: {len(ham_df)}")

    # Compare boolean features
    boolean_features = ['has_urgency_words', 'has_financial_words']
    for feature in boolean_features:
        spam_pct = spam_df[feature].mean() * 100
        ham_pct = ham_df[feature].mean() * 100
        print(f"\n{feature}:")
        print(f"  Spam: {spam_pct:.1f}%")
        print(f"  Ham: {ham_pct:.1f}%")
        print(f"  Difference: {spam_pct - ham_pct:.1f} percentage points")

    # Compare numerical features
    numerical_features = ['email_length', 'subject_length', 'num_links', 'num_caps_words']
    for feature in numerical_features:
        spam_mean = spam_df[feature].mean()
        ham_mean = ham_df[feature].mean()
        spam_median = spam_df[feature].median()
        ham_median = ham_df[feature].median()

        print(f"\n{feature}:")
        print(f"  Spam - Mean: {spam_mean:.1f}, Median: {spam_median:.1f}")
        print(f"  Ham - Mean: {ham_mean:.1f}, Median: {ham_median:.1f}")
        print(f"  Mean difference: {spam_mean - ham_mean:.1f}")

def analyze_sender_domains(df):
    """
    Analyze sender domains for patterns
    """
    print("\nSENDER DOMAIN ANALYSIS")
    print("-" * 30)

    # Top domains by spam/ham
    spam_domains = df[df['label'] == 'spam']['sender_domain'].value_counts().head(10)
    ham_domains = df[df['label'] == 'ham']['sender_domain'].value_counts().head(10)

    print("Top 10 Spam Domains:")
    for domain, count in spam_domains.items():
        print(f"  {domain}: {count}")

    print("\nTop 10 Ham Domains:")
    for domain, count in ham_domains.items():
        print(f"  {domain}: {count}")

    # Domain patterns
    spam_df = df[df['label'] == 'spam']
    ham_df = df[df['label'] == 'ham']

    # Count domains with common business extensions
    suspicious_patterns = ['noreply', 'no-reply', 'updates', 'info']

    for pattern in suspicious_patterns:
        spam_count = spam_df['sender_domain'].str.contains(pattern, case=False, na=False).sum()
        ham_count = ham_df['sender_domain'].str.contains(pattern, case=False, na=False).sum()
        print(f"\nDomains containing '{pattern}':")
        print(f"  Spam: {spam_count}")
        print(f"  Ham: {ham_count}")

# Usage example:
basic_statistics_analysis(enhanced_df)
compare_spam_ham_features(enhanced_df)
analyze_sender_domains(enhanced_df)

BASIC STATISTICS ANALYSIS
------------------------------
Numerical Features Summary:

email_length:
  Mean: 246.84
  Median: 248.00
  Std: 38.44
  Min: 148
  Max: 342

subject_length:
  Mean: 31.85
  Median: 29.00
  Std: 6.68
  Min: 22
  Max: 51

body_length:
  Mean: 173.93
  Median: 178.00
  Std: 40.57
  Min: 86
  Max: 258

num_links:
  Mean: 0.56
  Median: 1.00
  Std: 0.50
  Min: 0
  Max: 1

num_caps_words:
  Mean: 0.35
  Median: 0.00
  Std: 0.53
  Min: 0
  Max: 2

Top 10 Sender Domains:
  softwareupdates.com: 915
  homemail.net: 881
  personalmail.net: 875
  info.org: 867
  billingcorp.com: 865
  projectmanagement.com: 862
  example.com: 843
  retailfeedback.com: 812
  legitcompany.com: 800
  customerservice.co: 780

Boolean Features:
  has_urgency_words: 689 (6.9%)
  has_financial_words: 3213 (32.1%)

SPAM vs HAM Comparison:
  email_length: Spam=188.65, Ham=257.11
  subject_length: Spam=40.78, Ham=30.27
  body_length: Spam=106.49, Ham=185.83
  num_links: Spam=0.90, Ham=0.50
  num_c