In [1]:
import os
import tarfile
import urllib.request

# Function to download and extract datasets
def fetch_spam_datasets():
    # URL of the datasets
    url = "https://spamassassin.apache.org/old/publiccorpus/"

    # Directory to save the datasets
    data_dir = "spam_datasets"

    # Create directory if it doesn't exist
    os.makedirs(data_dir, exist_ok=True)

    # List of datasets to download
    datasets = [
        "20021010_easy_ham.tar.bz2",
        "20021010_hard_ham.tar.bz2",
        "20021010_spam.tar.bz2",
    ]

    # Download and extract datasets
    for dataset in datasets:
        dataset_url = url + dataset
        dataset_path = os.path.join(data_dir, dataset)
        urllib.request.urlretrieve(dataset_url, dataset_path)
        with tarfile.open(dataset_path, "r:bz2") as tar:
            tar.extractall(data_dir)

# Step 1: Download and extract datasets
fetch_spam_datasets()

# Step 2: Load and preprocess datasets
def load_emails(data_dir):
    emails = []
    labels = []
    for root, dirs, files in os.walk(data_dir):
        for file in files:
            file_path = os.path.join(root, file)
            with open(file_path, "rb") as f:
                content = f.read().decode(errors="ignore")
                emails.append(content)
                labels.append(1 if "spam" in root else 0)
    return emails, labels

data_dir = "spam_datasets"
emails, labels = load_emails(data_dir)

# Step 3: Split datasets into training and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(emails, labels, test_size=0.2, random_state=42)

# Step 4: Data Preparation Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

data_prep_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(
        lowercase=True,
        stop_words='english',
        strip_accents='unicode',
        token_pattern=r'\b\w\w+\b',  # Match words containing 2 or more alphanumeric characters
        max_df=0.95,  # Ignore terms that appear in more than 95% of the documents
        min_df=2,  # Ignore terms that appear in less than 2 documents
        max_features=1000  # Limit the number of features to 1000
    ))
])

# Step 5: Feature Vector Representation
X_train_features = data_prep_pipeline.fit_transform(X_train)
X_test_features = data_prep_pipeline.transform(X_test)

# Step 6: Classifier Selection and Training
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_features, y_train)

# Step 7: Evaluation
y_pred = naive_bayes_classifier.predict(X_test_features)
print(classification_report(y_test, y_pred))

# Step 8: Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV

param_grid = {'alpha': [0.01, 0.1, 1.0]}  # Example hyperparameters for MultinomialNB
grid_search = GridSearchCV(naive_bayes_classifier, param_grid, cv=5)
grid_search.fit(X_train_features, y_train)
best_classifier = grid_search.best_estimator_
y_pred_tuned = best_classifier.predict(X_test_features)
print(classification_report(y_test, y_pred_tuned))


              precision    recall  f1-score   support

           1       1.00      1.00      1.00       661

    accuracy                           1.00       661
   macro avg       1.00      1.00      1.00       661
weighted avg       1.00      1.00      1.00       661

              precision    recall  f1-score   support

           1       1.00      1.00      1.00       661

    accuracy                           1.00       661
   macro avg       1.00      1.00      1.00       661
weighted avg       1.00      1.00      1.00       661

