# Spam Classification using Encoder LLMs with Linear Probing 

**Dataset:** Enron Spam Dataset



In [1]:
# Import necessary libraries
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
# AdamW is now imported from torch.optim
from torch.optim import AdamW
from torch import nn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [3]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [None]:
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split

# Load the Enron Spam dataset from Hugging Face
dataset = load_dataset("SetFit/enron_spam")

# Convert the 'train' and 'test' splits to pandas DataFrames
train_df = pd.DataFrame(dataset["train"])
test_df = pd.DataFrame(dataset["test"])

train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

# Verify dataset structure
print("Training set size:", len(train_df))
print("Validation set size:", len(val_df))
print("Test set size:", len(test_df))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/176 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


train.jsonl:   0%|          | 0.00/101M [00:00<?, ?B/s]

test.jsonl:   0%|          | 0.00/6.27M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/31716 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Training set size: 28544
Validation set size: 3172
Test set size: 2000


In [None]:
# Load pre-trained encoder LLM (e.g., DistilBERT, BartEncoder)
chosen_llm_1 = "distilbert-base-uncased"
chosen_llm_2 = "huawei-noah/TinyBERT_General_4L_312D"

print(f"Chosen Encoder LLM 1: {chosen_llm_1}")
print(f"Chosen Encoder LLM 2: {chosen_llm_2}")


Chosen Encoder LLM 1: distilbert-base-uncased
Chosen Encoder LLM 2: huawei-noah/TinyBERT_General_4L_312D


In [6]:
from transformers import AutoTokenizer, AutoModel # Import AutoTokenizer from transformers
tokenizer_1 = AutoTokenizer.from_pretrained(chosen_llm_1)
tokenizer_2 = AutoTokenizer.from_pretrained(chosen_llm_2)

def tokenize_data(df, tokenizer):
    """Tokenize text data with padding and truncation."""
    return tokenizer(df['text'].tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt")

# Tokenize datasets for both LLMs
train_encodings_1 = tokenize_data(train_df, tokenizer_1)
val_encodings_1 = tokenize_data(val_df, tokenizer_1)
test_encodings_1 = tokenize_data(test_df, tokenizer_1)

train_encodings_2 = tokenize_data(train_df, tokenizer_2)
val_encodings_2 = tokenize_data(val_df, tokenizer_2)
test_encodings_2 = tokenize_data(test_df, tokenizer_2)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/409 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [7]:
import torch

In [8]:
import torch.nn as nn

In [None]:

from torch.utils.data import Dataset # Import Dataset from torch.utils.data

class SpamDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset_1 = SpamDataset(train_encodings_1, train_df['label'].tolist())
val_dataset_1 = SpamDataset(val_encodings_1, val_df['label'].tolist())
test_dataset_1 = SpamDataset(test_encodings_1, test_df['label'].tolist())

train_dataset_2 = SpamDataset(train_encodings_2, train_df['label'].tolist())
val_dataset_2 = SpamDataset(val_encodings_2, val_df['label'].tolist())
test_dataset_2 = SpamDataset(test_encodings_2, test_df['label'].tolist())


class SpamClassifier(nn.Module):
    def __init__(self, llm_name):
        super(SpamClassifier, self).__init__()
        self.llm = AutoModel.from_pretrained(llm_name)
        # Freeze LLM weights
        for param in self.llm.parameters():
            param.requires_grad = False
        # MLP classifier head
        self.classifier = nn.Sequential(
            nn.Linear(self.llm.config.hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(0.1),  # Add dropout for regularization
            nn.Linear(256, 2)  # Binary classification: spam (1) or not spam (0)
        )

    def forward(self, input_ids, attention_mask):
        # Get LLM outputs
        outputs = self.llm(input_ids=input_ids, attention_mask=attention_mask)
        # Use [CLS] token or pooler output
        if hasattr(outputs, 'last_hidden_state'):
            cls_output = outputs.last_hidden_state[:, 0, :]  # [CLS] token
        else:
            cls_output = outputs.pooler_output  # For DistilBERT
        logits = self.classifier(cls_output)
        return logits

In [10]:
from torch.utils.data import DataLoader

In [11]:
from torch.optim import AdamW

In [12]:
from sklearn.metrics import accuracy_score

In [None]:

def train_model(model, train_dataset, val_dataset, epochs=3, batch_size=16, learning_rate=1e-3):
    """Train the classifier head while keeping LLM weights frozen."""
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    optimizer = AdamW(model.classifier.parameters(), lr=learning_rate)  # Optimize only classifier
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # Validation
        model.eval()
        val_labels = []
        val_preds = []
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                outputs = model(input_ids, attention_mask)
                preds = torch.argmax(outputs, dim=1)
                val_labels.extend(labels.cpu().numpy())
                val_preds.extend(preds.cpu().numpy())

        val_accuracy = accuracy_score(val_labels, val_preds)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}, Val Accuracy: {val_accuracy:.4f}")

    return model

epochs = 3
batch_size = 16
learning_rate = 1e-3

# Train model 1 (DistilBERT)
model_1 = SpamClassifier(chosen_llm_1)
print(f"\nTraining {chosen_llm_1}...")
model_1 = train_model(model_1, train_dataset_1, val_dataset_1, epochs, batch_size, learning_rate)

# Train model 2 (TinyBERT)
model_2 = SpamClassifier(chosen_llm_2)
print(f"\nTraining {chosen_llm_2}...")
model_2 = train_model(model_2, train_dataset_2, val_dataset_2, epochs, batch_size, learning_rate)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]


Training distilbert-base-uncased...
Epoch 1/3, Loss: 0.1152, Val Accuracy: 0.9710
Epoch 2/3, Loss: 0.0755, Val Accuracy: 0.9827
Epoch 3/3, Loss: 0.0667, Val Accuracy: 0.9830


pytorch_model.bin:   0%|          | 0.00/62.7M [00:00<?, ?B/s]


Training huawei-noah/TinyBERT_General_4L_312D...


model.safetensors:   0%|          | 0.00/62.7M [00:00<?, ?B/s]

Epoch 1/3, Loss: 0.3879, Val Accuracy: 0.8897
Epoch 2/3, Loss: 0.3299, Val Accuracy: 0.9158
Epoch 3/3, Loss: 0.3118, Val Accuracy: 0.9155


In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # Import necessary metrics

In [15]:
### ADD YOUR CODE HERE ###
# Evaluate the trained model on the test set
# Calculate and report accuracy, precision, recall, and F1-score
def evaluate_model(model, test_dataset, batch_size=16):
    """Evaluate model on test set and return metrics."""
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()
    test_labels = []
    test_preds = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)
            test_labels.extend(labels.cpu().numpy())
            test_preds.extend(preds.cpu().numpy())

    accuracy = accuracy_score(test_labels, test_preds)
    precision = precision_score(test_labels, test_preds)
    recall = recall_score(test_labels, test_preds)
    f1 = f1_score(test_labels, test_preds)
    return accuracy, precision, recall, f1

# Evaluate both models
print("\nEvaluating models...")
metrics_1 = evaluate_model(model_1, test_dataset_1)
metrics_2 = evaluate_model(model_2, test_dataset_2)

# Display results
print(f"\n{chosen_llm_1} Results:")
print(f"Accuracy: {metrics_1[0]:.4f}, Precision: {metrics_1[1]:.4f}, Recall: {metrics_1[2]:.4f}, F1: {metrics_1[3]:.4f}")

print(f"\n{chosen_llm_2} Results:")
print(f"Accuracy: {metrics_2[0]:.4f}, Precision: {metrics_2[1]:.4f}, Recall: {metrics_2[2]:.4f}, F1: {metrics_2[3]:.4f}")


Evaluating models...

distilbert-base-uncased Results:
Accuracy: 0.9880, Precision: 0.9843, Recall: 0.9921, F1: 0.9881

huawei-noah/TinyBERT_General_4L_312D Results:
Accuracy: 0.9100, Precision: 0.8670, Recall: 0.9702, F1: 0.9157


In [None]:
print("\n### Performance Comparison and Trend Discussion ###")
print(f"- {chosen_llm_1}:")
print(f"  - Accuracy: {metrics_1[0]:.4f}")
print(f"  - Precision: {metrics_1[1]:.4f}")
print(f"  - Recall: {metrics_1[2]:.4f}")
print(f"  - F1: {metrics_1[3]:.4f}")
print(f"- {chosen_llm_2}:")
print(f"  - Accuracy: {metrics_2[0]:.4f}")
print(f"  - Precision: {metrics_2[1]:.4f}")
print(f"  - Recall: {metrics_2[2]:.4f}")
print(f"  - F1: {metrics_2[3]:.4f}")

print("\nDiscussion:")
print(f"- {chosen_llm_1} (DistilBERT) has ~66M parameters, offering a balance between size and performance.")
print(f"- {chosen_llm_2} (TinyBERT) has ~14M parameters, making it more efficient but potentially less expressive.")
print("- Trends: DistilBERT may outperform TinyBERT due to its larger capacity, but TinyBERT could be faster and use less memory.")
print("- Actual performance depends on the dataset; adjust hyperparameters if needed.")


# Performance vs. Expected Metrics Discussion


expected_metrics = {"Accuracy": 0.85, "F1": 0.85, "Precision": 0.85, "Recall": 0.82}

# Determine best model
best_model = chosen_llm_1 if metrics_1[3] > metrics_2[3] else chosen_llm_2  # Based on F1 score
best_metrics = metrics_1 if best_model == chosen_llm_1 else metrics_2

print("\n### Performance vs. Expected Metrics Discussion ###")
print(f"Best Model: {best_model}")
print(f"- Accuracy: {best_metrics[0]:.4f} (Expected: >{expected_metrics['Accuracy']})")
print(f"- F1: {best_metrics[3]:.4f} (Expected: >{expected_metrics['F1']})")
print(f"- Precision: {best_metrics[1]:.4f} (Expected: >{expected_metrics['Precision']})")
print(f"- Recall: {best_metrics[2]:.4f} (Expected: >{expected_metrics['Recall']})")

meets_criteria = (best_metrics[0] > expected_metrics["Accuracy"] and
                  best_metrics[3] > expected_metrics["F1"] and
                  best_metrics[1] > expected_metrics["Precision"] and
                  best_metrics[2] > expected_metrics["Recall"])

print(f"\nMeets Expected Metrics: {'Yes' if meets_criteria else 'No'}")
print("Discussion:")
if meets_criteria:
    print("- The best model exceeds all expected performance thresholds, indicating effective feature extraction and classification.")
else:
    print("- The model falls short of some metrics. Consider increasing epochs, tuning learning rate, or using a larger LLM if memory allows.")


### Performance Comparison and Trend Discussion ###
- distilbert-base-uncased:
  - Accuracy: 0.9880
  - Precision: 0.9843
  - Recall: 0.9921
  - F1: 0.9881
- huawei-noah/TinyBERT_General_4L_312D:
  - Accuracy: 0.9100
  - Precision: 0.8670
  - Recall: 0.9702
  - F1: 0.9157

Discussion:
- distilbert-base-uncased (DistilBERT) has ~66M parameters, offering a balance between size and performance.
- huawei-noah/TinyBERT_General_4L_312D (TinyBERT) has ~14M parameters, making it more efficient but potentially less expressive.
- Trends: DistilBERT may outperform TinyBERT due to its larger capacity, but TinyBERT could be faster and use less memory.
- Actual performance depends on the dataset; adjust hyperparameters if needed.

### Performance vs. Expected Metrics Discussion ###
Best Model: distilbert-base-uncased
- Accuracy: 0.9880 (Expected: >0.85)
- F1: 0.9881 (Expected: >0.85)
- Precision: 0.9843 (Expected: >0.85)
- Recall: 0.9921 (Expected: >0.82)

Meets Expected Metrics: Yes
Discussion:
- 

In [18]:
print("\n### References ###")
print("- Hugging Face Transformers: https://huggingface.co/transformers/")
print("- PyTorch Documentation: https://pytorch.org/docs/stable/index.html")
print("- Scikit-learn Metrics: https://scikit-learn.org/stable/modules/model_evaluation.html")


### References ###
- Hugging Face Transformers: https://huggingface.co/transformers/
- PyTorch Documentation: https://pytorch.org/docs/stable/index.html
- Scikit-learn Metrics: https://scikit-learn.org/stable/modules/model_evaluation.html
