# Spam Classification using Encoder LLMs with Linear Probing [5 points]
In this part, we will use encoder Large Language Models (LLMs) for spam classification. We will leverage the rich features of pre-trained LLMs without fine-tuning them. Instead, we will freeze the LLM weights and train a lightweight classifier head (MLP) on top for spam classification.

**Dataset:** Enron Spam Dataset

**Expected Performance (Best Model):** {Accuracy: >85%, F1: >85%, Precision: >85%, Recall: >82%}

In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel, AutoConfig
#!pip install datasets
from datasets import load_dataset, DatasetDict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

MODEL_NAMES = ['distilbert-base-uncased', 'huawei-noah/TinyBERT_General_4L_312D']  # DistilBERT and TinyBERT
MAX_LENGTH = 256
BATCH_SIZE = 16
LEARNING_RATE = 2e-4
EPOCHS = 4

def tokenize_data(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    def preprocess(batch):
        return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=MAX_LENGTH)

    raw_dataset = load_dataset("SetFit/enron_spam")
    raw_train_test = raw_dataset['train'].train_test_split(test_size=0.1, seed=42)
    dataset = DatasetDict({
        'train': raw_train_test['train'],
        'validation': raw_train_test['test'],
        'test': raw_dataset['test']
    })
    dataset = dataset.map(preprocess, batched=True)
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
    return dataset, tokenizer

class SpamClassifier(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.base_model = AutoModel.from_pretrained(model_name)
        for param in self.base_model.parameters():
            param.requires_grad = False  # freeze base model

        hidden_size = self.base_model.config.hidden_size
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_size, 2)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # [CLS] token
        return self.classifier(cls_output)

def train_and_evaluate(model_name):
    dataset, tokenizer = tokenize_data(model_name)
    train_loader = DataLoader(dataset['train'], batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(dataset['validation'], batch_size=BATCH_SIZE)
    test_loader = DataLoader(dataset['test'], batch_size=BATCH_SIZE)
    model = SpamClassifier(model_name).to(device)
    optimizer = torch.optim.AdamW(model.classifier.parameters(), lr=LEARNING_RATE)
    loss_fn = nn.CrossEntropyLoss()

    # Training 
    model.train()
    for epoch in range(EPOCHS):
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}"):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch + 1} Loss: {total_loss / len(train_loader):.4f}")

    # Evaluation
    def evaluate(loader):
        model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for batch in loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                preds = torch.argmax(outputs, dim=1)

                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        acc = accuracy_score(all_labels, all_preds)
        precision = precision_score(all_labels, all_preds)
        recall = recall_score(all_labels, all_preds)
        f1 = f1_score(all_labels, all_preds)
        return acc, precision, recall, f1

    acc, precision, recall, f1 = evaluate(test_loader)
    print(f"Evaluation for {model_name}")
    print(f"Accuracy:  {acc * 100:.2f}%")
    print(f"Precision: {precision * 100:.2f}%")
    print(f"Recall:    {recall * 100:.2f}%")
    print(f"F1 Score:  {f1 * 100:.2f}%")

    return {
        'model': model_name,
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


Using device: cuda


In [8]:
results = []
for model_name in MODEL_NAMES:
    results.append(train_and_evaluate(model_name))

print("\nModel Comparison:")
for res in results:
    print(f"{res['model']}: Acc={res['accuracy']:.4f}, Prec={res['precision']:.4f}, Recall={res['recall']:.4f}, F1={res['f1']:.4f}")


Repo card metadata block was not found. Setting CardData to empty.
Map: 100%|██████████| 2000/2000 [00:00<00:00, 2101.42 examples/s]
Training Epoch 1: 100%|██████████| 1784/1784 [07:37<00:00,  3.90it/s]


Epoch 1 Loss: 0.1302


Training Epoch 2: 100%|██████████| 1784/1784 [07:55<00:00,  3.75it/s]


Epoch 2 Loss: 0.0869


Training Epoch 3: 100%|██████████| 1784/1784 [07:33<00:00,  3.93it/s]


Epoch 3 Loss: 0.0746


Training Epoch 4: 100%|██████████| 1784/1784 [07:33<00:00,  3.94it/s]


Epoch 4 Loss: 0.0679
Evaluation for distilbert-base-uncased
Accuracy:  98.45%
Precision: 97.75%
Recall:    99.21%
F1 Score:  98.47%


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Repo card metadata block was not found. Setting CardData to empty.
Map: 100%|██████████| 28544/28544 [00:18<00:00, 1518.66 examples/s]
Map: 100%|██████████| 3172/3172 [00:02<00:00, 1489.70 examples/s]
Map: 100%|██████████| 2000/2000 [00:01<00:00, 1487.84 examples/s]
Training Epoch 1: 100%|██████████| 1784/1784 [01:02<00:00, 28.40it/s]


Epoch 1 Loss: 0.4393


Training Epoch 2: 100%|██████████| 1784/1784 [01:02<00:00, 28.33it/s]


Epoch 2 Loss: 0.3548


Training Epoch 3: 100%|██████████| 1784/1784 [01:02<00:00, 28.45it/s]


Epoch 3 Loss: 0.3337


Training Epoch 4: 100%|██████████| 1784/1784 [01:02<00:00, 28.40it/s]


Epoch 4 Loss: 0.3208
Evaluation for huawei-noah/TinyBERT_General_4L_312D
Accuracy:  90.05%
Precision: 86.02%
Recall:    95.83%
F1 Score:  90.66%

Model Comparison:
distilbert-base-uncased: Acc=0.9845, Prec=0.9775, Recall=0.9921, F1=0.9847
huawei-noah/TinyBERT_General_4L_312D: Acc=0.9005, Prec=0.8602, Recall=0.9583, F1=0.9066


# References:
1. https://huggingface.co/docs/transformers/en/model_doc/distilbert
2. https://huggingface.co/huawei-noah/TinyBERT_General_4L_312D
3. https://www.kdnuggets.com/distilbert-resource-efficient-natural-language-processing
