In [1]:
"""
SST-2 BERT Baseline Replication
================================

This notebook trains a BERT-base model on the SST-2 sentiment classification dataset.

Key Notes:
- The model is fine-tuned for 1 epoch on SST-2.
- Fixed-length tokenization (max_length=64) is used to ensure stable batching.
- Validation accuracy is reported and used for analysis.
- Test-set evaluation on GPU is intentionally skipped due to a known
  CUDA/Accelerate runtime issue in Google Colab during post-training evaluation.

The following test evaluation code is therefore commented out:

    # trainer.model.to("cpu")
    # test_results = trainer.evaluate(tokenized_dataset["test"])
    # print(test_results)

Validation results are sufficient for baseline replication and
subsequent interpretability (attention vs gradients) analysis.
"""

# =========================
# 1. Imports
# =========================

import numpy as np
import torch

from datasets import load_dataset
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)

import evaluate


# =========================
# 2. Load Dataset (SST-2)
# =========================

dataset = load_dataset("glue", "sst2")

print(dataset)


# =========================
# 3. Load Tokenizer
# =========================

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


# =========================
# 4. Tokenization Function
# =========================
# Fixed-length padding avoids tensor shape mismatches during training/evaluation

def tokenize_function(example):
    return tokenizer(
        example["sentence"],
        truncation=True,
        padding="max_length",
        max_length=64
    )


tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Remove unused columns and set PyTorch format
tokenized_dataset = tokenized_dataset.remove_columns(["sentence", "idx"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")


# =========================
# 5. Data Collator
# =========================

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


# =========================
# 6. Load Model
# =========================

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2
)


# =========================
# 7. Metrics Function (Robust)
# =========================
# Handles different Hugging Face output formats safely

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    if isinstance(eval_pred, tuple):
        logits, labels = eval_pred
    else:
        logits = eval_pred.predictions
        labels = eval_pred.label_ids

    # Some models return logits as a tuple
    if isinstance(logits, tuple):
        logits = logits[0]

    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)


# =========================
# 8. Training Arguments
# =========================

training_args = TrainingArguments(
    output_dir="./sst2_results",
    eval_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=100,
    report_to="none"
)


# =========================
# 9. Trainer
# =========================

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


# =========================
# 10. Train Model
# =========================

trainer.train()


# =========================
# 11. Validation Results
# =========================

val_results = trainer.evaluate()
print("SST-2 Validation Results:", val_results)


# =========================
# 12. Test Evaluation (INTENTIONALLY SKIPPED)
# =========================
# See explanation at top of notebook.

# trainer.model.to("cpu")
# test_results = trainer.evaluate(tokenized_dataset["test"])
# print("SST-2 Test Accuracy:", test_results)


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})


Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1535,0.249545,0.926606


SST-2 Validation Results: {'eval_loss': 0.24954521656036377, 'eval_accuracy': 0.926605504587156, 'eval_runtime': 3.2122, 'eval_samples_per_second': 271.462, 'eval_steps_per_second': 17.122, 'epoch': 1.0}
