# Assignment 2: Transformer-Based Models

In [None]:
# required libraries
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from datasets import load_dataset
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

## Task 1: Load and Inspect a Transformer Model



In [None]:
# Load tokenizer and model
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [None]:
print(f"MODEL: {model_name}")

# Count total parameters
total_params = 0
for param in model.parameters():
    total_params += param.numel()

print(f"\nTotal Parameters: {total_params:,}")

In [None]:
# Model size in MB
model_size_mb = (total_params * 4) / (1024 ** 2)
print(f"Model Size: {model_size_mb:.2f} MB")

# Print model configuration
config = model.config
print(f"\nNumber of Layers: {config.n_layers}")
print(f"Hidden Size: {config.dim}")
print(f"Attention Heads: {config.n_heads}")
print(f"Max Sequence Length: {config.max_position_embeddings}")

In [None]:
# Tokenizer information
print(f"\nVocabulary Size: {tokenizer.vocab_size:,}")
print(f"Padding Token: {tokenizer.pad_token}")

In [None]:
# Example tokenization
example_text = "The transformer architecture revolutionized NLP!"
tokens = tokenizer.tokenize(example_text)
token_ids = tokenizer.encode(example_text)

print(f"\nExample Text: {example_text}")
print(f"Tokens: {tokens}")
print(f"Token IDs: {token_ids}")
print(f"Number of Tokens: {len(tokens)}")

## Task 2: Load Dataset and Build Classification Pipeline


In [None]:
# Load IMDb dataset
dataset = load_dataset("imdb")
print(f"\nTrain samples: {len(dataset['train'])}")
print(f"Test samples: {len(dataset['test'])}")

In [None]:
# Take smaller subset for faster training
train_size = 5000
test_size = 1000

train_dataset = dataset["train"].shuffle(seed=42).select(range(train_size))
test_dataset = dataset["test"].shuffle(seed=42).select(range(test_size))

print(f"\nUsing {train_size} training samples")
print(f"Using {test_size} test samples")

### 1. Tokenize dataset



In [None]:
# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)

In [None]:
print("\nTokenizing dataset...")
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

In [None]:
# Rename label column
train_dataset = train_dataset.rename_column("label", "labels")
test_dataset = test_dataset.rename_column("label", "labels")

In [None]:
# Set format for PyTorch
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

###2. Build PyTorch DataLoader

In [None]:
# Create dataloaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

print(f"\nBatch Size: {batch_size}")
print(f"Training Batches: {len(train_loader)}")
print(f"Test Batches: {len(test_loader)}")

In [None]:
# Load model for classification
num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

In [None]:
device = "cpu"
model.to(device)

### 3. Fine-tune the transformer

In [None]:
# Training hyperparameters
learning_rate = 2e-5
epochs = 3

In [None]:
# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

### 4. Log:


In [None]:
# Track training loss
train_losses = []
val_accuracies = []
epoch_times = []

In [None]:
# Training loop
for epoch in range(epochs):
    epoch_start = time.time()

    # Training phase
    model.train()
    total_train_loss = 0

    for batch_idx, batch in enumerate(train_loader):
        # Move batch to device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Forward pass
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

        # Print progress every 50 batches
        if batch_idx % 50 == 0:
            print(f"Epoch {epoc
                           h+1}/{epochs} | Batch {batch_idx}/{len(train_loader)} | Loss: {loss.item():.4f}")

    # Calculate average training loss
    avg_train_loss = total_train_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    # Validation phase
    model.eval()
    correct = 0
    total = 0


    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=-1)

            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    val_accuracy = correct / total
    val_accuracies.append(val_accuracy)

    epoch_time = time.time() - epoch_start
    epoch_times.append(epoch_time)

    print(f"\nEpoch {epoch+1} Summary:")
    print(f"  Train Loss: {avg_train_loss:.4f}")
    print(f"  Val Accuracy: {val_accuracy:.4f} ({val_accuracy*100:.2f}%)")
    print(f"  Time: {epoch_time:.2f}s\n")

Epoch 1/3 | Batch 0/313 | Loss: 0.7069
Epoch 1/3 | Batch 50/313 | Loss: 0.5330
Epoch 1/3 | Batch 100/313 | Loss: 0.4596
Epoch 1/3 | Batch 150/313 | Loss: 0.3216
Epoch 1/3 | Batch 200/313 | Loss: 0.4301


## Task 3: Evaluate & Interpret

In [None]:
model.eval()

all_predictions = []
all_labels = []
misclassified = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=-1)

        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

        # Collect misclassified examples
        for i in range(len(labels)):
            if predictions[i] != labels[i]:
                text = tokenizer.decode(input_ids[i], skip_special_tokens=True)
                misclassified.append({
                    "text": text[:200],
                    "true_label": labels[i].item(),
                    "predicted_label": predictions[i].item()
                })

In [None]:
# Calculate metrics
accuracy = accuracy_score(all_labels, all_predictions)
f1 = f1_score(all_labels, all_predictions)
conf_matrix = confusion_matrix(all_labels, all_predictions)

print(f"\nAccuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"F1 Score: {f1:.4f}")

print(f"\nConfusion Matrix:")
print(conf_matrix)

In [None]:
# Show first 5 misclassified examples
print(f"\nMisclassified Examples (First 5):")
for idx, example in enumerate(misclassified[:5]):
    print(f"\nExample {idx+1}:")
    print(f"Text: {example['text']}...")
    print(f"True Label: {example['true_label']} | Predicted: {example['predicted_label']}")

### Plot Training Curves


In [None]:
# Loss curve
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(range(1, epochs+1), train_losses, 'b-o')
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training Loss over Epochs")
plt.grid(True)

# Accuracy curve
plt.subplot(1, 2, 2)
plt.plot(range(1, epochs+1), val_accuracies, 'g-o')
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Validation Accuracy over Epochs")
plt.grid(True)

plt.tight_layout()
plt.show()

## Task 4: Model Efficiency Analysis


In [None]:
# Load second model for comparison
comparison_model_name = "bert-base-uncased"
comparison_model = AutoModelForSequenceClassification.from_pretrained(comparison_model_name, num_labels=num_labels)
comparison_model.to(device)
comparison_model.eval()

In [None]:
# Function to measure inference time
def measure_inference_time(model, dataloader, num_batches=20):
    inference_times = []

In [None]:
with torch.no_grad():
        for idx, batch in enumerate(dataloader):
            if idx >= num_batches:
                break

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            start_time = time.time()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            inference_time = time.time() - start_time

            inference_times.append(inference_time / len(input_ids))

    return np.mean(inference_times)

In [None]:
# Measure for both models
print("\nMeasuring inference times...")

distilbert_time = measure_inference_time(model, test_loader)
bert_time = measure_inference_time(comparison_model, test_loader)

# Model parameters
distilbert_params = sum(p.numel() for p in model.parameters())
bert_params = sum(p.numel() for p in comparison_model.parameters())

# Model sizes
distilbert_size = (distilbert_params * 4) / (1024 ** 2)
bert_size = (bert_params * 4) / (1024 ** 2)

In [None]:
# Print comparison
print(f"\nModel Comparison:")
print(f"{'Metric':<30} {'DistilBERT':<20} {'BERT':<20}")
print(f"{'Parameters':<30} {distilbert_params:,<20} {bert_params:,<20}")
print(f"{'Model Size (MB)':<30} {distilbert_size:<20.2f}
      {bert_size:<20.2f}")
print(f"{'Inference Time (ms)':<30} {distilbert_time*1000:<20.2f} {bert_time*1000:<20.2f}")
print(f"{'Speedup':<30} {bert_time/distilbert_time:<20.2f}x")
