# Assignment 2: Transformer-Based Models

In [1]:
# required libraries
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from datasets import load_dataset
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

## Task 1: Load and Inspect a Transformer Model



In [2]:
# Load tokenizer and model
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [5]:
print(f"MODEL: {model_name}")

# Count total parameters
total_params = 0
for param in model.parameters():
    total_params += param.numel()

print(f"\nTotal Parameters: {total_params:,}")

MODEL: distilbert-base-uncased

Total Parameters: 66,362,880


In [6]:
# Model size in MB
model_size_mb = (total_params * 4) / (1024 ** 2)
print(f"Model Size: {model_size_mb:.2f} MB")

# Print model configuration
config = model.config
print(f"\nNumber of Layers: {config.n_layers}")
print(f"Hidden Size: {config.dim}")
print(f"Attention Heads: {config.n_heads}")
print(f"Max Sequence Length: {config.max_position_embeddings}")

Model Size: 253.15 MB

Number of Layers: 6
Hidden Size: 768
Attention Heads: 12
Max Sequence Length: 512


In [7]:
# Tokenizer information
print(f"\nVocabulary Size: {tokenizer.vocab_size:,}")
print(f"Padding Token: {tokenizer.pad_token}")


Vocabulary Size: 30,522
Padding Token: [PAD]


In [8]:
# Example tokenization
example_text = "The transformer architecture revolutionized NLP!"
tokens = tokenizer.tokenize(example_text)
token_ids = tokenizer.encode(example_text)

print(f"\nExample Text: {example_text}")
print(f"Tokens: {tokens}")
print(f"Token IDs: {token_ids}")
print(f"Number of Tokens: {len(tokens)}")


Example Text: The transformer architecture revolutionized NLP!
Tokens: ['the', 'transform', '##er', 'architecture', 'revolution', '##ized', 'nl', '##p', '!']
Token IDs: [101, 1996, 10938, 2121, 4294, 4329, 3550, 17953, 2361, 999, 102]
Number of Tokens: 9


## Task 2: Load Dataset and Build Classification Pipeline


In [9]:
# Load IMDb dataset
dataset = load_dataset("imdb")
print(f"\nTrain samples: {len(dataset['train'])}")
print(f"Test samples: {len(dataset['test'])}")

README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]


Train samples: 25000
Test samples: 25000


In [10]:
# Take smaller subset for faster training
train_size = 5000
test_size = 1000

train_dataset = dataset["train"].shuffle(seed=42).select(range(train_size))
test_dataset = dataset["test"].shuffle(seed=42).select(range(test_size))

print(f"\nUsing {train_size} training samples")
print(f"Using {test_size} test samples")


Using 5000 training samples
Using 1000 test samples


In [11]:
# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)

In [12]:
print("\nTokenizing dataset...")
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)


Tokenizing dataset...


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [13]:
# Rename label column
train_dataset = train_dataset.rename_column("label", "labels")
test_dataset = test_dataset.rename_column("label", "labels")

In [14]:
# Set format for PyTorch
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [15]:
# Create dataloaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

print(f"\nBatch Size: {batch_size}")
print(f"Training Batches: {len(train_loader)}")
print(f"Test Batches: {len(test_loader)}")


Batch Size: 16
Training Batches: 313
Test Batches: 63
