In [11]:
#CHANGES DONE

# Added GPT-2 (tiny) model training loop
# Added stable average validation loss calculation
# Added train/validation split with short-text filtering for cleaner data
# Added model checkpoint saving and reloading for reproducibility

In [12]:
# Install dependencies
!pip install torch transformers datasets evaluate tqdm



In [13]:
# Imports
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling
)
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm.auto import tqdm
import math, torch, evaluate

In [14]:
#Load dataset (added train/val split + filtering)
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
train_ds = dataset["train"].filter(lambda x: len(x["text"].strip()) > 10)
val_ds = dataset["validation"]

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [15]:
# Tokenize and group text
def tokenize_function(examples):
    return tokenizer(examples["text"], return_special_tokens_mask=False)

train_ds = train_ds.map(tokenize_function, batched=True, remove_columns=["text"])
val_ds   = val_ds.map(tokenize_function, batched=True, remove_columns=["text"])

block_size = 128
def group_texts(examples):
    concatenated_inputs = sum(examples["input_ids"], [])
    concatenated_masks = sum(examples["attention_mask"], [])
    total_len = (len(concatenated_inputs) // block_size) * block_size
    concatenated_inputs = concatenated_inputs[:total_len]
    concatenated_masks = concatenated_masks[:total_len]
    result_input_ids = [concatenated_inputs[i:i+block_size] for i in range(0, total_len, block_size)]
    result_masks = [concatenated_masks[i:i+block_size] for i in range(0, total_len, block_size)]
    return {"input_ids": result_input_ids, "attention_mask": result_masks}

train_ds = train_ds.map(group_texts, batched=True, batch_size=100)
val_ds   = val_ds.map(group_texts, batched=True, batch_size=100)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
train_loader = DataLoader(train_ds, batch_size=2, shuffle=True, collate_fn=data_collator)
val_loader   = DataLoader(val_ds, batch_size=2, collate_fn=data_collator)


In [16]:
# Load and train GPT-2 (tiny)
model = AutoModelForCausalLM.from_pretrained("sshleifer/tiny-gpt2")
model.resize_token_embeddings(len(tokenizer))
optimizer = AdamW(model.parameters(), lr=5e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

model.train()
for step, batch in enumerate(tqdm(train_loader, total=20, desc="Training")):
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
print("✅ Training complete")

Training:   0%|          | 0/20 [00:00<?, ?it/s]

✅ Training complete


In [17]:
# Evaluate perplexity
model.eval()
total_loss = 0
for step, batch in enumerate(tqdm(val_loader, total=10, desc="Evaluating")):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        loss = model(**batch).loss
    total_loss += loss.item()

avg_loss = total_loss / 10
print(f"Average Validation Loss: {avg_loss:.4f}")

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

Average Validation Loss: 893.0775


In [18]:
# Save and reload checkpoint
save_path = "./checkpoints/tiny_gpt2_lab1"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
reloaded_model = AutoModelForCausalLM.from_pretrained(save_path)
print("✅ Model reloaded successfully!")

✅ Model reloaded successfully!
