# CS288 Assignment 2 - Part 4: Training & Evaluation

## Complete Pipeline for Part 4A & 4B

This notebook will guide you through:
- **Part 4A**: Pre-training a language model and fine-tuning for multiple-choice QA
- **Part 4B**: Using prompting to solve MCQA

### Required Deliverables:
- `finetuned_predictions.json` (Part 4A)
- `prompting_predictions.json` (Part 4B)

### Instructions:
1. Upload your `cs288-sp26-a2.zip` to Google Drive (see Cell 2)
2. Run all cells in order
3. Download the generated JSON files at the end

---

**Estimated Runtime:**
- Quick config: ~5-10 minutes
- Small config: ~15-30 minutes  
- Medium config: ~1-2 hours

**GPU Recommendation:** Enable GPU via `Runtime ‚Üí Change runtime type ‚Üí GPU (T4)`

## 1. Setup: Mount Google Drive and Extract Code

**Before running this cell:**
1. Push your code to GitHub (see instructions below)
2. Make your repo public OR generate a personal access token for private repos

In [None]:
import os

# CHANGE THIS to your GitHub username and repo name
GITHUB_USERNAME = "shanayamalik"
REPO_NAME = "cs288-sp26-a2"

# Clone your repository
!git clone https://github.com/{GITHUB_USERNAME}/{REPO_NAME}.git /content/cs288-sp26-a2

# Change to project directory
%cd /content/cs288-sp26-a2

print("‚úÖ Repository cloned successfully!")
!ls -la

## 2. Install Dependencies

In [None]:
!pip install -q torch tiktoken datasets

import sys
import torch
print(f"Python: {sys.version}")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## 3. Test Imports

In [None]:
import sys
sys.path.insert(0, '/content/cs288-sp26-a2')

# Test Part 1 imports
from part1.tokenizer import get_tokenizer
from part1.train_bpe import train_bpe

# Test Part 2 imports
from part2.model import TransformerLM

# Test Part 3 imports
from part3.nn_utils import cross_entropy, gradient_clipping, token_accuracy, perplexity

# Test Part 4 imports
from part4.datasets import create_pretraining_dataloader, create_qa_dataloader
from part4.trainer import Trainer, TrainingConfig
from part4.sampling import generate_text, greedy_decode, top_k_decode
from part4.qa_model import TransformerForMultipleChoice, evaluate_qa_model
from part4.prompting import PromptTemplate, PromptingPipeline, evaluate_prompting

print("‚úÖ All imports successful!")

## 4. Configuration

**Adjust these settings based on your compute budget:**

- **quick**: Fast testing (~5-10 min) - Small model, small data
- **small**: Basic training (~15-30 min) - 10k stories, ~10M params
- **medium**: Better quality (~1-2 hours) - 10k stories, ~50M params

For submission, recommend using at least **small** configuration.

In [None]:
from pathlib import Path

# Choose configuration: "quick", "small", or "medium"
CONFIG_NAME = "small"  # <-- CHANGE THIS

CONFIGS = {
    "quick": {
        "vocab_size": 512,
        "d_model": 128,
        "num_layers": 4,
        "num_heads": 4,
        "d_ff": 512,
        "context_length": 256,
        "pretrain_epochs": 2,
        "finetune_epochs": 5,
        "batch_size": 32,
        "lr": 1e-3,
        "num_stories": 1000,  # For TinyStories download
    },
    "small": {
        "vocab_size": 4096,
        "d_model": 256,
        "num_layers": 6,
        "num_heads": 8,
        "d_ff": 1024,
        "context_length": 512,
        "pretrain_epochs": 3,
        "finetune_epochs": 10,
        "batch_size": 32,
        "lr": 3e-4,
        "num_stories": 10000,
    },
    "medium": {
        "vocab_size": 8192,
        "d_model": 512,
        "num_layers": 8,
        "num_heads": 8,
        "d_ff": 2048,
        "context_length": 512,
        "pretrain_epochs": 5,
        "finetune_epochs": 15,
        "batch_size": 16,
        "lr": 1e-4,
        "num_stories": 10000,
    }
}

CONFIG = CONFIGS[CONFIG_NAME]
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Using configuration: {CONFIG_NAME}")
print(f"Device: {DEVICE}")
print(f"Model will have ~{CONFIG['d_model']**2 * CONFIG['num_layers'] * 12 / 1e6:.1f}M parameters")
print(f"\nConfig details:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

## 5. Download TinyStories Dataset

In [None]:
from datasets import load_dataset
import os

os.makedirs('/content/cs288-sp26-a2/part4/fixtures', exist_ok=True)

print("Downloading TinyStories dataset...")
num_stories = CONFIG["num_stories"]
dataset = load_dataset("roneneldan/TinyStories", split=f"train[:{num_stories}]")
print(f"‚úÖ Loaded {len(dataset):,} stories")

# Save to text file
PRETRAIN_DATA = Path('/content/cs288-sp26-a2/part4/fixtures/tinystories_train.txt')
with open(PRETRAIN_DATA, 'w', encoding='utf-8') as f:
    for i, story in enumerate(dataset):
        f.write(story['text'])
        f.write('\n<|endoftext|>\n')
        if (i + 1) % 1000 == 0:
            print(f"  Processed {i+1}/{len(dataset)} stories...")

file_size_mb = os.path.getsize(PRETRAIN_DATA) / 1024 / 1024
print(f"‚úÖ Saved to {PRETRAIN_DATA}")
print(f"   File size: {file_size_mb:.1f} MB")

## 6. Train BPE Tokenizer (Part 1)

In [None]:
print("=" * 60)
print("STEP 1: Training BPE Tokenizer")
print("=" * 60)

SPECIAL_TOKENS = ["<|endoftext|>", "<|pad|>"]

print(f"Training on: {PRETRAIN_DATA}")
print(f"Target vocab size: {CONFIG['vocab_size']}")
print(f"Special tokens: {SPECIAL_TOKENS}")
print()

vocab, merges = train_bpe(
    input_path=PRETRAIN_DATA,
    vocab_size=CONFIG['vocab_size'],
    special_tokens=SPECIAL_TOKENS,
)

tokenizer = get_tokenizer(vocab, merges, SPECIAL_TOKENS)

print(f"\n‚úÖ Tokenizer trained!")
print(f"   Vocab size: {len(vocab)}")
print(f"   Num merges: {len(merges)}")

# Test encoding
test_text = "Once upon a time, there was a little girl."
tokens = tokenizer.encode(test_text)
decoded = tokenizer.decode(tokens)

print(f"\nüìù Test encoding:")
print(f"   Input:   '{test_text}'")
print(f"   Tokens:  {len(tokens)} tokens ‚Üí {tokens[:10]}...")
print(f"   Decoded: '{decoded}'")

## 7. Create TransformerLM Model (Part 2)

In [None]:
print("=" * 60)
print("STEP 2: Creating Transformer Language Model")
print("=" * 60)

model = TransformerLM(
    vocab_size=len(tokenizer.vocab),
    context_length=CONFIG["context_length"],
    d_model=CONFIG["d_model"],
    num_layers=CONFIG["num_layers"],
    num_heads=CONFIG["num_heads"],
    d_ff=CONFIG["d_ff"],
).to(DEVICE)

num_params = sum(p.numel() for p in model.parameters())
model_size_mb = num_params * 4 / 1024 / 1024  # fp32

print(f"\n‚úÖ Model created!")
print(f"\nüìä Model Architecture:")
print(f"   Vocab size: {len(tokenizer.vocab):,}")
print(f"   Context length: {CONFIG['context_length']}")
print(f"   d_model: {CONFIG['d_model']}")
print(f"   Layers: {CONFIG['num_layers']}")
print(f"   Attention heads: {CONFIG['num_heads']}")
print(f"   FFN dimension: {CONFIG['d_ff']}")
print(f"\nüíæ Model Size:")
print(f"   Parameters: {num_params:,}")
print(f"   Memory (fp32): ~{model_size_mb:.1f} MB")

## 8. Pre-train Model (Language Modeling)

In [None]:
print("=" * 60)
print("STEP 3: Pre-training Language Model")
print("=" * 60)

# Create dataloader
train_dataloader = create_pretraining_dataloader(
    file_path=PRETRAIN_DATA,
    tokenizer=tokenizer,
    batch_size=CONFIG["batch_size"],
    max_length=CONFIG["context_length"],
    stride=CONFIG["context_length"] // 2,  # 50% overlap
    shuffle=True,
)

print(f"\nüìö Training Data:")
print(f"   Sequences: {len(train_dataloader.dataset):,}")
print(f"   Batches/epoch: {len(train_dataloader)}")
print(f"   Tokens/epoch: ~{len(train_dataloader) * CONFIG['batch_size'] * CONFIG['context_length']:,}")

# Training configuration
train_config = TrainingConfig(
    num_epochs=CONFIG["pretrain_epochs"],
    learning_rate=CONFIG["lr"],
    weight_decay=0.01,
    warmup_steps=min(100, len(train_dataloader) // 5),
    max_grad_norm=1.0,
    batch_size=CONFIG["batch_size"],
    device=DEVICE,
    log_interval=max(1, len(train_dataloader) // 5),
)

trainer = Trainer(
    model=model,
    config=train_config,
    train_dataloader=train_dataloader,
)

print(f"\nüöÄ Training Configuration:")
print(f"   Epochs: {train_config.num_epochs}")
print(f"   Learning rate: {train_config.learning_rate}")
print(f"   Warmup steps: {train_config.warmup_steps}")
print(f"   Batch size: {train_config.batch_size}")
print(f"   Device: {DEVICE}")
print()
print("=" * 60)
print("Starting training...")
print("=" * 60)

results = trainer.train()

print(f"\n‚úÖ Pre-training complete!")
print(f"   Final training loss: {trainer.train_losses[-1]:.4f}")
print(f"   Training losses: {[f'{loss:.4f}' for loss in trainer.train_losses]}")

## 9. Test Text Generation

In [None]:
print("=" * 60)
print("TEXT GENERATION SAMPLES")
print("=" * 60)

prompts = [
    "Once upon a time",
    "The little dog",
    "There was a princess",
]

model.eval()

for prompt in prompts:
    print(f"\nüìù Prompt: '{prompt}'")
    print("-" * 60)
    
    # Greedy decoding
    greedy_text = generate_text(
        model, tokenizer, prompt,
        max_new_tokens=50,
        method="greedy"
    )
    print(f"Greedy: {greedy_text}")
    
    # Top-k sampling
    topk_text = generate_text(
        model, tokenizer, prompt,
        max_new_tokens=50,
        method="top_k",
        k=50,
        temperature=0.8
    )
    print(f"Top-k:  {topk_text}")

print("\n‚úÖ Generation test complete!")

## 10. Fine-tune for Multiple-Choice QA (Part 4A)

In [None]:
print("=" * 60)
print("STEP 4: Fine-tuning for Multiple-Choice QA")
print("=" * 60)

# Create MCQA model with classification head
qa_model = TransformerForMultipleChoice(
    transformer_lm=model,
    hidden_size=CONFIG["d_model"],
    num_choices=4,
    pooling="last",
    freeze_backbone=False,  # Allow fine-tuning of base model
).to(DEVICE)

print(f"‚úÖ QA Model created with classification head")

# Load QA datasets
QA_TRAIN = Path('/content/cs288-sp26-a2/part4/fixtures/squad_train.json')
QA_DEV = Path('/content/cs288-sp26-a2/part4/fixtures/squad_dev.json')

qa_train_loader = create_qa_dataloader(
    data=QA_TRAIN,
    tokenizer=tokenizer,
    batch_size=CONFIG["batch_size"],
    max_length=CONFIG["context_length"],
    num_choices=4,
    shuffle=True,
)

qa_dev_loader = create_qa_dataloader(
    data=QA_DEV,
    tokenizer=tokenizer,
    batch_size=CONFIG["batch_size"],
    max_length=CONFIG["context_length"],
    num_choices=4,
    shuffle=False,
)

print(f"\nüìö QA Data:")
print(f"   Training examples: {len(qa_train_loader.dataset)}")
print(f"   Dev examples: {len(qa_dev_loader.dataset)}")

# Custom loss function for QA
def qa_loss_fn(batch, model):
    input_ids = batch["input_ids"].to(DEVICE)  # [batch, num_choices, seq_len]
    attention_mask = batch["attention_mask"].to(DEVICE)
    labels = batch["labels"].to(DEVICE)  # [batch]
    
    logits = model(input_ids, attention_mask)  # [batch, num_choices]
    return cross_entropy(logits, labels)

# Fine-tuning configuration
finetune_config = TrainingConfig(
    num_epochs=CONFIG["finetune_epochs"],
    learning_rate=CONFIG["lr"] / 2,  # Lower LR for fine-tuning
    weight_decay=0.01,
    warmup_steps=min(50, len(qa_train_loader) // 5),
    max_grad_norm=1.0,
    batch_size=CONFIG["batch_size"],
    device=DEVICE,
    log_interval=max(1, len(qa_train_loader) // 5),
)

qa_trainer = Trainer(
    model=qa_model,
    config=finetune_config,
    train_dataloader=qa_train_loader,
    val_dataloader=qa_dev_loader,
    compute_loss_fn=qa_loss_fn,
)

print(f"\nüöÄ Fine-tuning Configuration:")
print(f"   Epochs: {finetune_config.num_epochs}")
print(f"   Learning rate: {finetune_config.learning_rate}")
print(f"   Warmup steps: {finetune_config.warmup_steps}")
print()
print("=" * 60)
print("Starting fine-tuning...")
print("=" * 60)

finetune_results = qa_trainer.train()

print(f"\n‚úÖ Fine-tuning complete!")
if qa_trainer.val_losses:
    print(f"   Final validation loss: {qa_trainer.val_losses[-1]:.4f}")

## 11. Generate Fine-tuned Predictions (Part 4A Output)

In [None]:
print("=" * 60)
print("STEP 5: Evaluating Fine-tuned Model")
print("=" * 60)

# Evaluate on dev set
accuracy = evaluate_qa_model(qa_model, qa_dev_loader, DEVICE)
print(f"\nüìä Fine-tuned Model Performance:")
print(f"   Dev Accuracy: {accuracy:.2%}")

# Generate predictions on test set
QA_TEST = Path('/content/cs288-sp26-a2/part4/fixtures/squad_test.json')
qa_test_loader = create_qa_dataloader(
    data=QA_TEST,
    tokenizer=tokenizer,
    batch_size=CONFIG["batch_size"],
    max_length=CONFIG["context_length"],
    num_choices=4,
    shuffle=False,
)

print(f"\nüîç Generating predictions on test set ({len(qa_test_loader.dataset)} examples)...")

predictions = []
qa_model.eval()

with torch.no_grad():
    for batch in qa_test_loader:
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        
        logits = qa_model(input_ids, attention_mask)
        preds = logits.argmax(dim=-1).cpu().tolist()
        predictions.extend(preds)

# Save predictions
import json

os.makedirs('/content/outputs', exist_ok=True)
output_file = '/content/outputs/finetuned_predictions.json'

with open(output_file, 'w') as f:
    json.dump(predictions, f, indent=2)

print(f"\n‚úÖ Predictions saved to: {output_file}")
print(f"   Total predictions: {len(predictions)}")
print(f"   Sample predictions: {predictions[:10]}")

## 12. Prompting-Based Evaluation (Part 4B)

In [None]:
print("=" * 60)
print("STEP 6: Prompting-Based Evaluation")
print("=" * 60)

# Create prompt template
prompt_template = PromptTemplate()

# Create prompting pipeline using the base LM (not the QA model)
prompting_pipeline = PromptingPipeline(
    model=model,  # Use the base TransformerLM
    tokenizer=tokenizer,
    prompt_template=prompt_template,
    max_length=CONFIG["context_length"],
)

print(f"‚úÖ Prompting pipeline created")

# Evaluate on dev set
print(f"\nüîç Evaluating prompting on dev set...")
prompting_accuracy = evaluate_prompting(
    prompting_pipeline,
    qa_dev_loader,
    DEVICE
)

print(f"\nüìä Prompting Model Performance:")
print(f"   Dev Accuracy: {prompting_accuracy:.2%}")
print(f"\nüìà Comparison:")
print(f"   Fine-tuned: {accuracy:.2%}")
print(f"   Prompting:  {prompting_accuracy:.2%}")
print(f"   Difference: {(prompting_accuracy - accuracy):.2%}")

## 13. Generate Prompting Predictions (Part 4B Output)

In [None]:
print("=" * 60)
print("Generating prompting predictions on test set...")
print("=" * 60)

# Generate predictions using prompting
prompting_predictions = []

# Load test data
with open(QA_TEST, 'r') as f:
    test_data = json.load(f)

print(f"Processing {len(test_data)} test examples...")

model.eval()
for i, example in enumerate(test_data):
    if (i + 1) % 50 == 0:
        print(f"  Processed {i+1}/{len(test_data)} examples...")
    
    pred = prompting_pipeline.predict(example)
    prompting_predictions.append(pred)

# Save predictions
prompting_output_file = '/content/outputs/prompting_predictions.json'

with open(prompting_output_file, 'w') as f:
    json.dump(prompting_predictions, f, indent=2)

print(f"\n‚úÖ Prompting predictions saved to: {prompting_output_file}")
print(f"   Total predictions: {len(prompting_predictions)}")
print(f"   Sample predictions: {prompting_predictions[:10]}")

## 14. Final Summary & Download Files

In [None]:
print("\n" + "=" * 60)
print("üéâ TRAINING COMPLETE!")
print("=" * 60)

print("\nüìä Final Results:")
print(f"   Configuration: {CONFIG_NAME}")
print(f"   Model parameters: {num_params:,}")
print(f"   Pre-training epochs: {CONFIG['pretrain_epochs']}")
print(f"   Fine-tuning epochs: {CONFIG['finetune_epochs']}")
print()
print("   Fine-tuned model accuracy: {:.2%}".format(accuracy))
print("   Prompting model accuracy:  {:.2%}".format(prompting_accuracy))
print("   Improvement: {:.2%}".format(prompting_accuracy - accuracy))
print()
print("üìÅ Output Files:")
print(f"   ‚úÖ {output_file}")
print(f"   ‚úÖ {prompting_output_file}")
print()
print("=" * 60)

# Verify files exist
assert os.path.exists(output_file), "Fine-tuned predictions file not found!"
assert os.path.exists(prompting_output_file), "Prompting predictions file not found!"

# Verify correct format
with open(output_file, 'r') as f:
    ft_preds = json.load(f)
with open(prompting_output_file, 'r') as f:
    pr_preds = json.load(f)

assert len(ft_preds) == len(test_data), f"Fine-tuned predictions mismatch: {len(ft_preds)} vs {len(test_data)}"
assert len(pr_preds) == len(test_data), f"Prompting predictions mismatch: {len(pr_preds)} vs {len(test_data)}"

print("‚úÖ All files verified and ready for submission!")

## 15. Download Submission Files

In [None]:
from google.colab import files

print("Downloading submission files...")
print()

# Download both prediction files
files.download(output_file)
print(f"‚úÖ Downloaded: finetuned_predictions.json")

files.download(prompting_output_file)
print(f"‚úÖ Downloaded: prompting_predictions.json")

print()
print("=" * 60)
print("‚úÖ ALL DONE!")
print("=" * 60)
print()
print("Next steps:")
print("1. Check your Downloads folder for the JSON files")
print("2. Submit both files according to assignment instructions")
print()
print("Expected grading:")
print(f"  - Fine-tuned accuracy: {accuracy:.2%}")
print(f"  - Prompting accuracy: {prompting_accuracy:.2%}")
print()
print("Good luck! üöÄ")