# Problem: benchmarking_script (10 points)

End-to-end benchmarking of forward and backward passes for `BasicsTransformerLM`.

In [None]:
import math
import timeit

import torch
import pandas as pd
from a1_basics.model import BasicsTransformerLM
from student.basicprofiling import benchmark, MODEL_SIZES

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")
if DEVICE == "cuda":
    print(torch.cuda.get_device_name())

## Part (b): Time forward and backward passes for all model sizes

Use 5 warmup steps, 10 measurement steps. Report average and standard deviation.

Set `USE_SLURM_RESULTS = True` below to load pre-computed results from sbatch jobs, or `False` to run live in this notebook.

In [None]:
USE_SLURM_RESULTS = True  # Toggle: True = load from sbatch CSVs, False = run live

if USE_SLURM_RESULTS:
    # --- Load pre-computed results from sbatch ---
    import os
    csv_path = os.path.join(os.path.dirname("__file__"), "results", "bench_part_b.csv")
    # If running on HPC, adjust path:
    # csv_path = "/scratch/pg2973/nyu-llm-reasoners-a2/results/bench_part_b.csv"
    df_b = pd.read_csv(csv_path)
    # Normalize mode names to match notebook convention
    df_b["mode"] = df_b["mode"].replace({
        "forward-only": "forward",
        "forward-backward": "forward+backward",
    })
    print(f"Loaded {len(df_b)} rows from {csv_path}")
    display(df_b)

else:
    # --- Run live in notebook ---
    context_lengths = [128, 256, 512, 1024]
    results = []

    for size_name, params in MODEL_SIZES.items():
        for ctx_len in context_lengths:
            for mode in ["forward", "forward+backward"]:
                backward = mode == "forward+backward"
                print(f"--- {size_name} | ctx={ctx_len} | {mode} ---")
                try:
                    times = benchmark(
                        d_model=params["d_model"],
                        d_ff=params["d_ff"],
                        num_layers=params["num_layers"],
                        num_heads=params["num_heads"],
                        context_length=ctx_len,
                        warmup_steps=5,
                        num_steps=10,
                        backward=backward,
                        device=DEVICE,
                    )
                    avg = sum(times) / len(times) * 1000
                    std = math.sqrt(sum((t - avg / 1000) ** 2 for t in times) / len(times)) * 1000
                    results.append({
                        "model": size_name,
                        "ctx_len": ctx_len,
                        "mode": mode,
                        "avg_ms": round(avg, 2),
                        "std_ms": round(std, 2),
                    })
                except RuntimeError as e:
                    print(f"  OOM or error: {e}")
                    results.append({
                        "model": size_name,
                        "ctx_len": ctx_len,
                        "mode": mode,
                        "avg_ms": "OOM",
                        "std_ms": "OOM",
                    })
                if DEVICE == "cuda":
                    torch.cuda.empty_cache()
                print()

    df_b = pd.DataFrame(results)

In [None]:
df_b = pd.DataFrame(results)

# Pivot into a readable table: rows = (model, ctx_len), columns = mode
pivot_avg = df_b.pivot_table(index=["model", "ctx_len"], columns="mode", values="avg_ms", aggfunc="first")
pivot_std = df_b.pivot_table(index=["model", "ctx_len"], columns="mode", values="std_ms", aggfunc="first")

# Combine avg ± std into a single string per cell
def fmt_cell(avg, std):
    if avg == "OOM" or std == "OOM":
        return "OOM"
    return f"{avg:.2f} ± {std:.2f}"

combined = pd.DataFrame(index=pivot_avg.index)
for col in pivot_avg.columns:
    combined[col] = [
        fmt_cell(a, s) for a, s in zip(pivot_avg[col], pivot_std[col])
    ]

combined.columns.name = None
combined = combined.rename(columns={"forward": "Forward (ms)", "forward+backward": "Fwd+Bwd (ms)"})

print("=== Markdown ===")
print(combined.to_markdown())
print()
print("=== LaTeX ===")
print(combined.to_latex())

combined

### Part (b) response

_Fill in after running:_ A 1-2 sentence response with your timings.

## Part (c): Effect of warm-up steps

Repeat the analysis with 0, 1, 2, and 5 warm-up steps to see the effect.

Uses the same `USE_SLURM_RESULTS` toggle from above.

In [None]:
if USE_SLURM_RESULTS:
    # --- Load pre-computed results from sbatch ---
    csv_path = os.path.join(os.path.dirname("__file__"), "results", "bench_part_c.csv")
    # If running on HPC, adjust path:
    # csv_path = "/scratch/pg2973/nyu-llm-reasoners-a2/results/bench_part_c.csv"
    df_c = pd.read_csv(csv_path)
    df_c["mode"] = df_c["mode"].replace({
        "forward-only": "forward",
        "forward-backward": "forward+backward",
    })
    print(f"Loaded {len(df_c)} rows from {csv_path}")
    display(df_c)

else:
    # --- Run live in notebook ---
    warmup_values = [0, 1, 2, 5]
    warmup_results = []

    params = MODEL_SIZES["small"]

    for w in warmup_values:
        for mode in ["forward", "forward+backward"]:
            backward = mode == "forward+backward"
            print(f"--- warmup={w} | {mode} ---")
            times = benchmark(
                d_model=params["d_model"],
                d_ff=params["d_ff"],
                num_layers=params["num_layers"],
                num_heads=params["num_heads"],
                context_length=128,
                warmup_steps=w,
                num_steps=10,
                backward=backward,
                device=DEVICE,
            )
            avg = sum(times) / len(times) * 1000
            std = math.sqrt(sum((t - avg / 1000) ** 2 for t in times) / len(times)) * 1000
            warmup_results.append({
                "warmup": w,
                "mode": mode,
                "avg_ms": round(avg, 2),
                "std_ms": round(std, 2),
                "first_step_ms": round(times[0] * 1000, 2),
            })
            if DEVICE == "cuda":
                torch.cuda.empty_cache()
            print()

    df_c = pd.DataFrame(warmup_results)

In [None]:
# Normalize column name
if "warmup_steps" in df_c.columns:
    df_c = df_c.rename(columns={"warmup_steps": "warmup"})

pivot_avg = df_c.pivot_table(index="warmup", columns="mode", values="avg_ms", aggfunc="first")
pivot_std = df_c.pivot_table(index="warmup", columns="mode", values="std_ms", aggfunc="first")

table_c = pd.DataFrame(index=pivot_avg.index)
for col in pivot_avg.columns:
    table_c[f"{col} avg±std (ms)"] = [
        f"{a:.2f} ± {s:.2f}" for a, s in zip(pivot_avg[col], pivot_std[col])
    ]

# Add first-step column if available (live runs only)
if "first_step_ms" in df_c.columns:
    pivot_first = df_c.pivot_table(index="warmup", columns="mode", values="first_step_ms", aggfunc="first")
    for col in pivot_first.columns:
        table_c[f"{col} 1st step (ms)"] = [f"{v:.2f}" for v in pivot_first[col]]

table_c.columns.name = None
table_c.index.name = "warmup"

print("=== Markdown ===")
print(table_c.to_markdown())
print()
print("=== LaTeX ===")
print(table_c.to_latex())

table_c

### Part (c) response

_Fill in after running:_ A 2-3 sentence response.