# Interview Transformer (Step-by-Step)

Goal: hand-write a minimal GPT-style Transformer in PyTorch, step by step.

## Step 0: Imports + seed

In [None]:
import math
import random
import torch
import torch.nn as nn
import torch.nn.functional as F

seed = 1337
random.seed(seed)
torch.manual_seed(seed)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("device:", device)

## Step 1: Tiny dataset + vocab
We use a short text for fast iteration. You can replace it later.

In [None]:
text = '''
To be, or not to be, that is the question:
Whether 'tis nobler in the mind to suffer
The slings and arrows of outrageous fortune,
Or to take arms against a sea of troubles
'''.strip()

chars = sorted(list(set(text)))
vocab_size = len(chars)
print("vocab_size:", vocab_size)

stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for ch, i in stoi.items()}

encode = lambda s: [stoi[c] for c in s]
decode = lambda ids: "".join([itos[i] for i in ids])

data = torch.tensor(encode(text), dtype=torch.long)
print("data length:", len(data))

## Step 2: Batch sampling (causal language modeling)

In [None]:
# Train/val split
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

# Batch config
block_size = 64
batch_size = 16

def get_batch(split):
    src = train_data if split == "train" else val_data
    # shrink block if split is small
    t = min(block_size, len(src) - 1)
    if t < 1:
        raise ValueError("Text too short for batching; need at least 2 tokens.")
    max_start = len(src) - t
    ix = torch.randint(max_start, (batch_size,))
    x = torch.stack([src[i:i+t] for i in ix])
    y = torch.stack([src[i+1:i+t+1] for i in ix])
    return x.to(device), y.to(device)

## Step 3: Scaled Dot-Product Attention (single head)
We'll write this from scratch next.

In [None]:
# TODO: implement a single-head causal self-attention

## Step 4: Multi-Head Attention

In [None]:
# TODO: wrap multiple heads + output projection

## Step 5: FeedForward + Transformer Block (Pre-LN)

In [None]:
# TODO: implement FFN and residual block

## Step 6: GPT model (token + position embedding)

In [None]:
# TODO: implement GPT model with generate()

## Step 7: Train + Generate

In [None]:
# TODO: training loop and text generation