# LLMs From Scratch: Visual + Interactive Master Notebook (v3)

This version adds **step-by-step visualizations** and **interactive knobs** (change variables and re-run cells) so beginners can see exactly how each part works.


## Learning Path
1. Math intuition with plots
2. Tokenization and embedding visuals
3. Attention heatmaps
4. Causal masking visualization
5. Tiny GPT build
6. Interactive generation controls
7. Training curve visualization


In [None]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = (8, 4)
torch.manual_seed(42)
print('Ready. Change variables in cells to explore interactively.')


## Part A — Math Visuals


In [None]:
# Step A1: Dot-product as similarity
v1 = torch.tensor([2.0, 4.0])
v2 = torch.tensor([1.0, 2.0])
v3 = torch.tensor([-2.0, -4.0])

pairs = [('v1·v2', torch.dot(v1,v2).item()), ('v1·v3', torch.dot(v1,v3).item())]
labels = [p[0] for p in pairs]
values = [p[1] for p in pairs]

plt.bar(labels, values)
plt.title('Dot Product Comparison')
plt.axhline(0, color='black', lw=1)
plt.show()
print('Interpretation: positive=aligned, negative=opposite direction')


In [None]:
# Step A2: Softmax temperature visualization (interactive)
scores = torch.tensor([3.0, 1.0, 0.2, -1.0])
temperatures = [2.0, 1.0, 0.5, 0.2]  # try your own values

fig, axes = plt.subplots(1, len(temperatures), figsize=(14,3), sharey=True)
for ax, T in zip(axes, temperatures):
    probs = torch.softmax(scores / T, dim=0)
    ax.bar(range(len(scores)), probs.numpy())
    ax.set_title(f'T={T}')
    ax.set_xticks(range(len(scores)))
plt.suptitle('Lower temperature => sharper distribution')
plt.show()


## Part B — Tokenization & Embeddings Visuals


In [None]:
text = "llms can learn patterns from data very quickly"
tokens = text.split()
vocab = {w:i for i,w in enumerate(sorted(set(tokens)))}
ids = [vocab[t] for t in tokens]

print('Tokens:', tokens)
print('IDs   :', ids)

plt.figure(figsize=(10,2))
plt.scatter(range(len(ids)), [1]*len(ids), s=120)
for i, (tok, tid) in enumerate(zip(tokens, ids)):
    plt.text(i, 1.02, f'{tok}
{tid}', ha='center', va='bottom', fontsize=9)
plt.yticks([])
plt.title('Token -> ID mapping along sequence')
plt.show()


In [None]:
# Embedding geometry projection (2D via first 2 dims for teaching)
vocab_size = max(vocab.values()) + 1
d_model = 8
emb = nn.Embedding(vocab_size, d_model)
E = emb.weight.detach()

plt.figure(figsize=(6,6))
plt.scatter(E[:,0].numpy(), E[:,1].numpy())
for token, idx in vocab.items():
    plt.text(E[idx,0].item(), E[idx,1].item(), token)
plt.title('Embedding vectors (first 2 dimensions)')
plt.xlabel('dim0'); plt.ylabel('dim1')
plt.show()


## Part C — Attention: Step-by-Step + Heatmap


In [None]:
def scaled_dot_attention(Q, K, V, mask=None):
    d_k = Q.size(-1)
    scores = (Q @ K.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, float('-inf'))
    weights = torch.softmax(scores, dim=-1)
    out = weights @ V
    return out, weights

# toy sequence with token labels
token_labels = ['I', 'love', 'learning', 'AI']
B, T, C = 1, len(token_labels), 8
X = torch.randn(B, T, C)
Wq, Wk, Wv = [torch.randn(C, C) for _ in range(3)]
Q, K, V = X @ Wq, X @ Wk, X @ Wv
out, attn = scaled_dot_attention(Q, K, V)

plt.figure(figsize=(5,4))
plt.imshow(attn[0].detach().numpy(), cmap='viridis')
plt.colorbar(label='attention weight')
plt.xticks(range(T), token_labels)
plt.yticks(range(T), token_labels)
plt.title('Attention Heatmap (who looks at whom)')
plt.xlabel('Key token'); plt.ylabel('Query token')
plt.show()


## Part D — Causal Mask Visualization (GPT rule: no future peeking)


In [None]:
T = 8
causal_mask = torch.tril(torch.ones(T, T))

plt.figure(figsize=(5,4))
plt.imshow(causal_mask.numpy(), cmap='gray_r')
plt.title('Causal Mask (white=visible, black=blocked)')
plt.xlabel('Key position'); plt.ylabel('Query position')
plt.show()


## Part E — Build Tiny Transformer + GPT


In [None]:
class TinyBlock(nn.Module):
    def __init__(self, d_model=32, n_heads=4):
        super().__init__()
        self.attn = nn.MultiheadAttention(d_model, n_heads, batch_first=True)
        self.ln1 = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, 4 * d_model),
            nn.GELU(),
            nn.Linear(4 * d_model, d_model)
        )
        self.ln2 = nn.LayerNorm(d_model)

    def forward(self, x, attn_mask=None):
        a, _ = self.attn(x, x, x, attn_mask=attn_mask)
        x = self.ln1(x + a)
        f = self.ffn(x)
        return self.ln2(x + f)

class TinyGPT(nn.Module):
    def __init__(self, vocab_size=200, d_model=64, n_layers=2, n_heads=4, max_len=64):
        super().__init__()
        self.tok = nn.Embedding(vocab_size, d_model)
        self.pos = nn.Embedding(max_len, d_model)
        self.blocks = nn.ModuleList([TinyBlock(d_model, n_heads) for _ in range(n_layers)])
        self.ln = nn.LayerNorm(d_model)
        self.head = nn.Linear(d_model, vocab_size)

    def forward(self, idx):
        B, T = idx.shape
        pos = torch.arange(T, device=idx.device).unsqueeze(0)
        x = self.tok(idx) + self.pos(pos)
        causal = torch.triu(torch.ones(T, T, device=idx.device), diagonal=1).bool()
        for block in self.blocks:
            x = block(x, attn_mask=causal)
        return self.head(self.ln(x))

model = TinyGPT(vocab_size=120, d_model=32, n_layers=2, n_heads=4, max_len=64)
print('Model ready')


## Part F — Visual Training Curve


In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
loss_fn = nn.CrossEntropyLoss()
losses = []

steps = 30  # increase to 100+ for smoother trend
for step in range(steps):
    x = torch.randint(0, 120, (16, 20))
    y = x.roll(-1, dims=1)
    logits = model(x)
    loss = loss_fn(logits[:, :-1, :].reshape(-1, 120), y[:, :-1].reshape(-1))

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    losses.append(loss.item())

plt.plot(losses)
plt.title('Training Loss Curve (interactive: change steps/lr)')
plt.xlabel('Step')
plt.ylabel('Loss')
plt.show()
print('Final loss:', round(losses[-1], 4))


## Part G — Interactive Generation Controls


In [None]:
@torch.no_grad()
def generate(model, start_ids, max_new_tokens=20, temperature=1.0, top_k=None):
    model.eval()
    idx = start_ids.clone()
    for _ in range(max_new_tokens):
        logits = model(idx)[:, -1, :] / temperature
        if top_k is not None:
            v, _ = torch.topk(logits, k=top_k)
            logits[logits < v[:, [-1]]] = -float('inf')
        probs = F.softmax(logits, dim=-1)
        next_id = torch.multinomial(probs, 1)
        idx = torch.cat([idx, next_id], dim=1)
    return idx

start = torch.randint(0, 120, (1, 6))

# Interactive knobs: change and re-run
MAX_NEW = 12
TEMP = 0.8
TOP_K = 20

sample = generate(model, start, max_new_tokens=MAX_NEW, temperature=TEMP, top_k=TOP_K)
print('Generated token IDs:', sample.tolist())


## Part H — Quick Challenges (Visual)
1. Change `temperatures` and explain probability shape change.
2. Change `TOP_K` and compare generation diversity.
3. Increase training `steps` and compare loss curve.
4. Replace random training data with your own tokenized text.


## Responsible AI Reminder
- Generated text can be fluent but wrong.
- Check for bias and unsafe outputs.
- Human review is important in real products.
