# Character-Level Language Model (Bigram)
Fast first win on *Alice in Wonderland*.


In [None]:
# Setup
!pip -q install torch --upgrade
import math, random, textwrap, requests
import torch, torch.nn as nn, torch.nn.functional as F
from tqdm.auto import tqdm
device = "cuda" if torch.cuda.is_available() else "cpu"; torch.manual_seed(1337); print("Device:", device)

# Data
text = requests.get("https://www.gutenberg.org/files/11/11-0.txt").text
text = text[:300_000]
chars = sorted(list(set(text))); vocab_size = len(chars)
stoi = {ch:i for i,ch in enumerate(chars)}; itos = {i:ch for ch,i in stoi.items()}
encode = lambda s: torch.tensor([stoi[c] for c in s], dtype=torch.long)
decode = lambda t: "".join(itos[int(i)] for i in t)

data = torch.tensor([stoi[c] for c in text], dtype=torch.long)
n = int(0.9*len(data)); train_data, val_data = data[:n], data[n:]
batch_size, block_size = 64, 1
def get_batch(split):
    src = train_data if split=="train" else val_data
    ix = torch.randint(len(src)-block_size-1, (batch_size,))
    x = torch.stack([src[i:i+block_size] for i in ix])
    y = torch.stack([src[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

class BigramLM(nn.Module):
    def __init__(self, vocab_size):
        super().__init__(); self.token_emb = nn.Embedding(vocab_size, vocab_size)
    def forward(self, idx, targets=None):
        logits = self.token_emb(idx); loss=None
        if targets is not None: loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss
    @torch.no_grad()
    def generate(self, idx, max_new_tokens=200, temperature=1.0):
        for _ in range(max_new_tokens):
            logits,_ = self(idx); logits = logits[:,-1,:]/temperature
            probs = F.softmax(logits, dim=-1); nxt = torch.multinomial(probs, 1)
            idx = torch.cat([idx, nxt], dim=1)
        return idx

model = BigramLM(vocab_size).to(device)
opt = torch.optim.AdamW(model.parameters(), lr=1e-2)
max_steps, eval_interval = 2000, 200

@torch.no_grad()
def estimate_loss():
    out={}; model.eval()
    for split in ["train","val"]:
        losses=[]; 
        for _ in range(20):
            xb,yb = get_batch(split); _,loss = model(xb,yb); losses.append(loss.item())
        out[split]=sum(losses)/len(losses)
    model.train(); return out

for step in range(max_steps):
    if step % eval_interval == 0:
        l = estimate_loss(); print(f"step {step:4d}: train {l['train']:.3f} | val {l['val']:.3f}")
    xb,yb = get_batch("train"); _,loss = model(xb,yb)
    opt.zero_grad(set_to_none=True); loss.backward(); opt.step()

print("Training done.")
for temp in [0.7,1.0,1.3]:
    ctx = torch.tensor([[torch.randint(vocab_size, (1,)).item()]], device=device)
    out = model.generate(ctx, 300, temperature=temp)[0].tolist()
    print(f"\n=== SAMPLE (temp={temp}) ===\n"); print(textwrap.fill(decode(out), width=90))
