# **Building LLM from sratch**

In [59]:
import math
import time
import tiktoken
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

device = (
    "cuda" if torch.cuda.is_available()
    else "mps" if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available()
    else "cpu"
)

device


'mps'

**Dataset,DataLoader**

In [60]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        for i in range(0, len(token_ids) - max_length, stride):
            inp = token_ids[i:i+max_length]
            tgt = token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(inp, dtype=torch.long))
            self.target_ids.append(torch.tensor(tgt, dtype=torch.long))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]



def create_dataloader_v1(txt, batch_size=8, max_length=256, stride=128):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    return loader, tokenizer


# **Model architecture ---> Multi-Attention mechanism , LayerNorm, Feed-forward network**

**MultiHeadAttention block**

In [61]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0
        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads

        self.W_q = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_k = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_v = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)

        mask = torch.triu(torch.ones(context_length, context_length), diagonal=1)
        self.register_buffer("mask", mask)

    def forward(self, x):
        B, T, _ = x.shape

        q = self.W_q(x)
        k = self.W_k(x)
        v = self.W_v(x)

        q = q.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
        k = k.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
        v = v.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)

        att = q @ k.transpose(-2, -1)
        att = att / math.sqrt(self.head_dim)

        mask = self.mask[:T, :T].bool()
        att.masked_fill_(mask, -1e10)

        att = torch.softmax(att, dim=-1)
        att = self.dropout(att)

        out = att @ v
        out = out.transpose(1, 2).contiguous().view(B, T, self.d_out)
        return self.out_proj(out)


class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
        self.eps = 1e-5

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm + self.shift


class GELU(nn.Module):
    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2/math.pi, device=x.device)) *
            (x + 0.044715 * x.pow(3))
        ))


class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        D = cfg["emb_dim"]
        self.layers = nn.Sequential(
            nn.Linear(D, 4*D),
            GELU(),
            nn.Linear(4*D, D)
        )

    def forward(self, x):
        return self.layers(x)


class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            dropout=cfg["drop_rate"],
            num_heads=cfg["n_heads"],
            qkv_bias=cfg["qkv_bias"]
        )
        self.ff = FeedForward(cfg)
        self.ln1 = LayerNorm(cfg["emb_dim"])
        self.ln2 = LayerNorm(cfg["emb_dim"])
        self.dropout = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        x = x + self.dropout(self.att(self.ln1(x)))
        x = x + self.dropout(self.ff(self.ln2(x)))
        return x


class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop = nn.Dropout(cfg["drop_rate"])

        self.blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )

        self.ln_f = LayerNorm(cfg["emb_dim"])
        self.head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, idx):
        B, T = idx.shape
        tok = self.tok_emb(idx)
        pos = self.pos_emb(torch.arange(T, device=idx.device))[None, :, :]
        x = self.drop(tok + pos)
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.head(x)
        return logits


**Text generation helper**

In [62]:
@torch.no_grad()
def generate_text(model, idx, max_new_tokens, context_size, temperature=1.0, top_k=50):
    model.eval()

    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        logits = model(idx_cond)           # (B, T, V)
        logits = logits[:, -1, :]          # (B, V)

        # temperature
        logits = logits / temperature

        # top-k filtering
        if top_k is not None:
            values, _ = torch.topk(logits, top_k)
            min_vals = values[:, -1].unsqueeze(-1)
            logits[logits < min_vals] = -1e10

        probs = torch.softmax(logits, dim=-1)
        next_id = torch.multinomial(probs, num_samples=1)  # (B, 1)

        idx = torch.cat([idx, next_id], dim=1)

    return idx


**Training Loop**

In [72]:
def train_gpt(txt_path="harry.txt", epochs=3, batch_size=8, lr=3e-4,
              max_length=128, stride=128):

    with open(txt_path, "r", encoding="utf-8") as f:
        txt = f.read()
    txt = txt[:5000]  # only first 5k characters
    loader, tokenizer = create_dataloader_v1(txt, batch_size, max_length, stride)

    cfg = {
        "vocab_size": tokenizer.n_vocab,
        "context_length": max_length,
        "emb_dim": 192,
        "n_heads": 3,
        "n_layers": 3,
        "drop_rate": 0.1,
        "qkv_bias": False,
    }

    model = GPTModel(cfg).to(device)
    optim = torch.optim.AdamW(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()

    for ep in range(1, epochs+1):
        model.train()
        total = 0
        for inp, tgt in loader:
            inp, tgt = inp.to(device), tgt.to(device)

            logits = model(inp)
            B, T, V = logits.shape

            loss = loss_fn(
                logits.view(B*T, V),
                tgt.view(B*T)
            )

            optim.zero_grad()
            loss.backward()
            optim.step()

            total += loss.item()

        print(f"Epoch {ep} Loss: {total/len(loader):.4f}")

    return model, tokenizer


**Train + Generate**

In [74]:
model, tokenizer = train_gpt("harry.txt", epochs=200) 

start = "Mr. Dursley was the director"
encoded = tokenizer.encode(start)
idx = torch.tensor(encoded, device=device).unsqueeze(0)

out = generate_text(model, idx, max_new_tokens=50, context_size=256,
                    temperature=0.8, top_k=40)
print(tokenizer.decode(out[0].tolist()))



Epoch 1 Loss: 11.0238
Epoch 2 Loss: 10.9039
Epoch 3 Loss: 10.8094
Epoch 4 Loss: 10.6904
Epoch 5 Loss: 10.6043
Epoch 6 Loss: 10.4941
Epoch 7 Loss: 10.3692
Epoch 8 Loss: 10.2816
Epoch 9 Loss: 10.1309
Epoch 10 Loss: 10.0050
Epoch 11 Loss: 9.8988
Epoch 12 Loss: 9.7404
Epoch 13 Loss: 9.6403
Epoch 14 Loss: 9.5157
Epoch 15 Loss: 9.3873
Epoch 16 Loss: 9.2458
Epoch 17 Loss: 9.1315
Epoch 18 Loss: 9.0073
Epoch 19 Loss: 8.9195
Epoch 20 Loss: 8.7924
Epoch 21 Loss: 8.6754
Epoch 22 Loss: 8.5711
Epoch 23 Loss: 8.4764
Epoch 24 Loss: 8.3673
Epoch 25 Loss: 8.2376
Epoch 26 Loss: 8.1134
Epoch 27 Loss: 8.0075
Epoch 28 Loss: 7.8918
Epoch 29 Loss: 7.7698
Epoch 30 Loss: 7.6662
Epoch 31 Loss: 7.5308
Epoch 32 Loss: 7.4112
Epoch 33 Loss: 7.2820
Epoch 34 Loss: 7.2139
Epoch 35 Loss: 7.1147
Epoch 36 Loss: 6.9527
Epoch 37 Loss: 6.8657
Epoch 38 Loss: 6.7594
Epoch 39 Loss: 6.6570
Epoch 40 Loss: 6.5424
Epoch 41 Loss: 6.4532
Epoch 42 Loss: 6.3649
Epoch 43 Loss: 6.2040
Epoch 44 Loss: 6.1025
Epoch 45 Loss: 6.0329
Epoch 46 