# Tiny LLM

[![Open In Colab - ml_basics](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/proloy79/attoLLM/blob/main/notebooks/llm_experiments.ipynb)

In [1]:
from __future__ import annotations
from typing import Iterable, Tuple
from pathlib import Path
import math
from time import time
import torch
import torch.nn as nn
from torch import Tensor
from torch.utils.data import DataLoader
import torch.nn.functional as F
from dataclasses import asdict
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os 

plt.style.use('seaborn-v0_8')
torch.manual_seed(7)

def in_colab():
    return 'google.colab' in sys.modules

if in_colab():
    # --- Colab‑only setup ---
    print("Running in Colab — setting up environment")
    if not os.path.exists('/content/attoLLM'):
      !git clone https://github.com/proloy79/attoLLM.git
    !pip install -e /content/attoLLM
    sys.path.append('/content/attoLLM/src')

else:
    print("Running locally — skipping Colab setup")

from attollm.attention import scaled_dot_product_attention
from attollm.data_loader import *
from attollm.gpt import *
from attollm.simple_tokenizer import SimpleTokenizer,Vocab
from attollm.sample import *

Running locally — skipping Colab setup


## Validate single head dot product attention

In [2]:
torch.manual_seed(0)

Q = torch.tensor([[1.,0.],
                  [0.,1.],
                  [1.,1.]])

K = torch.tensor([[1.,0.],
                  [1.,1.],
                  [0.,1.]])

V = torch.tensor([[1.,0.],
                  [0.,2.],
                  [3.,1.]])

print('Q shape: ', Q.shape)
q = Q.unsqueeze(0) # turn to batch of 1:  [1,3,2]
k = K.unsqueeze(0)
v = V.unsqueeze(0)
print('q shape: ', q.shape)

expected_scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(2)
expected_weights = torch.softmax(expected_scores, dim=-1)
expected_output = torch.matmul(expected_weights, v)

def test_scaled_dot_product_attention():
    scores, weights, output = scaled_dot_product_attention(q, k, v)
    
    # Check shapes
    assert scores.shape == (1, 3, 3)
    assert weights.shape == (1, 3, 3)
    assert output.shape == (1, 3, 2)

    # Check numerical correctness
    assert torch.allclose(scores, expected_scores, atol=1e-12), "Scores mismatch"
    assert torch.allclose(weights, expected_weights, atol=1e-12), "Weights mismatch"
    assert torch.allclose(output, expected_output, atol=1e-12), "Output mismatch"

    print("Single head attention tests passed!")

def test_scaled_dot_product_attention_with_causal_mask(mask: Tensor):
    scores, weights, output = scaled_dot_product_attention(q, k, v, mask=mask)

    # Shape checks
    assert scores.shape == (1, 3, 3)
    assert weights.shape == (1, 3, 3)
    assert output.shape == (1, 3, 2)

    print('mask: ', mask)
    print('scores: ', scores)
    print('weights: ', weights)
    
    # Row 0: positions 1,2 must be -inf
    assert scores[0,0,1] == float("-inf")
    assert scores[0,0,2] == float("-inf")

    # Row 1: position 2 must be -inf
    assert scores[0,1,2] == float("-inf")

    # Row 2: no masking
    assert not torch.isinf(scores[0,2]).any()
    print('All missing points are filled with -inf confirming mask used properly')
    
    # Check that softmax respects the mask:
    # Row 0 must put all probability on position 0
    assert torch.allclose(weights[0,0], torch.tensor([1.,0.,0.]), atol=1e-12)

    # Row 1 must distribute only over positions 0 and 1
    assert weights[0,1,2] == 0.

    # Row 2 unchanged from unmasked case
    assert torch.allclose(weights[0,2], expected_weights[0,2], atol=1e-12)
    print('All missing points have 0 weights confirming the mask was applied properly')
    print("Causal mask test passed!")

test_scaled_dot_product_attention()

mask = torch.tril(torch.ones(3, 3)).unsqueeze(0)  # shape (1,3,3)
test_scaled_dot_product_attention_with_causal_mask(mask)


Q shape:  torch.Size([3, 2])
q shape:  torch.Size([1, 3, 2])
Single head attention tests passed!
mask:  tensor([[[1., 0., 0.],
         [1., 1., 0.],
         [1., 1., 1.]]])
scores:  tensor([[[0.7071,   -inf,   -inf],
         [0.0000, 0.7071,   -inf],
         [0.7071, 1.4142, 0.7071]]])
weights:  tensor([[[1.0000, 0.0000, 0.0000],
         [0.3302, 0.6698, 0.0000],
         [0.2483, 0.5035, 0.2483]]])
All missing points are filled with -inf confirming mask used properly
All missing points have 0 weights confirming the mask was applied properly
Causal mask test passed!


## Tiny GPT that predicts what will come next

In [3]:
#############################
level="char" #"word"
warmup_steps=8
epochs=2
steps=300
block_size=128
batch_size=4
lr=3e-4
d_model = 128
n_head = 4
n_layer = 2
d_ff = 512
dropout = 0.1
pos_type = "sinusoidal" #"learned"
out_file_path = '/content/attoLLM/data/processed/gpt_outputs.pt' if in_colab() else './../data/processed/gpt_outputs.pt'
#############################


text = load_texts(['/content/attoLLM/data/raw/sample.txt' if in_colab() else './../data/raw/sample.txt'])
ids_info = build_ids_with_tokenizer(text, level)
ds = LMSequenceDataset(ids_info.ids, block_size) # Slice stream into windows
dl = DataLoader(ds, batch_size, shuffle=True, drop_last=True)

vocab_size=ids_info.vocab_size
print('batch size: ', batch_size, 'vocab size: ', vocab_size, 'dl len', len(dl))

cfg = GPTConfig(vocab_size=vocab_size, block_size=block_size, d_model=d_model, n_head=n_head, n_layer=n_layer, d_ff=d_ff, dropout=dropout, pos_type=pos_type)
model =GPT(cfg)

opt = torch.optim.AdamW(model.parameters(), lr=lr)
scheduler = None
if warmup_steps > 0:
    # Linear warmup from 0 -> 1 over warmup_steps
    def lr_lambda(step: int) -> float:
        return min(1.0, (step + 1) / float(warmup_steps))
    scheduler = torch.optim.lr_scheduler.LambdaLR(opt, lr_lambda)

print("Config:", asdict(cfg))
print("Dataset tokens:", ds.ids.numel())

t0 = time()
model.train()

for epoch in range(max(1, epochs)):
    step = 0
    print(f'running epoch {epoch}')
    
    for x, y in dl:
        #print(f'running step {step}')
        if steps and step >= steps:
            break
                
        opt.zero_grad(set_to_none=True)
        logits, loss = model(x, targets=y, pad_id=ids_info.pad_id)
        
        assert loss is not None
        
        loss.backward()
        opt.step()
        
        if scheduler is not None:
            scheduler.step()
        if step % 50 == 0:
            lr_now = opt.param_groups[0]["lr"]
            print(
                f"step {step:5d} lr {lr_now:.5f} "
                f"loss {loss.detach().item():.4f}"
            )
        step += 1
            
dt = time() - t0
print(f"Done. steps={step} time={dt:.1f}s")

# Save checkpoint
out = Path(out_file_path)
out.parent.mkdir(parents=True, exist_ok=True)

ckpt = {
    "config": asdict(cfg),
    "model_state": model.state_dict(),
}

#print(ids_info.token_to_id)

# Save tokenizer metadata if available for easier sampling later
if ids_info.id_to_token is not None:
    ckpt["tokenizer"] = {
        "level": ids_info.level,
        "id_to_token": ids_info.id_to_token,
        "token_to_id": ids_info.token_to_id,
        "pad_id": ids_info.pad_id,
        "unk_id": ids_info.unk_id,
    }
torch.save(ckpt, out)
print("Checkpoints saved to :", out)
    

batch size:  4 vocab size:  45 dl len 550
Config: {'vocab_size': 45, 'block_size': 128, 'd_model': 128, 'n_head': 4, 'n_layer': 2, 'd_ff': 512, 'dropout': 0.1, 'pos_type': 'sinusoidal', 'tie_weights': True}
Dataset tokens: 2330
running epoch 0
step     0 lr 0.00007 loss 3.7943
step    50 lr 0.00030 loss 3.1065
step   100 lr 0.00030 loss 3.0084
step   150 lr 0.00030 loss 3.0224
step   200 lr 0.00030 loss 3.0588
step   250 lr 0.00030 loss 3.0509
running epoch 1
step     0 lr 0.00030 loss 3.0072
step    50 lr 0.00030 loss 2.9962
step   100 lr 0.00030 loss 2.9896
step   150 lr 0.00030 loss 3.0418
step   200 lr 0.00030 loss 3.0809
step   250 lr 0.00030 loss 2.9815
Done. steps=300 time=23.6s
Checkpoints saved to : ../data/processed/gpt_outputs.pt


## Load the checkpoint and test a prompt

In [4]:
"""
load checkpoint
rebuild model from config
load weights
restore tokenizer
encode prompt
run sampling loop
decode output
"""

# rebuild model from saved config and set to eval state
ckpt = torch.load(out_file_path)
cfg = GPTConfig(**ckpt["config"])

model = GPT(cfg)
model.load_state_dict(ckpt["model_state"])
model.eval()

# restore tokenizer
tok_info = ckpt["tokenizer"]
reloaded_id_to_token = tok_info["id_to_token"]
reloaded_token_to_id = tok_info["token_to_id"]
reloaded_pad_id = tok_info["pad_id"]
reloaded_unk_id = tok_info["unk_id"]

#print(reloaded_token_to_id)

reloaded_tok=SimpleTokenizer(Vocab(reloaded_token_to_id,
    reloaded_id_to_token,
    reloaded_pad_id,
    reloaded_unk_id))

prompt = "Philosophy is"
test_token_ids = torch.tensor(reloaded_tok.encode(prompt), dtype=torch.long).unsqueeze(0)
#print(test_token_ids)
gen = sample(model, test_token_ids, max_new_tokens=120, temperature=0.9, top_p=0.95)
generated_text = reloaded_tok.decode(gen.tolist()[0])
print('generated text: ', generated_text)


generated text:  Philosophy isa polnahcsoeeot lec erseaotitstc lreo ppeplr til sfieaecci sioitroyas  anhctethnsnan neyou iieuyr  aneipfeeydia, i ihnct
