In [1]:
from pathlib import Path

import torch
from tqdm import tqdm

from models import BigramLanguageModel, GPTLanguageModel

In [2]:
torch.manual_seed(2024);

In [ ]:
text: str = Path("tiny_shakespeare.txt").read_text()

# here are all the unique characters that occur in this text
VOCABULARY: tuple[str, ...] = tuple(sorted(set(text)))
VOCABULARY_SIZE: int = len(VOCABULARY)

In [3]:
# create a mapping from characters to integers
char2idx = {char: idx for idx, char in enumerate(VOCABULARY)}
idx2char = {idx: char for idx, char in enumerate(VOCABULARY)}


def encode(string: str) -> tuple[int, ...]:
    return tuple(char2idx[c] for c in string)


def decode(tup: tuple[int, ...]) -> str:
    return "".join([idx2char[i] for i in tup])

In [4]:
# Train and test splits
VALIDATION_PROPORTION: float = 0.1

data: torch.Tensor = torch.tensor(encode(text), dtype=torch.long)
n_train_samples = int((1 - VALIDATION_PROPORTION) * len(data))
train_data = data[:n_train_samples]
val_data = data[n_train_samples:]

In [5]:
def get_batch(split: str) -> tuple[torch.Tensor, torch.Tensor]:
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - BLOCK_SIZE, (BATCH_SIZE,))
    x = torch.stack([data[i : i + BLOCK_SIZE] for i in ix])
    y = torch.stack([data[i + 1 : i + BLOCK_SIZE + 1] for i in ix])
    return x, y

In [6]:
BATCH_SIZE = 32
BLOCK_SIZE = 8
MAX_ITERS = 3000
EVAL_INTERVAL = 300
EVAL_ITERS = 200

LEARNING_RATE = 1e-2

model = BigramLanguageModel(vocabulary_size=VOCABULARY_SIZE)

In [27]:
BATCH_SIZE = 64
BLOCK_SIZE = 128
MAX_ITERS = 5000
EVAL_INTERVAL = 500
EVAL_ITERS = 10

LEARNING_RATE = 3e-4

model = GPTLanguageModel(
    n_embeddings=256,
    n_heads=3,
    n_layers=3,
    dropout=0.2,
    block_size=BLOCK_SIZE,
    vocabulary_size=VOCABULARY_SIZE,
)

In [25]:
print(sum(p.numel() for p in model.parameters()) / 1e6, "M parameters")

2.430529 M parameters


In [22]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ["train", "val"]:
        losses = torch.zeros(EVAL_ITERS)
        for k in tqdm(range(EVAL_ITERS), position=1, leave=False, desc=f"Evaluating on {split} set"):
            X, Y = get_batch(split)
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [14]:
model = torch.load("gpt.pt")
# model.load("bigram.pt")
model.eval();

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

for it in tqdm(range(MAX_ITERS)):
    # every once in a while evaluate the loss on train and val sets
    if (it + 1) % EVAL_INTERVAL == 0:
        losses = estimate_loss()
        print(f"loss: train {losses['train']:.4f}, val {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch("train")

    # evaluate the loss
    _, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

 10%|▉         | 499/5000 [06:06<1:15:21,  1.00s/it]
Evaluating on train set:   0%|          | 0/10 [00:00<?, ?it/s][A
Evaluating on train set:  10%|█         | 1/10 [00:00<00:02,  4.41it/s][A
Evaluating on train set:  20%|██        | 2/10 [00:00<00:01,  4.32it/s][A
Evaluating on train set:  30%|███       | 3/10 [00:00<00:01,  4.28it/s][A
Evaluating on train set:  40%|████      | 4/10 [00:00<00:01,  4.26it/s][A
Evaluating on train set:  50%|█████     | 5/10 [00:01<00:01,  4.28it/s][A
Evaluating on train set:  60%|██████    | 6/10 [00:01<00:00,  4.28it/s][A
Evaluating on train set:  70%|███████   | 7/10 [00:01<00:00,  4.31it/s][A
Evaluating on train set:  80%|████████  | 8/10 [00:01<00:00,  4.29it/s][A
Evaluating on train set:  90%|█████████ | 9/10 [00:02<00:00,  4.29it/s][A
Evaluating on train set: 100%|██████████| 10/10 [00:02<00:00,  4.25it/s][A
                                                                        [A
Evaluating on val set:   0%|          | 0/10 [00:00<?

loss: train 1.7878, val 1.9337


 14%|█▎        | 677/5000 [08:24<52:22,  1.38it/s]  

In [None]:
torch.save(model, "gpt-demo-large.pt")

In [13]:
context = torch.zeros((1, 1), dtype=torch.long)

In [16]:
print(decode(tuple(model.generate(context, max_new_tokens=1500)[0].tolist())))



Nurse:
I am, where in to is besevery, and unto it the den,
To And what prossided when castes trick mother,
The gainst no earry!

METRLANUS:
No thy al pine.

Second Cury!
BRUTUS:
my dure! True! whereford't wenty hould find
Witht in should; do herefore inly, if
That thart won welps the folliht them all king the nobbe wing'd,
Ander he to drawns helence? Heavy that wars her shouldincis,
Buth they alonge hath yet swear Duke marriator:
And which !

DUCHARD II:
No mindes unothing ha now?
Take unram of mall our userit on the page.
Senator your, agood nevil, God alkay?

This then as of the tet their on of words: be Juldenety.

Fivost:
Thou he a criuries honour the his commpany mind;
And care gentlement shall it liss execes of appreemianted!
Do, on it gaters, crafe, strtial fend
Of it on hild country apsed;
Breas, and thyal renoble?

ESCAPULIET:
I force no thusbatstire a nothing wall
Indo a Rome, thart the shalt mose! God mock!
That shall me on ion; a of to my commber
And natulare to belierch.