In [1]:
import sys
import os
import torch 
from pathlib import Path

def get_project_info() -> Path:
  current = Path.cwd().resolve()
  root = current
  for parent in [current, *current.parents]:
    if (parent / "toy_transformers").exists():
      root = parent
      break
  return root, current

if 'ROOT_DIR' not in globals():
	ROOT_DIR, EXPERIMENT_DIR = get_project_info()
	if str(ROOT_DIR) not in sys.path:
		sys.path.append(str(ROOT_DIR))
	if Path.cwd() != ROOT_DIR:
		os.chdir(ROOT_DIR)

from toy_transformers.models import gptv1
from toy_transformers import tokenization

In [2]:
VOCAB_SIZE = 256
BATCH_SIZE = 16
MODE = tokenization.TokenizationMode.STR
DEVICE = "mps"

config = gptv1.GPTv1Config(
	vocab_size=VOCAB_SIZE,
	block_size=256,
	device=DEVICE,
	n_heads=6,
)

In [3]:
vocab_path = EXPERIMENT_DIR / f"vocab_{VOCAB_SIZE}.json"
raw_data_path = ROOT_DIR / "data/gutenberg/freud-interpretation-of-dreams.txt"

if not vocab_path.exists():
	raw_data = open(raw_data_path, "r")
	vocab = tokenization.create_bpe(
		raw_data, 
		VOCAB_SIZE, MODE
	)
	vocab.save(vocab_path)
else:
	vocab = tokenization.Vocabulary.load(vocab_path)

In [4]:
data = torch.tensor(
	vocab.encode(open(raw_data_path, "r").read()),
	dtype=torch.long
).to(device=DEVICE)

n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

block_size, batch_size = config.block_size, BATCH_SIZE
def get_batch(split):
  data = train_data if split == 'train' else val_data
  idxs = torch.randint(len(data) - block_size, (batch_size,), device=DEVICE)
  x = torch.stack([data[i:i+block_size] for i in idxs])
  y = torch.stack([data[i+1:i+block_size+1] for i in idxs])
  return x, y

@torch.no_grad()
def estimate_val_loss(model):
  model.eval()
  X, Y = get_batch("val")
  _, loss = model(X, Y)
  model.train()
  return loss.item()

In [5]:
torch.set_float32_matmul_precision("medium")
m = gptv1.LanguageModel(config).to(device=DEVICE)
m.compile()

optimizer = torch.optim.AdamW(m.parameters(), lr=3e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
  optimizer,
  mode='min',
  factor=0.1,
  patience=10
)

import time
from torch.amp import autocast
from tqdm import tqdm

@torch.compile(fullgraph=False)
def opt_step():
	optimizer.step()

In [8]:
for steps in range(2000):
	xb, yb = get_batch('train')
	with autocast(device_type=DEVICE, dtype=torch.float16):
		logits, loss = m(xb, yb)
	optimizer.zero_grad(set_to_none=True)
	loss.backward()
	opt_step()
	train_loss, val_loss = loss.item(), None
	if steps % 50 == 0:
		val_loss = estimate_val_loss(m)
		scheduler.step(val_loss)
	
	if steps % 25 == 0:
		print(steps, train_loss, val_loss)

0 2.8628978729248047 3.2616634368896484
25 3.0183637142181396 None
50 2.962312698364258 3.2225286960601807
75 2.888119697570801 None
100 2.920164108276367 3.3332157135009766
125 2.948301076889038 None
150 2.8485748767852783 3.133199453353882
175 2.8420910835266113 None
200 2.796180486679077 3.0181374549865723
225 2.91645884513855 None
250 2.892462730407715 3.216552734375
275 2.850684404373169 None
300 2.6719589233398438 3.2438554763793945
325 2.7837882041931152 None
350 2.8367955684661865 3.1557326316833496
375 2.6888136863708496 None
400 2.7636306285858154 3.1481988430023193
425 2.6931753158569336 None
450 2.747929573059082 3.203716278076172
475 2.579789161682129 None
500 2.72352933883667 3.271591901779175
525 2.6106324195861816 None
550 2.5243115425109863 3.266279697418213
575 2.6501262187957764 None
600 2.634244918823242 3.0115363597869873
625 2.742239475250244 None
650 2.664534091949463 3.086094856262207
675 2.7224597930908203 None
700 2.670586585998535 3.0264275074005127
725 2.594

In [9]:
idx = torch.tensor([vocab.encode("The mind ")], dtype=torch.long, device=DEVICE)
print(idx)
print("The mind ", end="", flush=True)
for token in m.generate(idx, max_new_tokens=200):
	print(vocab.decode([token.item()])[0], end="", flush=True)
print()

tensor([[ 36, 126, 145, 129,  47]], device='mps:0')
The mind uring upon will concerned with severalsymaterial for thegeticper outsers mayhis own dreams From thisin their assume one is indeft the birth-fulfilment

The BsShe is susponsible to be having to attrave she to fall the kind parents motor for theunconsciousand of the night in great she children[CQ

MyserMsather(Cheddenous who has beencause so safeeling a came for for his Ih
