In [1]:
import sys
import os
import torch
from pathlib import Path

def get_project_info() -> Path:
  current = Path.cwd().resolve()
  root = current
  for parent in [current, *current.parents]:
    if (parent / "toy_transformers").exists():
      root = parent
      break
  return root, current

if 'ROOT_DIR' not in globals():
	ROOT_DIR, EXPERIMENT_DIR = get_project_info()
	if str(ROOT_DIR) not in sys.path:
		sys.path.append(ROOT_DIR)
	if Path.cwd() != ROOT_DIR:
		os.chdir(ROOT_DIR)

In [None]:
from toy_transformers.data import tokenization


In [3]:
from toy_transformers.utilities.reproducibility import set_all_seeds

SEED = 42
set_all_seeds(SEED, deterministic=True)

In [None]:
from toy_transformers.data import bpe
from toy_transformers.utilities import io
from toy_transformers.data import tokenization

from toy_transformers.data.dataset import create_dataloader_for_epoch

vocab = bpe.Vocabulary.from_state_dict(
  io.load(EXPERIMENT_DIR / "artifacts/vocab256")
)
data = tokenization.TokenizedData.from_state_dict(
	io.load(EXPERIMENT_DIR / "artifacts/data")
)

In [5]:
# test random
CHARS = "0123456789abcdef"
for epoch in range(10):
  train_loader = create_dataloader_for_epoch(
    dataset=data, 
    epoch=epoch, 
    base_seed=SEED, 
    block_size=1, 
    batch_size=1, 
    shuffle=True, 
    drop_last=True,
    pin_memory=False
	)
  i_loader = iter(train_loader)
  l = []
  for i in range(10):
    v1, v2 = next(i_loader)
    l.append(v1.item())
    l.append(v2.item())
  print("".join([CHARS[v % 16] for v in l]))

3691ecde42114d39565b
ed333c17b328d7c14d34
205fbf29b5307e898b73
1fff2261365272f16253
e79de0e032f63337363b
f1c6f94ccd994dd517ab
ce4904eccff3145c2c16
7fc928263a204ee9cb6d
ec41a7091d6822e0e065
eca1afd7422eafa7d8e0


In [None]:
from toy_transformers.models import gptv1
import pickle
import hashlib

# Model configuration
device = "mps" if torch.backends.mps.is_available() else "cpu"
config = gptv1.GPTv1Config(
    batch_size=64,
    block_size=128,
    n_heads=8,
    n_embed=288,
    n_layers=6,
    dropout=0.2,
    device=device
)

# Create model (will use global random seed for weight initialization)
model = gptv1.LanguageModel(
    vocab_size=len(vocab.tokens),
    config=config
).to(device)

state = model.state_dict()
state_bytes = pickle.dumps({k: v.cpu().numpy() for k, v in state.items()})
model_init_hash = hashlib.sha256(state_bytes).hexdigest()

fbae6d0cb06c75501f89fb2f4d5b24f91c6ee953080226c833786f09a769c46c
