In [1]:
import sys
import os
import torch
from pathlib import Path

def get_project_info() -> Path:
  current = Path.cwd().resolve()
  root = current
  for parent in [current, *current.parents]:
    if (parent / "toy_transformers").exists():
      root = parent
      break
  return root, current

if 'ROOT_DIR' not in globals():
	ROOT_DIR, EXPERIMENT_DIR = get_project_info()
	if str(ROOT_DIR) not in sys.path:
		sys.path.append(str(ROOT_DIR))
	if Path.cwd() != ROOT_DIR:
		os.chdir(ROOT_DIR)

from toy_transformers.models import gptv3
from toy_transformers import tokenization
from toy_transformers import checkpoint

In [2]:
VOCAB_SIZE = 4096
BATCH_SIZE = 16
GRAD_ACCUM_STEPS = 2
MODE = tokenization.TokenizationMode.STR
DEVICE = "mps"

MAX_LR = 3e-4
MIN_LR = 3e-5
WARMUP_STEPS = 500
NUM_EPOCHS = 10

EVAL_INTERVAL = 100
EVAL_BATCHES = 20
LOG_INTERVAL = 10

config = gptv3.GPTv3Config(
	vocab_size=VOCAB_SIZE,
	block_size=512,
	device=DEVICE,
	n_heads=8,
	n_embed=512,
	n_layers=8
)

In [3]:
vocab_path = EXPERIMENT_DIR / f"data/vocab_{VOCAB_SIZE}.json"
DATA_DIR = ROOT_DIR / "data/raw/simplebooks/simplebooks-92-raw"
TRAIN_PATH = DATA_DIR / "train.txt"
VALID_PATH = DATA_DIR / "valid.txt"
TEST_PATH = DATA_DIR / "test.txt"

if not vocab_path.exists():
	raw_data = open(TRAIN_PATH, "r")
	vocab = tokenization.create_bpe(
		raw_data, 
		VOCAB_SIZE, MODE
	)
	vocab.save(vocab_path)
else:
	vocab = tokenization.Vocabulary.load(vocab_path)

In [None]:
def cache_and_tokenize(text_path: Path, cache_path: Path, vocab: tokenization.Vocabulary) -> torch.Tensor:
	if cache_path.exists():
		print("loading from cache")
		return torch.load(cache_path, weights_only=True)
	print("tokenizing...")
	text = open(text_path, "r").read()
	tokens = torch.tensor(vocab.encode(text), dtype=torch.long)
	torch.save(tokens, cache_path)
	return tokens

train_tokens = cache_and_tokenize(TRAIN_PATH, EXPERIMENT_DIR / "data/train.pt", vocab)
val_tokens = cache_and_tokenize(VALID_PATH, EXPERIMENT_DIR / "data/valid.pt", vocab)
test_tokens = cache_and_tokenize(TEST_PATH, EXPERIMENT_DIR / "data/test.pt", vocab)

tokenizing...
tokenizing...
tokenizing...


In [11]:
print(open(TRAIN_PATH, "r").read(200))

Dave Darrin's Second Year At Annapolis

Or

Two Midshipmen As Naval Academy "Youngsters"

By

H. Irving Hancock



Chapter I

A Question Of Midshipman Honor

"How can a midshipman and gentleman act in


In [12]:
print("".join(vocab.decode(train_tokens[:100])))

Dave Darrin's Second Year At Annapolis

Or

Two Midshipmen As Naval Academy "Youngsters"

By

H. Irving Hancock



Chapter I

A Question Of Midshipman Honor

"How can a midshipman and gentleman act in that way?"

The voice of Midshipman David Darrin, United
