# Run02: GPT-v1 with Full Artifact Chain

This experiment follows the workflow from gpt-v1-w-tokenizer but uses the full artifact chain:
1. RawDataset - Store raw text with compression
2. Vocabulary - BPE tokenizer (256 tokens)
3. TokenizedData - Tokenized tensor with artifact references
4. TrainingRun - Model training with checkpoints
5. Text Generation Demo

In [None]:
import sys
import os
import torch
from pathlib import Path

def get_project_info() -> Path:
  current = Path.cwd().resolve()
  root = current
  for parent in [current, *current.parents]:
    if (parent / "toy_transformers").exists():
      root = parent
      break
  return root, current

if 'ROOT_DIR' not in globals():
	ROOT_DIR, EXPERIMENT_DIR = get_project_info()
	if str(ROOT_DIR) not in sys.path:
		sys.path.append(str(ROOT_DIR))
	if Path.cwd() != ROOT_DIR:
		os.chdir(ROOT_DIR)

print(f"Root: {ROOT_DIR}")
print(f"Experiment: {EXPERIMENT_DIR}")

In [None]:
from toy_transformers.data import RawDataset, register_raw_dataset
from toy_transformers.data import tokenization
from toy_transformers.data.bpe import Vocabulary
from toy_transformers.training.training_run import TrainingRun
from toy_transformers.configs import (
    DataConfig, GPTv1Config, OptimizerConfig,
    AdamWConfig, ReduceLROnPlateauConfig
)
from toy_transformers.utilities import io
from toy_transformers.utilities.reproducibility import set_all_seeds

print("Imports successful")

## Stage 1: Create RawDataset Artifact

In [None]:
# Artifact paths
ARTIFACTS_DIR = EXPERIMENT_DIR / "artifacts"
RAW_DATA_PATH = ARTIFACTS_DIR / "raw_dataset"
VOCAB_PATH = ARTIFACTS_DIR / "vocab256"
TOKENIZED_PATH = ARTIFACTS_DIR / "tokenized_data"
CHECKPOINT_PATH = ARTIFACTS_DIR / "checkpoint"

# Load input text
with open(ROOT_DIR / "data/input.txt", 'r', encoding='utf-8') as f:
    raw_text = f.read()

# register raw dataset artifact
raw_dataset = register_raw_dataset(raw_text, str(RAW_DATA_PATH))
print(f"RawDataset created: {raw_dataset.artifact_id}")
print(f"Text length: {len(raw_dataset.text):,} characters")

## Stage 2: Create Vocabulary Artifact (BPE, 256 tokens)

In [None]:
# Create vocabulary from raw dataset (256 tokens)
vocab = tokenization.create_vocabulary_from_raw(
    str(RAW_DATA_PATH),
    str(VOCAB_PATH),
    vocab_size=256
)
print(f"Vocabulary created: {vocab.artifact_id}")
print(f"Vocab size: {len(vocab.tokens)}")

## Stage 3: Create TokenizedData Artifact

In [None]:
# Tokenize using vocabulary
tokenized_data = tokenization.tokenize_with_vocabulary(
    str(VOCAB_PATH),
    str(RAW_DATA_PATH),
    str(TOKENIZED_PATH)
)
print(f"TokenizedData created: {tokenized_data.artifact_id}")
print(f"Tokens: {len(tokenized_data.data):,}")
print(f"Linked vocab: {tokenized_data.vocab_id}")
print(f"Linked raw: {tokenized_data.raw_dataset_id}")

## Stage 4: Setup Model and Training Configuration

In [None]:
set_all_seeds(42, deterministic=True)

device = "mps" if torch.backends.mps.is_available() else "cpu"

# Model config (similar to gpt-v1-w-tokenizer)
model_config = {
    "n_heads": 6,
    "n_embed": 288,
    "n_layers": 6,
    "dropout": 0.2
}

# Data config
data_config = DataConfig(
    vocab_size=256,
    block_size=256,
    batch_size=16,
    shuffle=True
)

# Optimizer config with scheduler
optimizer_config = OptimizerConfig(
    optimizer_type="adamw",
    optimizer_params=AdamWConfig(lr=3e-4, weight_decay=0.01),
    scheduler=ReduceLROnPlateauConfig(
        mode="min",
        factor=0.1,
        patience=10
    ),
    device=device,
    seed=42,
    max_epochs=1
)

print(f"Device: {device}")
print(f"Model: gpt-v1, n_embed={model_config['n_embed']}, n_layers={model_config['n_layers']}")
print(f"Data: vocab_size={data_config.vocab_size}, block_size={data_config.block_size}, batch_size={data_config.batch_size}")

## Stage 5: Create TrainingRun

In [None]:
# Create training run
training_run = TrainingRun(
    model_name="gpt-v1",
    model_config=model_config,
    data_config=data_config,
    optimizer_config=optimizer_config,
    processed_dataset_id=tokenized_data.artifact_id
)

# Set dataset hash for verification
training_run.set_dataset_hash(tokenized_data)

# Create model and optimizer
model = training_run.create_model().to(device)
optimizer = training_run.create_optimizer(model)

num_params = sum(p.numel() for p in model.parameters())
print(f"TrainingRun created")
print(f"Model parameters: {num_params:,}")
print(f"Dataset hash: {training_run.dataset_hash}")

## Stage 6: Training Loop

In [None]:
from torch.amp import autocast
from tqdm import tqdm

torch.set_float32_matmul_precision("medium")
model.train()

num_steps = 4000
save_every = 1000
log_every = 50

dataloader = training_run.create_dataloader(tokenized_data, epoch=0)

for step in tqdm(range(num_steps), desc="Training"):
    try:
        x, y = next(dataloader)
    except StopIteration:
        training_run.epoch += 1
        training_run.batches_completed = 0
        dataloader = training_run.create_dataloader(tokenized_data, training_run.epoch)
        x, y = next(dataloader)

    x, y = x.to(device), y.to(device)

    with autocast(device_type=device, dtype=torch.float16):
        logits, loss = model(x, y)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.optimizer.step()  # Step underlying optimizer only

    training_run.step += 1
    training_run.batches_completed += 1

    train_loss = loss.item()
    training_run.log_step(train_loss=train_loss, lr=optimizer.get_lr())

    # Log and step scheduler periodically (matching original pattern)
    if step % log_every == 0:
        if optimizer.scheduler is not None:
            optimizer.scheduler.step(train_loss)
        print(f"Step {step}: loss={train_loss:.4f}, lr={optimizer.get_lr():.2e}")

    # Save checkpoint
    if step > 0 and step % save_every == 0:
        training_run.save(str(CHECKPOINT_PATH), model, optimizer)
        print(f"Checkpoint saved at step {step}")

# Final save
training_run.save(str(CHECKPOINT_PATH), model, optimizer)
print(f"Training complete! Final loss: {train_loss:.4f}")

## Stage 7: Text Generation Demo

In [None]:
# Load vocabulary for decoding
vocab_data = io.load(str(VOCAB_PATH))
vocab_obj = Vocabulary.from_state_dict(vocab_data)

# Generate text
model.eval()
seed_text = "The "
seed_tokens = vocab_obj.encode(seed_text)
idx = torch.tensor([seed_tokens], dtype=torch.long, device=device)

print(f"Seed: '{seed_text}'")
print(f"Generating 200 tokens...\n")

print(seed_text, end="", flush=True)
with torch.no_grad():
    for token in model.generate(idx, max_new_tokens=200):
        print(vocab_obj.decode([token.item()])[0], end="", flush=True)
print()

## Summary: Artifact Chain

This experiment created the following artifacts:
- `artifacts/raw_dataset/` - RawDataset (compressed text)
- `artifacts/vocab256/` - Vocabulary (BPE with 256 tokens)
- `artifacts/tokenized_data/` - TokenizedData (with vocab and raw_dataset references)
- `artifacts/checkpoint/` - TrainingRun (model weights, optimizer state, logs)

In [None]:
print("Artifact Chain Summary:")
print(f"  RawDataset: {raw_dataset.artifact_id}")
print(f"  Vocabulary: {vocab.artifact_id}")
print(f"  TokenizedData: {tokenized_data.artifact_id}")
print(f"    -> vocab_id: {tokenized_data.vocab_id}")
print(f"    -> raw_dataset_id: {tokenized_data.raw_dataset_id}")
print(f"  TrainingRun: step={training_run.step}, epoch={training_run.epoch}")
print(f"    -> processed_dataset_id: {training_run.processed_dataset_id}")