# Step-by-step run of alphazero self-play & training.


In [1]:
import os
import time
from pathlib import Path
from collections import defaultdict, Counter
import asyncio
from typing import Callable

import numpy as np
import torch
import torch.nn.functional as F

# Game and players
from rgi.rgizero.games.connect4 import Connect4Game
from rgi.rgizero.players.alphazero import AlphazeroPlayer
from rgi.rgizero.players.alphazero import play_game

from notebook_utils import reload_local_modules

print("✅ Imports successful")

if torch.cuda.is_available():
    device = 'cuda'
elif torch.backends.mps.is_available():
  device = 'mps'
else:
    device = 'cpu'
print(f'Using device: {device}')
assert device in ('cuda', 'mps'), f"No accelerator available, device={device}"

# Allow asyncio to work with jupyter notebook
import nest_asyncio
nest_asyncio.apply()

# Increase numpy print width
np.set_printoptions(linewidth=300)

%load_ext line_profiler

✅ Imports successful
Using device: mps


In [2]:
DEBUG_MODE = True     # Set options to make debugger work properly. Single worker, etc.
LOAD_MODEL = False
TRAIN_MODEL = True
MODEL_SIZE = "tiny"  # "tiny" or "small" or "large" or "xl"
NUM_SIMULATIONS = 200
RUN_GENERATIONS = True
RUN_TOURNAMENT = False

# If False, we still load previous games from disk.
NUM_GAMES = 10_000
MAX_TRAINING_EPOCHS = 10
TRAIN_BATCH_SIZE = 2048
MAX_TRAINING_ITERS = 1_000_000 // TRAIN_BATCH_SIZE
CONFIG_ALIAS = f'trajectory_sims-{NUM_SIMULATIONS}_games-{NUM_GAMES}_size-{MODEL_SIZE}_train-{MAX_TRAINING_ITERS}_x1'
NUM_GENERATIONS = 20

# DEBUG: Update batch_size after config_alias
MODEL_SIZE = "small"
MAX_TRAINING_ITERS = 100_000_000 // TRAIN_BATCH_SIZE
MAX_TRAINING_EPOCHS = 10_000

## Step 1: Set up history-wrapped game


In [3]:
from rgi.rgizero.games.history_wrapper import HistoryTrackingGame
from rgi.rgizero.data.trajectory_dataset import Vocab
from rgi.rgizero.common import TOKENS

base_game, max_game_length = Connect4Game(connect_length=4), 7*6

game = HistoryTrackingGame(base_game)
state_0 = game.initial_state()
block_size = max_game_length + 2
all_actions = game.all_actions()
action_vocab = Vocab(itos=[TOKENS.START_OF_GAME] + list(all_actions))
n_max_context = max_game_length + 2
game_name = base_game.__class__.__name__

print("✅ Using HistoryTrackingGame from module")
print(f"Game: {game_name}, Players: {game.num_players(state_0)}, Actions: {list(game.all_actions())}")

DATA_DIR = Path.cwd().parent / "data" / "rgizero-e2e" / game_name / CONFIG_ALIAS
print("Creating data dir: ", DATA_DIR)
os.makedirs(DATA_DIR, exist_ok=True)

MODEL_DIR = Path.cwd().parent / "models" / "rgizero-e2e" / game_name / CONFIG_ALIAS
print("Creating model dir: ", MODEL_DIR)
os.makedirs(MODEL_DIR, exist_ok=True)


✅ Using HistoryTrackingGame from module
Game: Connect4Game, Players: 2, Actions: [1, 2, 3, 4, 5, 6, 7]
Creating data dir:  /Users/rodo/src/rgi3/data/rgizero-e2e/Connect4Game/trajectory_sims-200_games-10000_size-tiny_train-488_x1
Creating model dir:  /Users/rodo/src/rgi3/models/rgizero-e2e/Connect4Game/trajectory_sims-200_games-10000_size-tiny_train-488_x1


## Step 2: Create random generation_0 model


# Tune Model


In [6]:
reload_local_modules(verbose=False)

# Parameters which will never be used for tuning.
fixed_params = dict(
    model_name='c4-tuning',
    model_version='0.1',
    num_players = game.num_players(state_0),
    vocab_size = action_vocab.vocab_size,
    num_genrations = NUM_GENERATIONS,
    data_dir = DATA_DIR,

    eval_iters = 200,
    log_interval = 1000,
    eval_interval = 10_000,

    device = device,
)

INITIAL_LEARNING_RATE = 0.05

initial_params = dict(
    n_layer=2,
    n_head=2,
    n_embd=8,  # tiny model

    n_max_context=n_max_context,
    batch_size = 32,
    gradient_accumulation_steps = 1,

    max_iters=100,
    max_epochs=1_000_000, # Make max_epoch high, rely on max_iters to stop.
        
    learning_rate = INITIAL_LEARNING_RATE,    
    decay_lr = True,  # whether to decay the learning rate
    lr_decay_iters = 100,  # make equal to max_iters usually
    min_lr = INITIAL_LEARNING_RATE / 10,  # learning_rate / 10 usually
    warmup_iters = 0,  # not super necessary potentially

    weight_decay = 1e-1,
    beta1 = 0.9,
    beta2 = 0.95,
    grad_clip = 1.0,  # clip gradients at this value, or disable if == 0.0

    dtype = "float16",

    dropout = 0.0,
    bias = False,  # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
)

tune_options = dict(
    n_layer = [2, 3, 4, 5, 6, 8, 10, 12, 16, 32],
    # n_head = [1, 2, 4, 8, 16, 32],   # Needs to be calcualted to ensure n_embed % n_head == 0
    n_embd = [8, 16, 32, 64, 128, 256, 512, 1024],

    n_max_context = [initial_params['n_max_context']],
    batch_size = [16, 32, 64, 128, 256, 512, 1024],
    gradient_accumulation_steps = [1],  # TODO: We only support 1 for now. This fails is we don't have an exact multiple of the batch size per epoch.

    max_iters = [100, 300, 1_000, 3_000, 10_000, 30_000, 100_000, 300_000],
    max_epochs = [1_000_000], # Make max_epoch high, rely on max_iters to stop.
    
    learning_rate = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0],
    decay_lr = [False, True],

    beta1 = [0.9],
    beta2 = [0.95, 0.99],

    weight_decay = [0.01, 0.1],
    grad_clip = [0,0, 1.0],  # clip gradients at this value, or disable if == 0.0

    dtype = ["bfloat16", "float16"],
    dropout = [0.0, 0.01, 0.05, 0.1],
    bias = [True, False],    
)

_n_head_options = [1, 2, 4, 8, 16, 32]
computed_tune_options = dict(
    min_lr = lambda opt: [opt['learning_rate'] / 10],
    lr_decay_iters = lambda opt: [opt['max_iters']],
    warmup_iters = lambda opt: [x for x in [0, 100, 1000] if x < opt['lr_decay_iters']] if opt['decay_lr'] else [0],
    n_head = lambda opt: [n for n in _n_head_options if opt['n_embd'] % n == 0],
)

TUNER_VERSION = "0.0.3-smart"

from rgi.rgizero.models.tuner import Tuner

tuner = Tuner(
    fixed_params=fixed_params.copy(),
    initial_params=initial_params.copy(),
    tune_options=tune_options.copy(), 
    computed_tune_options=computed_tune_options.copy(),
    cache_version=TUNER_VERSION,
    target_improvement_per_minute=1.00)
tuner.autotune_smart()


transform_config_fields: {'n_embd', 'bias', 'n_layer', 'n_max_context', 'dropout', 'n_head'}
train_config_fields: {'always_save_checkpoint', 'max_iters', 'weight_decay', 'decay_lr', 'batch_size', 'eval_only', 'learning_rate', 'max_epochs', 'lr_decay_iters', 'model_version', 'beta2', 'warmup_iters', 'wandb_log', 'dtype', 'gradient_accumulation_steps', 'min_lr', 'log_interval', 'compile', 'beta1', 'device', 'eval_interval', 'eval_iters', 'model_name', 'grad_clip'}
Training initial model as baseline.
## Initial Model, loss=2.4571030139923096 elapsed=6.595702886581421s
Attempting channge 'dropout=0.0'
## Model dropout=0.0, loss=2.4571030139923096 elapsed=6.595702886581421s
Attempting channge 'beta2=0.99'
## Model beta2=0.99, loss=2.4571030139923096 elapsed=6.595702886581421s
Attempting channge 'decay_lr=False'
## Model decay_lr=False, loss=2.4571030139923096 elapsed=6.595702886581421s
Attempting channge 'learning_rate=0.1'
## Model learning_rate=0.1, loss=2.4571030139923096 elapsed=6.59570

False

In [5]:
tuner = Tuner(
    fixed_params=fixed_params.copy(),
    initial_params=initial_params.copy(),
    tune_options=tune_options.copy(), 
    computed_tune_options=computed_tune_options.copy(),
    cache_version=TUNER_VERSION,
    target_improvement_per_minute=0.10)
tuner.autotune_smart()

Training initial model as baseline.
## Initial Model, loss=2.1770522594451904 elapsed=74.7136869430542s
Expected score ('n_embd', 32) -> 2.391762519116495
Expected score ('batch_size', 256) -> 2.412680541113729
Expected score ('batch_size', 64) -> 2.4138263496808836
Expected score ('lr_decay_iters', 1000) -> 2.4145058044348007
Expected score ('max_iters', 1000) -> 2.4145058044348007
Expected score ('n_head', 1) -> 2.421457539399465
Expected score ('n_embd', 64) -> 2.4241771759113333
Expected score ('learning_rate', 0.001) -> 2.4325078189373017
Expected score ('min_lr', 0.0001) -> 2.4325078189373017
Expected score ('n_embd', 16) -> 2.4335869294239414
Expected score ('bias', True) -> 2.439599448499225
Expected score ('batch_size', 128) -> 2.4508042148749034
Expected score ('beta2', 0.99) -> 2.453751447673197
Expected score ('lr_decay_iters', 300) -> 2.4581626067675795
Expected score ('max_iters', 300) -> 2.4581626067675795
Expected score ('n_head', 2) -> 2.4609193782188274
Expected score

False

In [8]:
tuner = Tuner(
    fixed_params=fixed_params.copy(),
    initial_params=initial_params.copy(),
    tune_options=tune_options.copy(), 
    computed_tune_options=computed_tune_options.copy(),
    cache_version=TUNER_VERSION,
    target_improvement_per_minute=0.001)
tuner.autotune_smart()

Training initial model as baseline.
## Initial Model, loss=2.119516372680664 elapsed=419.40264201164246s
Attempting channge 'learning_rate=0.005'
## Model learning_rate=0.005, loss=2.119516372680664 elapsed=419.40264201164246s
Attempting channge 'dtype=float16'
## Model dtype=float16, loss=2.119516372680664 elapsed=419.40264201164246s
Attempting channge 'bias=True'
## Model bias=True, loss=2.119516372680664 elapsed=419.40264201164246s
Attempting channge 'beta2=0.99'
## Model beta2=0.99, loss=2.119516372680664 elapsed=419.40264201164246s
Attempting channge 'n_layer=4'
## Model n_layer=4, loss=2.119516372680664 elapsed=419.40264201164246s
Attempting channge 'dropout=0.05'
## Model dropout=0.05, loss=2.119516372680664 elapsed=419.40264201164246s
Attempting channge 'n_head=8'
## Model n_head=8, loss=2.119516372680664 elapsed=419.40264201164246s
Attempting channge 'dropout=0.0'
model_config=TransformerConfig(n_max_context=44, n_layer=3, n_head=4, n_embd=128, dropout=0.0, bias=False)
train_c

KeyboardInterrupt: 