# Step-by-step run of alphazero self-play & training.


In [1]:
import os
import time
from pathlib import Path
import asyncio

import numpy as np
import torch

# Game and players
from rgi.rgizero.experiment import ExperimentRunner, ExperimentConfig
from rgi.rgizero.data.trajectory_dataset import Vocab, print_dataset_stats, TrajectoryDataset
from rgi.rgizero.evaluators import ActionHistoryTransformerEvaluator, AsyncNetworkEvaluator
from rgi.rgizero.models.tuner import create_random_model

import notebook_utils
from notebook_utils import reload_local_modules

device = notebook_utils.detect_device()

## Disable for debugger stability?
# # Allow asyncio to work with jupyter notebook
# import nest_asyncio
# nest_asyncio.apply()

# Increase numpy print width
np.set_printoptions(linewidth=300)

%load_ext line_profiler

transform_config_fields: {'n_head', 'n_embd', 'n_max_context', 'dropout', 'bias', 'n_layer'}
train_config_fields: {'model_version', 'log_interval', 'grad_clip', 'decay_lr', 'eval_only', 'lr_decay_iters', 'eval_iters', 'eval_interval', 'min_lr', 'always_save_checkpoint', 'batch_size', 'compile', 'device', 'warmup_iters', 'gradient_accumulation_steps', 'learning_rate', 'beta1', 'max_epochs', 'dtype', 'wandb_log', 'weight_decay', 'model_name', 'beta2', 'max_iters'}
Detected device: mps


In [2]:
RUN_GENERATIONS = True


# Create Experiment Config
experiment_config = ExperimentConfig(
    experiment_name='smoketest-e2e',
    game_name='connect4',
    num_generations=3,
    num_games_per_gen=50,
    num_simulations=10,
    model_size="tiny",
    train_batch_size=10,
    max_training_epochs=2,
    seed=42
)


## Step 1: Set up game and experiment runner


In [3]:
from rgi.rgizero.data.trajectory_dataset import Vocab
from rgi.rgizero.common import TOKENS

# Initialize Experiment Runner
experiment_base_dir = Path.cwd().parent / 'experiments'
experiment_runner = ExperimentRunner(experiment_config, experiment_base_dir)
game = experiment_runner.game
action_vocab = experiment_runner.action_vocab
n_max_context = experiment_runner.n_max_context

DATA_DIR = experiment_runner.data_dir
MODEL_DIR = experiment_runner.models_dir

print('✅ Runner initialized')
print(f'Game: {experiment_runner.config.game_name}, Players: {experiment_runner.num_players}, Actions: {list(game.base_game.all_actions())}')
print('Data dir: ', DATA_DIR)
print('Model dir: ', MODEL_DIR)


✅ Runner initialized
Game: connect4, Players: 2, Actions: [1, 2, 3, 4, 5, 6, 7]
Data dir:  /Users/rodo/src/rgi3-sync/experiments/smoketest-e2e/data
Model dir:  /Users/rodo/src/rgi3-sync/experiments/smoketest-e2e/models


## Step 2: Create random generation_0 model


In [4]:
# Initialize (creates Random Gen 0 if needed)
model_0 = experiment_runner.initialize()
current_model = model_0


Starting Experiment: smoketest-e2e
Initializing Random Gen 0 model.
Saved model to /Users/rodo/src/rgi3-sync/experiments/smoketest-e2e/models/gen-0.pt


In [5]:
! rm -rf ../experiments/smoketest-e2e


25.16s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


In [6]:
results_dict = {}
trajectory_paths_dict = {}
model_dict = {0: model_0}

current_model = model_dict[0]
if RUN_GENERATIONS:
    for generation_id in range(1, experiment_config.num_generations+1):
        current_model = await experiment_runner.run_generation_step_async(generation_id, current_model)
        dataset_path = experiment_runner.get_trajectory_path(generation_id)
        
        # print stats for visibility
        print_dataset_stats(dataset_path, f'gen-{generation_id}', n_max_context, action_vocab)
        
        model_dict[generation_id] = current_model



=== Generation 1 ===
Playing 50 games...


0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
ERROR:tornado.general:SEND Error: Host unreachable
Self Play: 100%|██████████| 50/50 [00:06<00:00,  7.29it/s]


Writing 50 trajectories...
Training model for gen 1...
num decayed parameter tensors: 11, with 1,968 parameters
num non-decayed parameter tensors: 7, with 50 parameters
using fused AdamW: False
step 0: losses: train:2.7720, train_policy_loss:2.0774, train_value_loss:0.6946, val:2.7773, val_policy_loss:2.0808, val_value_loss:0.6965
iter 0/5/5000: loss 2.7796, policy_loss:2.0794, value_loss:0.7002, time 0.48s, iter_time: 0.00ms
iter 1/5/5000: loss 2.7671, policy_loss:2.0766, value_loss:0.6905, time 0.05s, iter_time: 51.92ms
iter 2/5/5000: loss 2.7766, policy_loss:2.0775, value_loss:0.6990, time 0.02s, iter_time: 17.54ms
iter 3/5/5000: loss 2.7661, policy_loss:2.0756, value_loss:0.6905, time 0.05s, iter_time: 48.67ms
iter 4/5/5000: loss 2.7698, policy_loss:2.0765, value_loss:0.6933, time 0.09s, iter_time: 89.52ms
iter 5/10/5000: loss 2.7732, policy_loss:2.0762, value_loss:0.6969, time 0.05s, iter_time: 0.00ms
iter 6/10/5000: loss 2.7674, policy_loss:2.0761, value_loss:0.6913, time 0.05s, 

RuntimeError: Parent directory /Users/rodo/src/rgi3-sync/experiments/smoketest-e2e/models does not exist.