# Step-by-step run of alphazero self-play & training.


In [1]:
import os
import time
from pathlib import Path
import asyncio

import numpy as np
import torch

# Game and players
from rgi.rgizero.experiment import ExperimentRunner, ExperimentConfig
from rgi.rgizero.data.trajectory_dataset import Vocab, print_dataset_stats, TrajectoryDataset
from rgi.rgizero.evaluators import ActionHistoryTransformerEvaluator, AsyncNetworkEvaluator
from rgi.rgizero.models.tuner import create_random_model

import notebook_utils
from notebook_utils import reload_local_modules

device = notebook_utils.detect_device()

## Disable for debugger stability?
# # Allow asyncio to work with jupyter notebook
# import nest_asyncio
# nest_asyncio.apply()

# Increase numpy print width
np.set_printoptions(linewidth=300)

%load_ext line_profiler

transform_config_fields: {'n_max_context', 'dropout', 'n_head', 'n_layer', 'n_embd', 'bias'}
train_config_fields: {'wandb_log', 'beta1', 'min_lr', 'eval_iters', 'decay_lr', 'max_iters', 'max_epochs', 'warmup_iters', 'batch_size', 'eval_only', 'patience', 'device', 'model_version', 'learning_rate', 'eval_interval', 'weight_decay', 'lr_decay_iters', 'compile', 'always_save_checkpoint', 'beta2', 'gradient_accumulation_steps', 'model_name', 'dtype', 'grad_clip', 'log_interval'}
Detected device: mps


In [2]:
RUN_GENERATIONS = True


# Create Experiment Config
experiment_config = ExperimentConfig(
    experiment_name='smoketest-e2e-v3',   # Use sliding window.
    # experiment_name='smoketest-e2e-v2',
    # parent_experiment_name='smoketest-e2e',
    game_name='connect4',
    num_generations=40,
    num_games_per_gen=10_000,
    num_simulations=200,
    # model_size="tiny",
    # train_batch_size=10,
    # max_training_epochs=2,
    seed=42
)

# Tuned params from connect4 with 23k training games.
tuned_params = {
    'batch_size': 512,
    'beta1': 0.9,
    'beta2': 0.99,
    'bias': False,
    'decay_lr': True,
    'dropout': 0.0,
    'dtype': 'float16',
    'grad_clip': 1.0,
    'gradient_accumulation_steps': 1,
    'learning_rate': 0.001,
    'lr_decay_iters': 5000,
    'max_epochs': 1000000,
    'max_iters': 30_000,  # 30_000
    'min_lr': 0.0001,
    'n_embd': 64,
    'n_head': 2,
    'n_layer': 4,
    'n_max_context': 44,
    'warmup_iters': 1000,
    'weight_decay': 0.2,
    'eval_iters': 100,
    'log_interval': 200,
    'eval_interval': 1000,
    }

## Step 1: Set up game and experiment runner


In [3]:
from rgi.rgizero.data.trajectory_dataset import Vocab
from rgi.rgizero.common import TOKENS

# Initialize Experiment Runner
experiment_base_dir = Path.cwd().parent / 'experiments'
experiment_runner = ExperimentRunner(experiment_config, experiment_base_dir, training_args=tuned_params)
game = experiment_runner.game
action_vocab = experiment_runner.action_vocab
n_max_context = experiment_runner.n_max_context

DATA_DIR = experiment_runner.data_dir
MODEL_DIR = experiment_runner.models_dir

print('✅ Runner initialized')
print(f'Game: {experiment_runner.config.game_name}, Players: {experiment_runner.num_players}, Actions: {list(game.base_game.all_actions())}')
print('Data dir: ', DATA_DIR)
print('Model dir: ', MODEL_DIR)


✅ Runner initialized
Game: connect4, Players: 2, Actions: [1, 2, 3, 4, 5, 6, 7]
Data dir:  /Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data
Model dir:  /Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/models


## Step 2: Create random generation_0 model


In [4]:
# Initialize (creates Random Gen 0 if needed)
model_0 = experiment_runner.initialize()
current_model = model_0


Starting Experiment: smoketest-e2e-v3
Initializing Random Gen 0 model.
Saved model to /Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/models/gen-0.pt


In [5]:
results_dict = {}
trajectory_paths_dict = {}
model_dict = {0: model_0}

current_model = model_dict[0]
if RUN_GENERATIONS:
    for generation_id in range(1, experiment_config.num_generations+1):
        current_model = await experiment_runner.run_generation_step_async(generation_id, current_model)
        dataset_paths = experiment_runner.get_trajectory_paths(generation_id)
        
        # print stats for visibility
        print_dataset_stats(dataset_paths, n_max_context, action_vocab, model=current_model, game=game)
        
        model_dict[generation_id] = current_model

# 10m to play 2x10k generations... probabilities still very wrong.
# Evaluation time: 0.015 seconds, size=574, eval-per-second=37837.60, total-batches=6000, mean-eval-per-second=94963.99, mean-time-per-batch=0.010, mean-batch-size=990.34

# >>> log(2) + log(7) -> 2.6390573296152584
## Model doesn't seem to improve loss at all?
# step.   0: losses: train:2.5971, train_policy_loss:1.9146, train_value_loss:0.6825, val:2.5972, val_policy_loss:1.9147, val_value_loss:0.6825
# step 1000: losses: train:2.6036, train_policy_loss:1.9122, train_value_loss:0.6914, val:2.6050, val_policy_loss:1.9132, val_value_loss:0.6917
# step 2000: losses: train:2.6056, train_policy_loss:1.9119, train_value_loss:0.6937, val:2.6056, val_policy_loss:1.9123, val_value_loss:0.6933
# iter    0/1170/5000: loss 2.5699, policy_loss:1.9129, value_loss:0.6570, time 5.18s, iter_time: 0.00ms
# iter 1000/1170/5000: loss 2.5996, policy_loss:1.9068, value_loss:0.6928, time 1.96s, iter_time: 1957.74ms
# iter 2339/2340/5000: loss 2.6014, policy_loss:1.9131, value_loss:0.6884, time 0.01s, iter_time: 14.61ms




=== Generation 1 ===
Playing 10000 games...


Self Play:   1%|▏         | 149/10000 [00:44<27:02,  6.07it/s] 

Evaluation time: 0.010 seconds, size=1000, eval-per-second=101165.07, total-batches=1000, mean-eval-per-second=101727.88, mean-time-per-batch=0.010, mean-batch-size=1000.00


Self Play:   7%|▋         | 657/10000 [01:36<10:34, 14.72it/s]

Evaluation time: 0.011 seconds, size=1000, eval-per-second=88967.93, total-batches=2000, mean-eval-per-second=93760.01, mean-time-per-batch=0.011, mean-batch-size=1000.00


Self Play:  12%|█▏        | 1209/10000 [02:30<15:45,  9.30it/s]

Evaluation time: 0.010 seconds, size=1000, eval-per-second=98617.57, total-batches=3000, mean-eval-per-second=90231.78, mean-time-per-batch=0.011, mean-batch-size=1000.00


Self Play:  18%|█▊        | 1756/10000 [03:25<20:59,  6.54it/s]

Evaluation time: 0.017 seconds, size=1000, eval-per-second=59182.23, total-batches=4000, mean-eval-per-second=85528.89, mean-time-per-batch=0.012, mean-batch-size=1000.00


Self Play:  23%|██▎       | 2274/10000 [04:24<15:44,  8.18it/s]

Evaluation time: 0.016 seconds, size=1000, eval-per-second=60683.24, total-batches=5000, mean-eval-per-second=78077.08, mean-time-per-batch=0.013, mean-batch-size=1000.00


Self Play:  28%|██▊       | 2832/10000 [05:19<07:55, 15.08it/s]

Evaluation time: 0.011 seconds, size=1000, eval-per-second=89032.14, total-batches=6000, mean-eval-per-second=78474.36, mean-time-per-batch=0.013, mean-batch-size=1000.00


Self Play:  34%|███▍      | 3375/10000 [06:17<08:11, 13.49it/s]

Evaluation time: 0.017 seconds, size=1000, eval-per-second=59290.15, total-batches=7000, mean-eval-per-second=76111.98, mean-time-per-batch=0.013, mean-batch-size=1000.00


Self Play:  39%|███▉      | 3879/10000 [07:14<09:44, 10.47it/s]

Evaluation time: 0.011 seconds, size=1000, eval-per-second=92184.53, total-batches=8000, mean-eval-per-second=75266.89, mean-time-per-batch=0.013, mean-batch-size=1000.00


Self Play:  44%|████▍     | 4419/10000 [08:10<21:57,  4.24it/s]

Evaluation time: 0.022 seconds, size=1000, eval-per-second=45126.73, total-batches=9000, mean-eval-per-second=75465.56, mean-time-per-batch=0.013, mean-batch-size=1000.00


Self Play:  50%|████▉     | 4975/10000 [09:12<08:44,  9.58it/s]

Evaluation time: 0.018 seconds, size=1000, eval-per-second=56156.92, total-batches=10000, mean-eval-per-second=72496.36, mean-time-per-batch=0.014, mean-batch-size=1000.00


Self Play:  55%|█████▌    | 5517/10000 [10:12<07:40,  9.74it/s]

Evaluation time: 0.020 seconds, size=1000, eval-per-second=49667.30, total-batches=11000, mean-eval-per-second=71098.26, mean-time-per-batch=0.014, mean-batch-size=1000.00


Self Play:  60%|██████    | 6050/10000 [11:09<08:08,  8.09it/s]

Evaluation time: 0.011 seconds, size=1000, eval-per-second=88323.45, total-batches=12000, mean-eval-per-second=71172.65, mean-time-per-batch=0.014, mean-batch-size=1000.00


Self Play:  66%|██████▌   | 6597/10000 [12:05<05:51,  9.68it/s]

Evaluation time: 0.011 seconds, size=1000, eval-per-second=88386.73, total-batches=13000, mean-eval-per-second=70770.35, mean-time-per-batch=0.014, mean-batch-size=1000.00


Self Play:  71%|███████▏  | 7143/10000 [13:01<03:01, 15.76it/s]

Evaluation time: 0.011 seconds, size=1000, eval-per-second=89174.10, total-batches=14000, mean-eval-per-second=70879.95, mean-time-per-batch=0.014, mean-batch-size=1000.00


Self Play:  76%|███████▋  | 7648/10000 [14:00<04:49,  8.12it/s]

Evaluation time: 0.011 seconds, size=1000, eval-per-second=92860.08, total-batches=15000, mean-eval-per-second=70520.77, mean-time-per-batch=0.014, mean-batch-size=1000.00


Self Play:  82%|████████▏ | 8180/10000 [14:58<06:32,  4.63it/s]

Evaluation time: 0.018 seconds, size=1000, eval-per-second=54590.59, total-batches=16000, mean-eval-per-second=70220.14, mean-time-per-batch=0.014, mean-batch-size=1000.00


Self Play:  87%|████████▋ | 8729/10000 [15:56<02:24,  8.82it/s]

Evaluation time: 0.019 seconds, size=1000, eval-per-second=51671.81, total-batches=17000, mean-eval-per-second=69744.20, mean-time-per-batch=0.014, mean-batch-size=1000.00


Self Play:  93%|█████████▎| 9282/10000 [16:54<01:25,  8.41it/s]

Evaluation time: 0.010 seconds, size=719, eval-per-second=69162.78, total-batches=18000, mean-eval-per-second=68400.25, mean-time-per-batch=0.015, mean-batch-size=995.76


Self Play:  97%|█████████▋| 9738/10000 [17:30<00:23, 11.01it/s]

Evaluation time: 0.005 seconds, size=262, eval-per-second=56644.72, total-batches=19000, mean-eval-per-second=65679.34, mean-time-per-batch=0.015, mean-batch-size=968.47


Self Play:  99%|█████████▉| 9940/10000 [17:46<00:04, 12.01it/s]

Evaluation time: 0.023 seconds, size=61, eval-per-second=2680.52, total-batches=20000, mean-eval-per-second=63712.86, mean-time-per-batch=0.015, mean-batch-size=927.09


Self Play: 100%|█████████▉| 9997/10000 [17:53<00:00,  9.77it/s]

Evaluation time: 0.021 seconds, size=6, eval-per-second=281.72, total-batches=21000, mean-eval-per-second=62611.78, mean-time-per-batch=0.014, mean-batch-size=884.35


Self Play: 100%|██████████| 10000/10000 [17:53<00:00,  9.31it/s]


Writing 10000 trajectories...
Training model for gen 1...
num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters




using fused AdamW: False
step 0: losses: train:2.7692, train_policy_loss:2.0635, train_value_loss:0.7057, val:2.7680, val_policy_loss:2.0628, val_value_loss:0.7052
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-1/best.pt
iter 0/18/30000: loss 2.7641, policy_loss:2.0632, value_loss:0.7009, time 1.88s, iter_time: 0.00ms
iter 200/216/30000: loss 2.5468, policy_loss:1.8915, value_loss:0.6553, time 0.12s, iter_time: 58.97ms
iter 400/414/30000: loss 2.5205, policy_loss:1.8687, value_loss:0.6518, time 0.42s, iter_time: 104.37ms
iter 600/612/30000: loss 2.4910, policy_loss:1.8666, value_loss:0.6244, time 0.50s, iter_time: 83.88ms
iter 800/810/30000: loss 2.4833, policy_loss:1.8592, value_loss:0.6241, time 0.63s, iter_time: 78.59ms
step 1000: losses: train:2.4615, train_policy_loss:1.8532, train_value_loss:0.6083, val:2.5309, val_policy_loss:1.8584, val_value_loss:0.6724
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-1/best.pt
sa

Self Play:  10%|█         | 1006/10000 [00:55<09:22, 15.98it/s]

Evaluation time: 0.011 seconds, size=1000, eval-per-second=88543.47, total-batches=1000, mean-eval-per-second=91289.03, mean-time-per-batch=0.011, mean-batch-size=1000.00


Self Play:  22%|██▏       | 2204/10000 [01:57<13:57,  9.31it/s]

Evaluation time: 0.012 seconds, size=1000, eval-per-second=83857.57, total-batches=2000, mean-eval-per-second=85318.68, mean-time-per-batch=0.012, mean-batch-size=1000.00


Self Play:  34%|███▎      | 3356/10000 [02:59<05:04, 21.83it/s]

Evaluation time: 0.012 seconds, size=1000, eval-per-second=85516.02, total-batches=3000, mean-eval-per-second=81578.95, mean-time-per-batch=0.012, mean-batch-size=1000.00


Self Play:  45%|████▌     | 4542/10000 [04:02<03:40, 24.74it/s]

Evaluation time: 0.015 seconds, size=1000, eval-per-second=66334.08, total-batches=4000, mean-eval-per-second=81374.73, mean-time-per-batch=0.012, mean-batch-size=1000.00


Self Play:  57%|█████▋    | 5728/10000 [05:06<02:48, 25.39it/s]

Evaluation time: 0.016 seconds, size=1000, eval-per-second=62386.46, total-batches=5000, mean-eval-per-second=80334.04, mean-time-per-batch=0.012, mean-batch-size=1000.00


Self Play:  69%|██████▉   | 6888/10000 [06:08<01:56, 26.81it/s]

Evaluation time: 0.013 seconds, size=1000, eval-per-second=79275.42, total-batches=6000, mean-eval-per-second=80407.81, mean-time-per-batch=0.012, mean-batch-size=1000.00


Self Play:  81%|████████  | 8088/10000 [07:14<02:27, 12.94it/s]

Evaluation time: 0.010 seconds, size=1000, eval-per-second=96003.66, total-batches=7000, mean-eval-per-second=79264.21, mean-time-per-batch=0.013, mean-batch-size=1000.00


Self Play:  92%|█████████▏| 9243/10000 [08:20<00:56, 13.33it/s]

Evaluation time: 0.033 seconds, size=758, eval-per-second=23107.77, total-batches=8000, mean-eval-per-second=75658.59, mean-time-per-batch=0.013, mean-batch-size=996.53


Self Play:  98%|█████████▊| 9846/10000 [08:59<00:09, 15.47it/s]

Evaluation time: 0.027 seconds, size=155, eval-per-second=5767.23, total-batches=9000, mean-eval-per-second=65332.01, mean-time-per-batch=0.014, mean-batch-size=928.53


Self Play: 100%|█████████▉| 9987/10000 [09:11<00:02,  5.92it/s]

Evaluation time: 0.002 seconds, size=14, eval-per-second=7454.65, total-batches=10000, mean-eval-per-second=61305.61, mean-time-per-batch=0.014, mean-batch-size=841.73


Self Play: 100%|██████████| 10000/10000 [09:13<00:00, 18.06it/s]


Writing 10000 trajectories...
Training model for gen 2...
num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False




step 0: losses: train:2.4074, train_policy_loss:1.8335, train_value_loss:0.5739, val:2.4091, val_policy_loss:1.8365, val_value_loss:0.5726
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-2/best.pt
iter 0/36/30000: loss 2.4120, policy_loss:1.8372, value_loss:0.5748, time 12.18s, iter_time: 0.00ms
iter 200/216/30000: loss 2.2299, policy_loss:1.6950, value_loss:0.5349, time 6.66s, iter_time: 333.12ms
iter 400/432/30000: loss 2.1864, policy_loss:1.6818, value_loss:0.5046, time 0.20s, iter_time: 49.29ms
iter 600/612/30000: loss 2.2156, policy_loss:1.6824, value_loss:0.5332, time 4.27s, iter_time: 177.87ms
iter 800/828/30000: loss 2.1400, policy_loss:1.6481, value_loss:0.4919, time 0.79s, iter_time: 98.28ms
step 1000: losses: train:2.1753, train_policy_loss:1.6690, train_value_loss:0.5063, val:2.2458, val_policy_loss:1.6856, val_value_loss:0.5602
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-2/best.pt
saving checkpoint to /Use

Self Play:  13%|█▎        | 1343/10000 [01:05<06:01, 23.96it/s]

Evaluation time: 0.016 seconds, size=1000, eval-per-second=63718.04, total-batches=1000, mean-eval-per-second=76436.45, mean-time-per-batch=0.013, mean-batch-size=1000.00


Self Play:  30%|███       | 3020/10000 [02:17<05:11, 22.38it/s]

Evaluation time: 0.011 seconds, size=1000, eval-per-second=87844.35, total-batches=2000, mean-eval-per-second=72644.10, mean-time-per-batch=0.014, mean-batch-size=1000.00


Self Play:  47%|████▋     | 4705/10000 [03:29<02:53, 30.49it/s]

Evaluation time: 0.038 seconds, size=1000, eval-per-second=26446.47, total-batches=3000, mean-eval-per-second=72705.77, mean-time-per-batch=0.014, mean-batch-size=1000.00


Self Play:  64%|██████▍   | 6409/10000 [04:40<04:29, 13.35it/s]

Evaluation time: 0.021 seconds, size=1000, eval-per-second=46757.16, total-batches=4000, mean-eval-per-second=73270.03, mean-time-per-batch=0.014, mean-batch-size=1000.00


Self Play:  81%|████████  | 8085/10000 [05:52<01:27, 21.83it/s]

Evaluation time: 0.015 seconds, size=1000, eval-per-second=65329.80, total-batches=5000, mean-eval-per-second=73093.17, mean-time-per-batch=0.014, mean-batch-size=1000.00


Self Play:  97%|█████████▋| 9703/10000 [07:07<00:10, 28.32it/s]

Evaluation time: 0.045 seconds, size=303, eval-per-second=6663.14, total-batches=6000, mean-eval-per-second=62099.11, mean-time-per-batch=0.016, mean-batch-size=973.44


Self Play: 100%|█████████▉| 9990/10000 [07:31<00:01,  8.54it/s]

Evaluation time: 0.002 seconds, size=11, eval-per-second=6245.75, total-batches=7000, mean-eval-per-second=52408.52, mean-time-per-batch=0.016, mean-batch-size=848.36


Self Play: 100%|██████████| 10000/10000 [07:33<00:00, 22.04it/s]


Writing 10000 trajectories...
Training model for gen 3...
num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False




step 0: losses: train:2.0173, train_policy_loss:1.5521, train_value_loss:0.4652, val:2.0110, val_policy_loss:1.5544, val_value_loss:0.4566
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-3/best.pt
iter 0/53/30000: loss 2.0464, policy_loss:1.5600, value_loss:0.4864, time 5.13s, iter_time: 0.00ms
iter 200/212/30000: loss 1.8488, policy_loss:1.4365, value_loss:0.4123, time 3.89s, iter_time: 94.91ms
iter 400/424/30000: loss 1.8588, policy_loss:1.4470, value_loss:0.4118, time 1.84s, iter_time: 63.56ms
iter 600/636/30000: loss 1.8889, policy_loss:1.4320, value_loss:0.4569, time 1.92s, iter_time: 112.88ms
iter 800/848/30000: loss 1.8908, policy_loss:1.4522, value_loss:0.4386, time 0.38s, iter_time: 76.83ms
step 1000: losses: train:1.8747, train_policy_loss:1.4412, train_value_loss:0.4336, val:1.9165, val_policy_loss:1.4535, val_value_loss:0.4630
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-3/best.pt
saving checkpoint to /Users

Self Play:  29%|██▊       | 2867/10000 [01:14<02:59, 39.82it/s]

Evaluation time: 0.021 seconds, size=1000, eval-per-second=47796.70, total-batches=1000, mean-eval-per-second=69487.44, mean-time-per-batch=0.014, mean-batch-size=1000.00


Self Play:  58%|█████▊    | 5791/10000 [02:33<02:47, 25.18it/s]

Evaluation time: 0.013 seconds, size=1000, eval-per-second=78914.47, total-batches=2000, mean-eval-per-second=63675.54, mean-time-per-batch=0.016, mean-batch-size=1000.00


Self Play:  87%|████████▋ | 8728/10000 [03:54<00:38, 32.92it/s]

Evaluation time: 0.030 seconds, size=1000, eval-per-second=33429.27, total-batches=3000, mean-eval-per-second=60010.67, mean-time-per-batch=0.017, mean-batch-size=1000.00


Self Play:  99%|█████████▉| 9929/10000 [04:48<00:03, 18.86it/s]

Evaluation time: 0.004 seconds, size=73, eval-per-second=20499.75, total-batches=4000, mean-eval-per-second=41725.93, mean-time-per-batch=0.020, mean-batch-size=850.60


Self Play: 100%|█████████▉| 9999/10000 [04:55<00:00,  6.97it/s]

Evaluation time: 0.001 seconds, size=1, eval-per-second=727.80, total-batches=5000, mean-eval-per-second=39229.26, mean-time-per-batch=0.017, mean-batch-size=684.43


Self Play: 100%|██████████| 10000/10000 [04:56<00:00, 33.75it/s]


Writing 10000 trajectories...
Training model for gen 4...
num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False




step 0: losses: train:1.7611, train_policy_loss:1.3333, train_value_loss:0.4278, val:1.7582, val_policy_loss:1.3262, val_value_loss:0.4320
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-4/best.pt
iter 0/71/30000: loss 1.7479, policy_loss:1.3377, value_loss:0.4102, time 28.13s, iter_time: 0.00ms
iter 200/213/30000: loss 1.7632, policy_loss:1.3263, value_loss:0.4369, time 12.81s, iter_time: 220.80ms
iter 400/426/30000: loss 1.7125, policy_loss:1.3002, value_loss:0.4124, time 8.21s, iter_time: 182.52ms
iter 600/639/30000: loss 1.6863, policy_loss:1.2962, value_loss:0.3901, time 2.13s, iter_time: 66.62ms
iter 800/852/30000: loss 1.7477, policy_loss:1.3333, value_loss:0.4144, time 3.03s, iter_time: 159.50ms
step 1000: losses: train:1.7116, train_policy_loss:1.3010, train_value_loss:0.4106, val:1.7355, val_policy_loss:1.2961, val_value_loss:0.4394
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-4/best.pt
saving checkpoint to /U

Self Play:  68%|██████▊   | 6830/10000 [01:36<00:44, 70.46it/s] 

Evaluation time: 0.019 seconds, size=1000, eval-per-second=52186.13, total-batches=1000, mean-eval-per-second=60206.65, mean-time-per-batch=0.017, mean-batch-size=1000.00


Self Play:  99%|█████████▉| 9935/10000 [02:37<00:04, 13.90it/s]

Evaluation time: 0.003 seconds, size=67, eval-per-second=20272.57, total-batches=2000, mean-eval-per-second=36997.13, mean-time-per-batch=0.020, mean-batch-size=748.01


Self Play: 100%|█████████▉| 9998/10000 [02:43<00:00,  7.07it/s]

Evaluation time: 0.002 seconds, size=4, eval-per-second=1954.02, total-batches=3000, mean-eval-per-second=33404.02, mean-time-per-batch=0.015, mean-batch-size=506.73


Self Play: 100%|██████████| 10000/10000 [02:44<00:00, 60.75it/s]


Writing 10000 trajectories...
Training model for gen 5...
num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False




step 0: losses: train:1.5704, train_policy_loss:1.1858, train_value_loss:0.3846, val:1.5758, val_policy_loss:1.1903, val_value_loss:0.3854
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-5/best.pt
iter 0/88/30000: loss 1.5517, policy_loss:1.1690, value_loss:0.3827, time 46.63s, iter_time: 0.00ms
iter 200/264/30000: loss 1.4586, policy_loss:1.0957, value_loss:0.3629, time 7.85s, iter_time: 327.01ms
iter 400/440/30000: loss 1.5356, policy_loss:1.1519, value_loss:0.3837, time 6.03s, iter_time: 125.60ms
iter 600/616/30000: loss 1.5469, policy_loss:1.1644, value_loss:0.3826, time 11.19s, iter_time: 155.46ms
iter 800/880/30000: loss 1.5313, policy_loss:1.1506, value_loss:0.3808, time 0.39s, iter_time: 48.69ms
step 1000: losses: train:1.5494, train_policy_loss:1.1736, train_value_loss:0.3758, val:1.5809, val_policy_loss:1.1825, val_value_loss:0.3984
saving checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-5
iter 1000/1056/30000: loss 1.5397, p

Self Play:  69%|██████▉   | 6908/10000 [01:36<00:40, 75.93it/s] 

Evaluation time: 0.019 seconds, size=1000, eval-per-second=53242.75, total-batches=1000, mean-eval-per-second=59929.76, mean-time-per-batch=0.017, mean-batch-size=1000.00


Self Play:  99%|█████████▉| 9925/10000 [02:38<00:08,  8.78it/s] 

Evaluation time: 0.005 seconds, size=77, eval-per-second=14162.49, total-batches=2000, mean-eval-per-second=35721.93, mean-time-per-batch=0.021, mean-batch-size=743.28


Self Play: 100%|█████████▉| 9997/10000 [02:47<00:00,  6.32it/s]

Evaluation time: 0.002 seconds, size=2, eval-per-second=895.26, total-batches=3000, mean-eval-per-second=30366.65, mean-time-per-batch=0.017, mean-batch-size=503.60


Self Play: 100%|██████████| 10000/10000 [02:48<00:00, 59.46it/s]


Writing 10000 trajectories...
Training model for gen 6...
num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False




step 0: losses: train:1.4653, train_policy_loss:1.1018, train_value_loss:0.3635, val:1.4676, val_policy_loss:1.1061, val_value_loss:0.3615
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-6/best.pt
iter 0/106/30000: loss 1.4900, policy_loss:1.1195, value_loss:0.3705, time 34.11s, iter_time: 0.00ms
iter 200/212/30000: loss 1.4497, policy_loss:1.0892, value_loss:0.3605, time 17.15s, iter_time: 182.41ms
iter 400/424/30000: loss 1.4576, policy_loss:1.1014, value_loss:0.3562, time 14.28s, iter_time: 174.13ms
iter 600/636/30000: loss 1.4353, policy_loss:1.0860, value_loss:0.3493, time 11.43s, iter_time: 163.32ms
iter 800/848/30000: loss 1.4835, policy_loss:1.0991, value_loss:0.3843, time 5.65s, iter_time: 97.37ms
step 1000: losses: train:1.4362, train_policy_loss:1.0774, train_value_loss:0.3587, val:1.4531, val_policy_loss:1.0855, val_value_loss:0.3675
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-6/best.pt
saving checkpoint to

Self Play:  98%|█████████▊| 9829/10000 [01:43<00:09, 17.19it/s] 

Evaluation time: 0.091 seconds, size=170, eval-per-second=1870.83, total-batches=1000, mean-eval-per-second=39793.91, mean-time-per-batch=0.022, mean-batch-size=859.85


Self Play: 100%|█████████▉| 9993/10000 [01:55<00:00,  9.18it/s]

Evaluation time: 0.002 seconds, size=7, eval-per-second=3573.53, total-batches=2000, mean-eval-per-second=29620.12, mean-time-per-batch=0.016, mean-batch-size=460.92


Self Play: 100%|██████████| 10000/10000 [01:57<00:00, 85.40it/s]


Writing 10000 trajectories...
Training model for gen 7...
num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False




step 0: losses: train:1.3078, train_policy_loss:0.9894, train_value_loss:0.3184, val:1.3352, val_policy_loss:1.0067, val_value_loss:0.3286
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-7/best.pt
iter 0/124/30000: loss 1.3879, policy_loss:1.0467, value_loss:0.3413, time 30.49s, iter_time: 0.00ms
iter 200/248/30000: loss 1.2663, policy_loss:0.9559, value_loss:0.3103, time 15.85s, iter_time: 208.55ms
iter 400/496/30000: loss 1.2851, policy_loss:0.9772, value_loss:0.3080, time 3.75s, iter_time: 134.09ms
iter 600/620/30000: loss 1.2954, policy_loss:0.9788, value_loss:0.3166, time 8.11s, iter_time: 78.03ms
iter 800/868/30000: loss 1.3323, policy_loss:1.0153, value_loss:0.3170, time 2.36s, iter_time: 42.17ms
step 1000: losses: train:1.2993, train_policy_loss:0.9869, train_value_loss:0.3124, val:1.3577, val_policy_loss:1.0102, val_value_loss:0.3475
saving checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-7
iter 1000/1116/30000: loss 1.2367, p

Self Play:  98%|█████████▊| 9830/10000 [01:44<00:09, 17.16it/s] 

Evaluation time: 0.051 seconds, size=172, eval-per-second=3341.44, total-batches=1000, mean-eval-per-second=39076.43, mean-time-per-batch=0.022, mean-batch-size=865.50


Self Play: 100%|█████████▉| 9989/10000 [01:54<00:01, 10.52it/s]

Evaluation time: 0.002 seconds, size=11, eval-per-second=4680.67, total-batches=2000, mean-eval-per-second=30972.35, mean-time-per-batch=0.015, mean-batch-size=462.69


Self Play: 100%|██████████| 10000/10000 [01:56<00:00, 85.67it/s]


Writing 10000 trajectories...
Training model for gen 8...
num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False




step 0: losses: train:1.2331, train_policy_loss:0.9299, train_value_loss:0.3031, val:1.1994, val_policy_loss:0.9128, val_value_loss:0.2866
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-8/best.pt
iter 0/141/30000: loss 1.2620, policy_loss:0.9502, value_loss:0.3119, time 13.84s, iter_time: 0.00ms
iter 200/282/30000: loss 1.2535, policy_loss:0.9433, value_loss:0.3101, time 4.35s, iter_time: 73.78ms
iter 400/423/30000: loss 1.2093, policy_loss:0.9381, value_loss:0.2713, time 11.83s, iter_time: 100.24ms
iter 600/705/30000: loss 1.2806, policy_loss:0.9599, value_loss:0.3208, time 4.33s, iter_time: 120.25ms
iter 800/846/30000: loss 1.1975, policy_loss:0.9272, value_loss:0.2703, time 15.78s, iter_time: 166.13ms
step 1000: losses: train:1.2221, train_policy_loss:0.9280, train_value_loss:0.2941, val:1.2150, val_policy_loss:0.9139, val_value_loss:0.3010
saving checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-8
iter 1000/1128/30000: loss 1.2094,

Self Play:  98%|█████████▊| 9821/10000 [01:44<00:12, 14.72it/s] 

Evaluation time: 0.007 seconds, size=183, eval-per-second=26413.77, total-batches=1000, mean-eval-per-second=40129.21, mean-time-per-batch=0.022, mean-batch-size=869.37


Self Play: 100%|█████████▉| 9981/10000 [01:54<00:01, 10.44it/s]

Evaluation time: 0.002 seconds, size=18, eval-per-second=8963.25, total-batches=2000, mean-eval-per-second=31895.91, mean-time-per-batch=0.015, mean-batch-size=467.45


Self Play: 100%|██████████| 10000/10000 [01:56<00:00, 85.61it/s]


Writing 10000 trajectories...
Training model for gen 9...




num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False
step 0: losses: train:1.1575, train_policy_loss:0.8734, train_value_loss:0.2841, val:1.1542, val_policy_loss:0.8779, val_value_loss:0.2763
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-9/best.pt
iter 0/159/30000: loss 1.1571, policy_loss:0.8757, value_loss:0.2814, time 20.24s, iter_time: 0.00ms
iter 200/318/30000: loss 1.1733, policy_loss:0.8884, value_loss:0.2849, time 3.83s, iter_time: 93.32ms
iter 400/477/30000: loss 1.1601, policy_loss:0.8816, value_loss:0.2785, time 9.24s, iter_time: 112.68ms
iter 600/636/30000: loss 1.1891, policy_loss:0.8976, value_loss:0.2915, time 10.32s, iter_time: 83.88ms
iter 800/954/30000: loss 1.1114, policy_loss:0.8589, value_loss:0.2525, time 0.22s, iter_time: 44.29ms
step 1000: losses: train:1.1564, train_policy_loss:0.8747, train_value_loss:0.2818, val:1.1680, val_policy_loss:0

Self Play:  98%|█████████▊| 9823/10000 [01:44<00:08, 20.81it/s] 

Evaluation time: 0.017 seconds, size=182, eval-per-second=10667.61, total-batches=1000, mean-eval-per-second=41301.79, mean-time-per-batch=0.021, mean-batch-size=873.95


Self Play: 100%|█████████▉| 9988/10000 [01:53<00:00, 17.93it/s]

Evaluation time: 0.002 seconds, size=12, eval-per-second=6051.66, total-batches=2000, mean-eval-per-second=33358.43, mean-time-per-batch=0.014, mean-batch-size=468.68


Self Play: 100%|██████████| 10000/10000 [01:55<00:00, 86.81it/s]


Writing 10000 trajectories...
Training model for gen 10...




num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False
step 0: losses: train:1.1058, train_policy_loss:0.8362, train_value_loss:0.2696, val:1.1031, val_policy_loss:0.8375, val_value_loss:0.2657
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-10/best.pt
iter 0/176/30000: loss 1.0749, policy_loss:0.8171, value_loss:0.2578, time 16.18s, iter_time: 0.00ms
iter 200/352/30000: loss 1.0123, policy_loss:0.8076, value_loss:0.2048, time 2.15s, iter_time: 89.78ms
iter 400/528/30000: loss 1.1320, policy_loss:0.8387, value_loss:0.2933, time 3.00s, iter_time: 62.50ms
iter 600/704/30000: loss 1.1109, policy_loss:0.8415, value_loss:0.2695, time 5.43s, iter_time: 75.38ms
iter 800/880/30000: loss 1.0945, policy_loss:0.8332, value_loss:0.2612, time 6.11s, iter_time: 63.65ms
step 1000: losses: train:1.1006, train_policy_loss:0.8324, train_value_loss:0.2682, val:1.1059, val_policy_loss:0.

Self Play:  98%|█████████▊| 9840/10000 [01:44<00:05, 26.91it/s] 

Evaluation time: 0.010 seconds, size=162, eval-per-second=16666.52, total-batches=1000, mean-eval-per-second=41275.76, mean-time-per-batch=0.021, mean-batch-size=865.48


Self Play: 100%|█████████▉| 9994/10000 [01:52<00:00, 11.80it/s]

Evaluation time: 0.002 seconds, size=6, eval-per-second=3848.57, total-batches=2000, mean-eval-per-second=33291.82, mean-time-per-batch=0.014, mean-batch-size=458.56


Self Play: 100%|██████████| 10000/10000 [01:55<00:00, 86.46it/s]


Writing 10000 trajectories...
Training model for gen 11...




num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False
step 0: losses: train:0.8201, train_policy_loss:0.6137, train_value_loss:0.2064, val:0.8353, val_policy_loss:0.6213, val_value_loss:0.2140
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-11/best.pt
iter 0/176/30000: loss 0.8229, policy_loss:0.6036, value_loss:0.2193, time 67.63s, iter_time: 0.00ms
iter 200/352/30000: loss 0.7998, policy_loss:0.5953, value_loss:0.2045, time 7.35s, iter_time: 306.20ms
iter 400/528/30000: loss 0.7289, policy_loss:0.5630, value_loss:0.1659, time 4.99s, iter_time: 103.95ms
iter 600/704/30000: loss 0.7958, policy_loss:0.6036, value_loss:0.1922, time 14.00s, iter_time: 194.38ms
iter 800/880/30000: loss 0.7229, policy_loss:0.5587, value_loss:0.1641, time 11.61s, iter_time: 120.99ms
step 1000: losses: train:0.7814, train_policy_loss:0.5864, train_value_loss:0.1950, val:0.7988, val_policy_l

Self Play:  99%|█████████▉| 9942/10000 [01:32<00:02, 21.80it/s] 

Evaluation time: 0.005 seconds, size=61, eval-per-second=12767.09, total-batches=1000, mean-eval-per-second=33894.79, mean-time-per-batch=0.019, mean-batch-size=658.47


Self Play: 100%|█████████▉| 9997/10000 [01:36<00:00,  5.96it/s]

Evaluation time: 0.002 seconds, size=4, eval-per-second=2461.81, total-batches=2000, mean-eval-per-second=30042.38, mean-time-per-batch=0.011, mean-batch-size=337.98


Self Play: 100%|█████████▉| 9999/10000 [01:36<00:00,  6.67it/s]

Evaluation time: 0.001 seconds, size=1, eval-per-second=742.35, total-batches=3000, mean-eval-per-second=27683.59, mean-time-per-batch=0.008, mean-batch-size=225.73


Self Play: 100%|██████████| 10000/10000 [01:38<00:00, 101.18it/s]


Writing 10000 trajectories...
Training model for gen 12...
num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False




step 0: losses: train:0.5948, train_policy_loss:0.4410, train_value_loss:0.1538, val:0.6005, val_policy_loss:0.4410, val_value_loss:0.1595
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-12/best.pt
iter 0/176/30000: loss 0.5886, policy_loss:0.4182, value_loss:0.1704, time 12.41s, iter_time: 0.00ms
iter 200/352/30000: loss 0.5833, policy_loss:0.4416, value_loss:0.1417, time 1.11s, iter_time: 46.22ms
iter 400/528/30000: loss 0.6092, policy_loss:0.4443, value_loss:0.1649, time 4.86s, iter_time: 101.25ms
iter 600/704/30000: loss 0.5588, policy_loss:0.4277, value_loss:0.1311, time 5.81s, iter_time: 80.63ms
iter 800/880/30000: loss 0.5575, policy_loss:0.4294, value_loss:0.1280, time 6.04s, iter_time: 62.90ms
step 1000: losses: train:0.5952, train_policy_loss:0.4382, train_value_loss:0.1570, val:0.6084, val_policy_loss:0.4401, val_value_loss:0.1683
saving checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-12
iter 1000/1056/30000: loss 0.6703, p

Self Play:  99%|█████████▉| 9949/10000 [01:32<00:02, 21.24it/s] 

Evaluation time: 0.002 seconds, size=56, eval-per-second=31783.63, total-batches=1000, mean-eval-per-second=31930.60, mean-time-per-batch=0.021, mean-batch-size=656.20


Self Play: 100%|█████████▉| 9996/10000 [01:37<00:00,  6.67it/s]

Evaluation time: 0.002 seconds, size=4, eval-per-second=2467.60, total-batches=2000, mean-eval-per-second=27328.69, mean-time-per-batch=0.012, mean-batch-size=337.39


Self Play: 100%|██████████| 10000/10000 [01:39<00:00, 100.78it/s]


Writing 10000 trajectories...
Training model for gen 13...
num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False




step 0: losses: train:0.4860, train_policy_loss:0.3536, train_value_loss:0.1324, val:0.4778, val_policy_loss:0.3492, val_value_loss:0.1286
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-13/best.pt
iter 0/176/30000: loss 0.4792, policy_loss:0.3533, value_loss:0.1259, time 32.93s, iter_time: 0.00ms
iter 200/352/30000: loss 0.5415, policy_loss:0.3773, value_loss:0.1642, time 4.23s, iter_time: 176.26ms
iter 400/528/30000: loss 0.4464, policy_loss:0.3291, value_loss:0.1173, time 2.84s, iter_time: 59.19ms
iter 600/704/30000: loss 0.4484, policy_loss:0.3394, value_loss:0.1090, time 3.36s, iter_time: 46.63ms
iter 800/880/30000: loss 0.5049, policy_loss:0.3328, value_loss:0.1721, time 4.34s, iter_time: 45.24ms
step 1000: losses: train:0.4729, train_policy_loss:0.3411, train_value_loss:0.1318, val:0.4741, val_policy_loss:0.3396, val_value_loss:0.1345
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-13/best.pt
saving checkpoint to /U

Self Play: 100%|█████████▉| 9979/10000 [01:30<00:02,  8.19it/s] 

Evaluation time: 0.003 seconds, size=22, eval-per-second=8517.93, total-batches=1000, mean-eval-per-second=31525.32, mean-time-per-batch=0.020, mean-batch-size=623.50


Self Play: 100%|█████████▉| 9999/10000 [01:33<00:00,  5.78it/s]

Evaluation time: 0.001 seconds, size=1, eval-per-second=687.70, total-batches=2000, mean-eval-per-second=28367.02, mean-time-per-batch=0.011, mean-batch-size=315.58


Self Play: 100%|██████████| 10000/10000 [01:34<00:00, 106.33it/s]


Writing 10000 trajectories...
Training model for gen 14...




num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False
step 0: losses: train:0.3812, train_policy_loss:0.2882, train_value_loss:0.0930, val:0.3772, val_policy_loss:0.2865, val_value_loss:0.0907
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-14/best.pt
iter 0/176/30000: loss 0.3653, policy_loss:0.2805, value_loss:0.0848, time 29.85s, iter_time: 0.00ms
iter 200/352/30000: loss 0.4282, policy_loss:0.3038, value_loss:0.1243, time 4.02s, iter_time: 167.35ms
iter 400/528/30000: loss 0.3681, policy_loss:0.2933, value_loss:0.0748, time 5.59s, iter_time: 116.56ms
iter 600/704/30000: loss 0.4007, policy_loss:0.3122, value_loss:0.0885, time 3.85s, iter_time: 53.52ms
iter 800/880/30000: loss 0.4228, policy_loss:0.3016, value_loss:0.1212, time 6.83s, iter_time: 71.17ms
step 1000: losses: train:0.3772, train_policy_loss:0.2851, train_value_loss:0.0922, val:0.3793, val_policy_loss:

Self Play: 100%|█████████▉| 9983/10000 [01:24<00:01, 14.96it/s] 

Evaluation time: 0.002 seconds, size=21, eval-per-second=11152.24, total-batches=1000, mean-eval-per-second=31734.33, mean-time-per-batch=0.017, mean-batch-size=541.29


Self Play: 100%|██████████| 10000/10000 [01:26<00:00, 116.18it/s]

Evaluation time: 0.001 seconds, size=1, eval-per-second=782.52, total-batches=2000, mean-eval-per-second=28828.29, mean-time-per-batch=0.009, mean-batch-size=272.87
Writing 10000 trajectories...





Training model for gen 15...
num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False




step 0: losses: train:0.3298, train_policy_loss:0.2581, train_value_loss:0.0717, val:0.3388, val_policy_loss:0.2622, val_value_loss:0.0766
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-15/best.pt
iter 0/176/30000: loss 0.3781, policy_loss:0.2572, value_loss:0.1210, time 16.12s, iter_time: 0.00ms
iter 200/352/30000: loss 0.3333, policy_loss:0.2536, value_loss:0.0797, time 2.68s, iter_time: 111.74ms
iter 400/528/30000: loss 0.3274, policy_loss:0.2744, value_loss:0.0530, time 3.82s, iter_time: 79.54ms
iter 600/704/30000: loss 0.3637, policy_loss:0.2583, value_loss:0.1053, time 5.68s, iter_time: 78.90ms
iter 800/880/30000: loss 0.3432, policy_loss:0.2672, value_loss:0.0760, time 3.88s, iter_time: 40.38ms
step 1000: losses: train:0.3385, train_policy_loss:0.2635, train_value_loss:0.0750, val:0.3521, val_policy_loss:0.2688, val_value_loss:0.0833
saving checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-15
iter 1000/1056/30000: loss 0.3262, p

Self Play: 100%|█████████▉| 9985/10000 [01:25<00:00, 22.26it/s] 

Evaluation time: 0.002 seconds, size=17, eval-per-second=7780.79, total-batches=1000, mean-eval-per-second=31224.13, mean-time-per-batch=0.017, mean-batch-size=542.55


Self Play: 100%|█████████▉| 9997/10000 [01:26<00:00,  7.70it/s]

Evaluation time: 0.002 seconds, size=2, eval-per-second=1164.92, total-batches=2000, mean-eval-per-second=28475.94, mean-time-per-batch=0.010, mean-batch-size=274.50


Self Play: 100%|██████████| 10000/10000 [01:27<00:00, 113.92it/s]


Writing 10000 trajectories...
Training model for gen 16...




num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False
step 0: losses: train:0.3009, train_policy_loss:0.2415, train_value_loss:0.0593, val:0.3039, val_policy_loss:0.2421, val_value_loss:0.0618
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-16/best.pt
iter 0/176/30000: loss 0.2767, policy_loss:0.2361, value_loss:0.0405, time 11.92s, iter_time: 0.00ms
iter 200/352/30000: loss 0.2653, policy_loss:0.2138, value_loss:0.0515, time 4.01s, iter_time: 166.93ms
iter 400/528/30000: loss 0.3005, policy_loss:0.2400, value_loss:0.0605, time 3.61s, iter_time: 75.30ms
iter 600/704/30000: loss 0.3210, policy_loss:0.2481, value_loss:0.0730, time 5.27s, iter_time: 73.13ms
iter 800/880/30000: loss 0.3231, policy_loss:0.2468, value_loss:0.0763, time 5.79s, iter_time: 60.33ms
step 1000: losses: train:0.3051, train_policy_loss:0.2429, train_value_loss:0.0622, val:0.3092, val_policy_loss:0

Self Play: 100%|█████████▉| 9965/10000 [01:27<00:03, 10.40it/s] 

Evaluation time: 0.003 seconds, size=36, eval-per-second=12507.86, total-batches=1000, mean-eval-per-second=28165.04, mean-time-per-batch=0.020, mean-batch-size=554.42


Self Play: 100%|██████████| 10000/10000 [01:30<00:00, 111.10it/s]


Writing 10000 trajectories...
Training model for gen 17...




num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False
step 0: losses: train:0.2974, train_policy_loss:0.2368, train_value_loss:0.0606, val:0.3073, val_policy_loss:0.2456, val_value_loss:0.0618
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-17/best.pt
iter 0/176/30000: loss 0.2705, policy_loss:0.2398, value_loss:0.0307, time 4.25s, iter_time: 0.00ms
iter 200/352/30000: loss 0.2905, policy_loss:0.2304, value_loss:0.0601, time 0.97s, iter_time: 40.60ms
iter 400/528/30000: loss 0.2624, policy_loss:0.2214, value_loss:0.0410, time 2.07s, iter_time: 43.03ms
iter 600/704/30000: loss 0.3071, policy_loss:0.2465, value_loss:0.0606, time 4.77s, iter_time: 66.27ms
iter 800/880/30000: loss 0.2710, policy_loss:0.2198, value_loss:0.0511, time 4.32s, iter_time: 45.03ms
step 1000: losses: train:0.2942, train_policy_loss:0.2354, train_value_loss:0.0588, val:0.3145, val_policy_loss:0.2

Self Play: 100%|█████████▉| 9974/10000 [01:24<00:01, 19.42it/s] 

Evaluation time: 0.003 seconds, size=31, eval-per-second=11318.19, total-batches=1000, mean-eval-per-second=32498.63, mean-time-per-batch=0.017, mean-batch-size=551.20


Self Play: 100%|██████████| 10000/10000 [01:26<00:00, 115.16it/s]


Writing 10000 trajectories...
Training model for gen 18...




num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False
step 0: losses: train:0.2913, train_policy_loss:0.2329, train_value_loss:0.0584, val:0.2862, val_policy_loss:0.2319, val_value_loss:0.0544
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-18/best.pt
iter 0/176/30000: loss 0.2954, policy_loss:0.2411, value_loss:0.0543, time 7.70s, iter_time: 0.00ms
iter 200/352/30000: loss 0.2932, policy_loss:0.2371, value_loss:0.0561, time 1.02s, iter_time: 42.59ms
iter 400/528/30000: loss 0.2746, policy_loss:0.2101, value_loss:0.0645, time 2.17s, iter_time: 45.13ms
iter 600/704/30000: loss 0.2320, policy_loss:0.2064, value_loss:0.0256, time 7.37s, iter_time: 102.36ms
iter 800/880/30000: loss 0.2770, policy_loss:0.2284, value_loss:0.0486, time 5.94s, iter_time: 61.82ms
step 1000: losses: train:0.2881, train_policy_loss:0.2285, train_value_loss:0.0596, val:0.2900, val_policy_loss:0.

Self Play: 100%|█████████▉| 9984/10000 [01:23<00:00, 22.01it/s] 

Evaluation time: 0.002 seconds, size=17, eval-per-second=8768.22, total-batches=1000, mean-eval-per-second=29572.44, mean-time-per-batch=0.018, mean-batch-size=523.05


Self Play: 100%|██████████| 10000/10000 [01:25<00:00, 116.50it/s]


Writing 10000 trajectories...
Training model for gen 19...




num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False
step 0: losses: train:0.2661, train_policy_loss:0.2140, train_value_loss:0.0521, val:0.2619, val_policy_loss:0.2133, val_value_loss:0.0486
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-19/best.pt
iter 0/176/30000: loss 0.2811, policy_loss:0.2223, value_loss:0.0588, time 3.22s, iter_time: 0.00ms
iter 200/352/30000: loss 0.2752, policy_loss:0.2244, value_loss:0.0509, time 0.99s, iter_time: 41.32ms
iter 400/528/30000: loss 0.2754, policy_loss:0.2089, value_loss:0.0665, time 2.17s, iter_time: 45.18ms
iter 600/704/30000: loss 0.2635, policy_loss:0.2131, value_loss:0.0504, time 2.86s, iter_time: 39.74ms
iter 800/880/30000: loss 0.2376, policy_loss:0.2114, value_loss:0.0262, time 3.94s, iter_time: 41.03ms
step 1000: losses: train:0.2729, train_policy_loss:0.2183, train_value_loss:0.0547, val:0.2738, val_policy_loss:0.2

Self Play: 100%|█████████▉| 9989/10000 [01:23<00:00, 28.79it/s] 

Evaluation time: 0.002 seconds, size=18, eval-per-second=10453.82, total-batches=1000, mean-eval-per-second=29048.53, mean-time-per-batch=0.018, mean-batch-size=517.29


Self Play: 100%|██████████| 10000/10000 [01:24<00:00, 117.77it/s]


Writing 10000 trajectories...
Training model for gen 20...
num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False




step 0: losses: train:0.2544, train_policy_loss:0.2052, train_value_loss:0.0492, val:0.2599, val_policy_loss:0.2086, val_value_loss:0.0513
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-20/best.pt
iter 0/176/30000: loss 0.2722, policy_loss:0.2013, value_loss:0.0709, time 6.62s, iter_time: 0.00ms
iter 200/352/30000: loss 0.2376, policy_loss:0.1939, value_loss:0.0437, time 1.00s, iter_time: 41.72ms
iter 400/528/30000: loss 0.2706, policy_loss:0.2144, value_loss:0.0562, time 2.38s, iter_time: 49.67ms
iter 600/704/30000: loss 0.2904, policy_loss:0.2180, value_loss:0.0724, time 6.34s, iter_time: 88.05ms
iter 800/880/30000: loss 0.2496, policy_loss:0.2052, value_loss:0.0444, time 8.45s, iter_time: 88.04ms
step 1000: losses: train:0.2557, train_policy_loss:0.2073, train_value_loss:0.0484, val:0.2697, val_policy_loss:0.2143, val_value_loss:0.0554
saving checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-20
iter 1000/1056/30000: loss 0.2542, pol

Self Play: 100%|█████████▉| 9972/10000 [01:22<00:01, 20.47it/s] 

Evaluation time: 0.004 seconds, size=28, eval-per-second=7442.84, total-batches=1000, mean-eval-per-second=33605.65, mean-time-per-batch=0.016, mean-batch-size=528.46


Self Play: 100%|██████████| 10000/10000 [01:24<00:00, 118.85it/s]


Writing 10000 trajectories...
Training model for gen 21...




num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False
step 0: losses: train:0.2475, train_policy_loss:0.1994, train_value_loss:0.0482, val:0.2398, val_policy_loss:0.1979, val_value_loss:0.0419
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-21/best.pt
iter 0/176/30000: loss 0.2385, policy_loss:0.1948, value_loss:0.0436, time 7.40s, iter_time: 0.00ms
iter 200/352/30000: loss 0.2207, policy_loss:0.1876, value_loss:0.0331, time 1.12s, iter_time: 46.66ms
iter 400/528/30000: loss 0.2176, policy_loss:0.1848, value_loss:0.0328, time 1.89s, iter_time: 39.34ms
iter 600/704/30000: loss 0.2308, policy_loss:0.1835, value_loss:0.0472, time 2.91s, iter_time: 40.38ms
iter 800/880/30000: loss 0.2676, policy_loss:0.1999, value_loss:0.0677, time 6.47s, iter_time: 67.42ms
step 1000: losses: train:0.2466, train_policy_loss:0.1994, train_value_loss:0.0472, val:0.2511, val_policy_loss:0.2

Self Play: 100%|█████████▉| 9987/10000 [01:20<00:01, 10.62it/s] 

Evaluation time: 0.002 seconds, size=12, eval-per-second=6868.40, total-batches=1000, mean-eval-per-second=34216.76, mean-time-per-batch=0.015, mean-batch-size=517.45


Self Play: 100%|██████████| 10000/10000 [01:22<00:00, 120.69it/s]


Writing 10000 trajectories...
Training model for gen 22...




num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False
step 0: losses: train:0.2474, train_policy_loss:0.1991, train_value_loss:0.0483, val:0.2441, val_policy_loss:0.1934, val_value_loss:0.0508
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-22/best.pt
iter 0/176/30000: loss 0.2539, policy_loss:0.2059, value_loss:0.0480, time 7.53s, iter_time: 0.00ms
iter 200/352/30000: loss 0.2375, policy_loss:0.1990, value_loss:0.0385, time 1.53s, iter_time: 63.59ms
iter 400/528/30000: loss 0.2325, policy_loss:0.1881, value_loss:0.0444, time 2.54s, iter_time: 52.83ms
iter 600/704/30000: loss 0.2350, policy_loss:0.1934, value_loss:0.0416, time 3.25s, iter_time: 45.20ms
iter 800/880/30000: loss 0.2331, policy_loss:0.1918, value_loss:0.0413, time 4.28s, iter_time: 44.59ms
step 1000: losses: train:0.2428, train_policy_loss:0.1965, train_value_loss:0.0463, val:0.2432, val_policy_loss:0.1

Self Play: 100%|█████████▉| 9993/10000 [01:20<00:00, 17.58it/s] 

Evaluation time: 0.002 seconds, size=8, eval-per-second=5214.36, total-batches=1000, mean-eval-per-second=33764.19, mean-time-per-batch=0.015, mean-batch-size=511.01


Self Play: 100%|██████████| 10000/10000 [01:22<00:00, 121.90it/s]


Writing 10000 trajectories...
Training model for gen 23...
num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False




step 0: losses: train:0.2267, train_policy_loss:0.1846, train_value_loss:0.0421, val:0.2233, val_policy_loss:0.1833, val_value_loss:0.0401
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-23/best.pt
iter 0/176/30000: loss 0.2435, policy_loss:0.1971, value_loss:0.0464, time 6.87s, iter_time: 0.00ms
iter 200/352/30000: loss 0.2222, policy_loss:0.1909, value_loss:0.0313, time 1.70s, iter_time: 70.67ms
iter 400/528/30000: loss 0.2156, policy_loss:0.1777, value_loss:0.0379, time 2.40s, iter_time: 49.94ms
iter 600/704/30000: loss 0.2224, policy_loss:0.1853, value_loss:0.0371, time 7.94s, iter_time: 110.25ms
iter 800/880/30000: loss 0.2373, policy_loss:0.1942, value_loss:0.0431, time 3.98s, iter_time: 41.43ms
step 1000: losses: train:0.2328, train_policy_loss:0.1894, train_value_loss:0.0434, val:0.2339, val_policy_loss:0.1891, val_value_loss:0.0448
saving checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-23
iter 1000/1056/30000: loss 0.2267, po

Self Play: 100%|█████████▉| 9995/10000 [01:20<00:00, 36.09it/s] 

Evaluation time: 0.002 seconds, size=4, eval-per-second=2576.75, total-batches=1000, mean-eval-per-second=35900.47, mean-time-per-batch=0.014, mean-batch-size=518.63


Self Play: 100%|██████████| 10000/10000 [01:21<00:00, 122.70it/s]


Writing 10000 trajectories...
Training model for gen 24...
num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False




step 0: losses: train:0.2274, train_policy_loss:0.1847, train_value_loss:0.0427, val:0.2148, val_policy_loss:0.1805, val_value_loss:0.0343
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-24/best.pt
iter 0/176/30000: loss 0.1957, policy_loss:0.1795, value_loss:0.0162, time 7.84s, iter_time: 0.00ms
iter 200/352/30000: loss 0.2367, policy_loss:0.1850, value_loss:0.0517, time 1.28s, iter_time: 53.42ms
iter 400/528/30000: loss 0.2213, policy_loss:0.1793, value_loss:0.0420, time 2.01s, iter_time: 41.97ms
iter 600/704/30000: loss 0.2291, policy_loss:0.1918, value_loss:0.0373, time 3.19s, iter_time: 44.27ms
iter 800/880/30000: loss 0.2406, policy_loss:0.1778, value_loss:0.0628, time 4.64s, iter_time: 48.31ms
step 1000: losses: train:0.2312, train_policy_loss:0.1867, train_value_loss:0.0445, val:0.2240, val_policy_loss:0.1848, val_value_loss:0.0392
saving checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-24
iter 1000/1056/30000: loss 0.2374, pol

Self Play: 100%|█████████▉| 9993/10000 [01:22<00:00, 20.79it/s] 

Evaluation time: 0.002 seconds, size=9, eval-per-second=4137.75, total-batches=1000, mean-eval-per-second=33021.11, mean-time-per-batch=0.016, mean-batch-size=522.31


Self Play: 100%|██████████| 10000/10000 [01:22<00:00, 121.20it/s]


Writing 10000 trajectories...
Training model for gen 25...
num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False




step 0: losses: train:0.2274, train_policy_loss:0.1856, train_value_loss:0.0418, val:0.2196, val_policy_loss:0.1829, val_value_loss:0.0366
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-25/best.pt
iter 0/176/30000: loss 0.2363, policy_loss:0.1916, value_loss:0.0448, time 7.08s, iter_time: 0.00ms
iter 200/352/30000: loss 0.1896, policy_loss:0.1724, value_loss:0.0172, time 1.01s, iter_time: 42.08ms
iter 400/528/30000: loss 0.2458, policy_loss:0.1829, value_loss:0.0629, time 2.14s, iter_time: 44.57ms
iter 600/704/30000: loss 0.2100, policy_loss:0.1747, value_loss:0.0353, time 2.95s, iter_time: 40.96ms
iter 800/880/30000: loss 0.2306, policy_loss:0.1924, value_loss:0.0382, time 3.78s, iter_time: 39.35ms
step 1000: losses: train:0.2294, train_policy_loss:0.1864, train_value_loss:0.0430, val:0.2248, val_policy_loss:0.1857, val_value_loss:0.0391
saving checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-25
iter 1000/1056/30000: loss 0.2346, pol

Self Play: 100%|█████████▉| 9996/10000 [01:22<00:00, 15.31it/s] 

Evaluation time: 0.004 seconds, size=5, eval-per-second=1369.08, total-batches=1000, mean-eval-per-second=31454.23, mean-time-per-batch=0.016, mean-batch-size=514.22


Self Play: 100%|██████████| 10000/10000 [01:22<00:00, 120.91it/s]


Writing 10000 trajectories...
Training model for gen 26...
num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False




step 0: losses: train:0.2278, train_policy_loss:0.1874, train_value_loss:0.0404, val:0.2251, val_policy_loss:0.1854, val_value_loss:0.0397
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-26/best.pt
iter 0/176/30000: loss 0.2147, policy_loss:0.1822, value_loss:0.0325, time 6.55s, iter_time: 0.00ms
iter 200/352/30000: loss 0.2205, policy_loss:0.1848, value_loss:0.0357, time 1.44s, iter_time: 59.98ms
iter 400/528/30000: loss 0.2149, policy_loss:0.1767, value_loss:0.0382, time 2.03s, iter_time: 42.38ms
iter 600/704/30000: loss 0.2150, policy_loss:0.1857, value_loss:0.0293, time 2.96s, iter_time: 41.17ms
iter 800/880/30000: loss 0.2113, policy_loss:0.1832, value_loss:0.0282, time 3.90s, iter_time: 40.67ms
step 1000: losses: train:0.2287, train_policy_loss:0.1874, train_value_loss:0.0412, val:0.2301, val_policy_loss:0.1879, val_value_loss:0.0422
saving checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-26
iter 1000/1056/30000: loss 0.2395, pol

Self Play: 100%|█████████▉| 9991/10000 [01:21<00:00, 25.97it/s] 

Evaluation time: 0.002 seconds, size=15, eval-per-second=8103.37, total-batches=1000, mean-eval-per-second=34697.26, mean-time-per-batch=0.015, mean-batch-size=523.35


Self Play: 100%|██████████| 10000/10000 [01:22<00:00, 121.51it/s]


Writing 10000 trajectories...
Training model for gen 27...
num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False




step 0: losses: train:0.2267, train_policy_loss:0.1877, train_value_loss:0.0389, val:0.2283, val_policy_loss:0.1877, val_value_loss:0.0406
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-27/best.pt
iter 0/176/30000: loss 0.1974, policy_loss:0.1762, value_loss:0.0212, time 3.56s, iter_time: 0.00ms
iter 200/352/30000: loss 0.2122, policy_loss:0.1771, value_loss:0.0352, time 1.09s, iter_time: 45.33ms
iter 400/528/30000: loss 0.2061, policy_loss:0.1809, value_loss:0.0252, time 1.94s, iter_time: 40.49ms
iter 600/704/30000: loss 0.1956, policy_loss:0.1748, value_loss:0.0208, time 2.92s, iter_time: 40.57ms
iter 800/880/30000: loss 0.2182, policy_loss:0.1830, value_loss:0.0352, time 3.69s, iter_time: 38.42ms
step 1000: losses: train:0.2215, train_policy_loss:0.1841, train_value_loss:0.0374, val:0.2255, val_policy_loss:0.1853, val_value_loss:0.0402
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-27/best.pt
saving checkpoint to /Use

Self Play: 100%|█████████▉| 9995/10000 [01:22<00:00, 16.11it/s] 

Evaluation time: 0.003 seconds, size=8, eval-per-second=2864.23, total-batches=1000, mean-eval-per-second=34618.07, mean-time-per-batch=0.015, mean-batch-size=527.29


Self Play: 100%|█████████▉| 9999/10000 [01:22<00:00, 12.95it/s]

Evaluation time: 0.002 seconds, size=1, eval-per-second=540.78, total-batches=2000, mean-eval-per-second=30877.02, mean-time-per-batch=0.009, mean-batch-size=264.50


Self Play: 100%|██████████| 10000/10000 [01:24<00:00, 118.20it/s]


Writing 10000 trajectories...
Training model for gen 28...




num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False
step 0: losses: train:0.2154, train_policy_loss:0.1812, train_value_loss:0.0341, val:0.2154, val_policy_loss:0.1801, val_value_loss:0.0353
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-28/best.pt
iter 0/176/30000: loss 0.2062, policy_loss:0.1789, value_loss:0.0272, time 8.01s, iter_time: 0.00ms
iter 200/352/30000: loss 0.2115, policy_loss:0.1744, value_loss:0.0371, time 1.13s, iter_time: 47.15ms
iter 400/528/30000: loss 0.2006, policy_loss:0.1735, value_loss:0.0271, time 1.95s, iter_time: 40.58ms
iter 600/704/30000: loss 0.2077, policy_loss:0.1775, value_loss:0.0303, time 2.85s, iter_time: 39.53ms
iter 800/880/30000: loss 0.2393, policy_loss:0.1810, value_loss:0.0583, time 3.71s, iter_time: 38.64ms
step 1000: losses: train:0.2184, train_policy_loss:0.1833, train_value_loss:0.0352, val:0.2227, val_policy_loss:0.1

Self Play: 100%|█████████▉| 9992/10000 [01:22<00:00, 17.90it/s] 

Evaluation time: 0.004 seconds, size=10, eval-per-second=2562.82, total-batches=1000, mean-eval-per-second=33013.44, mean-time-per-batch=0.016, mean-batch-size=531.14


Self Play: 100%|██████████| 10000/10000 [01:24<00:00, 118.61it/s]


Writing 10000 trajectories...
Training model for gen 29...
num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False
step 0: losses: train:0.2172, train_policy_loss:0.1833, train_value_loss:0.0339, val:0.2186, val_policy_loss:0.1849, val_value_loss:0.0337
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-29/best.pt
iter 0/176/30000: loss 0.2154, policy_loss:0.1830, value_loss:0.0324, time 3.27s, iter_time: 0.00ms
iter 200/352/30000: loss 0.2096, policy_loss:0.1820, value_loss:0.0275, time 0.98s, iter_time: 40.74ms
iter 400/528/30000: loss 0.2119, policy_loss:0.1811, value_loss:0.0308, time 2.01s, iter_time: 41.92ms
iter 600/704/30000: loss 0.2083, policy_loss:0.1752, value_loss:0.0331, time 3.44s, iter_time: 47.75ms
iter 800/880/30000: loss 0.2196, policy_loss:0.1896, value_loss:0.0301, time 4.41s, iter_time: 45.95ms
step 1000: losses: train:0.2195, train_policy_loss:0.184

Self Play: 100%|█████████▉| 9995/10000 [01:22<00:00, 26.82it/s] 

Evaluation time: 0.006 seconds, size=7, eval-per-second=1218.46, total-batches=1000, mean-eval-per-second=33739.46, mean-time-per-batch=0.016, mean-batch-size=525.78


Self Play: 100%|██████████| 10000/10000 [01:23<00:00, 119.80it/s]


Writing 10000 trajectories...
Training model for gen 30...
num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False
step 0: losses: train:0.2182, train_policy_loss:0.1849, train_value_loss:0.0333, val:0.2156, val_policy_loss:0.1846, val_value_loss:0.0310
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-30/best.pt
iter 0/176/30000: loss 0.2190, policy_loss:0.1858, value_loss:0.0332, time 8.92s, iter_time: 0.00ms
iter 200/352/30000: loss 0.2111, policy_loss:0.1844, value_loss:0.0266, time 1.04s, iter_time: 43.36ms
iter 400/528/30000: loss 0.2398, policy_loss:0.1964, value_loss:0.0433, time 2.07s, iter_time: 43.20ms
iter 600/704/30000: loss 0.2150, policy_loss:0.1901, value_loss:0.0249, time 3.24s, iter_time: 45.03ms
iter 800/880/30000: loss 0.1908, policy_loss:0.1767, value_loss:0.0140, time 4.24s, iter_time: 44.20ms
step 1000: losses: train:0.2171, train_policy_loss:0.184

Self Play: 100%|█████████▉| 9995/10000 [01:22<00:00, 27.97it/s] 

Evaluation time: 0.002 seconds, size=6, eval-per-second=3397.11, total-batches=1000, mean-eval-per-second=32705.68, mean-time-per-batch=0.016, mean-batch-size=533.20


Self Play: 100%|██████████| 10000/10000 [01:23<00:00, 119.57it/s]


Writing 10000 trajectories...
Training model for gen 31...




num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False
step 0: losses: train:0.2239, train_policy_loss:0.1886, train_value_loss:0.0353, val:0.2255, val_policy_loss:0.1906, val_value_loss:0.0349
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-31/best.pt
iter 0/176/30000: loss 0.2446, policy_loss:0.1922, value_loss:0.0524, time 3.24s, iter_time: 0.00ms
iter 200/352/30000: loss 0.1974, policy_loss:0.1763, value_loss:0.0212, time 1.13s, iter_time: 46.98ms
iter 400/528/30000: loss 0.2264, policy_loss:0.1832, value_loss:0.0432, time 2.04s, iter_time: 42.60ms
iter 600/704/30000: loss 0.2295, policy_loss:0.1873, value_loss:0.0422, time 2.89s, iter_time: 40.17ms
iter 800/880/30000: loss 0.2206, policy_loss:0.1849, value_loss:0.0358, time 3.83s, iter_time: 39.88ms
step 1000: losses: train:0.2147, train_policy_loss:0.1834, train_value_loss:0.0314, val:0.2256, val_policy_loss:0.1

Self Play: 100%|█████████▉| 9988/10000 [01:25<00:01, 10.54it/s] 

Evaluation time: 0.006 seconds, size=13, eval-per-second=2326.89, total-batches=1000, mean-eval-per-second=30322.23, mean-time-per-batch=0.018, mean-batch-size=536.73


Self Play: 100%|██████████| 10000/10000 [01:26<00:00, 115.40it/s]


Writing 10000 trajectories...
Training model for gen 32...




num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False
step 0: losses: train:0.2119, train_policy_loss:0.1823, train_value_loss:0.0296, val:0.2078, val_policy_loss:0.1807, val_value_loss:0.0271
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-32/best.pt
iter 0/176/30000: loss 0.2293, policy_loss:0.1906, value_loss:0.0387, time 8.92s, iter_time: 0.00ms
iter 200/352/30000: loss 0.2053, policy_loss:0.1798, value_loss:0.0255, time 1.31s, iter_time: 54.60ms
iter 400/528/30000: loss 0.2249, policy_loss:0.1852, value_loss:0.0397, time 2.04s, iter_time: 42.53ms
iter 600/704/30000: loss 0.1929, policy_loss:0.1820, value_loss:0.0110, time 3.16s, iter_time: 43.86ms
iter 800/880/30000: loss 0.2033, policy_loss:0.1834, value_loss:0.0199, time 3.92s, iter_time: 40.88ms
step 1000: losses: train:0.2137, train_policy_loss:0.1846, train_value_loss:0.0292, val:0.2118, val_policy_loss:0.1

Self Play: 100%|█████████▉| 9992/10000 [01:24<00:00, 26.27it/s] 

Evaluation time: 0.002 seconds, size=14, eval-per-second=7000.51, total-batches=1000, mean-eval-per-second=30410.48, mean-time-per-batch=0.017, mean-batch-size=528.00


Self Play: 100%|██████████| 10000/10000 [01:24<00:00, 118.05it/s]


Writing 10000 trajectories...
Training model for gen 33...
num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False




step 0: losses: train:0.2159, train_policy_loss:0.1842, train_value_loss:0.0317, val:0.2158, val_policy_loss:0.1854, val_value_loss:0.0305
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-33/best.pt
iter 0/176/30000: loss 0.1985, policy_loss:0.1779, value_loss:0.0206, time 3.69s, iter_time: 0.00ms
iter 200/352/30000: loss 0.2048, policy_loss:0.1848, value_loss:0.0200, time 1.12s, iter_time: 46.57ms
iter 400/528/30000: loss 0.2346, policy_loss:0.1805, value_loss:0.0541, time 2.06s, iter_time: 42.84ms
iter 600/704/30000: loss 0.2430, policy_loss:0.1993, value_loss:0.0436, time 3.04s, iter_time: 42.21ms
iter 800/880/30000: loss 0.2012, policy_loss:0.1816, value_loss:0.0196, time 4.03s, iter_time: 41.96ms
step 1000: losses: train:0.2139, train_policy_loss:0.1848, train_value_loss:0.0291, val:0.2221, val_policy_loss:0.1892, val_value_loss:0.0329
saving checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-33
iter 1000/1056/30000: loss 0.2214, pol

Self Play: 100%|█████████▉| 9989/10000 [01:21<00:00, 14.50it/s] 

Evaluation time: 0.002 seconds, size=13, eval-per-second=7165.98, total-batches=1000, mean-eval-per-second=33779.93, mean-time-per-batch=0.016, mean-batch-size=526.09


Self Play: 100%|██████████| 10000/10000 [01:24<00:00, 118.22it/s]

Evaluation time: 0.002 seconds, size=1, eval-per-second=553.63, total-batches=2000, mean-eval-per-second=28739.77, mean-time-per-batch=0.009, mean-batch-size=264.87
Writing 10000 trajectories...
Training model for gen 34...
num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False





step 0: losses: train:0.2179, train_policy_loss:0.1860, train_value_loss:0.0319, val:0.2204, val_policy_loss:0.1862, val_value_loss:0.0342
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-34/best.pt
iter 0/176/30000: loss 0.2188, policy_loss:0.1853, value_loss:0.0335, time 8.52s, iter_time: 0.00ms
iter 200/352/30000: loss 0.1811, policy_loss:0.1753, value_loss:0.0058, time 1.05s, iter_time: 43.61ms
iter 400/528/30000: loss 0.2056, policy_loss:0.1858, value_loss:0.0198, time 2.07s, iter_time: 43.20ms
iter 600/704/30000: loss 0.2249, policy_loss:0.1833, value_loss:0.0415, time 3.10s, iter_time: 43.02ms
iter 800/880/30000: loss 0.2039, policy_loss:0.1816, value_loss:0.0223, time 4.15s, iter_time: 43.27ms
step 1000: losses: train:0.2162, train_policy_loss:0.1861, train_value_loss:0.0301, val:0.2258, val_policy_loss:0.1888, val_value_loss:0.0370
saving checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-34
iter 1000/1056/30000: loss 0.2457, pol

Self Play: 100%|█████████▉| 9992/10000 [01:22<00:00, 25.29it/s] 

Evaluation time: 0.005 seconds, size=11, eval-per-second=2169.13, total-batches=1000, mean-eval-per-second=35728.86, mean-time-per-batch=0.015, mean-batch-size=537.39


Self Play: 100%|██████████| 10000/10000 [01:23<00:00, 120.28it/s]


Writing 10000 trajectories...
Training model for gen 35...




num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False
step 0: losses: train:0.2189, train_policy_loss:0.1878, train_value_loss:0.0311, val:0.2231, val_policy_loss:0.1890, val_value_loss:0.0341
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-35/best.pt
iter 0/176/30000: loss 0.2023, policy_loss:0.1798, value_loss:0.0225, time 3.20s, iter_time: 0.00ms
iter 200/352/30000: loss 0.2153, policy_loss:0.1926, value_loss:0.0227, time 0.95s, iter_time: 39.47ms
iter 400/528/30000: loss 0.2028, policy_loss:0.1801, value_loss:0.0227, time 2.03s, iter_time: 42.34ms
iter 600/704/30000: loss 0.2013, policy_loss:0.1778, value_loss:0.0236, time 3.10s, iter_time: 43.02ms
iter 800/880/30000: loss 0.2158, policy_loss:0.1883, value_loss:0.0275, time 3.89s, iter_time: 40.50ms
step 1000: losses: train:0.2186, train_policy_loss:0.1881, train_value_loss:0.0306, val:0.2239, val_policy_loss:0.1

Self Play: 100%|█████████▉| 9989/10000 [01:27<00:00, 35.19it/s] 

Evaluation time: 0.006 seconds, size=16, eval-per-second=2496.89, total-batches=1000, mean-eval-per-second=36359.74, mean-time-per-batch=0.017, mean-batch-size=605.66


Self Play: 100%|██████████| 10000/10000 [01:28<00:00, 113.06it/s]


Writing 10000 trajectories...
Training model for gen 36...




num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False
step 0: losses: train:0.2204, train_policy_loss:0.1867, train_value_loss:0.0337, val:0.2192, val_policy_loss:0.1888, val_value_loss:0.0304
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-36/best.pt
iter 0/176/30000: loss 0.2289, policy_loss:0.1830, value_loss:0.0459, time 3.39s, iter_time: 0.00ms
iter 200/352/30000: loss 0.1993, policy_loss:0.1791, value_loss:0.0203, time 0.95s, iter_time: 39.64ms
iter 400/528/30000: loss 0.2357, policy_loss:0.1857, value_loss:0.0500, time 2.05s, iter_time: 42.62ms
iter 600/704/30000: loss 0.1978, policy_loss:0.1839, value_loss:0.0138, time 2.93s, iter_time: 40.65ms
iter 800/880/30000: loss 0.1966, policy_loss:0.1798, value_loss:0.0168, time 3.87s, iter_time: 40.35ms
step 1000: losses: train:0.2195, train_policy_loss:0.1876, train_value_loss:0.0318, val:0.2242, val_policy_loss:0.1

Self Play: 100%|█████████▉| 9983/10000 [01:29<00:00, 17.26it/s] 

Evaluation time: 0.002 seconds, size=17, eval-per-second=8896.22, total-batches=1000, mean-eval-per-second=33171.30, mean-time-per-batch=0.018, mean-batch-size=600.62


Self Play: 100%|██████████| 10000/10000 [01:31<00:00, 109.63it/s]


Writing 10000 trajectories...
Training model for gen 37...




num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False
step 0: losses: train:0.2275, train_policy_loss:0.1925, train_value_loss:0.0350, val:0.2258, val_policy_loss:0.1916, val_value_loss:0.0342
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-37/best.pt
iter 0/176/30000: loss 0.2186, policy_loss:0.1901, value_loss:0.0285, time 4.12s, iter_time: 0.00ms
iter 200/352/30000: loss 0.2316, policy_loss:0.1875, value_loss:0.0441, time 1.13s, iter_time: 46.99ms
iter 400/528/30000: loss 0.2166, policy_loss:0.1935, value_loss:0.0232, time 2.07s, iter_time: 43.13ms
iter 600/704/30000: loss 0.2104, policy_loss:0.1874, value_loss:0.0230, time 3.00s, iter_time: 41.64ms
iter 800/880/30000: loss 0.2423, policy_loss:0.1956, value_loss:0.0467, time 4.03s, iter_time: 41.97ms
step 1000: losses: train:0.2246, train_policy_loss:0.1912, train_value_loss:0.0334, val:0.2272, val_policy_loss:0.1

Self Play: 100%|█████████▉| 9987/10000 [01:25<00:00, 22.49it/s] 

Evaluation time: 0.002 seconds, size=13, eval-per-second=6602.00, total-batches=1000, mean-eval-per-second=36864.61, mean-time-per-batch=0.016, mean-batch-size=586.71


Self Play: 100%|██████████| 10000/10000 [01:28<00:00, 113.61it/s]


Writing 10000 trajectories...
Training model for gen 38...
num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False




step 0: losses: train:0.2332, train_policy_loss:0.1966, train_value_loss:0.0366, val:0.2342, val_policy_loss:0.1986, val_value_loss:0.0356
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-38/best.pt
iter 0/176/30000: loss 0.2199, policy_loss:0.1861, value_loss:0.0339, time 4.31s, iter_time: 0.00ms
iter 200/352/30000: loss 0.2311, policy_loss:0.1898, value_loss:0.0412, time 0.96s, iter_time: 39.97ms
iter 400/528/30000: loss 0.2357, policy_loss:0.1891, value_loss:0.0466, time 2.00s, iter_time: 41.70ms
iter 600/704/30000: loss 0.2290, policy_loss:0.1973, value_loss:0.0316, time 3.67s, iter_time: 50.90ms
iter 800/880/30000: loss 0.2203, policy_loss:0.1938, value_loss:0.0265, time 4.07s, iter_time: 42.38ms
step 1000: losses: train:0.2290, train_policy_loss:0.1946, train_value_loss:0.0344, val:0.2312, val_policy_loss:0.1973, val_value_loss:0.0338
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-38/best.pt
saving checkpoint to /Use

Self Play: 100%|█████████▉| 9993/10000 [01:23<00:00, 35.70it/s] 

Evaluation time: 0.002 seconds, size=8, eval-per-second=5051.10, total-batches=1000, mean-eval-per-second=36337.44, mean-time-per-batch=0.015, mean-batch-size=556.34


Self Play: 100%|██████████| 10000/10000 [01:24<00:00, 118.95it/s]


Writing 10000 trajectories...
Training model for gen 39...




num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False
step 0: losses: train:0.2290, train_policy_loss:0.1959, train_value_loss:0.0331, val:0.2334, val_policy_loss:0.1973, val_value_loss:0.0361
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-39/best.pt
iter 0/176/30000: loss 0.2122, policy_loss:0.1846, value_loss:0.0277, time 4.28s, iter_time: 0.00ms
iter 200/352/30000: loss 0.2209, policy_loss:0.1913, value_loss:0.0297, time 1.02s, iter_time: 42.68ms
iter 400/528/30000: loss 0.2239, policy_loss:0.1963, value_loss:0.0275, time 2.04s, iter_time: 42.41ms
iter 600/704/30000: loss 0.2177, policy_loss:0.1843, value_loss:0.0334, time 3.06s, iter_time: 42.55ms
iter 800/880/30000: loss 0.2109, policy_loss:0.1952, value_loss:0.0157, time 3.99s, iter_time: 41.57ms
step 1000: losses: train:0.2301, train_policy_loss:0.1962, train_value_loss:0.0339, val:0.2394, val_policy_loss:0.2

Self Play: 100%|█████████▉| 9988/10000 [01:25<00:00, 13.68it/s] 

Evaluation time: 0.002 seconds, size=13, eval-per-second=7869.24, total-batches=1000, mean-eval-per-second=32424.49, mean-time-per-batch=0.017, mean-batch-size=562.22


Self Play: 100%|██████████| 10000/10000 [01:26<00:00, 115.29it/s]


Writing 10000 trajectories...
Training model for gen 40...
num decayed parameter tensors: 19, with 200,064 parameters
num non-decayed parameter tensors: 11, with 586 parameters
using fused AdamW: False




step 0: losses: train:0.2313, train_policy_loss:0.1982, train_value_loss:0.0331, val:0.2272, val_policy_loss:0.1967, val_value_loss:0.0305
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-40/best.pt
iter 0/176/30000: loss 0.2346, policy_loss:0.1977, value_loss:0.0370, time 3.90s, iter_time: 0.00ms
iter 200/352/30000: loss 0.2351, policy_loss:0.1971, value_loss:0.0381, time 1.13s, iter_time: 47.11ms
iter 400/528/30000: loss 0.2233, policy_loss:0.1894, value_loss:0.0339, time 2.34s, iter_time: 48.83ms
iter 600/704/30000: loss 0.2026, policy_loss:0.1894, value_loss:0.0132, time 3.18s, iter_time: 44.10ms
iter 800/880/30000: loss 0.2142, policy_loss:0.1939, value_loss:0.0203, time 3.96s, iter_time: 41.20ms
step 1000: losses: train:0.2301, train_policy_loss:0.1970, train_value_loss:0.0331, val:0.2273, val_policy_loss:0.1976, val_value_loss:0.0297
saving checkpoint to /Users/rodo/src/rgi3-sync/models/smoketest-e2e-v3/gen-40
iter 1000/1056/30000: loss 0.2339, pol

In [None]:
# current_model = await experiment_runner.run_generation_step_async(generation_id, current_model)

experiment_config.experiment_name='smoketest-e2e-v3-hack'   # Use sliding window.
experiment_config.parent_experiment_name='smoketest-e2e-v3'
experiment_config.num_generations=41
experiment_config.num_games_per_gen=1

experiment_runner = ExperimentRunner(experiment_config, experiment_base_dir, training_args=tuned_params)
await experiment_runner.play_generation_async(current_model, gen_id=experiment_config.num_generations)

  frame.f_lineno = line


# Tune Model (initial)


In [6]:
reload_local_modules(verbose=False)

state_0 = game.initial_state()
NUM_GENERATIONS = 5
LEARNING_RATE = 0.1

# Parameters which will never be used for tuning.
fixed_params = dict(
    model_name='c4-smoketest',
    model_version='0.1',
    num_players = game.num_players(state_0),
    vocab_size = action_vocab.vocab_size,
    dataset_paths = tuple(experiment_runner.get_trajectory_paths(experiment_config.num_generations)),


    eval_iters = 200,
    log_interval = 1000,
    eval_interval = 10_000,

    device = device,
)

initial_params = dict(
    n_layer=2,
    n_head=2,
    n_embd=8,  # tiny model

    n_max_context=n_max_context,
    batch_size = 32,
    gradient_accumulation_steps = 1,

    max_iters=100,
    max_epochs=1_000_000, # Make max_epoch high, rely on max_iters to stop.
        
    learning_rate = LEARNING_RATE,    
    decay_lr = True,  # whether to decay the learning rate
    lr_decay_iters = 100,  # make equal to max_iters usually
    min_lr = LEARNING_RATE / 10,  # learning_rate / 10 usually
    warmup_iters = 0,  # not super necessary potentially

    weight_decay = 1e-1,
    beta1 = 0.9,
    beta2 = 0.95,
    grad_clip = 1.0,  # clip gradients at this value, or disable if == 0.0

    dtype = "float16",

    dropout = 0.0,
    bias = False,  # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
    last_file = None,   # Used in tuning key only.
)

tune_options = dict(
    n_layer = [1, 2, 3, 4, 5, 6, 8, 10, 12, 16, 32],
    # n_head = [1, 2, 4, 8, 16, 32],   # Needs to be calcualted to ensure n_embed % n_head == 0
    n_embd = [8, 16, 32, 64, 128, 256, 512, 1024, 2048],

    n_max_context = [initial_params['n_max_context']],
    batch_size = [16, 32, 64, 128, 256, 512, 1024],
    gradient_accumulation_steps = [1],  # TODO: We only support 1 for now. This fails is we don't have an exact multiple of the batch size per epoch.

    max_iters = [100, 300, 1_000, 3_000, 5_000, 10_000, 30_000, 100_000, 300_000],
    max_epochs = [1_000_000], # Make max_epoch high, rely on max_iters to stop.
 
    learning_rate = [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0],
    decay_lr = [False, True],

    # TODO: What is a sensible range here?
    beta1 = [0.90, 0.95, 0.99],
    beta2 = [0.95, 0.98, 0.99],

    weight_decay = [0.01, 0.05, 0.1, 0.2],
    grad_clip = [0,0, 1.0],  # clip gradients at this value, or disable if == 0.0

    dtype = ["bfloat16", "float16"],
    dropout = [0.0, 0.01, 0.02, 0.05, 0.1],
    bias = [True, False],    
)

_n_head_options = [1, 2, 4, 8, 16, 32]
computed_tune_options = dict(
    min_lr = lambda opt: [opt['learning_rate'] / 10],
    lr_decay_iters = lambda opt: [opt['max_iters']],
    warmup_iters = lambda opt: [x for x in [0, 100, 500, 1000] if x < opt['lr_decay_iters']] if opt['decay_lr'] else [0],
    n_head = lambda opt: [n for n in _n_head_options if opt['n_embd'] % n == 0],
    last_file = lambda opt: [str(opt['dataset_paths'][-1])],
)

TUNER_VERSION = "0.0.6-smoketest"

from rgi.rgizero.models.tuner import Tuner

tuner = Tuner(
    fixed_params=fixed_params.copy(),
    initial_params=initial_params.copy(),
    tune_options=tune_options.copy(), 
    computed_tune_options=computed_tune_options.copy(),
    cache_version=TUNER_VERSION,
    target_improvement_per_minute=1.00)
tuner.autotune_smart()


transform_config_fields: {'n_max_context', 'dropout', 'n_head', 'n_layer', 'n_embd', 'bias'}
train_config_fields: {'wandb_log', 'beta1', 'min_lr', 'eval_iters', 'decay_lr', 'max_iters', 'max_epochs', 'warmup_iters', 'batch_size', 'eval_only', 'patience', 'device', 'model_version', 'learning_rate', 'eval_interval', 'weight_decay', 'lr_decay_iters', 'compile', 'always_save_checkpoint', 'beta2', 'gradient_accumulation_steps', 'model_name', 'dtype', 'grad_clip', 'log_interval'}
Using initial model as baseline.
Training initial
model_config=TransformerConfig(n_max_context=44, n_layer=2, n_head=2, n_embd=8, dropout=0.0, bias=False)
train_config=TrainConfig(model_name='c4-smoketest', model_version='0.1', eval_interval=10000, log_interval=1000, eval_iters=200, eval_only=False, always_save_checkpoint=True, wandb_log=False, gradient_accumulation_steps=1, batch_size=32, learning_rate=0.1, max_epochs=1000000, max_iters=100, weight_decay=0.1, beta1=0.9, beta2=0.95, grad_clip=1.0, decay_lr=True, war



step 0: losses: train:2.7775, train_policy_loss:2.1004, train_value_loss:0.6771, val:2.7777, val_policy_loss:2.1005, val_value_loss:0.6772
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7791, policy_loss:2.1015, value_loss:0.6775, time 41.51s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7777)
## train_loss: 2.7775, val_loss: 2.7776, Time taken: 55.675215005874634s, val_policy_loss: 2.1004, val_value_loss: 0.6772, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 8, 'batch_size': 32, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.1, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.01, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_na



step 0: losses: train:2.7775, train_policy_loss:2.1004, train_value_loss:0.6771, val:2.7777, val_policy_loss:2.1005, val_value_loss:0.6772
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7791, policy_loss:2.1015, value_loss:0.6775, time 2.63s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7777)
## train_loss: 2.7775, val_loss: 2.7776, Time taken: 7.4931111335754395s, val_policy_loss: 2.1004, val_value_loss: 0.6772, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 8, 'batch_size': 32, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.05, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.005, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_n



step 0: losses: train:2.7775, train_policy_loss:2.1004, train_value_loss:0.6771, val:2.7777, val_policy_loss:2.1005, val_value_loss:0.6772
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7791, policy_loss:2.1015, value_loss:0.6775, time 2.49s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7777)
## train_loss: 2.7775, val_loss: 2.7776, Time taken: 7.180812120437622s, val_policy_loss: 2.1004, val_value_loss: 0.6772, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 8, 'batch_size': 32, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.02, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.002, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_na



step 0: losses: train:2.7775, train_policy_loss:2.1004, train_value_loss:0.6771, val:2.7777, val_policy_loss:2.1005, val_value_loss:0.6772
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7791, policy_loss:2.1015, value_loss:0.6775, time 2.72s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7777)
## train_loss: 2.7775, val_loss: 2.7776, Time taken: 7.686541795730591s, val_policy_loss: 2.1004, val_value_loss: 0.6772, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 8, 'batch_size': 32, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.01, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.001, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_na



step 0: losses: train:2.7775, train_policy_loss:2.1004, train_value_loss:0.6770, val:2.7774, val_policy_loss:2.1004, val_value_loss:0.6770
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7761, policy_loss:2.1005, value_loss:0.6757, time 60.95s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7774)
## train_loss: 2.7776, val_loss: 2.7776, Time taken: 70.05358600616455s, val_policy_loss: 2.1003, val_value_loss: 0.6772, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 8, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.02, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.002, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_n



step 0: losses: train:2.7776, train_policy_loss:2.1003, train_value_loss:0.6772, val:2.7776, val_policy_loss:2.1004, val_value_loss:0.6772
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7789, policy_loss:2.0993, value_loss:0.6795, time 115.24s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7776)
## train_loss: 2.7774, val_loss: 2.7776, Time taken: 138.09453701972961s, val_policy_loss: 2.1004, val_value_loss: 0.6772, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 8, 'batch_size': 64, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.02, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.002, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model



step 0: losses: train:2.7775, train_policy_loss:2.1004, train_value_loss:0.6771, val:2.7777, val_policy_loss:2.1005, val_value_loss:0.6772
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7791, policy_loss:2.1015, value_loss:0.6775, time 2.98s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7777)
## train_loss: 2.7775, val_loss: 2.7776, Time taken: 7.559458017349243s, val_policy_loss: 2.1004, val_value_loss: 0.6772, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 8, 'batch_size': 32, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.02, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.002, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.95, 'beta2': 0.95, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_n



step 0: losses: train:2.7775, train_policy_loss:2.1004, train_value_loss:0.6771, val:2.7777, val_policy_loss:2.1005, val_value_loss:0.6772
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7791, policy_loss:2.1015, value_loss:0.6775, time 2.74s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7777)
## train_loss: 2.7775, val_loss: 2.7776, Time taken: 6.749730110168457s, val_policy_loss: 2.1004, val_value_loss: 0.6772, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 8, 'batch_size': 32, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.02, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.002, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_na



step 0: losses: train:2.7775, train_policy_loss:2.1004, train_value_loss:0.6771, val:2.7777, val_policy_loss:2.1005, val_value_loss:0.6772
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7791, policy_loss:2.1015, value_loss:0.6775, time 3.12s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7777)
## train_loss: 2.7775, val_loss: 2.7776, Time taken: 7.102126836776733s, val_policy_loss: 2.1004, val_value_loss: 0.6772, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 8, 'batch_size': 32, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.05, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.005, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_na



step 0: losses: train:2.7775, train_policy_loss:2.1004, train_value_loss:0.6771, val:2.7777, val_policy_loss:2.1005, val_value_loss:0.6772
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7791, policy_loss:2.1015, value_loss:0.6775, time 2.86s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7777)
## train_loss: 2.7775, val_loss: 2.7776, Time taken: 7.028873920440674s, val_policy_loss: 2.1004, val_value_loss: 0.6772, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 8, 'batch_size': 32, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.01, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.001, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_na



step 0: losses: train:2.7775, train_policy_loss:2.1004, train_value_loss:0.6771, val:2.7777, val_policy_loss:2.1005, val_value_loss:0.6772
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7791, policy_loss:2.1015, value_loss:0.6775, time 2.94s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7777)
## train_loss: 2.7775, val_loss: 2.7776, Time taken: 6.997674942016602s, val_policy_loss: 2.1004, val_value_loss: 0.6772, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 8, 'batch_size': 32, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.02, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.002, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.95, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_n



step 0: losses: train:2.7792, train_policy_loss:2.0801, train_value_loss:0.6991, val:2.7792, val_policy_loss:2.0801, val_value_loss:0.6991
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7809, policy_loss:2.0819, value_loss:0.6990, time 5.79s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7792)
## train_loss: 2.7791, val_loss: 2.7792, Time taken: 11.624053001403809s, val_policy_loss: 2.0801, val_value_loss: 0.6991, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 8, 'batch_size': 32, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.02, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.002, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': True, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_na



step 0: losses: train:2.7775, train_policy_loss:2.1004, train_value_loss:0.6771, val:2.7777, val_policy_loss:2.1005, val_value_loss:0.6772
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7791, policy_loss:2.1015, value_loss:0.6775, time 3.07s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7777)
## train_loss: 2.7775, val_loss: 2.7776, Time taken: 6.983636140823364s, val_policy_loss: 2.1004, val_value_loss: 0.6772, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 8, 'batch_size': 32, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.02, 'decay_lr': False, 'lr_decay_iters': 100, 'min_lr': 0.002, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_n



step 0: losses: train:2.7775, train_policy_loss:2.1004, train_value_loss:0.6771, val:2.7777, val_policy_loss:2.1005, val_value_loss:0.6772
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7790, policy_loss:2.1017, value_loss:0.6773, time 4.24s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7777)
## train_loss: 2.7775, val_loss: 2.7776, Time taken: 9.56999397277832s, val_policy_loss: 2.1004, val_value_loss: 0.6772, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 8, 'batch_size': 32, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.02, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.002, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.01, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_na



step 0: losses: train:2.7775, train_policy_loss:2.1004, train_value_loss:0.6771, val:2.7777, val_policy_loss:2.1005, val_value_loss:0.6772
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/300/300: loss 2.7791, policy_loss:2.1015, value_loss:0.6775, time 2.80s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7777)
## train_loss: 2.7775, val_loss: 2.7776, Time taken: 10.227452754974365s, val_policy_loss: 2.1004, val_value_loss: 0.6772, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 8, 'batch_size': 32, 'gradient_accumulation_steps': 1, 'max_iters': 300, 'max_epochs': 1000000, 'learning_rate': 0.02, 'decay_lr': True, 'lr_decay_iters': 300, 'min_lr': 0.002, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_n



step 0: losses: train:2.7311, train_policy_loss:2.0920, train_value_loss:0.6391, val:2.7310, val_policy_loss:2.0919, val_value_loss:0.6391
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7307, policy_loss:2.0925, value_loss:0.6382, time 11.98s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7310)
## train_loss: 2.7309, val_loss: 2.7309, Time taken: 17.631279230117798s, val_policy_loss: 2.0920, val_value_loss: 0.6389, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 16, 'batch_size': 32, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.02, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.002, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model



step 0: losses: train:2.7775, train_policy_loss:2.1004, train_value_loss:0.6771, val:2.7777, val_policy_loss:2.1005, val_value_loss:0.6772
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7791, policy_loss:2.1015, value_loss:0.6775, time 6.12s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7777)
## train_loss: 2.7775, val_loss: 2.7776, Time taken: 11.17986798286438s, val_policy_loss: 2.1004, val_value_loss: 0.6772, overrides={'n_layer': 2, 'n_head': 1, 'n_embd': 8, 'batch_size': 32, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.02, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.002, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_na



step 0: losses: train:2.7775, train_policy_loss:2.1004, train_value_loss:0.6771, val:2.7777, val_policy_loss:2.1005, val_value_loss:0.6772
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7791, policy_loss:2.1015, value_loss:0.6775, time 4.48s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7777)
## train_loss: 2.7775, val_loss: 2.7776, Time taken: 9.6476731300354s, val_policy_loss: 2.1004, val_value_loss: 0.6772, overrides={'n_layer': 2, 'n_head': 4, 'n_embd': 8, 'batch_size': 32, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.02, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.002, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_name



step 0: losses: train:2.7915, train_policy_loss:2.1002, train_value_loss:0.6912, val:2.7915, val_policy_loss:2.1003, val_value_loss:0.6911
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7914, policy_loss:2.1004, value_loss:0.6910, time 3.33s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7915)
## train_loss: 2.7914, val_loss: 2.7915, Time taken: 7.781635046005249s, val_policy_loss: 2.1003, val_value_loss: 0.6912, overrides={'n_layer': 1, 'n_head': 2, 'n_embd': 8, 'batch_size': 32, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.02, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.002, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_na



step 0: losses: train:2.8041, train_policy_loss:2.0920, train_value_loss:0.7121, val:2.8041, val_policy_loss:2.0920, val_value_loss:0.7121
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.8063, policy_loss:2.0933, value_loss:0.7130, time 3.72s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.8041)
## train_loss: 2.8040, val_loss: 2.8041, Time taken: 8.892699956893921s, val_policy_loss: 2.0919, val_value_loss: 0.7122, overrides={'n_layer': 3, 'n_head': 2, 'n_embd': 8, 'batch_size': 32, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.02, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.002, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_na



step 0: losses: train:2.7775, train_policy_loss:2.1004, train_value_loss:0.6771, val:2.7777, val_policy_loss:2.1005, val_value_loss:0.6772
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7791, policy_loss:2.1015, value_loss:0.6775, time 3.25s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7777)
## train_loss: 2.7775, val_loss: 2.7776, Time taken: 7.18946099281311s, val_policy_loss: 2.1004, val_value_loss: 0.6772, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 8, 'batch_size': 32, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.02, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.002, 'warmup_iters': 0, 'weight_decay': 0.05, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_na



step 0: losses: train:2.7775, train_policy_loss:2.1004, train_value_loss:0.6771, val:2.7777, val_policy_loss:2.1005, val_value_loss:0.6772
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7791, policy_loss:2.1015, value_loss:0.6775, time 3.06s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7777)
## train_loss: 2.7775, val_loss: 2.7776, Time taken: 7.116979122161865s, val_policy_loss: 2.1004, val_value_loss: 0.6772, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 8, 'batch_size': 32, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.02, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.002, 'warmup_iters': 0, 'weight_decay': 0.2, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_na



step 0: losses: train:2.7775, train_policy_loss:2.1004, train_value_loss:0.6771, val:2.7777, val_policy_loss:2.1005, val_value_loss:0.6772
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7791, policy_loss:2.1015, value_loss:0.6775, time 3.12s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7777)
## train_loss: 2.7775, val_loss: 2.7776, Time taken: 7.197895050048828s, val_policy_loss: 2.1004, val_value_loss: 0.6772, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 8, 'batch_size': 32, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.02, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.002, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.99, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_na



step 0: losses: train:2.7775, train_policy_loss:2.1004, train_value_loss:0.6770, val:2.7774, val_policy_loss:2.1004, val_value_loss:0.6770
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7761, policy_loss:2.1005, value_loss:0.6757, time 2.80s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7774)
## train_loss: 2.7776, val_loss: 2.7776, Time taken: 6.736835956573486s, val_policy_loss: 2.1003, val_value_loss: 0.6772, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 8, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.02, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.002, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_na



step 0: losses: train:2.7775, train_policy_loss:2.1004, train_value_loss:0.6770, val:2.7774, val_policy_loss:2.1004, val_value_loss:0.6770
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7761, policy_loss:2.1005, value_loss:0.6757, time 2.27s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7774)
## train_loss: 2.7776, val_loss: 2.7776, Time taken: 6.402606010437012s, val_policy_loss: 2.1003, val_value_loss: 0.6772, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 8, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.05, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.005, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_na



step 0: losses: train:2.7775, train_policy_loss:2.1004, train_value_loss:0.6770, val:2.7774, val_policy_loss:2.1004, val_value_loss:0.6770
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7761, policy_loss:2.1005, value_loss:0.6757, time 2.35s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7774)
## train_loss: 2.7776, val_loss: 2.7776, Time taken: 6.448047161102295s, val_policy_loss: 2.1003, val_value_loss: 0.6772, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 8, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.1, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.01, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_name



step 0: losses: train:2.7775, train_policy_loss:2.1004, train_value_loss:0.6770, val:2.7774, val_policy_loss:2.1004, val_value_loss:0.6770
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/300/300: loss 2.7761, policy_loss:2.1005, value_loss:0.6757, time 2.32s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7774)
## train_loss: 2.7776, val_loss: 2.7776, Time taken: 19.243173122406006s, val_policy_loss: 2.1003, val_value_loss: 0.6772, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 8, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 300, 'max_epochs': 1000000, 'learning_rate': 0.05, 'decay_lr': True, 'lr_decay_iters': 300, 'min_lr': 0.005, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_n



step 0: losses: train:2.7775, train_policy_loss:2.1004, train_value_loss:0.6770, val:2.7774, val_policy_loss:2.1004, val_value_loss:0.6770
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7761, policy_loss:2.1005, value_loss:0.6757, time 2.52s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7774)
## train_loss: 2.7776, val_loss: 2.7776, Time taken: 7.245619058609009s, val_policy_loss: 2.1003, val_value_loss: 0.6772, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 8, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.05, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.005, 'warmup_iters': 0, 'weight_decay': 0.2, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_na



step 0: losses: train:2.7775, train_policy_loss:2.1004, train_value_loss:0.6770, val:2.7774, val_policy_loss:2.1004, val_value_loss:0.6770
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7761, policy_loss:2.1005, value_loss:0.6757, time 2.25s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7774)
## train_loss: 2.7776, val_loss: 2.7776, Time taken: 7.102996110916138s, val_policy_loss: 2.1003, val_value_loss: 0.6772, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 8, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.05, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.005, 'warmup_iters': 0, 'weight_decay': 0.05, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_n



step 0: losses: train:2.7775, train_policy_loss:2.1004, train_value_loss:0.6770, val:2.7774, val_policy_loss:2.1004, val_value_loss:0.6770
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7761, policy_loss:2.1005, value_loss:0.6757, time 2.55s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7774)
## train_loss: 2.7776, val_loss: 2.7776, Time taken: 6.748692989349365s, val_policy_loss: 2.1003, val_value_loss: 0.6772, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 8, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.05, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.005, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.95, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_n



step 0: losses: train:2.7775, train_policy_loss:2.1004, train_value_loss:0.6770, val:2.7774, val_policy_loss:2.1004, val_value_loss:0.6770
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7761, policy_loss:2.1005, value_loss:0.6757, time 2.75s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7774)
## train_loss: 2.7776, val_loss: 2.7776, Time taken: 6.564391136169434s, val_policy_loss: 2.1003, val_value_loss: 0.6772, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 8, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.05, 'decay_lr': False, 'lr_decay_iters': 100, 'min_lr': 0.005, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_n



step 0: losses: train:2.7916, train_policy_loss:2.1004, train_value_loss:0.6912, val:2.7916, val_policy_loss:2.1005, val_value_loss:0.6911
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7936, policy_loss:2.1027, value_loss:0.6909, time 2.47s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7916)
## train_loss: 2.7914, val_loss: 2.7913, Time taken: 5.920759677886963s, val_policy_loss: 2.1001, val_value_loss: 0.6912, overrides={'n_layer': 1, 'n_head': 2, 'n_embd': 8, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.05, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.005, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_na



step 0: losses: train:2.7775, train_policy_loss:2.1004, train_value_loss:0.6770, val:2.7774, val_policy_loss:2.1004, val_value_loss:0.6770
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7761, policy_loss:2.1005, value_loss:0.6757, time 4.94s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7774)
## train_loss: 2.7776, val_loss: 2.7776, Time taken: 9.053121089935303s, val_policy_loss: 2.1003, val_value_loss: 0.6772, overrides={'n_layer': 2, 'n_head': 4, 'n_embd': 8, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.05, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.005, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_na



step 0: losses: train:2.7775, train_policy_loss:2.1004, train_value_loss:0.6770, val:2.7774, val_policy_loss:2.1004, val_value_loss:0.6770
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7758, policy_loss:2.1006, value_loss:0.6752, time 6.41s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7774)
## train_loss: 2.7776, val_loss: 2.7776, Time taken: 10.412021160125732s, val_policy_loss: 2.1003, val_value_loss: 0.6772, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 8, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.05, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.005, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.01, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_



step 0: losses: train:2.8043, train_policy_loss:2.0922, train_value_loss:0.7121, val:2.8042, val_policy_loss:2.0921, val_value_loss:0.7121
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.8060, policy_loss:2.0929, value_loss:0.7130, time 3.00s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.8042)
## train_loss: 2.8042, val_loss: 2.8040, Time taken: 7.244549989700317s, val_policy_loss: 2.0918, val_value_loss: 0.7122, overrides={'n_layer': 3, 'n_head': 2, 'n_embd': 8, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.05, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.005, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_na



step 0: losses: train:2.7775, train_policy_loss:2.1004, train_value_loss:0.6770, val:2.7774, val_policy_loss:2.1004, val_value_loss:0.6770
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7761, policy_loss:2.1005, value_loss:0.6757, time 5.57s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7774)
## train_loss: 2.7776, val_loss: 2.7776, Time taken: 9.712861061096191s, val_policy_loss: 2.1003, val_value_loss: 0.6772, overrides={'n_layer': 2, 'n_head': 1, 'n_embd': 8, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.05, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.005, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_na



step 0: losses: train:2.7792, train_policy_loss:2.0801, train_value_loss:0.6991, val:2.7793, val_policy_loss:2.0801, val_value_loss:0.6992
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7777, policy_loss:2.0778, value_loss:0.6999, time 2.40s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7793)
## train_loss: 2.7792, val_loss: 2.7792, Time taken: 7.08407187461853s, val_policy_loss: 2.0801, val_value_loss: 0.6991, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 8, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.05, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.005, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': True, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_name



step 0: losses: train:2.7310, train_policy_loss:2.0919, train_value_loss:0.6391, val:2.7309, val_policy_loss:2.0920, val_value_loss:0.6389
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7263, policy_loss:2.0908, value_loss:0.6355, time 3.59s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7309)
## train_loss: 2.7309, val_loss: 2.7311, Time taken: 8.62743616104126s, val_policy_loss: 2.0921, val_value_loss: 0.6390, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 16, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.05, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.005, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_na



step 0: losses: train:2.7310, train_policy_loss:2.0919, train_value_loss:0.6391, val:2.7309, val_policy_loss:2.0920, val_value_loss:0.6389
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7263, policy_loss:2.0908, value_loss:0.6355, time 2.23s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7309)
## train_loss: 2.7309, val_loss: 2.7311, Time taken: 6.6698620319366455s, val_policy_loss: 2.0921, val_value_loss: 0.6390, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 16, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.02, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.002, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_



step 0: losses: train:2.7310, train_policy_loss:2.0919, train_value_loss:0.6391, val:2.7309, val_policy_loss:2.0920, val_value_loss:0.6389
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7263, policy_loss:2.0908, value_loss:0.6355, time 2.27s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7309)
## train_loss: 2.7309, val_loss: 2.7311, Time taken: 6.061525106430054s, val_policy_loss: 2.0921, val_value_loss: 0.6390, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 16, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.01, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.001, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_n



step 0: losses: train:2.7310, train_policy_loss:2.0919, train_value_loss:0.6391, val:2.7309, val_policy_loss:2.0920, val_value_loss:0.6389
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7263, policy_loss:2.0908, value_loss:0.6355, time 2.66s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7309)
## train_loss: 2.7309, val_loss: 2.7311, Time taken: 6.59116005897522s, val_policy_loss: 2.0921, val_value_loss: 0.6390, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 16, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.005, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.0005, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_



step 0: losses: train:2.7310, train_policy_loss:2.0919, train_value_loss:0.6391, val:2.7309, val_policy_loss:2.0920, val_value_loss:0.6389
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7263, policy_loss:2.0908, value_loss:0.6355, time 2.78s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7309)
## train_loss: 2.7309, val_loss: 2.7311, Time taken: 6.526685953140259s, val_policy_loss: 2.0921, val_value_loss: 0.6390, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 16, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.01, 'decay_lr': False, 'lr_decay_iters': 100, 'min_lr': 0.001, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_



step 0: losses: train:2.7310, train_policy_loss:2.0919, train_value_loss:0.6391, val:2.7309, val_policy_loss:2.0920, val_value_loss:0.6389
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7263, policy_loss:2.0908, value_loss:0.6355, time 2.70s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7309)
## train_loss: 2.7309, val_loss: 2.7311, Time taken: 6.550430059432983s, val_policy_loss: 2.0921, val_value_loss: 0.6390, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 16, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.01, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.001, 'warmup_iters': 0, 'weight_decay': 0.05, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_



step 0: losses: train:2.7310, train_policy_loss:2.0919, train_value_loss:0.6391, val:2.7309, val_policy_loss:2.0920, val_value_loss:0.6389
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7263, policy_loss:2.0908, value_loss:0.6355, time 2.64s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7309)
## train_loss: 2.7309, val_loss: 2.7311, Time taken: 6.8383378982543945s, val_policy_loss: 2.0921, val_value_loss: 0.6390, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 16, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.01, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.001, 'warmup_iters': 0, 'weight_decay': 0.2, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_



step 0: losses: train:2.7310, train_policy_loss:2.0919, train_value_loss:0.6391, val:2.7309, val_policy_loss:2.0920, val_value_loss:0.6389
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7263, policy_loss:2.0908, value_loss:0.6355, time 2.42s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7309)
## train_loss: 2.7309, val_loss: 2.7311, Time taken: 7.131251096725464s, val_policy_loss: 2.0921, val_value_loss: 0.6390, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 16, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.01, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.001, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.95, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_



step 0: losses: train:2.7489, train_policy_loss:2.0806, train_value_loss:0.6683, val:2.7487, val_policy_loss:2.0805, val_value_loss:0.6681
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7456, policy_loss:2.0786, value_loss:0.6670, time 2.26s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7487)
## train_loss: 2.7486, val_loss: 2.7491, Time taken: 6.2312469482421875s, val_policy_loss: 2.0809, val_value_loss: 0.6682, overrides={'n_layer': 1, 'n_head': 2, 'n_embd': 16, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.01, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.001, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_



step 0: losses: train:2.7309, train_policy_loss:2.0919, train_value_loss:0.6391, val:2.7309, val_policy_loss:2.0920, val_value_loss:0.6389
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7263, policy_loss:2.0908, value_loss:0.6355, time 3.71s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7309)
## train_loss: 2.7309, val_loss: 2.7311, Time taken: 8.863770246505737s, val_policy_loss: 2.0921, val_value_loss: 0.6390, overrides={'n_layer': 2, 'n_head': 4, 'n_embd': 16, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.01, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.001, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_n



step 0: losses: train:2.7766, train_policy_loss:2.0946, train_value_loss:0.6820, val:2.7768, val_policy_loss:2.0948, val_value_loss:0.6820
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7789, policy_loss:2.0966, value_loss:0.6823, time 2.43s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7768)
## train_loss: 2.7761, val_loss: 2.7762, Time taken: 7.046210050582886s, val_policy_loss: 2.0944, val_value_loss: 0.6818, overrides={'n_layer': 3, 'n_head': 2, 'n_embd': 16, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.01, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.001, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_n



step 0: losses: train:2.7841, train_policy_loss:2.0674, train_value_loss:0.7167, val:2.7846, val_policy_loss:2.0678, val_value_loss:0.7168
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7853, policy_loss:2.0672, value_loss:0.7181, time 3.77s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7846)
## train_loss: 2.7846, val_loss: 2.7849, Time taken: 8.98731017112732s, val_policy_loss: 2.0682, val_value_loss: 0.7167, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 16, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.01, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.001, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': True, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_nam



step 0: losses: train:2.7310, train_policy_loss:2.0919, train_value_loss:0.6391, val:2.7309, val_policy_loss:2.0920, val_value_loss:0.6389
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7275, policy_loss:2.0919, value_loss:0.6357, time 3.54s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7309)
## train_loss: 2.7309, val_loss: 2.7311, Time taken: 8.800753831863403s, val_policy_loss: 2.0921, val_value_loss: 0.6390, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 16, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.01, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.001, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.01, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_



step 0: losses: train:2.7310, train_policy_loss:2.0919, train_value_loss:0.6391, val:2.7309, val_policy_loss:2.0921, val_value_loss:0.6389
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7263, policy_loss:2.0908, value_loss:0.6355, time 4.77s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7309)
## train_loss: 2.7309, val_loss: 2.7311, Time taken: 9.915497064590454s, val_policy_loss: 2.0922, val_value_loss: 0.6390, overrides={'n_layer': 2, 'n_head': 1, 'n_embd': 16, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.01, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.001, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_n



step 0: losses: train:2.7310, train_policy_loss:2.0919, train_value_loss:0.6391, val:2.7309, val_policy_loss:2.0920, val_value_loss:0.6389
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7263, policy_loss:2.0908, value_loss:0.6355, time 2.71s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7309)
## train_loss: 2.7309, val_loss: 2.7311, Time taken: 6.4183738231658936s, val_policy_loss: 2.0921, val_value_loss: 0.6390, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 16, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.01, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.001, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.99, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_



step 0: losses: train:2.7311, train_policy_loss:2.0920, train_value_loss:0.6391, val:2.7310, val_policy_loss:2.0919, val_value_loss:0.6391
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7307, policy_loss:2.0925, value_loss:0.6382, time 2.98s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7310)
## train_loss: 2.7309, val_loss: 2.7309, Time taken: 7.022016763687134s, val_policy_loss: 2.0920, val_value_loss: 0.6389, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 16, 'batch_size': 32, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.01, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.001, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_n



step 0: losses: train:2.7310, train_policy_loss:2.0919, train_value_loss:0.6391, val:2.7309, val_policy_loss:2.0920, val_value_loss:0.6389
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/300/300: loss 2.7263, policy_loss:2.0908, value_loss:0.6355, time 2.80s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7309)
## train_loss: 2.7309, val_loss: 2.7311, Time taken: 9.918485641479492s, val_policy_loss: 2.0921, val_value_loss: 0.6390, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 16, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 300, 'max_epochs': 1000000, 'learning_rate': 0.01, 'decay_lr': True, 'lr_decay_iters': 300, 'min_lr': 0.001, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_n



step 0: losses: train:2.6982, train_policy_loss:2.0772, train_value_loss:0.6210, val:2.6986, val_policy_loss:2.0775, val_value_loss:0.6211
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.6917, policy_loss:2.0720, value_loss:0.6196, time 6.32s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.6986)
## train_loss: 2.6974, val_loss: 2.6978, Time taken: 10.851203918457031s, val_policy_loss: 2.0774, val_value_loss: 0.6204, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 32, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.01, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.001, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_



step 0: losses: train:2.7775, train_policy_loss:2.1004, train_value_loss:0.6770, val:2.7774, val_policy_loss:2.1004, val_value_loss:0.6770
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7761, policy_loss:2.1005, value_loss:0.6757, time 2.37s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7774)
## train_loss: 2.7776, val_loss: 2.7776, Time taken: 6.582316160202026s, val_policy_loss: 2.1003, val_value_loss: 0.6772, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 8, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.01, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.001, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.98, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_na



step 0: losses: train:2.7310, train_policy_loss:2.0919, train_value_loss:0.6391, val:2.7309, val_policy_loss:2.0920, val_value_loss:0.6389
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.7263, policy_loss:2.0908, value_loss:0.6355, time 2.48s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.7309)
## train_loss: 2.7309, val_loss: 2.7311, Time taken: 6.629193305969238s, val_policy_loss: 2.0921, val_value_loss: 0.6390, overrides={'n_layer': 2, 'n_head': 2, 'n_embd': 16, 'batch_size': 16, 'gradient_accumulation_steps': 1, 'max_iters': 100, 'max_epochs': 1000000, 'learning_rate': 0.01, 'decay_lr': True, 'lr_decay_iters': 100, 'min_lr': 0.001, 'warmup_iters': 0, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.95, 'grad_clip': 1.0, 'dtype': 'float16', 'dropout': 0.0, 'bias': False, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'model_n

(True,
 2.7311145186424257,
 6.061525106430054,
 {'n_layer': 2,
  'n_head': 2,
  'n_embd': 16,
  'n_max_context': 44,
  'batch_size': 16,
  'gradient_accumulation_steps': 1,
  'max_iters': 100,
  'max_epochs': 1000000,
  'learning_rate': 0.01,
  'decay_lr': True,
  'lr_decay_iters': 100,
  'min_lr': 0.001,
  'warmup_iters': 0,
  'weight_decay': 0.1,
  'beta1': 0.9,
  'beta2': 0.98,
  'grad_clip': 1.0,
  'dtype': 'float16',
  'dropout': 0.0,
  'bias': False,
  'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40',
  'model_name': 'c4-smoketest',
  'model_version': '0.1',
  'num_players': 2,
  'vocab_size': 8,
  'dataset_paths': (PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-31'),
   PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-32'),
   PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-33'),
   PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-34'),
   

In [7]:
tuner = Tuner(
    fixed_params=fixed_params.copy(),
    initial_params=initial_params.copy(),
    tune_options=tune_options.copy(), 
    computed_tune_options=computed_tune_options.copy(),
    cache_version=TUNER_VERSION,
    target_improvement_per_minute=0.1)
tuner.autotune_smart()


Using initial model as baseline.
## Initial Model, loss=2.697791143655777 elapsed=10.851203918457031s, val_policy=2.0774, val_value=0.6204
## Searching generation 0 with 19 candidates, including ['learning_rate: 0.01 -> 0.005', 'learning_rate: 0.01 -> 0.02', 'weight_decay: 0.1 -> 0.05', 'decay_lr: True -> False', 'weight_decay: 0.1 -> 0.2']
Training learning_rate: 0.01 -> 0.005
model_config=TransformerConfig(n_max_context=44, n_layer=2, n_head=2, n_embd=32, dropout=0.0, bias=False)
train_config=TrainConfig(model_name='c4-smoketest', model_version='0.1', eval_interval=10000, log_interval=1000, eval_iters=200, eval_only=False, always_save_checkpoint=True, wandb_log=False, gradient_accumulation_steps=1, batch_size=16, learning_rate=0.005, max_epochs=1000000, max_iters=100, weight_decay=0.1, beta1=0.9, beta2=0.98, grad_clip=1.0, decay_lr=True, warmup_iters=0, lr_decay_iters=100, min_lr=0.0005, device='mps', dtype='float16', compile=False, patience=5)
num decayed parameter tensors: 11, with



step 0: losses: train:2.6982, train_policy_loss:2.0772, train_value_loss:0.6210, val:2.6986, val_policy_loss:2.0775, val_value_loss:0.6211
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.6917, policy_loss:2.0720, value_loss:0.6196, time 2.29s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.6986)
## train_loss: 2.6974, val_loss: 2.6978, Time taken: 6.646500110626221s, val_policy_loss: 2.0774, val_value_loss: 0.6204, overrides={'batch_size': 16, 'beta1': 0.9, 'beta2': 0.98, 'bias': False, 'decay_lr': True, 'dropout': 0.0, 'dtype': 'float16', 'grad_clip': 1.0, 'gradient_accumulation_steps': 1, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'learning_rate': 0.005, 'lr_decay_iters': 100, 'max_epochs': 1000000, 'max_iters': 100, 'min_lr': 0.0005, 'n_embd': 32, 'n_head': 2, 'n_layer': 2, 'warmup_iters': 0, 'weight_decay': 0.1, 'model



step 0: losses: train:2.6982, train_policy_loss:2.0772, train_value_loss:0.6210, val:2.6986, val_policy_loss:2.0775, val_value_loss:0.6211
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.6917, policy_loss:2.0720, value_loss:0.6196, time 2.65s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.6986)
## train_loss: 2.6974, val_loss: 2.6978, Time taken: 7.1439478397369385s, val_policy_loss: 2.0774, val_value_loss: 0.6204, overrides={'batch_size': 16, 'beta1': 0.9, 'beta2': 0.98, 'bias': False, 'decay_lr': True, 'dropout': 0.0, 'dtype': 'float16', 'grad_clip': 1.0, 'gradient_accumulation_steps': 1, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'learning_rate': 0.002, 'lr_decay_iters': 100, 'max_epochs': 1000000, 'max_iters': 100, 'min_lr': 0.0002, 'n_embd': 32, 'n_head': 2, 'n_layer': 2, 'warmup_iters': 0, 'weight_decay': 0.1, 'mode



step 0: losses: train:2.6982, train_policy_loss:2.0772, train_value_loss:0.6210, val:2.6986, val_policy_loss:2.0775, val_value_loss:0.6211
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.6917, policy_loss:2.0720, value_loss:0.6196, time 2.62s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.6986)
## train_loss: 2.6974, val_loss: 2.6978, Time taken: 6.355707168579102s, val_policy_loss: 2.0774, val_value_loss: 0.6204, overrides={'batch_size': 16, 'beta1': 0.9, 'beta2': 0.98, 'bias': False, 'decay_lr': True, 'dropout': 0.0, 'dtype': 'float16', 'grad_clip': 1.0, 'gradient_accumulation_steps': 1, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'learning_rate': 0.005, 'lr_decay_iters': 100, 'max_epochs': 1000000, 'max_iters': 100, 'min_lr': 0.0005, 'n_embd': 32, 'n_head': 2, 'n_layer': 2, 'warmup_iters': 0, 'weight_decay': 0.05, 'mode



step 0: losses: train:2.6982, train_policy_loss:2.0772, train_value_loss:0.6210, val:2.6986, val_policy_loss:2.0775, val_value_loss:0.6211
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.6917, policy_loss:2.0720, value_loss:0.6196, time 2.90s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.6986)
## train_loss: 2.6974, val_loss: 2.6978, Time taken: 6.664028167724609s, val_policy_loss: 2.0774, val_value_loss: 0.6204, overrides={'batch_size': 16, 'beta1': 0.9, 'beta2': 0.98, 'bias': False, 'decay_lr': True, 'dropout': 0.0, 'dtype': 'float16', 'grad_clip': 1.0, 'gradient_accumulation_steps': 1, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'learning_rate': 0.002, 'lr_decay_iters': 100, 'max_epochs': 1000000, 'max_iters': 100, 'min_lr': 0.0002, 'n_embd': 32, 'n_head': 2, 'n_layer': 2, 'warmup_iters': 0, 'weight_decay': 0.05, 'mode



step 0: losses: train:2.6982, train_policy_loss:2.0772, train_value_loss:0.6210, val:2.6986, val_policy_loss:2.0775, val_value_loss:0.6211
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.6917, policy_loss:2.0720, value_loss:0.6196, time 2.74s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.6986)
## train_loss: 2.6974, val_loss: 2.6978, Time taken: 6.554748058319092s, val_policy_loss: 2.0774, val_value_loss: 0.6204, overrides={'batch_size': 16, 'beta1': 0.9, 'beta2': 0.98, 'bias': False, 'decay_lr': True, 'dropout': 0.0, 'dtype': 'float16', 'grad_clip': 1.0, 'gradient_accumulation_steps': 1, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'learning_rate': 0.01, 'lr_decay_iters': 100, 'max_epochs': 1000000, 'max_iters': 100, 'min_lr': 0.001, 'n_embd': 32, 'n_head': 2, 'n_layer': 2, 'warmup_iters': 0, 'weight_decay': 0.05, 'model_



step 0: losses: train:2.6982, train_policy_loss:2.0772, train_value_loss:0.6210, val:2.6986, val_policy_loss:2.0775, val_value_loss:0.6211
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.6917, policy_loss:2.0720, value_loss:0.6196, time 2.61s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.6986)
## train_loss: 2.6974, val_loss: 2.6978, Time taken: 6.576114892959595s, val_policy_loss: 2.0774, val_value_loss: 0.6204, overrides={'batch_size': 16, 'beta1': 0.9, 'beta2': 0.98, 'bias': False, 'decay_lr': False, 'dropout': 0.0, 'dtype': 'float16', 'grad_clip': 1.0, 'gradient_accumulation_steps': 1, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'learning_rate': 0.005, 'lr_decay_iters': 100, 'max_epochs': 1000000, 'max_iters': 100, 'min_lr': 0.0005, 'n_embd': 32, 'n_head': 2, 'n_layer': 2, 'warmup_iters': 0, 'weight_decay': 0.05, 'mod

(True,
 2.697791143655777,
 6.22353196144104,
 {'batch_size': 16,
  'beta1': 0.9,
  'beta2': 0.98,
  'bias': False,
  'decay_lr': True,
  'dropout': 0.0,
  'dtype': 'bfloat16',
  'grad_clip': 1.0,
  'gradient_accumulation_steps': 1,
  'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40',
  'learning_rate': 0.005,
  'lr_decay_iters': 100,
  'max_epochs': 1000000,
  'max_iters': 100,
  'min_lr': 0.0005,
  'n_embd': 32,
  'n_head': 2,
  'n_layer': 2,
  'n_max_context': 44,
  'warmup_iters': 0,
  'weight_decay': 0.05,
  'model_name': 'c4-smoketest',
  'model_version': '0.1',
  'num_players': 2,
  'vocab_size': 8,
  'dataset_paths': (PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-31'),
   PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-32'),
   PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-33'),
   PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-34'),
 

In [8]:
tuner = Tuner(
    fixed_params=fixed_params.copy(),
    initial_params=initial_params.copy(),
    tune_options=tune_options.copy(), 
    computed_tune_options=computed_tune_options.copy(),
    cache_version=TUNER_VERSION,
    target_improvement_per_minute=0.01)
tuner.autotune_smart()

Using initial model as baseline.
## Initial Model, loss=2.697791143655777 elapsed=6.22353196144104s, val_policy=2.0774, val_value=0.6204
## Searching generation 0 with 19 candidates, including ['learning_rate: 0.005 -> 0.002', 'learning_rate: 0.005 -> 0.01', 'weight_decay: 0.05 -> 0.01', 'decay_lr: True -> False', 'beta2: 0.98 -> 0.99']
## improved: False, loss=2.6978 elapsed=6.40s, mutation learning_rate: 0.005 -> 0.002
## improved: False, loss=2.6978 elapsed=6.53s, mutation learning_rate: 0.005 -> 0.01
## improved: False, loss=2.6978 elapsed=6.35s, mutation weight_decay: 0.05 -> 0.01
## improved: False, loss=2.6978 elapsed=6.55s, mutation decay_lr: True -> False
## improved: False, loss=2.6978 elapsed=6.87s, mutation beta2: 0.98 -> 0.99
## improved: False, loss=2.6978 elapsed=9.90s, mutation max_iters: 100 -> 300
## improved: False, loss=2.6978 elapsed=9.13s, mutation n_head: 2 -> 4
## improved: False, loss=2.6977 elapsed=9.47s, mutation n_head: 2 -> 1
## improved: False, loss=2.6978

(False,
 2.697791143655777,
 6.22353196144104,
 {'batch_size': 16,
  'beta1': 0.9,
  'beta2': 0.98,
  'bias': False,
  'decay_lr': True,
  'dropout': 0.0,
  'dtype': 'bfloat16',
  'grad_clip': 1.0,
  'gradient_accumulation_steps': 1,
  'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40',
  'learning_rate': 0.005,
  'lr_decay_iters': 100,
  'max_epochs': 1000000,
  'max_iters': 100,
  'min_lr': 0.0005,
  'n_embd': 32,
  'n_head': 2,
  'n_layer': 2,
  'n_max_context': 44,
  'warmup_iters': 0,
  'weight_decay': 0.05,
  'model_name': 'c4-smoketest',
  'model_version': '0.1',
  'num_players': 2,
  'vocab_size': 8,
  'dataset_paths': (PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-31'),
   PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-32'),
   PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-33'),
   PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-34'),


In [9]:
tuner = Tuner(
    fixed_params=fixed_params.copy(),
    initial_params=initial_params.copy(),
    tune_options=tune_options.copy(), 
    computed_tune_options=computed_tune_options.copy(),
    cache_version=TUNER_VERSION,
    target_improvement_per_minute=0.001)
tuner.autotune_smart()

Using initial model as baseline.
## Initial Model, loss=2.697672313451767 elapsed=9.468239068984985s, val_policy=2.0773, val_value=0.6204
## Searching generation 0 with 18 candidates, including ['learning_rate: 0.005 -> 0.002', 'learning_rate: 0.005 -> 0.01', 'weight_decay: 0.05 -> 0.01', 'decay_lr: True -> False', 'beta2: 0.98 -> 0.99']
Training learning_rate: 0.005 -> 0.002
model_config=TransformerConfig(n_max_context=44, n_layer=2, n_head=1, n_embd=32, dropout=0.0, bias=False)
train_config=TrainConfig(model_name='c4-smoketest', model_version='0.1', eval_interval=10000, log_interval=1000, eval_iters=200, eval_only=False, always_save_checkpoint=True, wandb_log=False, gradient_accumulation_steps=1, batch_size=16, learning_rate=0.002, max_epochs=1000000, max_iters=100, weight_decay=0.05, beta1=0.9, beta2=0.98, grad_clip=1.0, decay_lr=True, warmup_iters=0, lr_decay_iters=100, min_lr=0.0002, device='mps', dtype='bfloat16', compile=False, patience=5)
num decayed parameter tensors: 11, with



step 0: losses: train:2.6981, train_policy_loss:2.0771, train_value_loss:0.6209, val:2.6985, val_policy_loss:2.0774, val_value_loss:0.6211
saving best checkpoint to /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt
iter 0/100/100: loss 2.6915, policy_loss:2.0719, value_loss:0.6196, time 2.26s, iter_time: 0.00ms
Reloading best model from /Users/rodo/src/rgi3-sync/models/c4-smoketest/0.1/best.pt (val_loss=2.6985)
## train_loss: 2.6973, val_loss: 2.6977, Time taken: 6.518117189407349s, val_policy_loss: 2.0773, val_value_loss: 0.6204, overrides={'batch_size': 16, 'beta1': 0.9, 'beta2': 0.98, 'bias': False, 'decay_lr': True, 'dropout': 0.0, 'dtype': 'float16', 'grad_clip': 1.0, 'gradient_accumulation_steps': 1, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40', 'learning_rate': 0.002, 'lr_decay_iters': 100, 'max_epochs': 1000000, 'max_iters': 100, 'min_lr': 0.0002, 'n_embd': 32, 'n_head': 1, 'n_layer': 2, 'warmup_iters': 0, 'weight_decay': 0.01, 'mode

(True,
 2.697672313451767,
 6.384593725204468,
 {'batch_size': 16,
  'beta1': 0.9,
  'beta2': 0.98,
  'bias': False,
  'decay_lr': True,
  'dropout': 0.0,
  'dtype': 'bfloat16',
  'grad_clip': 1.0,
  'gradient_accumulation_steps': 1,
  'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40',
  'learning_rate': 0.002,
  'lr_decay_iters': 100,
  'max_epochs': 1000000,
  'max_iters': 100,
  'min_lr': 0.0002,
  'n_embd': 32,
  'n_head': 1,
  'n_layer': 2,
  'n_max_context': 44,
  'warmup_iters': 0,
  'weight_decay': 0.01,
  'model_name': 'c4-smoketest',
  'model_version': '0.1',
  'num_players': 2,
  'vocab_size': 8,
  'dataset_paths': (PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-31'),
   PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-32'),
   PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-33'),
   PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-34'),


In [10]:
tuner = Tuner(
    fixed_params=fixed_params.copy(),
    initial_params=initial_params.copy(),
    tune_options=tune_options.copy(), 
    computed_tune_options=computed_tune_options.copy(),
    cache_version=TUNER_VERSION,
    target_improvement_per_minute=0.0001)
tuner.autotune_smart()

Using initial model as baseline.
## Initial Model, loss=2.697672313451767 elapsed=6.384593725204468s, val_policy=2.0773, val_value=0.6204
## Searching generation 0 with 17 candidates, including ['learning_rate: 0.002 -> 0.001', 'learning_rate: 0.002 -> 0.005', 'max_iters: 100 -> 300', 'beta2: 0.98 -> 0.99', 'decay_lr: True -> False']
## improved: False, loss=2.6977 elapsed=6.60s, mutation learning_rate: 0.002 -> 0.001
## improved: False, loss=2.6977 elapsed=6.56s, mutation learning_rate: 0.002 -> 0.005
## improved: False, loss=2.6977 elapsed=9.27s, mutation max_iters: 100 -> 300
## improved: False, loss=2.6977 elapsed=6.48s, mutation beta2: 0.98 -> 0.99
## improved: False, loss=2.6977 elapsed=6.49s, mutation decay_lr: True -> False
## improved: False, loss=2.6977 elapsed=12.14s, mutation dropout: 0.0 -> 0.01
## improved: False, loss=2.6977 elapsed=7.50s, mutation beta1: 0.9 -> 0.95
## improved: False, loss=2.6977 elapsed=6.93s, mutation weight_decay: 0.01 -> 0.05
## improved: False, lo

(False,
 2.697672313451767,
 6.384593725204468,
 {'batch_size': 16,
  'beta1': 0.9,
  'beta2': 0.98,
  'bias': False,
  'decay_lr': True,
  'dropout': 0.0,
  'dtype': 'bfloat16',
  'grad_clip': 1.0,
  'gradient_accumulation_steps': 1,
  'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-40',
  'learning_rate': 0.002,
  'lr_decay_iters': 100,
  'max_epochs': 1000000,
  'max_iters': 100,
  'min_lr': 0.0002,
  'n_embd': 32,
  'n_head': 1,
  'n_layer': 2,
  'n_max_context': 44,
  'warmup_iters': 0,
  'weight_decay': 0.01,
  'model_name': 'c4-smoketest',
  'model_version': '0.1',
  'num_players': 2,
  'vocab_size': 8,
  'dataset_paths': (PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-31'),
   PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-32'),
   PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-33'),
   PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v3/data/gen-34'),

# Sanity check models


In [11]:
raise NotImplementedError("Skip this...")

NotImplementedError: Skip this...

In [None]:
reload_local_modules(verbose=False)

tuner = Tuner(
    fixed_params=fixed_params.copy(),
    initial_params=initial_params.copy(),
    tune_options=tune_options.copy(), 
    computed_tune_options=computed_tune_options.copy(),
    cache_version=TUNER_VERSION,
    target_improvement_per_minute=0.001)

tuner_result = tuner.autotune_smart()
# print(f'tuner_result={tuner_result}')

best_params = tuner.best_params.copy()
## Recalculating with best_params = {'batch_size': 512, 'beta1': 0.9, 'beta2': 0.99, 'bias': False, 'decay_lr': True, 'dropout': 0.0, 'dtype': 'float16', 'grad_clip': 1.0, 'gradient_accumulation_steps': 1, 'last_file': '/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v2/data/gen-20', 'learning_rate': 0.01, 'lr_decay_iters': 5000, 'max_epochs': 1000000, 'max_iters': 30000, 'min_lr': 0.001, 'n_embd': 64, 'n_head': 8, 'n_layer': 4, 'n_max_context': 44, 'warmup_iters': 1000, 'weight_decay': 0.2, 'model_name': 'c4-smoketest', 'model_version': '0.1', 'num_players': 2, 'vocab_size': 8, 'dataset_paths': (PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e/data/gen-1'), PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e/data/gen-2'), PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e/data/gen-3'), PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v2/data/gen-4'), PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v2/data/gen-5'), PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v2/data/gen-6'), PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v2/data/gen-7'), PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v2/data/gen-8'), PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v2/data/gen-9'), PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v2/data/gen-10'), PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v2/data/gen-11'), PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v2/data/gen-12'), PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v2/data/gen-13'), PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v2/data/gen-14'), PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v2/data/gen-15'), PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v2/data/gen-16'), PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v2/data/gen-17'), PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v2/data/gen-18'), PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v2/data/gen-19'), PosixPath('/Users/rodo/src/rgi3-sync/experiments/smoketest-e2e-v2/data/gen-20')), 'eval_iters': 200, 'log_interval': 1000, 'eval_interval': 10000, 'device': 'mps'}
## {'train': 2.2821048521995544, 'train_policy_loss': 1.746874241232872, 'train_value_loss': 0.5352306108176709, 'val': 2.324920549112208, 'val_policy_loss': 1.7480337900273941, 'val_value_loss': 0.5768867538255804, 'elapsed': 1651.836928844452, 'param_hash': '03189f0f4cb118a1ec142e488fe3fac12e97c118f60661796f8a5a64fa871d44'}

best_params['max_iters'] = 30_000 # {'train': 2.2821048521995544, 'train_policy_loss': 1.746874241232872, 'train_value_loss': 0.5352306108176709, 'val': 2.324920549112208, 'val_policy_loss': 1.7480337900273941, 'val_value_loss': 0.5768867538255804, 'elapsed': 1651.836928844452, 'param_hash': '03189f0f4cb118a1ec142e488fe3fac12e97c118f60661796f8a5a64fa871d44'}
# best_params['max_iters'] = 10_000 #{'train': 2.2972692108154296, 'train_policy_loss': 1.7478227978944778, 'train_value_loss': 0.5494464221596718, 'val': 2.3170768583522126, 'val_policy_loss': 1.7486604136579178, 'val_value_loss': 0.5684164271635168, 'elapsed': 529.6186480522156, 'param_hash': 'c81f68b9b00650b83a229a76b93d77d51f7bd1af43148f6c1af75e8f562f2670'}
# best_params['max_iters'] = 5000 # {'train': 2.3059648656845093, 'train_policy_loss': 1.749609624147415, 'train_value_loss': 0.5563552376627922, 'val': 2.319357395172119, 'val_policy_loss': 1.7505432332263273, 'val_value_loss': 0.5688141549334806, 'elapsed': 267.94705629348755, 'param_hash': 'a1194e1f289d35c51e5a0f01692109358f947530b50dd54364f01baefdb6bedf'}
# best_params['max_iters'] = 3000 # {'train': 2.3192384707927705, 'train_policy_loss': 1.752758464217186, 'train_value_loss': 0.5664800041913987, 'val': 2.326388120651245, 'val_policy_loss': 1.7533490412375505, 'val_value_loss': 0.5730390969444724, 'elapsed': 172.16889691352844, 'param_hash': '9e0204edd0b23fd54025b0c4de66c870a268b95197e0e47c4460ec03875b3dcb'}

# best_params['learning_rate'] = 0.01 # {'train': 2.2821048521995544, 'train_policy_loss': 1.746874241232872, 'train_value_loss': 0.5352306108176709, 'val': 2.324920549112208, 'val_policy_loss': 1.7480337900273941, 'val_value_loss': 0.5768867538255804, 'elapsed': 1651.836928844452, 'param_hash': '03189f0f4cb118a1ec142e488fe3fac12e97c118f60661796f8a5a64fa871d44'}
best_params['learning_rate'] = 0.0005 # {'train': 2.2302739822864535, 'train_policy_loss': 1.7535288149118424, 'train_value_loss': 0.47674516707658765, 'val': 2.4826266765594482, 'val_policy_loss': 1.755559900227715, 'val_value_loss': 0.7270667658132666, 'elapsed': 1617.1719007492065, 'param_hash': '4364fb3cfa7a3b4d33fe30f70ec9957a0a14bc7d9a195b5a2de2247d2a72a6d1'}

print(f'\n## Recalculating with best_params = {best_params}')
best_params = tuner._recalculate_tunable_params(best_params)
best_model = tuner.get_model_for_params(best_params)
print(tuner.train_and_compute_loss(best_params, reload_model=True)[2])



In [None]:
dataset_paths = experiment_runner.get_trajectory_paths(NUM_GENERATIONS)
print_dataset_stats(dataset_paths, n_max_context, action_vocab, model=best_model, game=game)

In [None]:
best_model

In [None]:
# Inspect training data
td_array = [TrajectoryDataset(DATA_DIR, f"gen-{generation_id}", block_size=n_max_context) for generation_id in range(1, NUM_GENERATIONS+1)]

In [None]:
# [td for td in td_array]
unrolled = [(generation+1, d) for generation, td in enumerate(td_array) for d in td]

# gen, d = unrolled[0], 
# d.action[:2]
# d.value[0]

dd = defaultdict(lambda: defaultdict(lambda: torch.tensor([0., 0.])))

for gen, d in unrolled:
    for g in ['*', gen]:    
        # dd[tuple(tuple(d.action[:0].tolist()))][g] += d.value[0]
        dd[tuple(tuple(d.action[:1].tolist()))][g] += d.value[0]
        dd[tuple(tuple(d.action[:2].tolist()))][g] += d.value[0]
        dd[tuple(tuple(d.action[:3].tolist()))][g] += d.value[0]
        dd[tuple(tuple(d.action[:4].tolist()))][g] += d.value[0]
        dd[tuple(tuple(d.action[:5].tolist()))][g] += d.value[0]
        dd[tuple(tuple(d.action[:6].tolist()))][g] += d.value[0]
        dd[tuple(tuple(d.action[:7].tolist()))][g] += d.value[0]
        dd[tuple(tuple(d.action[:8].tolist()))][g] += d.value[0]
        dd[tuple(tuple(d.action[:9].tolist()))][g] += d.value[0]
        dd[tuple(tuple(d.action[:10].tolist()))][g] += d.value[0]

print(f"len(dd) = {len(dd)}")


In [None]:
def eval_prefix(model, game, prefix):
    serial_evaluator = ActionHistoryTransformerEvaluator(model, device=device, block_size=n_max_context, vocab=action_vocab)
    state = game.initial_state()
    for action in prefix:
        state = game.next_state(state, action)
    legal_actions = game.legal_actions(state)
    result = serial_evaluator.evaluate(game, state, legal_actions)
    return result


In [None]:
## Someting is borked? Player1 win percent should be much higher??
def compare_model_vs_data(model, game, dd):    
    list(dd.items())[10][1]['*'].sum() > 100
    top_k = sorted(dd.items(), key=lambda kv: kv[1]['*'].sum(), reverse=True)[:20]
    top_k_keys = sorted(k for k, v in top_k)
    
    prefix_list = top_k_keys

    # prefix_list = [
    #     (0,), 
    #     (0,1), (0,2), (0,3), (0,4), (0,5), (0,6), (0,7),
    #     (0,1,1), (0,1,2), (0,1,3), (0,1,4), (0,1,5), (0,1,6), (0,1,7),
    #     (0,4,1), (0,4,2), (0,4,3), (0,4,4), (0,4,5), (0,4,6), (0,4,7),
    # ]

    for prefix in prefix_list:
        print(f"\nprefix={prefix}")
        for gen, counts in dd[prefix].items():
            if gen == '*':
                print(f"gen={gen}: {counts}, win_pct={100*counts[0]/sum(counts):.2f}%, sum={sum(counts)}")
        # # assert prefix[0] == 0
        actions = prefix[1:]
        eval_result = eval_prefix(model, game, actions)
        # print(f'legal_policy={eval_result.legal_policy}')
        # print(f'player_values={eval_result.player_values}')
        print(f'player_probs={(eval_result.player_values+1)/2}')

compare_model_vs_data(current_model, game, dd)


In [None]:
# Copy model
model_0 = create_random_model(model_config, action_vocab_size=action_vocab.vocab_size, num_players=game.num_players(state_0), seed=42, device=device)
if RUN_GENERATIONS:
    model_1 = load_model(1)


In [None]:
print("\n\n### Model 0")
print(model_0.action_embedding.weight)
compare_model_vs_data(model_0, game, dd)

In [None]:
if RUN_GENERATIONS:
    print("\n\n### Model 1")
    print(model_1.action_embedding.weight)
    compare_model_vs_data(model_1, game, dd)

## Run tournament to calcualte ELO


In [None]:
import asyncio
import numpy as np
from contextlib import asynccontextmanager
from rgi.rgizero.tournament import Tournament
from rgi.rgizero.players.alphazero import AlphazeroPlayer
from rgi.rgizero.models.action_history_transformer import ActionHistoryTransformerEvaluator, AsyncNetworkEvaluator

@asynccontextmanager
async def create_player_factory(model, simulations, game, device, block_size, action_vocab, max_batch_size):
    """
    Creates a shared evaluator and returns a factory function that produces 
    new AlphazeroPlayer instances using that shared evaluator.
    """
    # 1. Setup the shared evaluator
    serial_evaluator = ActionHistoryTransformerEvaluator(
        model, 
        device=device, 
        block_size=n_max_context, 
        vocab=action_vocab
    )
    async_evaluator = AsyncNetworkEvaluator(
        base_evaluator=serial_evaluator, 
        max_batch_size=max_batch_size, 
        verbose=False
    )
    
    # 2. Start the evaluator background task
    await async_evaluator.start()
    
    try:
        # 3. Define the factory. This is called by Tournament for every game.
        # It creates a NEW player instance but uses the SHARED async_evaluator.
        def player_factory():
            # Create a fresh RNG for each game/player instance
            rng = np.random.default_rng(np.random.randint(0, 2**31))
            return AlphazeroPlayer(
                game, 
                async_evaluator, 
                rng=rng, 
                add_noise=True, 
                simulations=simulations
            )
            
        yield player_factory
        
    finally:
        # 4. Cleanup
        await async_evaluator.stop()

async def run_tournament_async():
    # Use async with to manage the lifecycle of the evaluators
    async with (
        # create_player_factory(model_dict[0], 100, game, device, block_size, action_vocab, 10) as factory_gen0_100,
        # create_player_factory(model_dict[1], 100, game, device, block_size, action_vocab, 10) as factory_gen1_100,
        # create_player_factory(model_dict[2], 100, game, device, block_size, action_vocab, 10) as factory_gen2_100,
        # create_player_factory(model_dict[3], 100, game, device, block_size, action_vocab, 10) as factory_gen3_100,
        # create_player_factory(model_dict[4], 100, game, device, block_size, action_vocab, 10) as factory_gen4_100,
        # create_player_factory(model_dict[5], 100, game, device, block_size, action_vocab, 10) as factory_gen5_100,
        # create_player_factory(model_dict[10], 100, game, device, block_size, action_vocab, 10) as factory_gen6_100,
        # create_player_factory(model_dict[15], 100, game, device, block_size, action_vocab, 10) as factory_gen7_100,
        # create_player_factory(model_dict[20], 100, game, device, block_size, action_vocab, 10) as factory_gen8_100,

        create_player_factory(model_dict[0], 200, game, device, block_size, action_vocab, 10) as factory_gen0_200,
        #create_player_factory(model_dict[1], 200, game, device, block_size, action_vocab, 10) as factory_gen1_200,
        #create_player_factory(model_dict[2], 200, game, device, block_size, action_vocab, 10) as factory_gen2_200,
        #create_player_factory(model_dict[3], 200, game, device, block_size, action_vocab, 10) as factory_gen3_200,
        #create_player_factory(model_dict[4], 200, game, device, block_size, action_vocab, 10) as factory_gen4_200,
        create_player_factory(model_dict[5], 200, game, device, block_size, action_vocab, 10) as factory_gen5_200,
        #create_player_factory(model_dict[10], 200, game, device, block_size, action_vocab, 10) as factory_gen10_200,
        #create_player_factory(model_dict[15], 200, game, device, block_size, action_vocab, 10) as factory_gen15_200,
        create_player_factory(model_dict[20], 200, game, device, block_size, action_vocab, 10) as factory_gen20_200,
        ):
        
        # The dictionary now maps names to FACTORIES (Callables), not Player instances
        player_factories = {
            # "factory_gen0_100": factory_gen0_100,
            # "factory_gen1_100": factory_gen1_100,
            # "factory_gen2_100": factory_gen2_100,
            # "factory_gen3_100": factory_gen3_100,
            # "factory_gen4_100": factory_gen4_100,
            # "factory_gen5_100": factory_gen5_100,
            # "factory_gen6_100": factory_gen6_100,
            # "factory_gen7_100": factory_gen7_100,

            "factory_gen0_200": factory_gen0_200,
            #"factory_gen1_200": factory_gen1_200,
            #"factory_gen2_200": factory_gen2_200,
            #"factory_gen3_200": factory_gen3_200,
            #"factory_gen4_200": factory_gen4_200,
            "factory_gen5_200": factory_gen5_200,
            #"factory_gen10_200": factory_gen10_200,
            #"factory_gen15_200": factory_gen15_200,
            "factory_gen20_200": factory_gen20_200,
        }
        
        tournament = Tournament(game, player_factories, initial_elo=1000)
        
        print("Running tournament...")
        # await tournament.run(num_games=1_000, concurrent_games=2000)
        await tournament.run(num_games=100, concurrent_games=2000)
        tournament.print_standings()

# RUN_TOURNAMENT = True
if RUN_TOURNAMENT:
    await run_tournament_async()

# Running tournament...
# Tournament Progress: 100%|██████████| 10000/10000 [1:25:59<00:00,  1.94it/s]

# Tournament Standings:
# Rank  Player               ELO        Games    W-L-D          
# -----------------------------------------------------------------
# 1     factory_gen6_200     1140.5     1247     827-419-1      
# 2     factory_gen2_200     1100.1     1251     693-554-4      
# 3     factory_gen5_100     1074.4     1251     598-652-1      
# 4     factory_gen3_200     1029.1     1252     674-573-5      
# 5     factory_gen4_200     1027.0     1248     711-536-1      
# 6     factory_gen0_200     1020.0     1254     444-810-0      
# 7     factory_gen5_200     990.2      1248     742-502-4      
# 8     factory_gen7_100     987.5      1250     650-597-3      
# 9     factory_gen7_200     979.2      1248     768-476-4      
# 10    factory_gen2_100     974.0      1249     522-723-4      
# 11    factory_gen6_100     966.6      1248     684-564-0      
# 12    factory_gen4_100     964.2      1251     557-693-1      
# 13    factory_gen1_100     962.5      1252     547-705-0      
# 14    factory_gen3_100     947.0      1251     528-723-0      
# 15    factory_gen1_200     941.1      1252     630-620-2      
# 16    factory_gen0_100     896.5      1248     410-838-0     


## 20 generations.
# Running tournament...
# Tournament Progress: 100%|██████████| 1000/1000 [08:35<00:00,  1.94it/s]

# Tournament Standings:
# Rank  Player               ELO        Games    W-L-D          
# -----------------------------------------------------------------
# 1     factory_gen10_200    1114.2     333      212-120-1      
# 2     factory_gen2_200     1032.6     333      190-141-2      
# 3     factory_gen1_200     1003.9     334      159-175-0      
# 4     factory_gen20_200    1000.9     335      171-164-0      
# 5     factory_gen5_200     974.6      331      183-146-2      
# 6     factory_gen0_200     873.8      334      82-251-1  

# Tune Model (continued)


In [None]:
tuner = Tuner(
    fixed_params=fixed_params.copy(),
    initial_params=initial_params.copy(),
    tune_options=tune_options.copy(), 
    computed_tune_options=computed_tune_options.copy(),
    cache_version=TUNER_VERSION,
    target_improvement_per_minute=0.01)
tuner.autotune_smart()

# Using initial model as baseline.
# ## Initial Model, loss=2.1298508644104004 elapsed=171.78943705558777s
# ## Searching generation 0 with 22 candidates, including ['bias: False -> True', 'learning_rate: 0.005 -> 0.002', 'learning_rate: 0.005 -> 0.002', 'dtype: bfloat16 -> float16', 'weight_decay: 0.1 -> 0.2']
# ## improved: False, loss=2.1332 elapsed=178.64s, mutation bias: False -> True
# ## improved: False, loss=2.1395 elapsed=172.03s, mutation learning_rate: 0.005 -> 0.002
# ## improved: False, loss=2.1395 elapsed=172.03s, mutation learning_rate: 0.005 -> 0.002


In [None]:
reload_local_modules(verbose=False)

tuner = Tuner(
    fixed_params=fixed_params.copy(),
    initial_params=initial_params.copy(),
    tune_options=tune_options.copy(), 
    computed_tune_options=computed_tune_options.copy(),
    cache_version=TUNER_VERSION,
    target_improvement_per_minute=0.001)
tuner.autotune_smart()
from rgi.rgizero.models.action_history_transformer import ActionHistoryTransformer, ActionHistoryTransformerEvaluator
from rgi.rgizero.models.transformer import TransformerConfig

tiny_config: TransformerConfig = TransformerConfig(n_max_context=100, n_layer=2, n_head=2, n_embd=8)
tiny_model = ActionHistoryTransformer(config=tiny_config, action_vocab_size=action_vocab.vocab_size, num_players=game.num_players(state_0))
tiny_model.to(device)
tiny_evaluator = ActionHistoryTransformerEvaluator(tiny_model, device=device, block_size=5, vocab=action_vocab)


In [None]:
tuner = Tuner(
    fixed_params=fixed_params.copy(),
    initial_params=initial_params.copy(),
    tune_options=tune_options.copy(), 
    computed_tune_options=computed_tune_options.copy(),
    cache_version=TUNER_VERSION,
    target_improvement_per_minute=0.0001)
tuner.autotune_smart()


In [None]:
reload_local_modules(verbose=False)

tuner = Tuner(
    fixed_params=fixed_params.copy(),
    tune_options=tune_options.copy(), 
    initial_params=initial_params.copy(),
    computed_tune_options=computed_tune_options.copy(),
    cache_version=TUNER_VERSION,
    target_improvement_per_minute=0.0)
tuner.autotune_smart()

## Debug best model

In [None]:
reload_local_modules(verbose=False)

best_model = tuner.load_best_model()
compare_model_vs_data(best_model, game, dd)


In [None]:
from pprint import pprint
print(f'tuner.best_loss={tuner.best_loss}')
print(f'tuner.best_loss_elapsed={int(tuner.best_loss_elapsed)//60}m{tuner.best_loss_elapsed%60:.0f}s')
pprint(tuner.best_params)
# best_params = tuner.initial_params


## Print tuner stats


In [None]:
reload_local_modules(verbose=False)
tuner = Tuner(
    fixed_params=fixed_params.copy(),
    tune_options=tune_options.copy(), 
    initial_params=initial_params.copy(),
    computed_tune_options=computed_tune_options.copy(),
    cache_version=TUNER_VERSION,
    target_improvement_per_minute=0.001)
# print stats based on cached results.
tuner_stats = tuner.print_hparam_stats()

In [None]:
tuner_stats

In [None]:
# [(k,v['mean_val_delta']) for (k,v) in sorted(tuner_stats.items(), key=lambda x: x[1]['mean_val_delta'], reverse=True)]

for x in  sorted([(v['mean_val_delta'], k, v['mean_val_1'], v['mean_val_2']) for (k,v) in tuner_stats.items() if not np.isnan(v['mean_val_delta'])], reverse=True): print(x)
# sorted([(v['mean_val_delta'], k) for (k,v) in tuner_stats.items() if not np.isnan(v['mean_val_delta'])], reverse=True)


In [None]:
for x in sorted([(v['std_val_delta'], k, v['mean_val_1'], v['mean_val_2']) for (k,v) in tuner_stats.items() if not np.isnan(v['std_val_delta'])], reverse=True): print(x)


## Debug Convergence

Synthetic sanity-check: train on a toy 2-step game where the first action strongly determines the winner. This verifies the value head and training loop can learn simple patterns.


In [None]:
raise NotImplementedError("xxx STOP HERE xxx")

In [None]:
state_0 = game.initial_state()
all_actions_0 = game.all_actions()

print(all_actions_0)


In [None]:
import random

def play_random_game_with_fake_reward(game, max_actions) -> dict:
    state = game.initial_state()
    action_history = []
    legal_policies = []
    legal_action_idx_list = []

    all_actions = game.all_actions()
    all_action_idx_map = {action: idx for idx, action in enumerate(all_actions)}

    num_actions = 0
    while not game.is_terminal(state) and num_actions < max_actions:
        current_player = game.current_player_id(state)
        legal_actions = game.legal_actions(state)
        action_idx = random.randrange(len(legal_actions))
        action = legal_actions[action_idx]

        action_history.append(action)
        legal_policies.append(np.ones(len(legal_actions))/len(legal_actions))
        legal_action_idx = np.array([all_action_idx_map[action] for action in legal_actions])
        legal_action_idx_list.append(legal_action_idx)

        state = game.next_state(state, action)
        num_actions += 1

    # Determine outcome
    fake_reward = np.mean(action_history) / len(legal_actions)
    rewards = np.array([fake_reward, 1.0-fake_reward])
    if fake_reward >= 0.5:
        winner = 1
    else:
        winner = 2

    return {
        "winner": winner,
        "rewards": rewards,
        "action_history": action_history,
        "legal_policies": legal_policies,
        "final_state": state,
        "legal_action_idx": legal_action_idx_list,
    }

In [None]:
play_random_game_with_fake_reward(game, max_actions=2)

In [None]:
results = [play_random_game_with_fake_reward(game, max_actions=2) for _ in range(100_000)]
print_game_stats(results)


In [None]:
fake_gen_name = "fake-0"
trajectory_path = write_trajectory_dataset(results, action_vocab, fake_gen_name)


In [None]:
# fake_model_config = model_config_dict[MODEL_SIZE]
fake_model_config = model_config_dict["large"]
fake_model = create_random_model(fake_model_config, action_vocab_size=action_vocab.vocab_size, num_players=game.num_players(state_0), seed=42, device=device)

training_splits = [f'gen-{fake_gen_name}']
fake_model, fake_trainer = train_model(fake_model, training_splits, train_config)
save_model(fake_model, fake_trainer, fake_gen_name)

## model_size=tiny
# num decayed parameter tensors: 11, with 1,968 parameters
# num non-decayed parameter tensors: 7, with 50 parameters
# using fused AdamW: False
# step 0: train loss 2.7817, val loss 2.7816
# iter 0/49/488: loss 2.7821, time 2537.56ms
# iter 100/147/488: loss 2.6890, time 53.61ms
# iter 200/245/488: loss 2.6342, time 63.05ms
# iter 300/343/488: loss 2.6187, time 55.31ms
# iter 400/441/488: loss 2.6147, time 61.11ms

## model_size=large
# num decayed parameter tensors: 35, with 1,579,776 parameters
# num non-decayed parameter tensors: 19, with 2,186 parameters
# using fused AdamW: False
# step 0: train loss 2.8087, val loss 2.8088
# iter 0/49/488: loss 2.8099, time 11225.20ms
# iter 100/147/488: loss 2.6065, time 596.91ms
# iter 200/245/488: loss 2.6075, time 618.00ms
# iter 300/343/488: loss 2.6080, time 613.63ms
# iter 400/441/488: loss 2.6051, time 616.39ms

In [None]:
# for rerun in range(10):
#     print(f"Re-running training for {fake_gen_name} {rerun+1} of 10")
#     fake_model, fake_trainer = train_model(fake_model, training_splits, train_config)
#     save_model(fake_model, fake_trainer, fake_gen_name)

In [None]:
# [td for td in td_array]
fake_td_array = [TrajectoryDataset(DATA_DIR, split, block_size=n_max_context) for split in training_splits]
fake_unrolled = [(generation+1, d) for generation, td in enumerate(fake_td_array) for d in td]

# gen, d = unrolled[0], 
# d.action[:2]
# d.value[0]

# Inspect training data
fake_dd = defaultdict(lambda: defaultdict(lambda: torch.tensor([0., 0.])))

for gen, d in fake_unrolled:
    for g in ['*', gen]:    
        fake_dd[tuple(tuple(d.action[:0].tolist()))][g] += d.value[0]
        fake_dd[tuple(tuple(d.action[:1].tolist()))][g] += d.value[0]
        fake_dd[tuple(tuple(d.action[:2].tolist()))][g] += d.value[0]
        # fake_dd[tuple(tuple(d.action[:3].tolist()))][g] += d.value[0]

print(f"len(fake_dd) = {len(fake_dd)}")


In [None]:
fake_model = load_model(fake_gen_name)
compare_model_vs_data(fake_model, game, dd)


In [None]:
fake_model, fake_trainer = train_model(fake_model, training_splits, train_config)
