# slim_pretrain Smoke Test

This notebook performs a small end-to-end pretraining smoke test for the self-contained `slim_pretrain` package.

In [1]:
from pathlib import Path
import sys

# Robust repo root discovery
candidate = Path.cwd().resolve()
while candidate != candidate.parent and not (candidate / "slim_pretrain").exists():
    candidate = candidate.parent
if not (candidate / "slim_pretrain").exists():
    raise RuntimeError("Could not find repo root containing slim_pretrain/.")

repo_root = candidate
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

from slim_pretrain.pretrain.data import VariableBatchSpec
from slim_pretrain.pretrain.train import (
    DataCurriculumConfig,
    ModelConfig,
    OptimConfig,
    PretrainConfig,
    default_base_prior_config,
    pretrain_nano_tabpfn_pu,
)

print(f"Repo root: {repo_root}")

Repo root: /Users/qltian/Library/CloudStorage/GoogleDrive-qltian2021@gmail.com/Other computers/My Laptop/Documents/Research/ai/prior_gen


In [2]:
base_cfg = default_base_prior_config()

cfg = PretrainConfig(
    model=ModelConfig(
        embedding_size=32,
        num_attention_heads=4,
        mlp_hidden_size=64,
        num_layers=2,
        num_outputs=2,
    ),
    optim=OptimConfig(
        base_lr=1e-3,
        min_lr=1e-4,
        weight_decay=0.0,
        warmup_steps=5,
        beta1=0.9,
        beta2=0.95,
        grad_clip_norm=1.0,
    ),
    data=DataCurriculumConfig(
        total_stages=2,
        steps_per_stage=3,
        batch_spec=VariableBatchSpec(
            batch_size=4,
            seq_len_range=(48, 64),
            num_features_range=(4, 10),
            train_ratio_range=(0.6, 0.8),
            pu_row_policy="drop",
        ),
    ),
    device="cpu",
    seed=42,
    log_every=1,
    max_steps=6,
)

print("Configured steps:", cfg.total_steps)
print("Configured stages:", cfg.data.total_stages)

Configured steps: 6
Configured stages: 2


In [3]:
import time

t0 = time.time()
result = pretrain_nano_tabpfn_pu(base_cfg=base_cfg, config=cfg)
elapsed = time.time() - t0

history = result["history"]
print(f"Smoke pretraining done in {elapsed:.2f}s with {len(history)} steps.")
print("Last step record:")
history[-1]

step=1/6 stage=1 lr=0.000200 loss=0.6882 loss_ema=0.6882 pu_rate=1.00
step=2/6 stage=1 lr=0.000400 loss=0.6882 loss_ema=0.6882 pu_rate=1.00
step=3/6 stage=1 lr=0.000600 loss=0.7013 loss_ema=0.6888 pu_rate=1.00
step=4/6 stage=2 lr=0.000800 loss=0.6587 loss_ema=0.6873 pu_rate=1.00
step=5/6 stage=2 lr=0.001000 loss=0.6723 loss_ema=0.6866 pu_rate=1.00
step=6/6 stage=2 lr=0.001000 loss=0.7134 loss_ema=0.6879 pu_rate=1.00
Smoke pretraining done in 0.79s with 6 steps.
Last step record:


{'step': 5,
 'stage': 2,
 'lr': 0.001,
 'loss': 0.7133632898330688,
 'loss_ema': 0.6879215244057775,
 'eval_loss': nan,
 'is_causal': 0.0,
 'num_layers': 5.5,
 'hidden_dim': 12.0,
 'pu_keep_probability': 0.0,
 'batch_pu_rate': 1.0,
 'batch_removed_rows_mean': 17.75}

In [4]:
assert len(history) == cfg.total_steps, "History length mismatch."
assert all(float(r["pu_keep_probability"]) == 0.0 for r in history), "PU should be always-on."
assert all(1 <= int(r["stage"]) <= cfg.data.total_stages for r in history), "Invalid stage index in history."
assert all(float(r["loss"]) >= 0.0 for r in history), "Loss must be non-negative."

print("SMOKE TEST PASSED")
print("First step:", history[0])
print("Last step:", history[-1])

SMOKE TEST PASSED
First step: {'step': 0, 'stage': 1, 'lr': 0.0002, 'loss': 0.6881925463676453, 'loss_ema': 0.6881925463676453, 'eval_loss': nan, 'is_causal': 0.0, 'num_layers': 2.0, 'hidden_dim': 8.0, 'pu_keep_probability': 0.0, 'batch_pu_rate': 1.0, 'batch_removed_rows_mean': 20.5}
Last step: {'step': 5, 'stage': 2, 'lr': 0.001, 'loss': 0.7133632898330688, 'loss_ema': 0.6879215244057775, 'eval_loss': nan, 'is_causal': 0.0, 'num_layers': 5.5, 'hidden_dim': 12.0, 'pu_keep_probability': 0.0, 'batch_pu_rate': 1.0, 'batch_removed_rows_mean': 17.75}
