# ART Training for Text-2048 Environment

This notebook trains an agent on the text-2048 environment using:
- ART (Agent Reinforcement Training) framework
- HUD SDK for MCP agent implementation
- Dockerized text-2048 environment
- Tasks from HuggingFace dataset

**Requirements**: Enable GPU (Runtime → Change runtime type → T4 GPU)


## 1. Install Dependencies


In [None]:
# Install HUD SDK and dependencies
!pip install -q hud-python openpipe-art datasets weave openai mcp


## 2. Configuration

Set your API keys and training parameters here:


In [None]:
import os

# Set API keys (optional)
# For Colab secrets: Settings → Secrets → Add a new secret
# os.environ["WANDB_API_KEY"] = ""  # Optional for experiment tracking
# os.environ["HUD_API_KEY"] = ""    # Optional for HUD telemetry

# Model configuration
BASE_MODEL = "Qwen/Qwen2.5-3B-Instruct"  # 3B model works well on T4
MODEL_NAME = "mcprl-3b-2048"
PROJECT_NAME = "2048-mcp-rl"

# Training configuration
MAX_STEPS = 10  # Max steps per episode
TRAINING_CONFIG = {
    "num_training_inputs": 16,  # Number of training tasks
    "groups_per_step": 2,       # Trajectory groups per training step
    "num_epochs": 1,            # Training epochs
    "rollouts_per_group": 4,    # Rollouts per group
    "learning_rate": 1e-5,      # Learning rate
}


## 3. Import Libraries and Define Agent


In [None]:
import asyncio
import json
import os
import random
from typing import Awaitable

import dotenv
dotenv.load_dotenv()

import art
from art.local import LocalBackend
from art.utils import iterate_dataset
import weave


In [None]:
# Import HUD SDK components
from hud.agents import ArtHUDAgent
from hud.client import MCPClient
from hud.datasets import TaskConfig


In [None]:

@weave.op()
async def rollout(model: art.Model, task_dict: dict) -> art.Trajectory:
    """Generate one trajectory via ArtHUDAgent."""
    import hud
    
    with hud.trace(f"Art Rollout {task_dict['id']}", root=True, task_id=task_dict.get("id")):
        task_config = TaskConfig(**task_dict)
        
        mcp_client = MCPClient(task_config.mcp_config)
        agent = ArtHUDAgent(model, mcp_client, allowed_tools=["move"])
        await agent.initialize()

        trace = await agent.run(task_config, max_steps=MAX_STEPS)

        traj = art.Trajectory(
            messages_and_choices=agent.messages_and_choices,
            tools=agent.get_tool_schemas(),
            reward=trace.reward,
            metadata={"task": task_dict},
            metrics={
                "task_completed": trace.done,
                "success": trace.reward > 0,
                "reward": trace.reward,
                "ran_out_of_steps": not trace.done,
            },
        )

        await mcp_client.close()
    
        return traj.finish()



In [None]:
random.seed(42)

# ---- Optional W&B ----
WANDB_API_KEY = os.environ.get("WANDB_API_KEY", "")
if WANDB_API_KEY:
    weave.init(PROJECT_NAME)
    os.environ["WANDB_API_KEY"] = WANDB_API_KEY

# ---- Build / register model ----
model = art.TrainableModel(
    name=MODEL_NAME,
    project=PROJECT_NAME,
    base_model=BASE_MODEL,
)

backend = LocalBackend(in_process=True, path="./.art")
await model.register(backend)

print("Model created and registered")

# ---- Generate training scenarios  --------------------------------------------------
from datasets import load_dataset

dataset = load_dataset("hud-evals/2048-taskset", split="train")
dataset = list(dataset.shuffle(seed=42))[:TRAINING_CONFIG["num_training_inputs"]]

train_iterator = iterate_dataset(
    dataset,
    groups_per_step=TRAINING_CONFIG["groups_per_step"],
    num_epochs=TRAINING_CONFIG["num_epochs"],
    initial_step=await model.get_step(),
)

# ---- Training loop ----
for batch in train_iterator:
    print(f"\n=== Training step {batch.step} ===")

    groups: list[Awaitable[art.TrajectoryGroup]] = []
    for task_dict in batch.items:
        group = art.TrajectoryGroup(
            rollout(model, task_dict)
            for _ in range(TRAINING_CONFIG["rollouts_per_group"])
        )
        groups.append(group)

    print("Gathering trajectory groups…")
    gathered = await art.gather_trajectory_groups(groups)

    # We already have rewards from environment evaluation
    await model.train(
        gathered,
        config=art.TrainConfig(learning_rate=TRAINING_CONFIG["learning_rate"]),
    )
    print("✅ step complete, model checkpoint saved\n")

print("Training finished, checkpoints stored in ./.art/")