# RL on 2048

This notebook trains an agent on the hud 2048 environment using:
- ART (Agent Reinforcement Training) framework with GRPO
- HUD SDK for MCP agent implementation
- HUD remote text-2048 environment
- Tasks from HuggingFace dataset

**Requirements**: Enable GPU (Runtime → Change runtime type → T4 GPU)

This notebook will take around 2 hours to run in its entirety and complete the training run!

## 1. Install Dependencies


In [None]:
!uv pip install -q openpipe-art==0.3.11.post5 langchain-core tenacity "mcp>=1.11.0" "gql<4" aiohttp --no-cache-dir

In [None]:
! uv pip install -q --upgrade git+https://github.com/hud-evals/hud-python.git@l/text-2048 --no-deps

In [None]:
! uv pip install -q fastmcp mcp_use anthropic --no-cache-dir

In [None]:
! uv pip install -q --upgrade git+https://github.com/hud-evals/mcp-python-sdk.git --no-cache-dir

In [None]:
! uv pip install openai==1.99.9

## 2. Configuration

Set your API keys and training parameters here:


In [None]:
import os

# Set API keys
# For Colab secrets: Secrets → Add a new secret
from google.colab import userdata

os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")  # Optional for private datasets
os.environ["WANDB_API_KEY"] = userdata.get("WANDB_API_KEY")  # Optional for experiment tracking
os.environ["HUD_API_KEY"] = userdata.get("HUD_API_KEY")  # Optional for HUD telemetry

# Model configuration
BASE_MODEL = "Qwen/Qwen2.5-3B-Instruct"  # 3B model works on T4
MODEL_NAME = "mcprl-3b-2048"
PROJECT_NAME = "2048-mcp-rl-3"

# Training configuration
MAX_STEPS = 50  # Max steps per episode
TRAINING_CONFIG = {
    "num_training_inputs": 16,  # Number of training tasks
    "groups_per_step": 3,  # Trajectory groups per training step
    "num_epochs": 32,  # Training epochs
    "rollouts_per_group": 8,  # Rollouts per group
    "learning_rate": 1e-5,  # Learning rate
}

## 3. Import Libraries and Define Agent


In [None]:
import asyncio
import json
import os
import random
from typing import Awaitable

import dotenv

dotenv.load_dotenv()

import hud
import art
import weave
from art.local import LocalBackend
from art.utils import iterate_dataset

from hud.clients.fastmcp import FastMCPHUDClient as MCPClient
from hud.datasets import Task
from hud.agents import ArtHUDAgent

# ---- Optional W&B ----
import wandb

WANDB_API_KEY = os.environ.get("WANDB_API_KEY", "")
if WANDB_API_KEY:
    weave.init(PROJECT_NAME)
    os.environ["WANDB_API_KEY"] = WANDB_API_KEY

import logging

logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO)
import warnings

warnings.filterwarnings("ignore")

4. Load in ANY dataset that matches the hud environment spec. ART only works with text environments

In [None]:
# ---- Generate training scenarios  --------------------------------------------------
from datasets import load_dataset

random.seed(42)

dataset_hf = load_dataset("hud-evals/2048-taskset", split="train")
dataset = [dataset_hf[0]]

MCP_CONFIG = {
    "hud": {
        "url": "https://mcp.hud.so/v3/mcp",
        "headers": {
            "Authorization": "Bearer ${HUD_API_KEY}",
            "Run-Id": "${RUN_ID}",
            "Mcp-Image": "hudpython/hud-text-2048:latest",
        },
    }
}

5. Run the training! This code needs to be run within the same cell for proper model registration and tracing.

In [None]:
@weave.op
async def rollout(model: art.Model, task_dict: dict, job_id: str | None = None) -> art.Trajectory:
    """Generate one trajectory via ArtHUDAgent."""
    task_dict["mcp_config"] = MCP_CONFIG

    with hud.trace(
        f"Art Rollout {task_dict['id']}", root=True, task_id=task_dict.get("id"), job_id=job_id
    ):
        task_config = Task(**task_dict)

        mcp_client = MCPClient(task_config.mcp_config)
        await mcp_client.initialize()
        agent = ArtHUDAgent(model, mcp_client, allowed_tools=["move"])

        trace = await agent.run(task_config, max_steps=MAX_STEPS)

        # Debug: Check what's in messages_and_choices
        print(f"\nDEBUG: messages_and_choices has {len(agent.messages_and_choices)} items")
        for i, item in enumerate(agent.messages_and_choices):
            if isinstance(item, dict):
                print(
                    f"  [{i}] dict - role: {item.get('role')}, content: {repr(item.get('content', '')[:50])}"
                )
            else:
                # It's a Choice object
                if hasattr(item, "message"):
                    msg = item.message
                    print(
                        f"  [{i}] Choice - content: {repr(msg.content)}, tool_calls: {len(msg.tool_calls) if msg.tool_calls else 0}"
                    )
                else:
                    print(f"  [{i}] Unknown type: {type(item)}")

        tool_schemas = agent.get_tool_schemas()

        traj = art.Trajectory(
            messages_and_choices=agent.messages_and_choices,
            tools=tool_schemas,
            reward=trace.reward,
            metadata={"task": task_config.prompt},  # Just the prompt string
            metrics={
                "task_completed": trace.done,
                "success": trace.reward > 0,
                "reward": trace.reward,
                "ran_out_of_steps": not trace.done,
            },
        )

        await mcp_client.close()

        return traj.finish()


# ---- Build / register model ----
model = art.TrainableModel(
    name=MODEL_NAME,
    project=PROJECT_NAME,
    base_model=BASE_MODEL,
)
backend = LocalBackend(in_process=True, path="./.art")
await model.register(backend)

train_iterator = iterate_dataset(
    dataset,
    groups_per_step=TRAINING_CONFIG["groups_per_step"],
    num_epochs=TRAINING_CONFIG["num_epochs"],
    initial_step=await model.get_step(),
)

# ---- Training loop ----
with hud.job("Art Training Run") as job_obj:
    for batch in train_iterator:
        print(f"\n=== Training step {batch.step} ===")

        groups: list[Awaitable[art.TrajectoryGroup]] = []
        for task_dict in batch.items:
            group = art.TrajectoryGroup(
                rollout(model, task_dict, job_obj.id)
                for _ in range(TRAINING_CONFIG["rollouts_per_group"])
            )
            groups.append(group)

        print("Gathering trajectory groups…")
        gathered = await art.gather_trajectory_groups(groups)

        # We already have rewards from environment evaluation
        await model.train(
            gathered,
            config=art.TrainConfig(learning_rate=TRAINING_CONFIG["learning_rate"]),
        )
        print("✅ step complete, model checkpoint saved\n")

print("Training finished, checkpoints stored in ./.art/")