In [1]:
!pip install "numpy<2.0.0"

[0m

In [2]:
# make sure we're using numpy 1.*.*
import numpy as np

if (np.__version__).startswith("1."):
    print("Numpy version is 1.*.*, you're good to go!")
else:
    raise ValueError("Please restart your runtime using the above instructions!")

Numpy version is 1.*.*, you're good to go!


In [3]:
import os


# Optional
WANDB_API_KEY = ""
if WANDB_API_KEY:
    os.environ["WANDB_API_KEY"] = WANDB_API_KEY

# Optional
OPENPIPE_API_KEY = ""
if OPENPIPE_API_KEY:
    os.environ["OPENPIPE_API_KEY"] = OPENPIPE_API_KEY

In [4]:
%%capture
!uv pip install openpipe-art openpipe --prerelease allow --no-cache-dir

In [5]:
from dotenv import load_dotenv
import random
from typing import TypedDict
from typing import Literal
import string
import xml.etree.ElementTree as ET

load_dotenv()

WINNING_VALUE = 256


# Class that keeps track of state for a single game of 2048
class TwentyFortyEightGame(TypedDict):
    id: str
    board: list[list[int | None]]


# Randomly populates a cell on the board with a 2 or 4
def populate_random_cell(game: TwentyFortyEightGame) -> None:
    all_clear_coordinates = [
        (i, j)
        for i in range(len(game["board"]))
        for j in range(len(game["board"][i]))
        if game["board"][i][j] is None
    ]
    random_clear_coordinates = random.choice(all_clear_coordinates)
    # 90% chance to populate a 2, 10% chance to populate a 4
    game["board"][random_clear_coordinates[0]][random_clear_coordinates[1]] = (
        2 if random.random() < 0.9 else 4
    )


# Generates a new game of 2048
def generate_game(board_length: int = 4) -> TwentyFortyEightGame:
    # random 6 character string
    id = "".join(random.choices(string.ascii_letters + string.digits, k=6))
    game = {
        "id": id,
        "board": [[None for _ in range(board_length)] for _ in range(board_length)],
    }

    # populate two random cells
    populate_random_cell(game)
    populate_random_cell(game)

    return game


# Renders the board in a human-readable format
def render_board(game: TwentyFortyEightGame) -> str:
    board = game["board"]
    # print something like this:
    # _    | 2    | _    | 4
    # 4    | 8    | 2    | 16
    # 16   | 32   | 64   | 128
    # _    | 2    | 2    | 4
    # where _ is an empty cell

    max_cell_width = max(
        [len(str(cell)) for row in board for cell in row if cell is not None]
    )

    board_str = ""
    for row in board:
        # pad the cells with spaces to make them the same width
        board_str += "|".join(
            [
                str(cell).rjust(max_cell_width)
                if cell is not None
                else "_".rjust(max_cell_width)
                for cell in row
            ]
        )
        board_str += "\n"
    return board_str


# condense, privileging matches at the start of the sequence
# sequences should be passed starting with cells that are the furthest in the direction in which the board is being condensed
def condense_sequence(sequence: list[int | None]) -> list[int | None]:
    condensed_sequence = []

    gapless_sequence = [cell for cell in sequence if cell is not None]

    i = 0
    while i < len(gapless_sequence):
        if (
            i + 1 < len(gapless_sequence)
            and gapless_sequence[i] == gapless_sequence[i + 1]
        ):
            condensed_sequence.append(gapless_sequence[i] * 2)
            i += 2
        else:
            condensed_sequence.append(gapless_sequence[i])
            i += 1

    # pad the sequence with None at the end
    return condensed_sequence + [None] * (4 - len(condensed_sequence))


# Condenses the board in a given direction
def condense_board(
    game: TwentyFortyEightGame, direction: Literal["left", "right", "up", "down"]
) -> None:
    if direction == "left":
        for row in game["board"]:
            condensed_row = condense_sequence(row)
            for i in range(len(row)):
                row[i] = condensed_row[i]

    if direction == "right":
        for row in game["board"]:
            reversed_row = row[::-1]
            # reverse the row before and after condensing
            condensed_row = condense_sequence(reversed_row)[::-1]
            for i in range(len(row)):
                row[i] = condensed_row[i]

    if direction == "up":
        for col_index in range(len(game["board"][0])):
            column = [row[col_index] for row in game["board"]]

            condensed_column = condense_sequence(column)
            for row_index in range(len(column)):
                game["board"][row_index][col_index] = condensed_column[row_index]

    if direction == "down":
        for col_index in range(len(game["board"][0])):
            column = [row[col_index] for row in game["board"]]
            reversed_column = column[::-1]
            condensed_column = condense_sequence(reversed_column)[::-1]
            for row_index in range(len(column)):
                game["board"][row_index][col_index] = condensed_column[row_index]


# Applies an agent move to the game board
def apply_agent_move(game: TwentyFortyEightGame, move_xml: str) -> None:
    direction = None
    # parse the move
    try:
        root = ET.fromstring(move_xml)
        direction = root.text
    except Exception as e:
        raise ValueError("Invalid xml")

    if direction not in ["left", "right", "up", "down"]:
        raise ValueError("Invalid direction")

    condense_board(game, direction)

    populate_random_cell(game)


# Returns the maximum cell value on the board
def max_cell_value(game: TwentyFortyEightGame) -> int:
    return max([cell for row in game["board"] for cell in row if cell is not None])


# Returns True if the game is finished
def check_game_finished(game: TwentyFortyEightGame) -> bool:
    if max_cell_value(game) >= WINNING_VALUE:
        return True

    # check if any cell is empty
    if any(cell is None for row in game["board"] for cell in row):
        return False

    return True


# Returns the sum of all the cell values on the board
def total_board_value(game: TwentyFortyEightGame) -> int:
    return sum([cell for row in game["board"] for cell in row if cell is not None])


In [6]:
import art
from art.local import LocalAPI
from dotenv import load_dotenv
from openpipe.client import AsyncOpenPipe
import random
from pydantic import BaseModel


load_dotenv()

random.seed(42)

# Initialize the server
api = LocalAPI(
    # Normally we don't want to run the server in-process, but for the output
    # to show up properly on Google Colab we'll enable this.
    in_process=True
)


class CustomConfig(BaseModel):
    litellm_model_name: str | None = None


# Declare the model
model = art.TrainableModel(
    name="012",
    project="2048-dev",
    base_model="Qwen/Qwen2.5-3B-Instruct",
    # To run on a T4, we need to override some config defaults.
    _internal_config=art.dev.InternalModelConfig(
        init_args=art.dev.InitArgs(
            max_seq_length=8192,
        ),
        engine_args=art.dev.EngineArgs(
            enforce_eager=True,
            gpu_memory_utilization=0.8,
            num_scheduler_steps=1,
        ),
    ),
)
await model.register(api)

gpt_4o_mini = art.Model(
    name="gpt-4o-mini",
    project="2048-dev",
    config=CustomConfig(
        litellm_model_name="openai/gpt-4o-mini",
    ),
)
await gpt_4o_mini.register(api)

gpt_4o = art.Model(
    name="gpt-4o",
    project="2048-dev",
    config=CustomConfig(
        litellm_model_name="openai/gpt-4o",
    ),
)
await gpt_4o.register(api)


# Optional logging client
op_client = AsyncOpenPipe()


Please restructure your imports with 'import unsloth' at the top of your file.
  import unsloth  # type: ignore


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 04-18 19:16:50 __init__.py:207] Automatically detected platform cuda.
==((====))==  Unsloth 2025.3.19: Fast Qwen2 patching. Transformers: 4.51.1. vLLM: 0.7.3.
   \\   /|    NVIDIA H100 PCIe. Num GPUs = 1. Max memory: 79.097 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit with actual GPU utilization = 78.47%
Unsloth: Your GPU has CUDA compute capability 9.0 with VRAM = 79.1 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 32768. Num Seq

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-18 19:17:03 model_runner.py:1115] Loading model weights took 2.2265 GB
INFO 04-18 19:17:03 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 04-18 19:17:06 worker.py:267] Memory profiling takes 2.57 seconds
INFO 04-18 19:17:06 worker.py:267] the current vLLM instance can use total_gpu_memory (79.10GiB) x gpu_memory_utilization (0.78) = 62.06GiB
INFO 04-18 19:17:06 worker.py:267] model weights take 2.23GiB; non_torch_memory takes 0.14GiB; PyTorch activation peak memory takes 2.71GiB; the rest of the memory reserved for KV Cache is 56.98GiB.
INFO 04-18 19:17:07 executor_base.py:111] # cuda blocks: 103733, # CPU blocks: 10922
INFO 04-18 19:17:07 executor_base.py:116] Maximum concurrency for 32768 tokens per request: 50.65x
INFO 04-18 19:17:15 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory err

Capturing CUDA graph shapes: 100%|██████████| 49/49 [00:44<00:00,  1.10it/s]

INFO 04-18 19:17:59 model_runner.py:1562] Graph capturing finished in 45 secs, took 1.30 GiB
INFO 04-18 19:17:59 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 55.94 seconds



Unsloth 2025.3.19 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.
Unsloth: Already have LoRA adapters! We shall skip this step.


In [7]:
import art

import openai
import time
import math
import requests
from litellm import acompletion

from art.utils.litellm import convert_litellm_choice_to_openai


@art.retry(exceptions=(openai.LengthFinishReasonError, requests.ReadTimeout))
async def rollout(
    model: art.Model, step: int = 0, is_validation: bool = False
) -> art.Trajectory:
    game = generate_game()

    move_number = 0

    trajectory = art.Trajectory(
        messages_and_choices=[
            {
                "role": "system",
                "content": "You are an excellent 2048 player. Always choose the move most likely to lead to combine cells to eventually reach the number 2048. Optional moves are 'left', 'right', 'up', 'down'. Return your move as an XML object with a single property 'move', like so: <move>left</move>",
            }
        ],
        reward=0,
    )

    while True:
        trajectory.messages_and_choices.append(
            {"role": "user", "content": render_board(game)}
        )

        requested_at = int(time.time() * 1000)
        messages = trajectory.messages

        try:
            model_id = (
                model.config.litellm_model_name
                if isinstance(model.config, CustomConfig)
                else f"hosted_vllm/{model.name}"
            )
            chat_completion = await acompletion(
                base_url=model.inference_base_url,
                api_key=model.inference_api_key,
                model=model_id,
                messages=messages,
                max_completion_tokens=128,
            )
            last_completion = chat_completion
        except openai.LengthFinishReasonError as e:
            raise e
        except Exception as e:
            print("caught exception generating chat completion", e)
            raise e

        try:
            if op_client.api_key:
                await op_client.report(
                    requested_at=requested_at,
                    received_at=int(time.time() * 1000),
                    req_payload={
                        "model": model.name,
                        "messages": messages,
                        "metadata": {
                            "game_id": game["id"],
                            "notebook-id": "2048",
                            "step": str(step),
                            "validation": str(is_validation),
                            "move_number": str(move_number),
                        },
                    },
                    resp_payload=chat_completion,
                    status_code=200,
                )
        except Exception as e:
            print(f"Error reporting to OpenPipe: {e}")

        choice = convert_litellm_choice_to_openai(chat_completion.choices[0])
        content = choice.message.content
        assert isinstance(content, str)
        trajectory.messages_and_choices.append(choice)

        try:
            apply_agent_move(game, content)
            move_number += 1
        except ValueError:
            # logarithmically scale negative reward between -1 for 0 valid preceding moves and 0 for 1000 valid preceding moves
            trajectory.reward = -1 + (math.log(move_number + 1) / math.log(1000))
            break

        if check_game_finished(game):
            max_value = max_cell_value(game)
            board_value = total_board_value(game)
            trajectory.metrics["max_value"] = max_value
            trajectory.metrics["board_value"] = board_value

            if max_value < WINNING_VALUE:
                # scale max value logarithmically between 0 for 2 and 1 for WINNING_VALUE
                max_value_reward = (math.log(max_value, 2) - 1) / (
                    math.log(WINNING_VALUE, 2) - 1
                )
                # scale board value logarithmically between 0 for 2 * 16 and 1 for WINNING_VALUE * 16
                board_value_reward = (math.log(board_value, 2) - 1) / (
                    math.log(WINNING_VALUE * 16, 2) - 1
                )
                # combine the two rewards, with max value having a higher weight
                trajectory.reward = max_value_reward + (board_value_reward * 0.2)
            else:
                # double reward if the agent wins
                trajectory.reward = 2
                # add random number between 0 and .001
                trajectory.reward += random.random() * 0.001
            break

    try:
        if op_client.api_key:
            await op_client.update_log_metadata(
                filters=[
                    {
                        "field": "completionId",
                        "equals": last_completion.id,
                    }
                ],
                metadata={
                    "reward": str(trajectory.reward),
                    "reward_assigned": "true",
                },
            )
    except Exception as e:
        print(f"Error updating log metadata: {e}")

    return trajectory

In [None]:
openai_client = model.openai_client()
for i in range(await model.get_step(), 50):
    train_groups = await art.gather_trajectory_groups(
        (
            art.TrajectoryGroup(
                rollout(model, i, is_validation=False) for _ in range(4)
            )
            for _ in range(1)
        ),
        pbar_desc="gather",
        max_exceptions=1,
    )
    await model.delete_checkpoints()
    await model.train(
        train_groups,
        config=art.TrainConfig(learning_rate=3e-5),
        # Lowering the logprob_calculation_chunk_size is a memory saving measure
        # to allow longer sequences (up to 4096 tokens) to be processed on a T4.
        _config={"logprob_calculation_chunk_size": 8},
    )

gather:   0%|          | 0/4 [00:00<?, ?it/s]


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.

caught exception generating chat completion litellm.APIConnectionError: audio
Traceback (most recent call last):
  File "/root/sky_workdir/.venv/lib/python3.10/site-packages/litellm/main.py", line 977, in completion
    model_response = ModelResponse()
  File "/root/sky_workdir/.venv/lib/python3.10/site-packages/litellm/types/utils.py", line 1104, in __init__
    choices = [Choices()]
  File "/root/sky_workdir/.venv/lib/python3.10/site-packages/litellm/types/utils.py", line 735, in __init__
    self.message = Message()
  File "/root/sky_workdir/.venv/lib/python3.10/site-packages/litellm/types/utils.py", line 599, in __init__
    del self.audio
  File "/root/sky_workdir/.venv/lib/python3.10/site-packages/pydantic/main.py", line 1082, in __delattr__
    object.__delattr__(self, item)
AttributeError: audio



[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


No "val/reward" metric found in history


[34m[1mwandb[0m: Currently logged in as: [33mopenpipe[0m ([33mopenpipe-team[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Packed 3 trajectories into 2 sequences of length 6144


train:   0%|          | 0/2 [00:00<?, ?it/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 10,000,000 | Num Epochs = 3 | Total steps = 30,000,000
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 1 x 1) = 2
 "-____-"     Trainable parameters = 14,966,784/3,000,000,000 (0.50% trained)


Unsloth: Will smartly offload gradients to save VRAM!


gather:   0%|          | 0/4 [00:00<?, ?it/s]

No "val/reward" metric found in history
Deleted checkpoint ./.art/2048-dev/models/012/0016
Packed 4 trajectories into 3 sequences of length 10240


train:   0%|          | 0/3 [00:00<?, ?it/s]

gather:   0%|          | 0/4 [00:00<?, ?it/s]

No "val/reward" metric found in history
Deleted checkpoint ./.art/2048-dev/models/012/0017
Packed 4 trajectories into 3 sequences of length 8192


train:   0%|          | 0/3 [00:00<?, ?it/s]

gather:   0%|          | 0/4 [00:00<?, ?it/s]

No "val/reward" metric found in history
Deleted checkpoint ./.art/2048-dev/models/012/0018
Packed 4 trajectories into 4 sequences of length 10240


train:   0%|          | 0/4 [00:00<?, ?it/s]

gather:   0%|          | 0/4 [00:00<?, ?it/s]

No "val/reward" metric found in history
Deleted checkpoint ./.art/2048-dev/models/012/0019
Packed 4 trajectories into 3 sequences of length 10240


train:   0%|          | 0/3 [00:00<?, ?it/s]

gather:   0%|          | 0/4 [00:00<?, ?it/s]

No "val/reward" metric found in history
Deleted checkpoint ./.art/2048-dev/models/012/0020
Packed 4 trajectories into 3 sequences of length 10240


train:   0%|          | 0/3 [00:00<?, ?it/s]

gather:   0%|          | 0/4 [00:00<?, ?it/s]

No "val/reward" metric found in history
Deleted checkpoint ./.art/2048-dev/models/012/0021
Packed 4 trajectories into 4 sequences of length 6144


train:   0%|          | 0/4 [00:00<?, ?it/s]

gather:   0%|          | 0/4 [00:00<?, ?it/s]

No "val/reward" metric found in history
Deleted checkpoint ./.art/2048-dev/models/012/0022
Packed 4 trajectories into 3 sequences of length 10240


train:   0%|          | 0/3 [00:00<?, ?it/s]

gather:   0%|          | 0/4 [00:00<?, ?it/s]

No "val/reward" metric found in history
Deleted checkpoint ./.art/2048-dev/models/012/0023
Packed 4 trajectories into 4 sequences of length 12288


train:   0%|          | 0/4 [00:00<?, ?it/s]

gather:   0%|          | 0/4 [00:00<?, ?it/s]

No "val/reward" metric found in history
Deleted checkpoint ./.art/2048-dev/models/012/0024
Packed 4 trajectories into 4 sequences of length 10240


train:   0%|          | 0/4 [00:00<?, ?it/s]

gather:   0%|          | 0/4 [00:00<?, ?it/s]

No "val/reward" metric found in history
Deleted checkpoint ./.art/2048-dev/models/012/0025
Packed 4 trajectories into 4 sequences of length 10240


train:   0%|          | 0/4 [00:00<?, ?it/s]

gather:   0%|          | 0/4 [00:00<?, ?it/s]

No "val/reward" metric found in history
Deleted checkpoint ./.art/2048-dev/models/012/0026
Packed 4 trajectories into 3 sequences of length 10240


train:   0%|          | 0/3 [00:00<?, ?it/s]

gather:   0%|          | 0/4 [00:00<?, ?it/s]

No "val/reward" metric found in history
Deleted checkpoint ./.art/2048-dev/models/012/0027
Packed 4 trajectories into 4 sequences of length 10240


train:   0%|          | 0/4 [00:00<?, ?it/s]

gather:   0%|          | 0/4 [00:00<?, ?it/s]

No "val/reward" metric found in history
Deleted checkpoint ./.art/2048-dev/models/012/0028
Packed 4 trajectories into 4 sequences of length 10240


train:   0%|          | 0/4 [00:00<?, ?it/s]

gather:   0%|          | 0/4 [00:00<?, ?it/s]

No "val/reward" metric found in history
Deleted checkpoint ./.art/2048-dev/models/012/0029
Packed 4 trajectories into 3 sequences of length 10240


train:   0%|          | 0/3 [00:00<?, ?it/s]

gather:   0%|          | 0/4 [00:00<?, ?it/s]

No "val/reward" metric found in history
Deleted checkpoint ./.art/2048-dev/models/012/0030
Packed 4 trajectories into 4 sequences of length 10240


train:   0%|          | 0/4 [00:00<?, ?it/s]

gather:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
import asyncio


async def log_comparison_model(comparison_model: art.Model):
    trajectories = await art.gather_trajectory_groups(
        (
            art.TrajectoryGroup(
                rollout(comparison_model, 0, is_validation=True) for _ in range(12)
            )
            for _ in range(1)
        ),
        pbar_desc=f"gather {comparison_model.name}",
        max_exceptions=1,
    )

    await comparison_model.log(
        trajectories,
        split="val",
    )


promises = []

for comparison_model in [gpt_4o_mini, gpt_4o]:
    promises.append(log_comparison_model(comparison_model))

await asyncio.gather(*promises)


In [11]:
from art.utils.benchmarking.generate_comparison_table import generate_comparison_table
from art.utils.benchmarking.types import BenchmarkedModelKey

table = generate_comparison_table(
    project="2048-dev",
    benchmark_keys=[
        BenchmarkedModelKey("009", "train", [0, -2, -1]),
        BenchmarkedModelKey("gpt-4o-mini", "val"),
        BenchmarkedModelKey("gpt-4o", "val"),
    ],
    metrics=["reward", "max_value", "board_value"],
)

print(table.to_markdown())

|    | Model       | Split   |   Step |     reward |   max_value |   board_value |
|---:|:------------|:--------|-------:|-----------:|------------:|--------------:|
|  0 | 009         | train   |   0000 |  0.773351  |     88.6154 |       185.538 |
|  1 | 009         | train   |   0002 | -0.0199348 |     64      |       162     |
|  2 | 009         | train   |   0003 |  0.500415  |    128      |       202     |
|  3 | gpt-4o-mini | val     |   0000 |  0.973925  |     50.9091 |       128.909 |
|  4 | gpt-4o      | val     |   0000 |  1.29582   |     82.6667 |       207.333 |


In [12]:
from art.utils.benchmarking.generate_line_graphs import generate_line_graphs
from art.utils.benchmarking.display_image_grid import display_image_grid
from art.utils.benchmarking.types import BenchmarkedModelKey

graph_image_paths = generate_line_graphs(
    project="2048-dev",
    line_graph_keys=[
        BenchmarkedModelKey("009", "train"),
        BenchmarkedModelKey("010", "train"),
        BenchmarkedModelKey("011", "train"),
    ],
    comparison_keys=[
        BenchmarkedModelKey("gpt-4o-mini", "val"),
        BenchmarkedModelKey("gpt-4o", "val"),
    ],
    metrics=["reward", "max_value", "board_value"],
)

display_image_grid(graph_image_paths)