In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>

In [3]:
import art
from dotenv import load_dotenv
import random
from openpipe.client import OpenPipe

load_dotenv()

op_client = OpenPipe()
print("OpenPipe client initialized")

random.seed(42)


api = art.LocalAPI()
model = await api.get_or_create_model(
    name="001", project="2048-single-turn", base_model="Qwen/Qwen2.5-7B-Instruct"
)

OpenPipe client initialized


In [None]:
import art
from art.utils.get_trajectory_messages import get_trajectory_messages
import openai
import time
import math
import requests
from .utils import (
    generate_game,
    render_board,
    apply_agent_move,
    max_cell_value,
    check_game_finished,
)

WINNING_VALUE = 512


@art.retry(exceptions=(openai.LengthFinishReasonError, requests.ReadTimeout))
async def rollout(client: openai.AsyncOpenAI, iteration: int) -> art.Trajectory:

    game = generate_game()

    reward = 0
    move_number = 0

    trajectories: list[art.Trajectory] = []

    while True:

        trajectory = art.Trajectory(
            messages_and_choices=[
                {
                    "role": "system",
                    "content": "You are an excellent 2048 player. Always choose the move most likely to lead to combine cells to eventually reach the number 2048. Optional moves are 'left', 'right', 'up', 'down'. Return your move as an XML object with a single property 'move', like so: <move>direction</move>",
                }
            ],
            reward=0,
        )
        trajectories.append(trajectory)

        trajectory.messages_and_choices.append(
            {"role": "user", "content": render_board(game)}
        )

        requested_at = int(time.time() * 1000)
        messages = get_trajectory_messages(trajectory)

        async def get_completion():
            return await client.chat.completions.create(
                max_completion_tokens=2048,
                messages=messages,
                model=model.name,
                temperature=1.5,
            )

        try:
            chat_completion = await get_completion()
            last_completion = chat_completion
        except openai.LengthFinishReasonError as e:
            raise e
        except Exception as e:
            print("caught exception generating chat completion")
            print(e)
            global failing_trajectory
            failing_trajectory = trajectory
            raise e

        try:
            op_client.report(
                requested_at=requested_at,
                received_at=int(time.time() * 1000),
                req_payload={
                    "model": model.name,
                    "messages": messages,
                    "metadata": {
                        "game_id": game["id"],
                        "notebook-id": "2048",
                        "iteration": str(iteration),
                        "move_number": str(move_number),
                    },
                },
                resp_payload=chat_completion,
                status_code=200,
            )
        except Exception as e:
            print(f"Error reporting to OpenPipe: {e}")

        choice = chat_completion.choices[0]
        content = choice.message.content
        assert isinstance(content, str)
        trajectory.messages_and_choices.append(choice)

        try:
            apply_agent_move(game, content)
            move_number += 1
        except ValueError:
            reward = -1
            # remove all other trajectories to avoid giving negative reward for valid moves
            trajectories = [trajectory]
            break

        if check_game_finished(game):
            max_value = max_cell_value(game)

            if max_value < WINNING_VALUE:
                # scale reward logarithmically between 0 for 2 and 1 for 2048
                reward = (math.log(max_value, 2) - 1) / (math.log(WINNING_VALUE, 2) - 1)
            else:
                # double reward if it wins
                reward = 2
            break

    for trajectory in trajectories:
        trajectory.reward = reward

    try:
        op_client.update_log_metadata(
            filters=[
                {
                    "field": "completionId",
                    "equals": last_completion.id,
                }
            ],
            metadata={
                "reward": str(reward),
                "reward_assigned": "true",
            },
        )
    except Exception as e:
        print(f"Error updating log metadata: {e}")

    return trajectories


openai_client = await model.openai_client()

for i in range(await model.get_step(), 500):
    train_groups = await art.gather_trajectory_groups(
        (
            art.TrajectoryGroup(rollout(openai_client, i) for _ in range(18))
            for _ in range(1)
        ),
        pbar_desc="train",
    )
    await model.delete_checkpoints()
    await model.train(train_groups, config=art.TrainConfig(learning_rate=3e-5))


Please restructure your imports with 'import unsloth' at the top of your file.
  import unsloth  # type: ignore


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 04-09 20:00:01 __init__.py:207] Automatically detected platform cuda.
==((====))==  Unsloth 2025.3.19: Fast Qwen2 patching. Transformers: 4.51.1. vLLM: 0.7.3.
   \\   /|    NVIDIA H100 PCIe. Num GPUs = 1. Max memory: 79.097 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/qwen2.5-7b-instruct-unsloth-bnb-4bit with actual GPU utilization = 46.22%
Unsloth: Your GPU has CUDA compute capability 9.0 with VRAM = 79.1 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 32768. Num Seq

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:01<00:01,  1.28s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.19it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.11it/s]

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:01<00:01,  1.20s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.39it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.26it/s]



INFO 04-09 20:00:24 model_runner.py:1115] Loading model weights took 6.6961 GB
INFO 04-09 20:00:24 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 04-09 20:00:27 worker.py:267] Memory profiling takes 3.50 seconds
INFO 04-09 20:00:27 worker.py:267] the current vLLM instance can use total_gpu_memory (79.10GiB) x gpu_memory_utilization (0.46) = 36.56GiB
INFO 04-09 20:00:27 worker.py:267] model weights take 6.70GiB; non_torch_memory takes 0.14GiB; PyTorch activation peak memory takes 4.72GiB; the rest of the memory reserved for KV Cache is 25.00GiB.
INFO 04-09 20:00:28 executor_base.py:111] # cuda blocks: 29260, # CPU blocks: 7021
INFO 04-09 20:00:28 executor_base.py:116] Maximum concurrency for 32768 tokens per request: 14.29x
INFO 04-09 20:00:31 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error

Capturing CUDA graph shapes: 100%|██████████| 43/43 [00:31<00:00,  1.37it/s]


INFO 04-09 20:01:02 model_runner.py:1562] Graph capturing finished in 31 secs, took 1.14 GiB
INFO 04-09 20:01:02 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 38.71 seconds


Unsloth 2025.3.19 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


train:   0%|          | 0/18 [00:00<?, ?it/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33marctic_fly[0m ([33mbased-op[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Packed 207 trajectories into 3 sequences of length 8192


tune:   0%|          | 0/3 [00:00<?, ?it/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100,000 | Num Epochs = 3 | Total steps = 300,000
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 1 x 1) = 2
 "-____-"     Trainable parameters = 20,185,088/7,000,000,000 (0.29% trained)


Unsloth: Will smartly offload gradients to save VRAM!


train:   0%|          | 0/18 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-004/0000
Packed 267 trajectories into 5 sequences of length 8192


tune:   0%|          | 0/5 [00:00<?, ?it/s]

train:   0%|          | 0/18 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-004/0001
Packed 286 trajectories into 5 sequences of length 8192


tune:   0%|          | 0/5 [00:00<?, ?it/s]

train:   0%|          | 0/18 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-004/0002
Packed 243 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/18 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-004/0003
Packed 257 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/18 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-004/0004
Packed 121 trajectories into 2 sequences of length 8192


tune:   0%|          | 0/2 [00:00<?, ?it/s]

train:   0%|          | 0/18 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-004/0005
Packed 248 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/18 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-004/0006
Packed 401 trajectories into 6 sequences of length 8192


tune:   0%|          | 0/6 [00:00<?, ?it/s]

train:   0%|          | 0/18 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-004/0007
Packed 241 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/18 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-004/0008
Packed 421 trajectories into 5 sequences of length 8192


tune:   0%|          | 0/5 [00:00<?, ?it/s]

train:   0%|          | 0/18 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-004/0009
Packed 439 trajectories into 6 sequences of length 8192


tune:   0%|          | 0/6 [00:00<?, ?it/s]

train:   0%|          | 0/18 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-004/0010
Packed 229 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/18 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-004/0011
Packed 425 trajectories into 5 sequences of length 8192


tune:   0%|          | 0/5 [00:00<?, ?it/s]

train:   0%|          | 0/18 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-004/0012
Packed 240 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/18 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-004/0013
Packed 314 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/18 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-004/0014
Packed 173 trajectories into 3 sequences of length 8192


tune:   0%|          | 0/3 [00:00<?, ?it/s]

train:   0%|          | 0/18 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-004/0015
Packed 203 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/18 [00:00<?, ?it/s]

CancelledError: 

: 