In [19]:
from hud import gym, load_taskset

In [None]:
taskset = await load_taskset("OSWorld-Ubuntu-Links")
print(f"Total tasks in OSWorld: {len(taskset)}")

test = taskset[2]
print(f"Task prompt: {test.prompt}")

In [21]:
# The Ubuntu environment will take around 2.5 minutes to start, but can be parallelized
env = await gym.make(test)

In [None]:
from hud.agent import ClaudeAgent

# Define a new agent each time to reset the message history
# Make sure to define the environment variable ANTHROPIC_API_KEY
agent = ClaudeAgent()

# Initial observation
obs, _ = await env.reset()
print(f"Initial observation complete")

# Agent loop
for i in range(8):
    print(f"========= Step {i + 1} =========")
    action, done = await agent.predict(obs)
    print(f"Agent's action: {action}")

    if done:
        print("done")
        break

    obs, reward, terminated, info = await env.step(action)

    if terminated:
        print("terminated")
        break

In [31]:
# Evaluate environment state
result = await env.evaluate()

In [32]:
# Make sure to close environment to avoid being charged for idle time
await env.close()

Paralell runs for the whole dataset

In [25]:
import asyncio
import json
import traceback

from hud import register_job, load_taskset

In [26]:
# agent loop
MAX_STEPS = 30

# Limiting concurrent environment creation to 20 for testing purposes
semaphore = asyncio.Semaphore(30)

# Limiting concurrent Anthropic API calls to 3 to avoid rate limiting
anthropic_semaphore = asyncio.Semaphore(3)


async def agent_loop(task):
    agent = ClaudeAgent()

    # Section 1: Environment initialization
    async with semaphore:
        try:
            print("Creating Environment")
            env = await gym.make(task)
            print("Resetting Environment")
            obs, _ = await env.reset()
            print(f"Task description: {obs.text}")
        except Exception as e:
            with open("run_errors.ndjson", "a") as f:
                error_log = {
                    "task_id": task.id,
                    "phase": "initialization",
                    "error": str(e),
                    "traceback": traceback.format_exc(),
                }
                f.write(json.dumps(error_log) + "\n")
            raise

    # Section 2: Agent loop
    for i in range(MAX_STEPS):
        try:
            # agent's next action
            async with anthropic_semaphore:
                actions, done = await agent.predict(obs)

            if done:
                break

            # step the environment forward
            obs, reward, terminated, info = await env.step(actions)

            # drop out if terminated
            if terminated:
                break
            print(f"Step {i + 1} completed")
        except Exception as e:
            with open("run_errors.ndjson", "a") as f:
                error_log = {
                    "task_id": task.id,
                    "phase": "agent_loop",
                    "error": str(e),
                    "traceback": traceback.format_exc(),
                }
                f.write(json.dumps(error_log) + "\n")

    # Section 3: Evaluation
    try:
        result = await env.evaluate()
        print(f"Evaluation result: {result}")
    except Exception as e:
        with open("run_errors.ndjson", "a") as f:
            error_log = {
                "task_id": task.id,
                "phase": "evaluation",
                "error": str(e),
                "traceback": traceback.format_exc(),
            }
            f.write(json.dumps(error_log) + "\n")
        raise

    # Section 4: Cleanup
    try:
        await env.close()
    except Exception as e:
        with open("run_errors.ndjson", "a") as f:
            error_log = {
                "task_id": task.id,
                "phase": "cleanup",
                "error": str(e),
                "traceback": traceback.format_exc(),
            }
            f.write(json.dumps(error_log) + "\n")
        raise


@register_job("claude_osworld_parallel")
async def agent_run(taskset_name: str):
    taskset = await load_taskset(taskset_name)
    await asyncio.gather(*[agent_loop(task) for task in taskset], return_exceptions=True)

In [None]:
await agent_run("OSWorld-Ubuntu-Links")