In [1]:
# uv pip install -e ".[dev]"
from hud import gym, load_taskset
from pprint import pprint

In [2]:
taskset = await load_taskset("OSWorld-Ubuntu")
print(f"Total tasks in OSWorld: {len(taskset)}")

test = taskset[144]
print(f"Task prompt: {test.prompt}")

Total tasks in OSWorld: 369
Task prompt: Can you make my computer bring back the last tab I shut down?


In [3]:
# The Ubuntu environment will take around 2.5 minutes to start, but can be parallelized
env = await gym.make(test)

2025-05-27 10:04:56,691 - hud.gym - INFO - Creating private environment


In [4]:
from hud.agent import ClaudeAgent

# Define a new agent each time to reset the message history
# Make sure to define the environment variable ANTHROPIC_API_KEY
agent = ClaudeAgent()

# Initial observation
obs, _ = await env.reset()
print(f"Initial observation complete")

# Agent loop
for i in range(8):
    print(f"========= Step {i + 1} =========")
    action, done = await agent.predict(obs)
    print(f"Agent's action: {action}")

    obs, reward, terminated, info = await env.step(action)

    if done or terminated:
        break

Initial observation complete
Agent's action: [PressAction(type='press', keys=['ctrl', 'shift', 't'])]
Agent's action: [ResponseAction(type='response', text="Great! I've successfully reopened your last closed tab. As you can see, the TripAdvisor tab has been restored. Now you have three tabs open:\n\n1. Lonely Planet | Travel Guide\n2. Airbnb | Vacation rentals\n3. TripAdvisor: Over a billion reviews & contributions for Hotels\n\nThe keyboard shortcut Ctrl+Shift+T is very useful for recovering recently closed tabs in Chrome. You can actually press it multiple times to continue reopening previously closed tabs in the order they were closed.")]


In [5]:
# Evaluate environment state
result = await env.evaluate()
pprint(result)

{'error': None,
 'logs': 'INFO: Starting evaluation...\n'
         'INFO: Evaluating task 08d9a8b1-7b7a-4ba7-a226-4e266e13f6df...\n'
         'INFO: Evaluator configuration:\n'
         'INFO:   Metric function(s): is_expected_tabs\n'
         'INFO:   Metric conjunction: and\n'
         'INFO:   Result getter: get_open_tabs_info\n'
         'INFO:   Expected getter: get_rule\n'
         'INFO:   Metric options: {}\n'
         'INFO: Setting up post-config for evaluation...\n'
         'INFO: Evaluating single metric: is_expected_tabs\n'
         "INFO: Getting result state using config: {'type': 'open_tabs_info'}\n"
         "INFO: Getting expected state using config: {'type': 'rule', 'rules': "
         "{'type': 'url', 'urls': ['https://www.lonelyplanet.com', "
         "'https://www.airbnb.com', 'https://www.tripadvisor.com']}}\n"
         'INFO: Comparing result state with expected state\n'
         'INFO: Final evaluation result: 1\n'
         'INFO: Completed evaluation.\n'
    

In [6]:
# Make sure to close environment to avoid being charged for idle time
await env.close()

Paralell runs for the whole dataset

In [26]:
from hud import run_job

taskset = await load_taskset("OSWorld-Ubuntu")
job = await run_job(
    ClaudeAgent,
    taskset,
    "osworld-test",
    max_steps_per_task=20,
    max_concurrent_tasks=20,
    auto_reply_question=True,
)

In [None]:
await job.get_analytics()