In [None]:
%load_ext autoreload
%autoreload 2

import os
import logging

from dotenv import load_dotenv
load_dotenv()

from hud.adapters.claude.adapter import ClaudeAdapter
from hud.agent.claude import ClaudeAgent
from hud.taskset import TaskSet
from hud import gym
from hud.types import Gym
from hud.job import Job


from anthropic import Anthropic

In [2]:
# Log only warnings and errors
logging.basicConfig(level=logging.WARNING)

In [None]:
taskset = await TaskSet.load(taskset_id="OSWorld-Ubuntu-Links")

print(f"Total tasks in OSWorld: {len(taskset.tasks)}")

In [4]:
# Set gym name and convert to SDK01 (TODO: should be done server side)
for task in taskset.tasks:
    task.gym = Gym(name_or_id="Local-OSWorld-Ubuntu")
    task.convert_sdk01()

In [None]:
# (optional) create a job that will host all evaluations
job = await Job.create(name="Claude-test-OSWorld", gym_id="OSWorld-Ubuntu")

# Load Task 1 of OSWorld
# It may take around ~4 minutes to initialize the OSWorld environment and reset to a task
env = await gym.make(taskset.tasks[1], job_id=job.id)

In [None]:
# reset to a task with an observation (screenshot and text)
obs, info = await env.reset()
print(f"Task description: {obs.text}")

In [None]:
# agent
anthropic = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
agent = ClaudeAgent(anthropic)

# agent loop
for i in range(8):
    print(f"========= Step {i+1} =========")
    action, done = await agent.predict(obs)
    print(f"Agent's action: {action}")

    if done:
        print("done")
        break

    obs, reward, terminated, info = await env.step(action)

    if terminated:
        print("terminated")
        break


In [None]:
# evaluate environment state
result = await env.evaluate()
print(f"Evaluation result: {result[0]['logs']}")

In [None]:
# make sure to close environment to avoid being charged for idle time
await env.close()