In [1]:
import asyncio
from hud import gym
from hud.agent import OperatorAgent  # Replace with an agent that can achieve the task
from hud import create_job, load_taskset

from hud.types import CustomGym
from pathlib import Path
from hud.task import Task
from hud.taskset import TaskSet

from dotenv import load_dotenv
import os
import json
import pprint
from openai import AsyncOpenAI

load_dotenv()
assert os.environ.get("HUD_API_KEY")
assert os.environ.get("OPENAI_API_KEY")

In [2]:
taskset = await load_taskset("hud-appflowy-taskset-368-final")
print(f"Number of tasks: {len(taskset)}")

first_task = taskset[0]
pprint.pprint(first_task.__dict__)

Number of tasks: 368
{'config': None,
 'description': None,
 'evaluate': FunctionConfig(function='grade', args=['rockbnb-add-richmond-property'], id=None, metadata=None),
 'gym': CustomGym(type='public', location='remote', image_or_build_context='156041433621.dkr.ecr.us-east-1.amazonaws.com/docker-gym:6f368c40-4812-44b0-8478-730a72d31fdd', host_config=None),
 'id': 'f560b955-a804-4b4e-a31a-43dcae30fa28',
 'metadata': None,
 'prompt': 'We need to expand our Richmond district portfolio. Please add the '
           'Richmond Garden House (identifier PROP-007) to our bookkeeping '
           'system. This 3-bedroom property in the Richmond neighborhood '
           'should be marked as Active with Master-lease host type, occupancy '
           'rate of 0.78, guest rating of 4.72, and scheduled for next '
           'turnover on 2025-05-21 at 14:00.',
 'setup': FunctionConfig(function='setup', args=['rockbnb-add-richmond-property'], id=None, metadata=None)}


In [3]:
# This will take 2-3 minutes to set up the environment
env = await gym.make(env_src=first_task)

print("Environment resetting (runs setup)...")
obs, _ = await env.reset()
print(f"Initial observation: {obs}")

client = AsyncOpenAI(api_key=os.environ["OPENAI_API_KEY"])
agent = OperatorAgent(client=client)

setup produced stderr:
[06/05/25 00:32:53] DEBUG    Using selector: EpollSelector selector_events.py:64
                    INFO     Starting AppFlowy server            appflowy.py:128
                    INFO     Starting PostgreSQL server for problem  setup.py:29
                             template rockbnb                                   
                    INFO     Starting PostgreSQL server...           utils.py:11
                    DEBUG    Command to run: /usr/bin/sudo -u        utils.py:25
                             postgres                                           
                             /usr/lib/postgresql/16/bin/postgres -D             
                             /var/lib/postgresql/16/main -c                     
                             config_file=/etc/postgresql/16/main/pos            
                             tgresql.conf                                       
                    INFO     Launched PostgreSQL process with PID 13 utils.py:28
     

Environment resetting (runs setup)...
Initial observation: Observation(screenshot=None, text=We need to expand our Richmond district portfolio. Please add the Richmond Garden House (identifier ...)


In [4]:
import base64
from io import BytesIO
from PIL import Image
from IPython.display import display

for i in range(8):
    print(f"========= Step {i + 1} =========")
    action, done = await agent.predict(obs)
    print(f"Agent's action: {action}")

    obs, reward, terminated, info = await env.step(action)

    if done or terminated:
        break

Agent's action: [ScreenshotFetch(type='screenshot', logs={'id': 'resp_6840e5d904b4819ead95b357b9e87c520a4ee8f45226ed74', 'created_at': 1749083609.0, 'error': None, 'incomplete_details': None, 'instructions': None, 'metadata': {}, 'model': 'computer-use-preview-2025-03-11', 'object': 'response', 'output': [{'id': 'cu_6840e5d9b404819eb99274241245c35e0a4ee8f45226ed74', 'action': {'type': 'screenshot'}, 'call_id': 'call_lW0uPwAjUHwgnuCf9QOZSigy', 'pending_safety_checks': [], 'status': 'in_progress', 'type': 'computer_call'}], 'parallel_tool_calls': True, 'temperature': 1.0, 'tool_choice': 'auto', 'tools': [{'display_height': 768, 'display_width': 1024, 'environment': 'linux', 'type': 'computer_use_preview'}], 'top_p': 1.0, 'background': False, 'max_output_tokens': None, 'previous_response_id': None, 'reasoning': {'effort': 'medium', 'generate_summary': None, 'summary': None}, 'service_tier': 'auto', 'status': 'completed', 'text': {'format': {'type': 'text'}}, 'truncation': 'auto', 'usage':

Step produced stderr: [06/05/25 00:33:32] DEBUG    Using selector: EpollSelector selector_events.py:64
                    INFO     #### ACTION ##### screenshot        computer.py:154
                    INFO     Taking screenshot....               computer.py:303
[06/05/25 00:33:33] INFO     #### SHELL ##### DISPLAY=:1 scrot   computer.py:352
                             -p                                                 
                             /home/ubuntu/screenshots/screenshot                
                             _56d78dcfb01a4b4d9b92c83914a025db.p                
                             ng                                                 
                    INFO     #### SHELL ##### convert            computer.py:352
                             /home/ubuntu/screenshots/screenshot                
                             _56d78dcfb01a4b4d9b92c83914a025db.p                
                             ng -resize 1280x800! -define                       
      

Agent's action: [ClickAction(type='click', logs={'id': 'resp_6840e5ddf0b8819e885ed5db2840e0eb0a4ee8f45226ed74', 'created_at': 1749083613.0, 'error': None, 'incomplete_details': None, 'instructions': None, 'metadata': {}, 'model': 'computer-use-preview-2025-03-11', 'object': 'response', 'output': [{'id': 'rs_6840e5dfabf4819e8ca2b866893955ca0a4ee8f45226ed74', 'summary': [], 'type': 'reasoning', 'encrypted_content': None, 'status': None}, {'id': 'cu_6840e5e11100819eb8a4930266315d450a4ee8f45226ed74', 'action': {'button': 'left', 'type': 'click', 'x': 80, 'y': 108}, 'call_id': 'call_D2fsnWM5oGOW8r9YLMKHF2oG', 'pending_safety_checks': [], 'status': 'completed', 'type': 'computer_call'}], 'parallel_tool_calls': True, 'temperature': 1.0, 'tool_choice': 'auto', 'tools': [{'display_height': 768, 'display_width': 1024, 'environment': 'linux', 'type': 'computer_use_preview'}], 'top_p': 1.0, 'background': False, 'max_output_tokens': None, 'previous_response_id': 'resp_6840e5d904b4819ead95b357b9e87c

Step produced stderr: [06/05/25 00:33:39] DEBUG    Using selector: EpollSelector selector_events.py:64
                    INFO     #### ACTION ##### left_click        computer.py:154
                    INFO     #### CLICK ##### left_click         computer.py:230
[06/05/25 00:33:40] INFO     #### SHELL ##### DISPLAY=:1 xdotool computer.py:352
                             mousemove 100 112 click 1                          
[06/05/25 00:33:41] INFO     #### SHELL ##### DISPLAY=:1 scrot   computer.py:352
                             -p                                                 
                             /home/ubuntu/screenshots/screenshot                
                             _efdae510d158415699faf32801afbf37.p                
                             ng                                                 
                    INFO     #### SHELL ##### convert            computer.py:352
                             /home/ubuntu/screenshots/screenshot                
      

Agent's action: [ClickAction(type='click', logs={'id': 'resp_6840e5e5c460819e836a889cc0276a970a4ee8f45226ed74', 'created_at': 1749083621.0, 'error': None, 'incomplete_details': None, 'instructions': None, 'metadata': {}, 'model': 'computer-use-preview-2025-03-11', 'object': 'response', 'output': [{'id': 'rs_6840e5e7ba78819e9c4c2a882f039e0b0a4ee8f45226ed74', 'summary': [], 'type': 'reasoning', 'encrypted_content': None, 'status': None}, {'id': 'cu_6840e5e89e88819e9e3e8f2e82866ac40a4ee8f45226ed74', 'action': {'button': 'left', 'type': 'click', 'x': 264, 'y': 76}, 'call_id': 'call_gp4l9ruMlenpQf9N0SiRy1cS', 'pending_safety_checks': [], 'status': 'completed', 'type': 'computer_call'}], 'parallel_tool_calls': True, 'temperature': 1.0, 'tool_choice': 'auto', 'tools': [{'display_height': 768, 'display_width': 1024, 'environment': 'linux', 'type': 'computer_use_preview'}], 'top_p': 1.0, 'background': False, 'max_output_tokens': None, 'previous_response_id': 'resp_6840e5ddf0b8819e885ed5db2840e0

Step produced stderr: [06/05/25 00:33:47] DEBUG    Using selector: EpollSelector selector_events.py:64
                    INFO     #### ACTION ##### left_click        computer.py:154
                    INFO     #### CLICK ##### left_click         computer.py:230
                    INFO     #### SHELL ##### DISPLAY=:1 xdotool computer.py:352
                             mousemove 330 79 click 1                           
[06/05/25 00:33:48] INFO     #### SHELL ##### DISPLAY=:1 scrot   computer.py:352
                             -p                                                 
                             /home/ubuntu/screenshots/screenshot                
                             _11063936d19e46ddb39cb543aa36c159.p                
                             ng                                                 
                    INFO     #### SHELL ##### convert            computer.py:352
                             /home/ubuntu/screenshots/screenshot                
      

Agent's action: [ClickAction(type='click', logs={'id': 'resp_6840e5ed8250819e8107bdb266af7a360a4ee8f45226ed74', 'created_at': 1749083629.0, 'error': None, 'incomplete_details': None, 'instructions': None, 'metadata': {}, 'model': 'computer-use-preview-2025-03-11', 'object': 'response', 'output': [{'id': 'cu_6840e5efa7a0819e94b99bfc4ca6c7ed0a4ee8f45226ed74', 'action': {'button': 'left', 'type': 'click', 'x': 257, 'y': 75}, 'call_id': 'call_56M0OWjTjcyeSaZ8wUxPFsZy', 'pending_safety_checks': [], 'status': 'completed', 'type': 'computer_call'}], 'parallel_tool_calls': True, 'temperature': 1.0, 'tool_choice': 'auto', 'tools': [{'display_height': 768, 'display_width': 1024, 'environment': 'linux', 'type': 'computer_use_preview'}], 'top_p': 1.0, 'background': False, 'max_output_tokens': None, 'previous_response_id': 'resp_6840e5e5c460819e836a889cc0276a970a4ee8f45226ed74', 'reasoning': {'effort': 'medium', 'generate_summary': None, 'summary': None}, 'service_tier': 'default', 'status': 'compl

Step produced stderr: [06/05/25 00:33:54] DEBUG    Using selector: EpollSelector selector_events.py:64
                    INFO     #### ACTION ##### left_click        computer.py:154
                    INFO     #### CLICK ##### left_click         computer.py:230
                    INFO     #### SHELL ##### DISPLAY=:1 xdotool computer.py:352
                             mousemove 321 78 click 1                           
[06/05/25 00:33:55] INFO     #### SHELL ##### DISPLAY=:1 scrot   computer.py:352
                             -p                                                 
                             /home/ubuntu/screenshots/screenshot                
                             _dc5bf5b2eadd44b68c19bb7befd6ceeb.p                
                             ng                                                 
                    INFO     #### SHELL ##### convert            computer.py:352
                             /home/ubuntu/screenshots/screenshot                
      

Agent's action: [WaitAction(type='wait', logs={'id': 'resp_6840e5f493c8819e942b00232646325e0a4ee8f45226ed74', 'created_at': 1749083636.0, 'error': None, 'incomplete_details': None, 'instructions': None, 'metadata': {}, 'model': 'computer-use-preview-2025-03-11', 'object': 'response', 'output': [{'id': 'cu_6840e5f79844819eafd803949c22d5fc0a4ee8f45226ed74', 'action': {'type': 'wait'}, 'call_id': 'call_eegrdtCuZ5XQdIuukDaOV82h', 'pending_safety_checks': [], 'status': 'completed', 'type': 'computer_call'}], 'parallel_tool_calls': True, 'temperature': 1.0, 'tool_choice': 'auto', 'tools': [{'display_height': 768, 'display_width': 1024, 'environment': 'linux', 'type': 'computer_use_preview'}], 'top_p': 1.0, 'background': False, 'max_output_tokens': None, 'previous_response_id': 'resp_6840e5ed8250819e8107bdb266af7a360a4ee8f45226ed74', 'reasoning': {'effort': 'medium', 'generate_summary': None, 'summary': None}, 'service_tier': 'default', 'status': 'completed', 'text': {'format': {'type': 'text

Step produced stderr: [06/05/25 00:34:03] DEBUG    Using selector: EpollSelector selector_events.py:64
                    INFO     #### ACTION ##### wait              computer.py:154
[06/05/25 00:34:04] INFO     #### SHELL ##### DISPLAY=:1 scrot   computer.py:352
                             -p                                                 
                             /home/ubuntu/screenshots/screenshot                
                             _f9be6279bb4e4feebf56995ecd971987.p                
                             ng                                                 
                    INFO     #### SHELL ##### convert            computer.py:352
                             /home/ubuntu/screenshots/screenshot                
                             _f9be6279bb4e4feebf56995ecd971987.p                
                             ng -resize 1280x800! -define                       
                             png:compression-level=9                            
      

Agent's action: [ClickAction(type='click', logs={'id': 'resp_6840e5fce0bc819e99d22f326dcd65730a4ee8f45226ed74', 'created_at': 1749083644.0, 'error': None, 'incomplete_details': None, 'instructions': None, 'metadata': {}, 'model': 'computer-use-preview-2025-03-11', 'object': 'response', 'output': [{'id': 'cu_6840e5ffb1f8819e801d9dd889730a6a0a4ee8f45226ed74', 'action': {'button': 'left', 'type': 'click', 'x': 268, 'y': 75}, 'call_id': 'call_0ElpjhSDNOiw9yV9JDt0x0Ii', 'pending_safety_checks': [], 'status': 'completed', 'type': 'computer_call'}], 'parallel_tool_calls': True, 'temperature': 1.0, 'tool_choice': 'auto', 'tools': [{'display_height': 768, 'display_width': 1024, 'environment': 'linux', 'type': 'computer_use_preview'}], 'top_p': 1.0, 'background': False, 'max_output_tokens': None, 'previous_response_id': 'resp_6840e5f493c8819e942b00232646325e0a4ee8f45226ed74', 'reasoning': {'effort': 'medium', 'generate_summary': None, 'summary': None}, 'service_tier': 'default', 'status': 'compl

Step produced stderr: [06/05/25 00:34:11] DEBUG    Using selector: EpollSelector selector_events.py:64
                    INFO     #### ACTION ##### left_click        computer.py:154
                    INFO     #### CLICK ##### left_click         computer.py:230
                    INFO     #### SHELL ##### DISPLAY=:1 xdotool computer.py:352
                             mousemove 335 78 click 1                           
[06/05/25 00:34:12] INFO     #### SHELL ##### DISPLAY=:1 scrot   computer.py:352
                             -p                                                 
                             /home/ubuntu/screenshots/screenshot                
                             _d35b4fc41d7746009bd4e1d7ff38313f.p                
                             ng                                                 
                    INFO     #### SHELL ##### convert            computer.py:352
                             /home/ubuntu/screenshots/screenshot                
      

Agent's action: [ClickAction(type='click', logs={'id': 'resp_6840e60536b8819e865c51bfce7f87610a4ee8f45226ed74', 'created_at': 1749083653.0, 'error': None, 'incomplete_details': None, 'instructions': None, 'metadata': {}, 'model': 'computer-use-preview-2025-03-11', 'object': 'response', 'output': [{'id': 'rs_6840e607f2bc819eb158f608197c4a360a4ee8f45226ed74', 'summary': [], 'type': 'reasoning', 'encrypted_content': None, 'status': None}, {'id': 'cu_6840e608b624819e9d68122721f0522c0a4ee8f45226ed74', 'action': {'button': 'left', 'type': 'click', 'x': 378, 'y': 76}, 'call_id': 'call_vDtbeyKzfodZVshI91EmUe2F', 'pending_safety_checks': [], 'status': 'completed', 'type': 'computer_call'}], 'parallel_tool_calls': True, 'temperature': 1.0, 'tool_choice': 'auto', 'tools': [{'display_height': 768, 'display_width': 1024, 'environment': 'linux', 'type': 'computer_use_preview'}], 'top_p': 1.0, 'background': False, 'max_output_tokens': None, 'previous_response_id': 'resp_6840e5fce0bc819e99d22f326dcd65

Step produced stderr: [06/05/25 00:34:20] DEBUG    Using selector: EpollSelector selector_events.py:64
                    INFO     #### ACTION ##### left_click        computer.py:154
                    INFO     #### CLICK ##### left_click         computer.py:230
                    INFO     #### SHELL ##### DISPLAY=:1 xdotool computer.py:352
                             mousemove 472 79 click 1                           
[06/05/25 00:34:21] INFO     #### SHELL ##### DISPLAY=:1 scrot   computer.py:352
                             -p                                                 
                             /home/ubuntu/screenshots/screenshot                
                             _96cd6d0b60f04a9c8573bd952db94387.p                
                             ng                                                 
                    INFO     #### SHELL ##### convert            computer.py:352
                             /home/ubuntu/screenshots/screenshot                
      

Agent's action: [ClickAction(type='click', logs={'id': 'resp_6840e60e09d0819ead08f99e46e4b0bf0a4ee8f45226ed74', 'created_at': 1749083662.0, 'error': None, 'incomplete_details': None, 'instructions': None, 'metadata': {}, 'model': 'computer-use-preview-2025-03-11', 'object': 'response', 'output': [{'id': 'rs_6840e610e258819eaf28792389b4c5890a4ee8f45226ed74', 'summary': [], 'type': 'reasoning', 'encrypted_content': None, 'status': None}, {'id': 'cu_6840e6120390819e95511f48f64e46a80a4ee8f45226ed74', 'action': {'button': 'left', 'type': 'click', 'x': 721, 'y': 390}, 'call_id': 'call_NlXgyLBsASTZRNbewkpR8gSA', 'pending_safety_checks': [], 'status': 'completed', 'type': 'computer_call'}], 'parallel_tool_calls': True, 'temperature': 1.0, 'tool_choice': 'auto', 'tools': [{'display_height': 768, 'display_width': 1024, 'environment': 'linux', 'type': 'computer_use_preview'}], 'top_p': 1.0, 'background': False, 'max_output_tokens': None, 'previous_response_id': 'resp_6840e60536b8819e865c51bfce7f8

Step produced stderr: [06/05/25 00:34:29] DEBUG    Using selector: EpollSelector selector_events.py:64
                    INFO     #### ACTION ##### left_click        computer.py:154
                    INFO     #### CLICK ##### left_click         computer.py:230
                    INFO     #### SHELL ##### DISPLAY=:1 xdotool computer.py:352
                             mousemove 901 406 click 1                          
[06/05/25 00:34:30] INFO     #### SHELL ##### DISPLAY=:1 scrot   computer.py:352
                             -p                                                 
                             /home/ubuntu/screenshots/screenshot                
                             _e5faf03f62c042d8a016473194011b6c.p                
                             ng                                                 
                    INFO     #### SHELL ##### convert            computer.py:352
                             /home/ubuntu/screenshots/screenshot                
      

In [5]:
result = await env.evaluate()
print(result)

grade produced stderr:
[06/05/25 00:36:23] DEBUG    Using selector: EpollSelector selector_events.py:64
                    INFO     #### ACTION ##### left_click        computer.py:154
                    INFO     #### CLICK ##### left_click         computer.py:230
                    INFO     #### SHELL ##### DISPLAY=:1 xdotool computer.py:352
                             mousemove 1159 79 click 1                          
                    INFO     #### SHELL FINISHED, CONTENT        computer.py:364
                             BLOCKING ##### DISPLAY=:1 xdotool                  
                             mousemove 1159 79 click 1                          
                    INFO     #### ACTION ##### left_click        computer.py:154
                    INFO     #### CLICK ##### left_click         computer.py:230
                    INFO     #### SHELL ##### DISPLAY=:1 xdotool computer.py:352
                             mousemove 904 127 click 1                          
     

0.01577777777777778


In [6]:
await env.close()

In [7]:
from hud import run_job

job = await run_job(
    OperatorAgent,
    first_task,
    "appflowy-first-task",
    max_steps_per_task=20,
    max_concurrent_tasks=20,
    auto_reply_question=True,
)

100%|████████████████████████████████████████| 20/20 [4:13<0:00, 4.7 steps/min]
