Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -354,14 +354,16 @@ Browser agents are only useful if they remain practical to run. OpenBrowser ther
### Build Commands

```bash
# Extension development build with watch
# Extension development build with watch + auto-reload
cd extension
npm run dev

# TypeScript type checking
npm run typecheck
```

`npm run dev` watches for file changes, rebuilds, and **automatically reloads the extension in Chrome** — no manual reload on `chrome://extensions` needed after the first install. Production builds (`npm run build`) strip all dev-reload code.

### Project Structure

```
Expand Down
2 changes: 1 addition & 1 deletion eval/dataset/gmail_exec_followup.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name: "Gmail Finance Follow-up"
difficulty: hard
description: "Use Gmail-style search operators to find the exact finance board-packet thread, label it correctly, reply with exact text, attach the correct PDF, and send."
start_url: "http://localhost:16605/gmail/?reset=1"
instruction: 'In the Gmail mock, use search operators to find the correct finance board-packet thread from Mira Lin at aldercap.com. Open that exact thread, add it to the Finance/Board-Prep label, reply with this exact sentence: "Approved for Friday board prep. Please route the signed PDF back before 4 PM." Attach the mock PDF named Board-Pack-Revision-C.pdf and send the reply.'
instruction: 'In the Gmail mock, use search operators to find the correct finance board packet thread from Mira Lin at aldercap.com. Open that exact thread, add it to the Finance/Board-Prep label, reply with this exact sentence: "Approved for Friday board prep. Please route the signed PDF back before 4 PM." Attach the mock PDF named Board-Pack-Revision-C.pdf and send the reply.'
time_limit: 660.0
cost_limit: 1.4

Expand Down
185 changes: 172 additions & 13 deletions eval/evaluate_browser_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,12 @@

import requests
import yaml
from requests.exceptions import (
ChunkedEncodingError,
ConnectionError as RequestsConnectionError,
ReadTimeout,
)
from urllib3.exceptions import ProtocolError

logger = logging.getLogger(__name__)

Expand All @@ -39,6 +45,21 @@
EVAL_SERVER_PORT = 16605
OPENBROWSER_PORT = 8765

# SSE streaming timeouts for the agent channel at :8765.
# (connect_timeout, read_timeout) in seconds.
# The read timeout applies per-chunk: we need it longer than the slowest LLM
# turn so a slow turn doesn't abort mid-stream. The test-level wall-clock is
# still enforced by the outer thread-join in send_message.
# Override with OPENBROWSER_SSE_READ_TIMEOUT (seconds).
SSE_CONNECT_TIMEOUT = 30
SSE_READ_TIMEOUT = int(os.environ.get("OPENBROWSER_SSE_READ_TIMEOUT", "600"))

# API-stall detection & retry. After a test times out, we scan its SSE
# events for observation→action gaps exceeding this threshold. If found,
# the test is re-queued (up to API_STALL_MAX_RETRIES times).
API_STALL_THRESHOLD = float(os.environ.get("OPENBROWSER_API_STALL_THRESHOLD", "60"))
API_STALL_MAX_RETRIES = int(os.environ.get("OPENBROWSER_API_STALL_MAX_RETRIES", "3"))

# Paths
EVAL_DIR = Path(__file__).parent
DATASET_DIR = EVAL_DIR / "dataset"
Expand All @@ -52,6 +73,57 @@
DEFAULT_SESSION_DB_PATH = Path.home() / ".openbrowser" / "sessions.db"


def detect_api_stalls(
sse_events: List[Dict[str, Any]],
threshold: float = API_STALL_THRESHOLD,
) -> List[float]:
"""Scan SSE events for observation→action gaps that exceed *threshold*.

Returns a list of gap durations (seconds) where the model API took
unreasonably long to respond. An empty list means no stalls detected.

Also detects the "trailing stall" case: the test timed out while
waiting for an API response after the last observation, so no
completed pair exists. We approximate this by checking if the last
event is an ObservationEvent and the gap to the final timestamp in
the stream exceeds the threshold.
"""
# Build ordered list of (timestamp, kind) for action/observation events.
pairs: list[tuple[float, str]] = []
last_ts_overall: float = 0.0
for ev in sse_events:
ts = ev.get("timestamp")
if ts:
last_ts_overall = max(last_ts_overall, float(ts))
data = ev.get("data")
if not isinstance(data, dict):
continue
ev_type = data.get("type", "")
if not ts or ev_type not in ("ActionEvent", "ObservationEvent"):
continue
pairs.append((float(ts), ev_type))

pairs.sort(key=lambda p: p[0])

stalls: list[float] = []
for i in range(1, len(pairs)):
prev_ts, prev_kind = pairs[i - 1]
curr_ts, curr_kind = pairs[i]
if prev_kind == "ObservationEvent" and curr_kind == "ActionEvent":
gap = curr_ts - prev_ts
if gap >= threshold:
stalls.append(gap)

# Trailing stall: last event is an observation and a long time passed
# before the stream ended (test was killed waiting for model response).
if pairs and pairs[-1][1] == "ObservationEvent" and last_ts_overall:
trailing_gap = last_ts_overall - pairs[-1][0]
if trailing_gap >= threshold:
stalls.append(trailing_gap)

return stalls


@dataclass
class TestCase:
"""A test case definition"""
Expand Down Expand Up @@ -314,23 +386,44 @@ def send_message(
timed_out = False
response_holder: Dict[str, Any] = {"response": None, "aborted": False}

def _open_stream(sess: requests.Session) -> requests.Response:
return sess.post(
f"{self.base_url}/agent/conversations/{conversation_id}/messages",
json={
"text": message,
"cwd": cwd,
"browser_id": self.chrome_uuid,
},
stream=True,
headers={"Accept": "text/event-stream"},
# (connect, read). Read timeout gates individual chunks; must be
# larger than the slowest LLM turn. Outer wall-clock is enforced
# via thread-join below.
timeout=(SSE_CONNECT_TIMEOUT, SSE_READ_TIMEOUT),
)

def _collect_events() -> None:
nonlocal error
response = None
local_session = requests.Session()
local_session.trust_env = False
try:
response = local_session.post(
f"{self.base_url}/agent/conversations/{conversation_id}/messages",
json={
"text": message,
"cwd": cwd,
"browser_id": self.chrome_uuid,
},
stream=True,
headers={"Accept": "text/event-stream"},
timeout=90, # Per-read timeout; test-level timeout is handled outside.
)
try:
response = _open_stream(local_session)
except (RequestsConnectionError, ReadTimeout) as connect_err:
# Pre-stream failure (agent server not accepting or slow to
# start): one backoff retry before we declare the run dead.
logger.warning(
"SSE open failed (%s); retrying once after 2s backoff",
connect_err,
)
if response_holder["aborted"]:
return
time.sleep(2.0)
# Outer wall-clock may have fired while we were sleeping.
if response_holder["aborted"]:
return
response = _open_stream(local_session)
response_holder["response"] = response
response.raise_for_status()

Expand Down Expand Up @@ -412,6 +505,22 @@ def _collect_events() -> None:
"Conversation completed but no usage_metrics event received"
)

except (
RequestsConnectionError,
ReadTimeout,
ChunkedEncodingError,
ProtocolError,
) as e:
if response_holder["aborted"]:
logger.info(
"Stopped SSE collection after hitting the test time limit"
)
else:
error = (
f"SSE transport error on :{OPENBROWSER_PORT} after "
f"read_timeout={SSE_READ_TIMEOUT}s: {e}"
)
logger.error(error)
except Exception as e:
if response_holder["aborted"]:
logger.info(
Expand Down Expand Up @@ -2073,6 +2182,8 @@ def _run_scheduled_jobs(
busy_sites: set[str] = set()
in_flight: Dict[Any, ScheduledJob] = {}
target_order = list(range(len(targets)))
# Track retries per (target_index, test_index) for API-stall detection.
retry_counts: Dict[Tuple[int, int], int] = {}

with ThreadPoolExecutor(max_workers=max_parallel) as executor:
while True:
Expand Down Expand Up @@ -2116,6 +2227,10 @@ def _run_scheduled_jobs(
scheduled_job.site_bucket,
)

# Re-check pending after processing completions (retries
# may have re-queued jobs).
pending_jobs = any(jobs for jobs in jobs_by_target.values())

if not in_flight and not pending_jobs:
break

Expand All @@ -2132,16 +2247,60 @@ def _run_scheduled_jobs(
busy_sites.discard(job.site_bucket)

result = future.result()
results_by_target[job.target_index][job.test_index] = result

status = "PASSED" if result.passed else "FAILED"

# --- API-stall retry logic ---
# If the test timed out, check whether Dashscope API
# stalls (observation→action gaps > threshold) caused it.
# If so, re-queue for another attempt.
job_key = (job.target_index, job.test_index)
retries_so_far = retry_counts.get(job_key, 0)
should_retry = False

timed_out = (
result.duration is not None
and result.duration >= job.test_case.time_limit
)
if timed_out and retries_so_far < API_STALL_MAX_RETRIES:
stalls = detect_api_stalls(
result.sse_events,
threshold=API_STALL_THRESHOLD,
)
if stalls:
should_retry = True
retry_counts[job_key] = retries_so_far + 1
stall_summary = ", ".join(
f"{s:.0f}s" for s in sorted(stalls, reverse=True)[:3]
)
logger.info(
"API stall detected in '%s' for '%s' "
"(stalls: %s). Retry %d/%d.",
job.test_case.name,
job.model_key,
stall_summary,
retries_so_far + 1,
API_STALL_MAX_RETRIES,
)
# Re-queue at the front of the target's pending
# list so it runs again soon.
jobs_by_target[job.target_index].insert(0, job)

if not should_retry:
results_by_target[job.target_index][job.test_index] = result

logger.info(
"Completed test '%s' for model '%s': %s %.1f/%.1f",
"Completed test '%s' for model '%s': %s %.1f/%.1f%s",
job.test_case.name,
job.model_key,
status,
result.score,
result.max_score,
(
f" [retrying due to API stall, attempt {retries_so_far + 1}]"
if should_retry
else ""
),
)

return {
Expand Down
Loading
Loading