softpudding · softpudding · Apr 13, 2026 · Apr 12, 2026 · Apr 12, 2026 · Apr 12, 2026
diff --git a/README.md b/README.md
@@ -354,14 +354,16 @@ Browser agents are only useful if they remain practical to run. OpenBrowser ther
 ### Build Commands
 
 ```bash
-# Extension development build with watch
+# Extension development build with watch + auto-reload
 cd extension
 npm run dev
 
 # TypeScript type checking
 npm run typecheck
 ```
 
+`npm run dev` watches for file changes, rebuilds, and **automatically reloads the extension in Chrome** — no manual reload on `chrome://extensions` needed after the first install. Production builds (`npm run build`) strip all dev-reload code.
+
 ### Project Structure
 
 ```

diff --git a/eval/dataset/gmail_exec_followup.yaml b/eval/dataset/gmail_exec_followup.yaml
@@ -3,7 +3,7 @@ name: "Gmail Finance Follow-up"
 difficulty: hard
 description: "Use Gmail-style search operators to find the exact finance board-packet thread, label it correctly, reply with exact text, attach the correct PDF, and send."
 start_url: "http://localhost:16605/gmail/?reset=1"
-instruction: 'In the Gmail mock, use search operators to find the correct finance board-packet thread from Mira Lin at aldercap.com. Open that exact thread, add it to the Finance/Board-Prep label, reply with this exact sentence: "Approved for Friday board prep. Please route the signed PDF back before 4 PM." Attach the mock PDF named Board-Pack-Revision-C.pdf and send the reply.'
+instruction: 'In the Gmail mock, use search operators to find the correct finance board packet thread from Mira Lin at aldercap.com. Open that exact thread, add it to the Finance/Board-Prep label, reply with this exact sentence: "Approved for Friday board prep. Please route the signed PDF back before 4 PM." Attach the mock PDF named Board-Pack-Revision-C.pdf and send the reply.'
 time_limit: 660.0
 cost_limit: 1.4
 

diff --git a/eval/evaluate_browser_agent.py b/eval/evaluate_browser_agent.py
@@ -29,6 +29,12 @@
 
 import requests
 import yaml
+from requests.exceptions import (
+    ChunkedEncodingError,
+    ConnectionError as RequestsConnectionError,
+    ReadTimeout,
+)
+from urllib3.exceptions import ProtocolError
 
 logger = logging.getLogger(__name__)
 
@@ -39,6 +45,21 @@
 EVAL_SERVER_PORT = 16605
 OPENBROWSER_PORT = 8765
 
+# SSE streaming timeouts for the agent channel at :8765.
+# (connect_timeout, read_timeout) in seconds.
+# The read timeout applies per-chunk: we need it longer than the slowest LLM
+# turn so a slow turn doesn't abort mid-stream. The test-level wall-clock is
+# still enforced by the outer thread-join in send_message.
+# Override with OPENBROWSER_SSE_READ_TIMEOUT (seconds).
+SSE_CONNECT_TIMEOUT = 30
+SSE_READ_TIMEOUT = int(os.environ.get("OPENBROWSER_SSE_READ_TIMEOUT", "600"))
+
+# API-stall detection & retry.  After a test times out, we scan its SSE
+# events for observation→action gaps exceeding this threshold.  If found,
+# the test is re-queued (up to API_STALL_MAX_RETRIES times).
+API_STALL_THRESHOLD = float(os.environ.get("OPENBROWSER_API_STALL_THRESHOLD", "60"))
+API_STALL_MAX_RETRIES = int(os.environ.get("OPENBROWSER_API_STALL_MAX_RETRIES", "3"))
+
 # Paths
 EVAL_DIR = Path(__file__).parent
 DATASET_DIR = EVAL_DIR / "dataset"
@@ -52,6 +73,57 @@
 DEFAULT_SESSION_DB_PATH = Path.home() / ".openbrowser" / "sessions.db"
 
 
+def detect_api_stalls(
+    sse_events: List[Dict[str, Any]],
+    threshold: float = API_STALL_THRESHOLD,
+) -> List[float]:
+    """Scan SSE events for observation→action gaps that exceed *threshold*.
+
+    Returns a list of gap durations (seconds) where the model API took
+    unreasonably long to respond.  An empty list means no stalls detected.
+
+    Also detects the "trailing stall" case: the test timed out while
+    waiting for an API response after the last observation, so no
+    completed pair exists.  We approximate this by checking if the last
+    event is an ObservationEvent and the gap to the final timestamp in
+    the stream exceeds the threshold.
+    """
+    # Build ordered list of (timestamp, kind) for action/observation events.
+    pairs: list[tuple[float, str]] = []
+    last_ts_overall: float = 0.0
+    for ev in sse_events:
+        ts = ev.get("timestamp")
+        if ts:
+            last_ts_overall = max(last_ts_overall, float(ts))
+        data = ev.get("data")
+        if not isinstance(data, dict):
+            continue
+        ev_type = data.get("type", "")
+        if not ts or ev_type not in ("ActionEvent", "ObservationEvent"):
+            continue
+        pairs.append((float(ts), ev_type))
+
+    pairs.sort(key=lambda p: p[0])
+
+    stalls: list[float] = []
+    for i in range(1, len(pairs)):
+        prev_ts, prev_kind = pairs[i - 1]
+        curr_ts, curr_kind = pairs[i]
+        if prev_kind == "ObservationEvent" and curr_kind == "ActionEvent":
+            gap = curr_ts - prev_ts
+            if gap >= threshold:
+                stalls.append(gap)
+
+    # Trailing stall: last event is an observation and a long time passed
+    # before the stream ended (test was killed waiting for model response).
+    if pairs and pairs[-1][1] == "ObservationEvent" and last_ts_overall:
+        trailing_gap = last_ts_overall - pairs[-1][0]
+        if trailing_gap >= threshold:
+            stalls.append(trailing_gap)
+
+    return stalls
+
+
 @dataclass
 class TestCase:
     """A test case definition"""
@@ -314,23 +386,44 @@ def send_message(
         timed_out = False
         response_holder: Dict[str, Any] = {"response": None, "aborted": False}
 
+        def _open_stream(sess: requests.Session) -> requests.Response:
+            return sess.post(
+                f"{self.base_url}/agent/conversations/{conversation_id}/messages",
+                json={
+                    "text": message,
+                    "cwd": cwd,
+                    "browser_id": self.chrome_uuid,
+                },
+                stream=True,
+                headers={"Accept": "text/event-stream"},
+                # (connect, read). Read timeout gates individual chunks; must be
+                # larger than the slowest LLM turn. Outer wall-clock is enforced
+                # via thread-join below.
+                timeout=(SSE_CONNECT_TIMEOUT, SSE_READ_TIMEOUT),
+            )
+
         def _collect_events() -> None:
             nonlocal error
             response = None
             local_session = requests.Session()
             local_session.trust_env = False
             try:
-                response = local_session.post(
-                    f"{self.base_url}/agent/conversations/{conversation_id}/messages",
-                    json={
-                        "text": message,
-                        "cwd": cwd,
-                        "browser_id": self.chrome_uuid,
-                    },
-                    stream=True,
-                    headers={"Accept": "text/event-stream"},
-                    timeout=90,  # Per-read timeout; test-level timeout is handled outside.
-                )
+                try:
+                    response = _open_stream(local_session)
+                except (RequestsConnectionError, ReadTimeout) as connect_err:
+                    # Pre-stream failure (agent server not accepting or slow to
+                    # start): one backoff retry before we declare the run dead.
+                    logger.warning(
+                        "SSE open failed (%s); retrying once after 2s backoff",
+                        connect_err,
+                    )
+                    if response_holder["aborted"]:
+                        return
+                    time.sleep(2.0)
+                    # Outer wall-clock may have fired while we were sleeping.
+                    if response_holder["aborted"]:
+                        return
+                    response = _open_stream(local_session)
                 response_holder["response"] = response
                 response.raise_for_status()
 
@@ -412,6 +505,22 @@ def _collect_events() -> None:
                         "Conversation completed but no usage_metrics event received"
                     )
 
+            except (
+                RequestsConnectionError,
+                ReadTimeout,
+                ChunkedEncodingError,
+                ProtocolError,
+            ) as e:
+                if response_holder["aborted"]:
+                    logger.info(
+                        "Stopped SSE collection after hitting the test time limit"
+                    )
+                else:
+                    error = (
+                        f"SSE transport error on :{OPENBROWSER_PORT} after "
+                        f"read_timeout={SSE_READ_TIMEOUT}s: {e}"
+                    )
+                    logger.error(error)
             except Exception as e:
                 if response_holder["aborted"]:
                     logger.info(
@@ -2073,6 +2182,8 @@ def _run_scheduled_jobs(
         busy_sites: set[str] = set()
         in_flight: Dict[Any, ScheduledJob] = {}
         target_order = list(range(len(targets)))
+        # Track retries per (target_index, test_index) for API-stall detection.
+        retry_counts: Dict[Tuple[int, int], int] = {}
 
         with ThreadPoolExecutor(max_workers=max_parallel) as executor:
             while True:
@@ -2116,6 +2227,10 @@ def _run_scheduled_jobs(
                         scheduled_job.site_bucket,
                     )
 
+                # Re-check pending after processing completions (retries
+                # may have re-queued jobs).
+                pending_jobs = any(jobs for jobs in jobs_by_target.values())
+
                 if not in_flight and not pending_jobs:
                     break
 
@@ -2132,16 +2247,60 @@ def _run_scheduled_jobs(
                     busy_sites.discard(job.site_bucket)
 
                     result = future.result()
-                    results_by_target[job.target_index][job.test_index] = result
 
                     status = "PASSED" if result.passed else "FAILED"
+
+                    # --- API-stall retry logic ---
+                    # If the test timed out, check whether Dashscope API
+                    # stalls (observation→action gaps > threshold) caused it.
+                    # If so, re-queue for another attempt.
+                    job_key = (job.target_index, job.test_index)
+                    retries_so_far = retry_counts.get(job_key, 0)
+                    should_retry = False
+
+                    timed_out = (
+                        result.duration is not None
+                        and result.duration >= job.test_case.time_limit
+                    )
+                    if timed_out and retries_so_far < API_STALL_MAX_RETRIES:
+                        stalls = detect_api_stalls(
+                            result.sse_events,
+                            threshold=API_STALL_THRESHOLD,
+                        )
+                        if stalls:
+                            should_retry = True
+                            retry_counts[job_key] = retries_so_far + 1
+                            stall_summary = ", ".join(
+                                f"{s:.0f}s" for s in sorted(stalls, reverse=True)[:3]
+                            )
+                            logger.info(
+                                "API stall detected in '%s' for '%s' "
+                                "(stalls: %s). Retry %d/%d.",
+                                job.test_case.name,
+                                job.model_key,
+                                stall_summary,
+                                retries_so_far + 1,
+                                API_STALL_MAX_RETRIES,
+                            )
+                            # Re-queue at the front of the target's pending
+                            # list so it runs again soon.
+                            jobs_by_target[job.target_index].insert(0, job)
+
+                    if not should_retry:
+                        results_by_target[job.target_index][job.test_index] = result
+
                     logger.info(
-                        "Completed test '%s' for model '%s': %s %.1f/%.1f",
+                        "Completed test '%s' for model '%s': %s %.1f/%.1f%s",
                         job.test_case.name,
                         job.model_key,
                         status,
                         result.score,
                         result.max_score,
+                        (
+                            f" [retrying due to API stall, attempt {retries_so_far + 1}]"
+                            if should_retry
+                            else ""
+                        ),
                     )
 
         return {