scaleapi · shehabyasser-scale · Jul 3, 2026
diff --git a/vero/src/vero/harbor/config.py b/vero/src/vero/harbor/config.py
@@ -23,9 +23,12 @@ class HarborConfig:
     reward_key: str | None = None  # primary reward; default pass -> reward -> mean
     # How to score a task when n_attempts > 1 produced several trials:
     #   "best": the existing behavior (clean trials preferred, then latest).
-    #   "mean": average the reward across all clean scored attempts. This is the
-    #           de-noising mode: noise shrinks ~1/sqrt(k), and the score estimates
-    #           pass probability instead of pass@k (which "best" inflates toward).
+    #   "mean": average the reward across all scored attempts, dirty or clean
+    #           (a timed-out attempt the verifier scored 0.0 still counts; only
+    #           attempts with no rewards at all are excluded). This is the
+    #           de-noising mode: noise shrinks ~1/sqrt(k), and the score
+    #           estimates pass probability instead of pass@k (which "best"
+    #           inflates toward).
     aggregate_attempts: str = "best"
     extra_args: list[str] = field(default_factory=list)  # passthrough harbor run flags
 

diff --git a/vero/src/vero/harbor/runner.py b/vero/src/vero/harbor/runner.py
@@ -261,23 +261,34 @@ def _sample_result(
             return SampleResult(
                 error=f"No Harbor trial result for task '{task_name}'.", **common
             )
-        # Mean aggregation across attempts: average the reward over every clean
-        # scored attempt (a verified 0.0 is a valid measurement; an exception is
-        # not). Falls through to the single best trial when nothing scored clean.
+        # Mean aggregation across attempts: average the reward over every SCORED
+        # attempt, dirty or clean. Harbor can record an exception (agent timeout,
+        # non-zero agent exit) and still run the verifier, so such an attempt
+        # carries a real measured 0.0; dropping it would estimate
+        # P(pass | attempt finished cleanly), which is non-monotone (one pass plus
+        # two timeouts would score 1.0) and systematically forgives candidates
+        # that make the agent slower. Only attempts with no rewards at all
+        # (failed before the verifier scored) are excluded. Falls through to the
+        # single best trial when nothing scored.
         if attempts:
-            scored = [
-                self._extract_reward((t.get("verifier_result") or {}).get("rewards"))
-                for t in attempts
-                if (t.get("verifier_result") or {}).get("rewards")
-                and not t.get("exception_info")
+            scored_trials = [
+                t for t in attempts if (t.get("verifier_result") or {}).get("rewards")
             ]
-            if scored:
+            if scored_trials:
+                scored = [
+                    self._extract_reward((t.get("verifier_result") or {}).get("rewards"))
+                    for t in scored_trials
+                ]
+                n_clean = sum(
+                    1 for t in scored_trials if not t.get("exception_info")
+                )
                 return SampleResult(
                     score=sum(scored) / len(scored),
                     metrics={
                         "reward_mean": sum(scored) / len(scored),
                         "n_attempts": float(len(attempts)),
                         "n_scored": float(len(scored)),
+                        "n_clean": float(n_clean),
                     },
                     output={
                         "task_name": task_name,

diff --git a/vero/tests/test_harbor_runner.py b/vero/tests/test_harbor_runner.py
@@ -271,9 +271,12 @@ def test_resume_with_nothing_ran_skips_guard(self, tmp_path, monkeypatch):
 
 
 class TestMeanAttemptAggregation:
-    """aggregate_attempts='mean': average the reward across clean scored
-    attempts (de-noising; estimates pass probability). Default 'best' keeps
-    the existing latest-clean behavior, which inflates toward pass@k.
+    """aggregate_attempts='mean': average the reward across every SCORED
+    attempt, dirty or clean (de-noising; estimates per-attempt pass
+    probability). Harbor scores timed-out attempts 0.0 while also recording
+    the exception; those must count, or the mean forgives slow candidates.
+    Default 'best' keeps the existing latest-clean behavior, which inflates
+    toward pass@k.
     """
 
     def _write(self, run, trial, task, rewards=None, exc=False):
@@ -300,7 +303,9 @@ def test_mean_averages_clean_attempts(self, tmp_path):
         assert r.score == 0.5
         assert r.metrics["n_scored"] == 2.0
 
-    def test_mean_excludes_exception_attempts(self, tmp_path):
+    def test_mean_excludes_attempts_without_rewards(self, tmp_path):
+        # An attempt that died before the verifier scored it carries no
+        # measurement; it is excluded (but still counted in n_attempts).
         runner = HarborRunner(HarborConfig(
             task_source="org/ds", agent_import_path="p:m",
             n_attempts=2, aggregate_attempts="mean",
@@ -312,6 +317,41 @@ def test_mean_excludes_exception_attempts(self, tmp_path):
         r = runner._sample_result(groups["t0"][0], 0, "t0", _params(), attempts=groups["t0"])
         assert r.score == 1.0
         assert r.metrics["n_scored"] == 1.0
+        assert r.metrics["n_attempts"] == 2.0
+
+    def test_mean_counts_scored_exception_attempts(self, tmp_path):
+        # The live-GAIA shape: harbor records AgentTimeoutError but still runs
+        # the verifier, so the attempt has BOTH exception_info and a scored 0.0.
+        # [1.0 clean, 0.0 timeout, 0.0 timeout] must score 1/3, not 1.0.
+        runner = HarborRunner(HarborConfig(
+            task_source="org/ds", agent_import_path="p:m",
+            n_attempts=3, aggregate_attempts="mean",
+        ))
+        jobs = tmp_path / "jobs"; run = jobs / "2026-01-01__00-00-00"
+        self._write(run, "t0a", "t0", rewards={"reward": 1.0})
+        self._write(run, "t0b", "t0", rewards={"reward": 0.0}, exc=True)
+        self._write(run, "t0c", "t0", rewards={"reward": 0.0}, exc=True)
+        groups = runner._trial_groups(jobs)
+        r = runner._sample_result(groups["t0"][0], 0, "t0", _params(), attempts=groups["t0"])
+        assert r.score == pytest.approx(1 / 3)
+        assert r.metrics["n_scored"] == 3.0
+        assert r.metrics["n_clean"] == 1.0
+
+    def test_mean_over_all_dirty_attempts(self, tmp_path):
+        # Every attempt timed out but was scored (the all-timeouts live shape):
+        # the mean path must still apply, not the single-best-trial fallback.
+        runner = HarborRunner(HarborConfig(
+            task_source="org/ds", agent_import_path="p:m",
+            n_attempts=2, aggregate_attempts="mean",
+        ))
+        jobs = tmp_path / "jobs"; run = jobs / "2026-01-01__00-00-00"
+        self._write(run, "t0a", "t0", rewards={"reward": 1.0}, exc=True)
+        self._write(run, "t0b", "t0", rewards={"reward": 0.0}, exc=True)
+        groups = runner._trial_groups(jobs)
+        r = runner._sample_result(groups["t0"][0], 0, "t0", _params(), attempts=groups["t0"])
+        assert r.score == 0.5
+        assert r.metrics["n_clean"] == 0.0
+        assert r.output["aggregate"] == "mean"
 
     def test_default_best_unchanged(self, tmp_path):
         # No attempts passed (default 'best' config): single-trial path intact.