Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions vero/src/vero/harbor/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,12 @@ class HarborConfig:
reward_key: str | None = None # primary reward; default pass -> reward -> mean
# How to score a task when n_attempts > 1 produced several trials:
# "best": the existing behavior (clean trials preferred, then latest).
# "mean": average the reward across all clean scored attempts. This is the
# de-noising mode: noise shrinks ~1/sqrt(k), and the score estimates
# pass probability instead of pass@k (which "best" inflates toward).
# "mean": average the reward across all scored attempts, dirty or clean
# (a timed-out attempt the verifier scored 0.0 still counts; only
# attempts with no rewards at all are excluded). This is the
# de-noising mode: noise shrinks ~1/sqrt(k), and the score
# estimates pass probability instead of pass@k (which "best"
# inflates toward).
aggregate_attempts: str = "best"
extra_args: list[str] = field(default_factory=list) # passthrough harbor run flags

Expand Down
29 changes: 20 additions & 9 deletions vero/src/vero/harbor/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,23 +261,34 @@ def _sample_result(
return SampleResult(
error=f"No Harbor trial result for task '{task_name}'.", **common
)
# Mean aggregation across attempts: average the reward over every clean
# scored attempt (a verified 0.0 is a valid measurement; an exception is
# not). Falls through to the single best trial when nothing scored clean.
# Mean aggregation across attempts: average the reward over every SCORED
# attempt, dirty or clean. Harbor can record an exception (agent timeout,
# non-zero agent exit) and still run the verifier, so such an attempt
# carries a real measured 0.0; dropping it would estimate
# P(pass | attempt finished cleanly), which is non-monotone (one pass plus
# two timeouts would score 1.0) and systematically forgives candidates
# that make the agent slower. Only attempts with no rewards at all
# (failed before the verifier scored) are excluded. Falls through to the
# single best trial when nothing scored.
if attempts:
scored = [
self._extract_reward((t.get("verifier_result") or {}).get("rewards"))
for t in attempts
if (t.get("verifier_result") or {}).get("rewards")
and not t.get("exception_info")
scored_trials = [
t for t in attempts if (t.get("verifier_result") or {}).get("rewards")
]
if scored:
if scored_trials:
scored = [
self._extract_reward((t.get("verifier_result") or {}).get("rewards"))
for t in scored_trials
]
n_clean = sum(
1 for t in scored_trials if not t.get("exception_info")
)
return SampleResult(
score=sum(scored) / len(scored),
metrics={
"reward_mean": sum(scored) / len(scored),
"n_attempts": float(len(attempts)),
"n_scored": float(len(scored)),
"n_clean": float(n_clean),
},
output={
"task_name": task_name,
Expand Down
48 changes: 44 additions & 4 deletions vero/tests/test_harbor_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,9 +271,12 @@ def test_resume_with_nothing_ran_skips_guard(self, tmp_path, monkeypatch):


class TestMeanAttemptAggregation:
"""aggregate_attempts='mean': average the reward across clean scored
attempts (de-noising; estimates pass probability). Default 'best' keeps
the existing latest-clean behavior, which inflates toward pass@k.
"""aggregate_attempts='mean': average the reward across every SCORED
attempt, dirty or clean (de-noising; estimates per-attempt pass
probability). Harbor scores timed-out attempts 0.0 while also recording
the exception; those must count, or the mean forgives slow candidates.
Default 'best' keeps the existing latest-clean behavior, which inflates
toward pass@k.
"""

def _write(self, run, trial, task, rewards=None, exc=False):
Expand All @@ -300,7 +303,9 @@ def test_mean_averages_clean_attempts(self, tmp_path):
assert r.score == 0.5
assert r.metrics["n_scored"] == 2.0

def test_mean_excludes_exception_attempts(self, tmp_path):
def test_mean_excludes_attempts_without_rewards(self, tmp_path):
# An attempt that died before the verifier scored it carries no
# measurement; it is excluded (but still counted in n_attempts).
runner = HarborRunner(HarborConfig(
task_source="org/ds", agent_import_path="p:m",
n_attempts=2, aggregate_attempts="mean",
Expand All @@ -312,6 +317,41 @@ def test_mean_excludes_exception_attempts(self, tmp_path):
r = runner._sample_result(groups["t0"][0], 0, "t0", _params(), attempts=groups["t0"])
assert r.score == 1.0
assert r.metrics["n_scored"] == 1.0
assert r.metrics["n_attempts"] == 2.0

def test_mean_counts_scored_exception_attempts(self, tmp_path):
# The live-GAIA shape: harbor records AgentTimeoutError but still runs
# the verifier, so the attempt has BOTH exception_info and a scored 0.0.
# [1.0 clean, 0.0 timeout, 0.0 timeout] must score 1/3, not 1.0.
runner = HarborRunner(HarborConfig(
task_source="org/ds", agent_import_path="p:m",
n_attempts=3, aggregate_attempts="mean",
))
jobs = tmp_path / "jobs"; run = jobs / "2026-01-01__00-00-00"
self._write(run, "t0a", "t0", rewards={"reward": 1.0})
self._write(run, "t0b", "t0", rewards={"reward": 0.0}, exc=True)
self._write(run, "t0c", "t0", rewards={"reward": 0.0}, exc=True)
groups = runner._trial_groups(jobs)
r = runner._sample_result(groups["t0"][0], 0, "t0", _params(), attempts=groups["t0"])
assert r.score == pytest.approx(1 / 3)
assert r.metrics["n_scored"] == 3.0
assert r.metrics["n_clean"] == 1.0

def test_mean_over_all_dirty_attempts(self, tmp_path):
# Every attempt timed out but was scored (the all-timeouts live shape):
# the mean path must still apply, not the single-best-trial fallback.
runner = HarborRunner(HarborConfig(
task_source="org/ds", agent_import_path="p:m",
n_attempts=2, aggregate_attempts="mean",
))
jobs = tmp_path / "jobs"; run = jobs / "2026-01-01__00-00-00"
self._write(run, "t0a", "t0", rewards={"reward": 1.0}, exc=True)
self._write(run, "t0b", "t0", rewards={"reward": 0.0}, exc=True)
groups = runner._trial_groups(jobs)
r = runner._sample_result(groups["t0"][0], 0, "t0", _params(), attempts=groups["t0"])
assert r.score == 0.5
assert r.metrics["n_clean"] == 0.0
assert r.output["aggregate"] == "mean"

def test_default_best_unchanged(self, tmp_path):
# No attempts passed (default 'best' config): single-trial path intact.
Expand Down