Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions vero/src/vero/harbor/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ class ServeConfig(BaseModel):
targets: list[_TargetCfg] = Field(default_factory=list)
base_commit: str | None = None
submit_enabled: bool = False
# Also admin-score the unmodified baseline on every target at finalize and
# write it to <admin_volume>/baseline.json: makes regressions visible
# (an optimized candidate can score WORSE than the untouched baseline).
score_baseline: bool = False

# volumes / token
agent_volume: str
Expand Down Expand Up @@ -208,6 +212,7 @@ async def build_components(config: ServeConfig) -> tuple[EvaluationSidecar, Veri
base_commit=config.base_commit,
selection_task=config.task,
selection_dataset_id=config.dataset_id,
score_baseline=config.score_baseline,
)

token = generate_token()
Expand Down
51 changes: 51 additions & 0 deletions vero/src/vero/harbor/verifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def __init__(
selection_task: str | None = None,
selection_dataset_id: str | None = None,
rescore_top_k: int = 3,
score_baseline: bool = False,
):
self.engine = engine
self.admin_volume = Path(admin_volume)
Expand All @@ -65,6 +66,7 @@ def __init__(
self.selection_task = selection_task
self.selection_dataset_id = selection_dataset_id
self.rescore_top_k = rescore_top_k
self.score_baseline = score_baseline

async def finalize(self) -> dict[str, float]:
"""Select the commit and score it on every target -> {reward_key: score}.
Expand Down Expand Up @@ -102,8 +104,57 @@ async def finalize(self) -> dict[str, float]:
rewards[target.reward_key] = (
float(score) if score is not None else default_minimum_score
)
await self._maybe_score_baseline(rewards)
return rewards

async def _maybe_score_baseline(self, rewards: dict[str, float]) -> None:
"""Admin-score the unmodified baseline on every target and persist it.

An optimized candidate can score WORSE than the untouched baseline
(observed live: a weak inner model went 0.3 -> 0.2 after optimization);
without this, the regression is invisible because auto_best excludes the
baseline from selection and nothing else ever scores it. Written to
<admin_volume>/baseline.json (NOT into reward.json, whose keys the outer
harness consumes) and logged next to the candidate's rewards. Failures
here never fail the trial.
"""
if not self.score_baseline:
return
if not self.base_commit:
# Misconfiguration must not be a silent no-op: the operator asked
# for baseline scoring and would otherwise never learn it is off.
logger.warning(
"score_baseline=True but base_commit is not set; skipping "
"baseline scoring."
)
return
try:
baselines: dict[str, float] = {}
for target in self.targets:
exp = await self.engine.evaluate_admin(
task=target.task,
dataset_id=target.dataset_id,
split=target.split,
commit=self.base_commit,
sample_ids=target.sample_ids,
)
score = exp.result.score()
baselines[target.reward_key] = (
float(score) if score is not None else default_minimum_score
)
self.admin_volume.mkdir(parents=True, exist_ok=True)
(self.admin_volume / "baseline.json").write_text(
json.dumps(baselines, indent=2)
)
for key, value in rewards.items():
base = baselines.get(key)
tag = " (REGRESSION vs baseline)" if base is not None and value < base else ""
logger.info(
"finalize: %s=%s baseline=%s%s", key, value, base, tag
)
except Exception:
logger.exception("baseline scoring failed; reward.json is unaffected")

async def _select_commit(self) -> str:
if self.reward_mode == "submit":
return self._submitted_commit()
Expand Down
81 changes: 81 additions & 0 deletions vero/tests/test_harbor_verifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,3 +239,84 @@ async def _admin(*, task, dataset_id, split, commit, sample_ids=None):
rewards = await v.finalize()
assert rewards == {"accuracy": 0.5}
assert engine.evaluate_admin.await_args.kwargs["commit"] == "agent"


class TestBaselineAtFinalize:
"""score_baseline=True: finalize also admin-scores the untouched baseline
and persists it to admin_volume/baseline.json, so regressions are visible
(observed live: optimization took a weak model from 0.3 to 0.2 and nothing
surfaced it). reward.json keys are unaffected.
"""

@pytest.mark.asyncio
async def test_baseline_scored_and_persisted(self, tmp_path):
(tmp_path / "submission.json").write_text(json.dumps({"commit": "cand"}))
engine = _engine([0.2, 0.3]) # candidate target eval, then baseline eval
v = Verifier(
engine=engine,
admin_volume=tmp_path,
reward_mode="submit",
base_commit="base",
score_baseline=True,
targets=[VerificationTarget(task=None, dataset_id="ds", split="validation", reward_key="accuracy")],
)
rewards = await v.finalize()
assert rewards == {"accuracy": 0.2} # reward.json content unchanged
data = json.loads((tmp_path / "baseline.json").read_text())
assert data == {"accuracy": 0.3}
# second admin eval was the baseline commit
assert engine.evaluate_admin.await_args_list[-1].kwargs["commit"] == "base"

@pytest.mark.asyncio
async def test_default_off_no_extra_evals(self, tmp_path):
(tmp_path / "submission.json").write_text(json.dumps({"commit": "cand"}))
engine = _engine([0.9])
v = Verifier(
engine=engine,
admin_volume=tmp_path,
reward_mode="submit",
base_commit="base",
targets=[VerificationTarget(task=None, dataset_id="ds", split="validation", reward_key="accuracy")],
)
rewards = await v.finalize()
assert rewards == {"accuracy": 0.9}
assert engine.evaluate_admin.await_count == 1
assert not (tmp_path / "baseline.json").exists()

@pytest.mark.asyncio
async def test_baseline_failure_never_fails_trial(self, tmp_path):
(tmp_path / "submission.json").write_text(json.dumps({"commit": "cand"}))
engine = MagicMock()
engine.evaluate_admin = AsyncMock(
side_effect=[MagicMock(result=MagicMock(score=MagicMock(return_value=0.7))),
RuntimeError("modal down")]
)
v = Verifier(
engine=engine,
admin_volume=tmp_path,
reward_mode="submit",
base_commit="base",
score_baseline=True,
targets=[VerificationTarget(task=None, dataset_id="ds", split="validation", reward_key="accuracy")],
)
rewards = await v.finalize()
assert rewards == {"accuracy": 0.7} # trial reward survives baseline failure

@pytest.mark.asyncio
async def test_missing_base_commit_warns(self, tmp_path, caplog):
# score_baseline=True with no base_commit must not be a silent no-op.
(tmp_path / "submission.json").write_text(json.dumps({"commit": "cand"}))
engine = _engine([0.9])
v = Verifier(
engine=engine,
admin_volume=tmp_path,
reward_mode="submit",
base_commit=None,
score_baseline=True,
targets=[VerificationTarget(task=None, dataset_id="ds", split="validation", reward_key="accuracy")],
)
with caplog.at_level("WARNING", logger="vero.harbor.verifier"):
rewards = await v.finalize()
assert rewards == {"accuracy": 0.9}
assert not (tmp_path / "baseline.json").exists()
assert any("base_commit is not set" in m for m in caplog.messages)