scaleapi · shehabyasser-scale · Jul 3, 2026 · Jul 3, 2026
diff --git a/vero/src/vero/harbor/serve.py b/vero/src/vero/harbor/serve.py
@@ -67,6 +67,10 @@ class ServeConfig(BaseModel):
     targets: list[_TargetCfg] = Field(default_factory=list)
     base_commit: str | None = None
     submit_enabled: bool = False
+    # Also admin-score the unmodified baseline on every target at finalize and
+    # write it to <admin_volume>/baseline.json: makes regressions visible
+    # (an optimized candidate can score WORSE than the untouched baseline).
+    score_baseline: bool = False
 
     # volumes / token
     agent_volume: str
@@ -208,6 +212,7 @@ async def build_components(config: ServeConfig) -> tuple[EvaluationSidecar, Veri
         base_commit=config.base_commit,
         selection_task=config.task,
         selection_dataset_id=config.dataset_id,
+        score_baseline=config.score_baseline,
     )
 
     token = generate_token()

diff --git a/vero/src/vero/harbor/verifier.py b/vero/src/vero/harbor/verifier.py
@@ -52,6 +52,7 @@ def __init__(
         selection_task: str | None = None,
         selection_dataset_id: str | None = None,
         rescore_top_k: int = 3,
+        score_baseline: bool = False,
     ):
         self.engine = engine
         self.admin_volume = Path(admin_volume)
@@ -65,6 +66,7 @@ def __init__(
         self.selection_task = selection_task
         self.selection_dataset_id = selection_dataset_id
         self.rescore_top_k = rescore_top_k
+        self.score_baseline = score_baseline
 
     async def finalize(self) -> dict[str, float]:
         """Select the commit and score it on every target -> {reward_key: score}.
@@ -102,8 +104,57 @@ async def finalize(self) -> dict[str, float]:
             rewards[target.reward_key] = (
                 float(score) if score is not None else default_minimum_score
             )
+        await self._maybe_score_baseline(rewards)
         return rewards
 
+    async def _maybe_score_baseline(self, rewards: dict[str, float]) -> None:
+        """Admin-score the unmodified baseline on every target and persist it.
+
+        An optimized candidate can score WORSE than the untouched baseline
+        (observed live: a weak inner model went 0.3 -> 0.2 after optimization);
+        without this, the regression is invisible because auto_best excludes the
+        baseline from selection and nothing else ever scores it. Written to
+        <admin_volume>/baseline.json (NOT into reward.json, whose keys the outer
+        harness consumes) and logged next to the candidate's rewards. Failures
+        here never fail the trial.
+        """
+        if not self.score_baseline:
+            return
+        if not self.base_commit:
+            # Misconfiguration must not be a silent no-op: the operator asked
+            # for baseline scoring and would otherwise never learn it is off.
+            logger.warning(
+                "score_baseline=True but base_commit is not set; skipping "
+                "baseline scoring."
+            )
+            return
+        try:
+            baselines: dict[str, float] = {}
+            for target in self.targets:
+                exp = await self.engine.evaluate_admin(
+                    task=target.task,
+                    dataset_id=target.dataset_id,
+                    split=target.split,
+                    commit=self.base_commit,
+                    sample_ids=target.sample_ids,
+                )
+                score = exp.result.score()
+                baselines[target.reward_key] = (
+                    float(score) if score is not None else default_minimum_score
+                )
+            self.admin_volume.mkdir(parents=True, exist_ok=True)
+            (self.admin_volume / "baseline.json").write_text(
+                json.dumps(baselines, indent=2)
+            )
+            for key, value in rewards.items():
+                base = baselines.get(key)
+                tag = " (REGRESSION vs baseline)" if base is not None and value < base else ""
+                logger.info(
+                    "finalize: %s=%s baseline=%s%s", key, value, base, tag
+                )
+        except Exception:
+            logger.exception("baseline scoring failed; reward.json is unaffected")
+
     async def _select_commit(self) -> str:
         if self.reward_mode == "submit":
             return self._submitted_commit()

diff --git a/vero/tests/test_harbor_verifier.py b/vero/tests/test_harbor_verifier.py
@@ -239,3 +239,84 @@ async def _admin(*, task, dataset_id, split, commit, sample_ids=None):
         rewards = await v.finalize()
         assert rewards == {"accuracy": 0.5}
         assert engine.evaluate_admin.await_args.kwargs["commit"] == "agent"
+
+
+class TestBaselineAtFinalize:
+    """score_baseline=True: finalize also admin-scores the untouched baseline
+    and persists it to admin_volume/baseline.json, so regressions are visible
+    (observed live: optimization took a weak model from 0.3 to 0.2 and nothing
+    surfaced it). reward.json keys are unaffected.
+    """
+
+    @pytest.mark.asyncio
+    async def test_baseline_scored_and_persisted(self, tmp_path):
+        (tmp_path / "submission.json").write_text(json.dumps({"commit": "cand"}))
+        engine = _engine([0.2, 0.3])  # candidate target eval, then baseline eval
+        v = Verifier(
+            engine=engine,
+            admin_volume=tmp_path,
+            reward_mode="submit",
+            base_commit="base",
+            score_baseline=True,
+            targets=[VerificationTarget(task=None, dataset_id="ds", split="validation", reward_key="accuracy")],
+        )
+        rewards = await v.finalize()
+        assert rewards == {"accuracy": 0.2}  # reward.json content unchanged
+        data = json.loads((tmp_path / "baseline.json").read_text())
+        assert data == {"accuracy": 0.3}
+        # second admin eval was the baseline commit
+        assert engine.evaluate_admin.await_args_list[-1].kwargs["commit"] == "base"
+
+    @pytest.mark.asyncio
+    async def test_default_off_no_extra_evals(self, tmp_path):
+        (tmp_path / "submission.json").write_text(json.dumps({"commit": "cand"}))
+        engine = _engine([0.9])
+        v = Verifier(
+            engine=engine,
+            admin_volume=tmp_path,
+            reward_mode="submit",
+            base_commit="base",
+            targets=[VerificationTarget(task=None, dataset_id="ds", split="validation", reward_key="accuracy")],
+        )
+        rewards = await v.finalize()
+        assert rewards == {"accuracy": 0.9}
+        assert engine.evaluate_admin.await_count == 1
+        assert not (tmp_path / "baseline.json").exists()
+
+    @pytest.mark.asyncio
+    async def test_baseline_failure_never_fails_trial(self, tmp_path):
+        (tmp_path / "submission.json").write_text(json.dumps({"commit": "cand"}))
+        engine = MagicMock()
+        engine.evaluate_admin = AsyncMock(
+            side_effect=[MagicMock(result=MagicMock(score=MagicMock(return_value=0.7))),
+                         RuntimeError("modal down")]
+        )
+        v = Verifier(
+            engine=engine,
+            admin_volume=tmp_path,
+            reward_mode="submit",
+            base_commit="base",
+            score_baseline=True,
+            targets=[VerificationTarget(task=None, dataset_id="ds", split="validation", reward_key="accuracy")],
+        )
+        rewards = await v.finalize()
+        assert rewards == {"accuracy": 0.7}  # trial reward survives baseline failure
+
+    @pytest.mark.asyncio
+    async def test_missing_base_commit_warns(self, tmp_path, caplog):
+        # score_baseline=True with no base_commit must not be a silent no-op.
+        (tmp_path / "submission.json").write_text(json.dumps({"commit": "cand"}))
+        engine = _engine([0.9])
+        v = Verifier(
+            engine=engine,
+            admin_volume=tmp_path,
+            reward_mode="submit",
+            base_commit=None,
+            score_baseline=True,
+            targets=[VerificationTarget(task=None, dataset_id="ds", split="validation", reward_key="accuracy")],
+        )
+        with caplog.at_level("WARNING", logger="vero.harbor.verifier"):
+            rewards = await v.finalize()
+        assert rewards == {"accuracy": 0.9}
+        assert not (tmp_path / "baseline.json").exists()
+        assert any("base_commit is not set" in m for m in caplog.messages)