scaleapi · shehabyasser-scale · Jul 3, 2026 · Jul 3, 2026
diff --git a/vero/src/vero/harbor/build/compiler.py b/vero/src/vero/harbor/build/compiler.py
@@ -178,6 +178,7 @@ def _serve_config(config: BuildConfig, dataset_id: str | None, base_commit: str)
         "targets": targets,
         "base_commit": base_commit,
         "submit_enabled": config.submit_enabled,
+        "score_baseline": config.score_baseline,
         "agent_volume": AGENT_VOLUME,
         "admin_volume": ADMIN_VOLUME,
         "admin_token_path": TOKEN_PATH,

diff --git a/vero/src/vero/harbor/build/config.py b/vero/src/vero/harbor/build/config.py
@@ -66,6 +66,10 @@ class BuildConfig(BaseModel):
     selection_split: str = "validation"
     targets: list[TargetSpec] = Field(default_factory=list)
     submit_enabled: bool = False
+    # Also admin-score the unmodified baseline on every target at finalize and
+    # write it to <admin_volume>/baseline.json, so a candidate that generalizes
+    # WORSE than the untouched repo is visible as a regression.
+    score_baseline: bool = False
 
     # write-access: paths in the target repo the optimizer may NOT edit
     # (the scorer, by default). Applied as unix perms in main before the agent runs.

diff --git a/vero/tests/test_harbor_build.py b/vero/tests/test_harbor_build.py
@@ -104,6 +104,48 @@ def test_serve_config_validates(built):
     assert cfg.budgets[0]["dataset_id"] == cfg.dataset_id
 
 
+def test_score_baseline_reaches_serve_json(built):
+    # Raw JSON on purpose: the key must be present in the compiler <-> serve
+    # contract even where the local ServeConfig predates the field.
+    raw = json.loads((built / "environment" / "sidecar" / "serve.json").read_text())
+    assert raw["score_baseline"] is False  # default off
+
+
+def test_score_baseline_true_emitted():
+    # Through the actual YAML path (not just the BuildConfig constructor), so
+    # the headline claim "reachable from build.yaml" is what is tested.
+    from vero.harbor.build.compiler import _serve_config
+
+    config = BuildConfig.model_validate(yaml.safe_load(
+        "name: o/n\n"
+        "agent_repo: .\n"
+        "splits:\n"
+        "  - {split: validation, access: non_viewable}\n"
+        "score_baseline: true\n"
+    ))
+    assert config.score_baseline is True
+    raw = _serve_config(config, "ds", "sha")
+    assert raw["score_baseline"] is True
+
+
+def test_score_baseline_true_through_compile_task(tmp_path, monkeypatch):
+    # Full pipeline: a True value must survive compile_task into the written
+    # serve.json, not just the _serve_config helper.
+    monkeypatch.setenv("VERO_SKIP_SECRET_CHECK", "1")
+    config = BuildConfig(
+        name="vero/gsm8k-opt",
+        agent_repo=str(_agent_repo(tmp_path)),
+        mode="A",
+        task="gsm8k",
+        dataset=str(_dataset(tmp_path)),
+        splits=[{"split": "validation", "access": "non_viewable"}],
+        score_baseline=True,
+    )
+    out = compile_task(config, tmp_path / "task", vero_root=_stub_vero(tmp_path))
+    raw = json.loads((out / "environment" / "sidecar" / "serve.json").read_text())
+    assert raw["score_baseline"] is True
+
+
 def test_rendered_files_parse(built):
     tomllib.loads((built / "task.toml").read_text())  # valid TOML
     compose = yaml.safe_load((built / "environment/docker-compose.yaml").read_text())