Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions vero/src/vero/harbor/build/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ def _serve_config(config: BuildConfig, dataset_id: str | None, base_commit: str)
"targets": targets,
"base_commit": base_commit,
"submit_enabled": config.submit_enabled,
"score_baseline": config.score_baseline,
"agent_volume": AGENT_VOLUME,
"admin_volume": ADMIN_VOLUME,
"admin_token_path": TOKEN_PATH,
Expand Down
4 changes: 4 additions & 0 deletions vero/src/vero/harbor/build/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ class BuildConfig(BaseModel):
selection_split: str = "validation"
targets: list[TargetSpec] = Field(default_factory=list)
submit_enabled: bool = False
# Also admin-score the unmodified baseline on every target at finalize and
# write it to <admin_volume>/baseline.json, so a candidate that generalizes
# WORSE than the untouched repo is visible as a regression.
score_baseline: bool = False

# write-access: paths in the target repo the optimizer may NOT edit
# (the scorer, by default). Applied as unix perms in main before the agent runs.
Expand Down
42 changes: 42 additions & 0 deletions vero/tests/test_harbor_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,48 @@ def test_serve_config_validates(built):
assert cfg.budgets[0]["dataset_id"] == cfg.dataset_id


def test_score_baseline_reaches_serve_json(built):
# Raw JSON on purpose: the key must be present in the compiler <-> serve
# contract even where the local ServeConfig predates the field.
raw = json.loads((built / "environment" / "sidecar" / "serve.json").read_text())
assert raw["score_baseline"] is False # default off


def test_score_baseline_true_emitted():
# Through the actual YAML path (not just the BuildConfig constructor), so
# the headline claim "reachable from build.yaml" is what is tested.
from vero.harbor.build.compiler import _serve_config

config = BuildConfig.model_validate(yaml.safe_load(
"name: o/n\n"
"agent_repo: .\n"
"splits:\n"
" - {split: validation, access: non_viewable}\n"
"score_baseline: true\n"
))
assert config.score_baseline is True
raw = _serve_config(config, "ds", "sha")
assert raw["score_baseline"] is True
Comment thread
shehabyasser-scale marked this conversation as resolved.


def test_score_baseline_true_through_compile_task(tmp_path, monkeypatch):
# Full pipeline: a True value must survive compile_task into the written
# serve.json, not just the _serve_config helper.
monkeypatch.setenv("VERO_SKIP_SECRET_CHECK", "1")
config = BuildConfig(
name="vero/gsm8k-opt",
agent_repo=str(_agent_repo(tmp_path)),
mode="A",
task="gsm8k",
dataset=str(_dataset(tmp_path)),
splits=[{"split": "validation", "access": "non_viewable"}],
score_baseline=True,
)
out = compile_task(config, tmp_path / "task", vero_root=_stub_vero(tmp_path))
raw = json.loads((out / "environment" / "sidecar" / "serve.json").read_text())
assert raw["score_baseline"] is True


def test_rendered_files_parse(built):
tomllib.loads((built / "task.toml").read_text()) # valid TOML
compose = yaml.safe_load((built / "environment/docker-compose.yaml").read_text())
Expand Down