Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 81 additions & 0 deletions docs/playground.css
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,87 @@ button:disabled {
width: 100%;
}

.challenge-panel {
border: 1px solid #d7dcd6;
border-radius: 10px;
margin-top: 12px;
overflow: hidden;
}

.challenge-head {
align-items: center;
background: #eef1ec;
display: flex;
flex-wrap: wrap;
gap: 8px;
justify-content: space-between;
padding: 8px 12px;
}

.challenge-head span {
color: var(--ink-soft, #5a6b67);
font-size: 0.82rem;
}

.challenge-board {
padding: 12px;
}

.challenge-empty {
color: var(--ink-soft, #5a6b67);
font-size: 0.85rem;
}

.challenge-verdict {
border-radius: 8px;
font-weight: 800;
margin-bottom: 10px;
padding: 8px 12px;
}

.challenge-verdict.win {
background: #e3f3dc;
color: #2f6b1f;
}

.challenge-verdict.lose {
background: #f7e6df;
color: #9a3d22;
}

.challenge-verdict.tie {
background: #eef1ec;
color: #4a5b57;
}

.challenge-table {
border-collapse: collapse;
font-size: 0.85rem;
width: 100%;
}

.challenge-table th,
.challenge-table td {
border-bottom: 1px solid #e4e8e2;
padding: 5px 8px;
text-align: right;
}

.challenge-table th:first-child,
.challenge-table td:first-child {
text-align: left;
}

.challenge-table thead th {
color: var(--ink-soft, #5a6b67);
font-weight: 700;
}

.challenge-table td.better {
background: #e3f3dc;
font-weight: 800;
}

.status-strip {
display: grid;
gap: 10px;
Expand Down
8 changes: 8 additions & 0 deletions docs/playground.html
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,14 @@ <h1 id="playground-title">Playground</h1>
</div>
<textarea id="codeCell" class="code-cell" spellcheck="false" rows="20" aria-label="Agent source code"></textarea>
</section>

<section id="challengePanel" class="challenge-panel" aria-label="Challenge" hidden>
<div class="challenge-head">
<span>Challenge — your edited agent vs. the shipped one, across 10 seeds</span>
<button id="scoreButton" type="button">Score 10 seeds</button>
</div>
<div id="challengeBoard" class="challenge-board" aria-live="polite"></div>
</section>
</div>

<aside class="trace-panel" aria-labelledby="trace-title">
Expand Down
155 changes: 155 additions & 0 deletions docs/playground.js
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@
codeCell: document.getElementById("codeCell"),
runCode: document.getElementById("runCodeButton"),
resetCode: document.getElementById("resetCodeButton"),
challengePanel: document.getElementById("challengePanel"),
challengeBoard: document.getElementById("challengeBoard"),
scoreButton: document.getElementById("scoreButton"),
reset: document.getElementById("resetButton"),
step: document.getElementById("stepButton"),
run: document.getElementById("runButton"),
Expand Down Expand Up @@ -105,6 +108,9 @@
elements.resetCode.addEventListener("click", () => {
resetEditedAgent();
});
elements.scoreButton.addEventListener("click", () => {
runChallenge();
});
elements.replay.addEventListener("input", () => {
state.replayIndex = clampReplayIndex(elements.replay.value);
render();
Expand Down Expand Up @@ -197,6 +203,11 @@
setRealStatus("running real Python: " + sourcePath(scenario));
if (scenario === "pickretry") {
populateDefaultAgentSource();
if (!elements.challengeBoard.textContent.trim()) {
renderChallengeMessage(
'Edit the agent above, then "Score ' + CHALLENGE_SEEDS + ' seeds" to see if you beat it.'
);
}
}
} catch (error) {
if (scenario === "pickretry") {
Expand Down Expand Up @@ -460,6 +471,149 @@
}
}

// --- "Beat the robot" challenge ---------------------------------------------
// Scores the edited agent vs. the shipped one across many seeds. Single-seed
// scoring would reward overfitting; robustness across seeds is the real test.
const CHALLENGE_SEEDS = 10;
const CHALLENGE_DRIVER = [
"import json, os, sys, importlib.util",
"cwd = os.getcwd()",
"if cwd not in sys.path:",
" sys.path.insert(0, cwd)",
"class _NoMatplotlib:",
" def find_spec(self, name, path=None, target=None):",
" if name == 'matplotlib' or name.startswith('matplotlib.'):",
" raise ImportError('matplotlib is intentionally unavailable on the headless browser path')",
" return None",
"sys.meta_path.insert(0, _NoMatplotlib())",
"path = os.path.join(cwd, 'examples', 'manipulation', '01_pick_and_retry.py')",
"spec = importlib.util.spec_from_file_location('pick_and_retry', path)",
"mod = importlib.util.module_from_spec(spec)",
"spec.loader.exec_module(mod)",
"from pir.viz.playground_trace import score_pick_and_retry",
"seeds = list(range(" + CHALLENGE_SEEDS + "))",
"baseline = score_pick_and_retry(mod.run_agent, mod.PickAndRetryAgent, seeds=seeds)",
"src = USER_SRC",
"if src and src.strip():",
" ns = {}",
" exec(src, ns)",
" Agent = ns.get('PickAndRetryAgent')",
" if Agent is None:",
" raise ValueError('Your code must define a class named PickAndRetryAgent')",
" you = score_pick_and_retry(mod.run_agent, Agent, seeds=seeds)",
"else:",
" you = baseline",
"json.dumps({'seeds': seeds, 'baseline': baseline, 'you': you})",
].join("\n");

async function runChallenge() {
stopRun();
elements.scoreButton.disabled = true;
renderChallengeMessage("scoring " + CHALLENGE_SEEDS + " seeds for both agents…");
setRealStatus("running the challenge…");
try {
const pyodide = await ensurePyodide();
pyodide.globals.set("USER_SRC", elements.codeCell.value || "");
const result = JSON.parse(await pyodide.runPythonAsync(CHALLENGE_DRIVER));
renderScoreboard(result);
setRealStatus("challenge scored across " + CHALLENGE_SEEDS + " seeds");
} catch (error) {
renderChallengeMessage("");
setRealStatus("challenge failed: " + error, true);
} finally {
elements.scoreButton.disabled = false;
}
}

function renderChallengeMessage(message) {
elements.challengeBoard.textContent = "";
if (!message) {
return;
}
const note = document.createElement("p");
note.className = "challenge-empty";
note.textContent = message;
elements.challengeBoard.appendChild(note);
}

function challengeVerdict(you, base) {
const same =
you.success_rate === base.success_rate &&
you.mean_reward === base.mean_reward &&
you.mean_steps === base.mean_steps;
if (same) {
return { cls: "tie", text: "You're running the shipped agent — edit it and score again to try to beat it." };
}
// Higher success rate wins; then higher reward; then fewer steps.
let youWin;
if (you.success_rate !== base.success_rate) {
youWin = you.success_rate > base.success_rate;
} else if (you.mean_reward !== base.mean_reward) {
youWin = you.mean_reward > base.mean_reward;
} else {
youWin = you.mean_steps < base.mean_steps;
}
return youWin
? { cls: "win", text: "🏆 You beat the shipped agent across " + CHALLENGE_SEEDS + " seeds!" }
: { cls: "lose", text: "Shipped agent still wins — make your policy more robust across seeds." };
}

function renderScoreboard(result) {
const you = result.you;
const base = result.baseline;
elements.challengeBoard.textContent = "";

const verdict = challengeVerdict(you, base);
const banner = document.createElement("div");
banner.className = "challenge-verdict " + verdict.cls;
banner.textContent = verdict.text;
elements.challengeBoard.appendChild(banner);

// metric label, key, and whether higher is better
const rows = [
["success rate", "success_rate", true, (v) => Math.round(v * 100) + "%"],
["mean reward", "mean_reward", true, (v) => v.toFixed(2)],
["mean steps", "mean_steps", false, (v) => v.toFixed(1)],
["mean retries", "mean_retries", false, (v) => v.toFixed(1)],
["mean grasp_miss", "mean_grasp_miss", false, (v) => v.toFixed(1)],
];

const table = document.createElement("table");
table.className = "challenge-table";
const thead = document.createElement("thead");
const headRow = document.createElement("tr");
["metric", "baseline", "you"].forEach((label) => {
const th = document.createElement("th");
th.textContent = label;
headRow.appendChild(th);
});
thead.appendChild(headRow);
table.appendChild(thead);

const tbody = document.createElement("tbody");
rows.forEach(([label, key, higherBetter, fmt]) => {
const tr = document.createElement("tr");
const name = document.createElement("td");
name.textContent = label;
tr.appendChild(name);

const baseCell = document.createElement("td");
baseCell.textContent = fmt(base[key]);
const youCell = document.createElement("td");
youCell.textContent = fmt(you[key]);

if (base[key] !== you[key]) {
const youBetter = higherBetter ? you[key] > base[key] : you[key] < base[key];
(youBetter ? youCell : baseCell).classList.add("better");
}
tr.appendChild(baseCell);
tr.appendChild(youCell);
tbody.appendChild(tr);
});
table.appendChild(tbody);
elements.challengeBoard.appendChild(table);
}

function stepOnce() {
if (state.index >= state.config.steps.length) {
render();
Expand Down Expand Up @@ -955,6 +1109,7 @@
}
elements.answer.disabled = state.scenario === "pickretry";
elements.codePanel.hidden = state.scenario !== "pickretry";
elements.challengePanel.hidden = state.scenario !== "pickretry";

renderReplay(replayIndex);
renderCompare();
Expand Down
Binary file modified docs/pyodide/pir_bundle.zip
Binary file not shown.
11 changes: 11 additions & 0 deletions docs/pyodide_playground_strategy.md
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,17 @@ All three phases are built and Python-verified; the remaining work is a single
browser pass over Phases 0–3 and (optionally) deleting the JS `clarifying`
preview once the real path is the trusted default.

**Bonus — "beat the robot" challenge. ✅ built (Python path verified).** A
**Score 10 seeds** button scores the edited agent against the shipped one across
10 seeds and shows a baseline-vs-you scoreboard (success rate, reward, steps,
retries, grasp misses) with a win/lose verdict. Scoring across *many* seeds is
deliberate: a single-seed score rewards overfitting (drop the retry schedule and
seed 3 solves in 2 steps), but robustness across seeds does not — an agent that
ignores its belief drops to a 0% success rate. `score_pick_and_retry` (pinned by
`tests/test_playground_trace.py`) does the aggregation; verified locally the
shipped agent scores 100% / reward 0.65 while a belief-ignoring agent scores 0% /
reward -1.2.

## Risks / watch-list

- **First-load latency.** Pyodide core is a few MB. Lazy-load it only when the
Expand Down
43 changes: 43 additions & 0 deletions pir/viz/playground_trace.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,3 +286,46 @@ def pick_and_retry_trace_to_playground(
"initial": initial,
"steps": steps,
}


# --- "Beat the robot" challenge scoring -------------------------------------
#
# An agent is scored across many seeds, not one, so a policy that overfits a
# single lucky seed (e.g. dropping the retry schedule to grab the belief mean
# immediately) is exposed by a low success rate. This is the whole lesson: the
# retry/belief logic exists for robustness, not for one episode.


def score_pick_and_retry(run_agent: Any, agent_factory: Any, *, seeds: Any) -> dict[str, Any]:
"""Run ``agent_factory()`` over ``seeds`` and aggregate the trace summaries.

``run_agent`` is the example's own loop (passed in to keep this module
decoupled from examples/); ``agent_factory`` is called once per seed for a
fresh agent. Returns plain-JSON aggregate stats.
"""

seeds = list(seeds)
successes = 0
steps_total = 0.0
retries_total = 0.0
reward_total = 0.0
miss_total = 0.0
for seed in seeds:
summary = run_agent(agent_factory(), seed=seed, render=False).summary()
if summary.success:
successes += 1
steps_total += summary.steps
retries_total += int(summary.counters.get("retry_count", 0) or 0)
reward_total += summary.total_reward
miss_total += summary.failure_counts.get("grasp_miss", 0)

n = len(seeds) or 1
return {
"episodes": len(seeds),
"successes": successes,
"success_rate": round(successes / n, 4),
"mean_steps": round(steps_total / n, 3),
"mean_retries": round(retries_total / n, 3),
"mean_reward": round(reward_total / n, 3),
"mean_grasp_miss": round(miss_total / n, 3),
}
Loading
Loading