rsasaki0109 · rsasaki0109 · Jun 4, 2026 · Jun 4, 2026
diff --git a/docs/playground.css b/docs/playground.css
@@ -198,6 +198,87 @@ button:disabled {
   width: 100%;
 }
 
+.challenge-panel {
+  border: 1px solid #d7dcd6;
+  border-radius: 10px;
+  margin-top: 12px;
+  overflow: hidden;
+}
+
+.challenge-head {
+  align-items: center;
+  background: #eef1ec;
+  display: flex;
+  flex-wrap: wrap;
+  gap: 8px;
+  justify-content: space-between;
+  padding: 8px 12px;
+}
+
+.challenge-head span {
+  color: var(--ink-soft, #5a6b67);
+  font-size: 0.82rem;
+}
+
+.challenge-board {
+  padding: 12px;
+}
+
+.challenge-empty {
+  color: var(--ink-soft, #5a6b67);
+  font-size: 0.85rem;
+}
+
+.challenge-verdict {
+  border-radius: 8px;
+  font-weight: 800;
+  margin-bottom: 10px;
+  padding: 8px 12px;
+}
+
+.challenge-verdict.win {
+  background: #e3f3dc;
+  color: #2f6b1f;
+}
+
+.challenge-verdict.lose {
+  background: #f7e6df;
+  color: #9a3d22;
+}
+
+.challenge-verdict.tie {
+  background: #eef1ec;
+  color: #4a5b57;
+}
+
+.challenge-table {
+  border-collapse: collapse;
+  font-size: 0.85rem;
+  width: 100%;
+}
+
+.challenge-table th,
+.challenge-table td {
+  border-bottom: 1px solid #e4e8e2;
+  padding: 5px 8px;
+  text-align: right;
+}
+
+.challenge-table th:first-child,
+.challenge-table td:first-child {
+  text-align: left;
+}
+
+.challenge-table thead th {
+  color: var(--ink-soft, #5a6b67);
+  font-weight: 700;
+}
+
+.challenge-table td.better {
+  background: #e3f3dc;
+  font-weight: 800;
+}
+
 .status-strip {
   display: grid;
   gap: 10px;

diff --git a/docs/playground.html b/docs/playground.html
@@ -124,6 +124,14 @@ <h1 id="playground-title">Playground</h1>
             </div>
             <textarea id="codeCell" class="code-cell" spellcheck="false" rows="20" aria-label="Agent source code"></textarea>
           </section>
+
+          <section id="challengePanel" class="challenge-panel" aria-label="Challenge" hidden>
+            <div class="challenge-head">
+              <span>Challenge — your edited agent vs. the shipped one, across 10 seeds</span>
+              <button id="scoreButton" type="button">Score 10 seeds</button>
+            </div>
+            <div id="challengeBoard" class="challenge-board" aria-live="polite"></div>
+          </section>
         </div>
 
         <aside class="trace-panel" aria-labelledby="trace-title">

diff --git a/docs/playground.js b/docs/playground.js
@@ -53,6 +53,9 @@
     codeCell: document.getElementById("codeCell"),
     runCode: document.getElementById("runCodeButton"),
     resetCode: document.getElementById("resetCodeButton"),
+    challengePanel: document.getElementById("challengePanel"),
+    challengeBoard: document.getElementById("challengeBoard"),
+    scoreButton: document.getElementById("scoreButton"),
     reset: document.getElementById("resetButton"),
     step: document.getElementById("stepButton"),
     run: document.getElementById("runButton"),
@@ -105,6 +108,9 @@
   elements.resetCode.addEventListener("click", () => {
     resetEditedAgent();
   });
+  elements.scoreButton.addEventListener("click", () => {
+    runChallenge();
+  });
   elements.replay.addEventListener("input", () => {
     state.replayIndex = clampReplayIndex(elements.replay.value);
     render();
@@ -197,6 +203,11 @@
         setRealStatus("running real Python: " + sourcePath(scenario));
         if (scenario === "pickretry") {
           populateDefaultAgentSource();
+          if (!elements.challengeBoard.textContent.trim()) {
+            renderChallengeMessage(
+              'Edit the agent above, then "Score ' + CHALLENGE_SEEDS + ' seeds" to see if you beat it.'
+            );
+          }
         }
       } catch (error) {
         if (scenario === "pickretry") {
@@ -460,6 +471,149 @@
     }
   }
 
+  // --- "Beat the robot" challenge ---------------------------------------------
+  // Scores the edited agent vs. the shipped one across many seeds. Single-seed
+  // scoring would reward overfitting; robustness across seeds is the real test.
+  const CHALLENGE_SEEDS = 10;
+  const CHALLENGE_DRIVER = [
+    "import json, os, sys, importlib.util",
+    "cwd = os.getcwd()",
+    "if cwd not in sys.path:",
+    "    sys.path.insert(0, cwd)",
+    "class _NoMatplotlib:",
+    "    def find_spec(self, name, path=None, target=None):",
+    "        if name == 'matplotlib' or name.startswith('matplotlib.'):",
+    "            raise ImportError('matplotlib is intentionally unavailable on the headless browser path')",
+    "        return None",
+    "sys.meta_path.insert(0, _NoMatplotlib())",
+    "path = os.path.join(cwd, 'examples', 'manipulation', '01_pick_and_retry.py')",
+    "spec = importlib.util.spec_from_file_location('pick_and_retry', path)",
+    "mod = importlib.util.module_from_spec(spec)",
+    "spec.loader.exec_module(mod)",
+    "from pir.viz.playground_trace import score_pick_and_retry",
+    "seeds = list(range(" + CHALLENGE_SEEDS + "))",
+    "baseline = score_pick_and_retry(mod.run_agent, mod.PickAndRetryAgent, seeds=seeds)",
+    "src = USER_SRC",
+    "if src and src.strip():",
+    "    ns = {}",
+    "    exec(src, ns)",
+    "    Agent = ns.get('PickAndRetryAgent')",
+    "    if Agent is None:",
+    "        raise ValueError('Your code must define a class named PickAndRetryAgent')",
+    "    you = score_pick_and_retry(mod.run_agent, Agent, seeds=seeds)",
+    "else:",
+    "    you = baseline",
+    "json.dumps({'seeds': seeds, 'baseline': baseline, 'you': you})",
+  ].join("\n");
+
+  async function runChallenge() {
+    stopRun();
+    elements.scoreButton.disabled = true;
+    renderChallengeMessage("scoring " + CHALLENGE_SEEDS + " seeds for both agents…");
+    setRealStatus("running the challenge…");
+    try {
+      const pyodide = await ensurePyodide();
+      pyodide.globals.set("USER_SRC", elements.codeCell.value || "");
+      const result = JSON.parse(await pyodide.runPythonAsync(CHALLENGE_DRIVER));
+      renderScoreboard(result);
+      setRealStatus("challenge scored across " + CHALLENGE_SEEDS + " seeds");
+    } catch (error) {
+      renderChallengeMessage("");
+      setRealStatus("challenge failed: " + error, true);
+    } finally {
+      elements.scoreButton.disabled = false;
+    }
+  }
+
+  function renderChallengeMessage(message) {
+    elements.challengeBoard.textContent = "";
+    if (!message) {
+      return;
+    }
+    const note = document.createElement("p");
+    note.className = "challenge-empty";
+    note.textContent = message;
+    elements.challengeBoard.appendChild(note);
+  }
+
+  function challengeVerdict(you, base) {
+    const same =
+      you.success_rate === base.success_rate &&
+      you.mean_reward === base.mean_reward &&
+      you.mean_steps === base.mean_steps;
+    if (same) {
+      return { cls: "tie", text: "You're running the shipped agent — edit it and score again to try to beat it." };
+    }
+    // Higher success rate wins; then higher reward; then fewer steps.
+    let youWin;
+    if (you.success_rate !== base.success_rate) {
+      youWin = you.success_rate > base.success_rate;
+    } else if (you.mean_reward !== base.mean_reward) {
+      youWin = you.mean_reward > base.mean_reward;
+    } else {
+      youWin = you.mean_steps < base.mean_steps;
+    }
+    return youWin
+      ? { cls: "win", text: "🏆 You beat the shipped agent across " + CHALLENGE_SEEDS + " seeds!" }
+      : { cls: "lose", text: "Shipped agent still wins — make your policy more robust across seeds." };
+  }
+
+  function renderScoreboard(result) {
+    const you = result.you;
+    const base = result.baseline;
+    elements.challengeBoard.textContent = "";
+
+    const verdict = challengeVerdict(you, base);
+    const banner = document.createElement("div");
+    banner.className = "challenge-verdict " + verdict.cls;
+    banner.textContent = verdict.text;
+    elements.challengeBoard.appendChild(banner);
+
+    // metric label, key, and whether higher is better
+    const rows = [
+      ["success rate", "success_rate", true, (v) => Math.round(v * 100) + "%"],
+      ["mean reward", "mean_reward", true, (v) => v.toFixed(2)],
+      ["mean steps", "mean_steps", false, (v) => v.toFixed(1)],
+      ["mean retries", "mean_retries", false, (v) => v.toFixed(1)],
+      ["mean grasp_miss", "mean_grasp_miss", false, (v) => v.toFixed(1)],
+    ];
+
+    const table = document.createElement("table");
+    table.className = "challenge-table";
+    const thead = document.createElement("thead");
+    const headRow = document.createElement("tr");
+    ["metric", "baseline", "you"].forEach((label) => {
+      const th = document.createElement("th");
+      th.textContent = label;
+      headRow.appendChild(th);
+    });
+    thead.appendChild(headRow);
+    table.appendChild(thead);
+
+    const tbody = document.createElement("tbody");
+    rows.forEach(([label, key, higherBetter, fmt]) => {
+      const tr = document.createElement("tr");
+      const name = document.createElement("td");
+      name.textContent = label;
+      tr.appendChild(name);
+
+      const baseCell = document.createElement("td");
+      baseCell.textContent = fmt(base[key]);
+      const youCell = document.createElement("td");
+      youCell.textContent = fmt(you[key]);
+
+      if (base[key] !== you[key]) {
+        const youBetter = higherBetter ? you[key] > base[key] : you[key] < base[key];
+        (youBetter ? youCell : baseCell).classList.add("better");
+      }
+      tr.appendChild(baseCell);
+      tr.appendChild(youCell);
+      tbody.appendChild(tr);
+    });
+    table.appendChild(tbody);
+    elements.challengeBoard.appendChild(table);
+  }
+
   function stepOnce() {
     if (state.index >= state.config.steps.length) {
       render();
@@ -955,6 +1109,7 @@
     }
     elements.answer.disabled = state.scenario === "pickretry";
     elements.codePanel.hidden = state.scenario !== "pickretry";
+    elements.challengePanel.hidden = state.scenario !== "pickretry";
 
     renderReplay(replayIndex);
     renderCompare();

diff --git a/docs/pyodide/pir_bundle.zip b/docs/pyodide/pir_bundle.zip
diff --git a/docs/pyodide_playground_strategy.md b/docs/pyodide_playground_strategy.md
@@ -183,6 +183,17 @@ All three phases are built and Python-verified; the remaining work is a single
 browser pass over Phases 0–3 and (optionally) deleting the JS `clarifying`
 preview once the real path is the trusted default.
 
+**Bonus — "beat the robot" challenge. ✅ built (Python path verified).** A
+**Score 10 seeds** button scores the edited agent against the shipped one across
+10 seeds and shows a baseline-vs-you scoreboard (success rate, reward, steps,
+retries, grasp misses) with a win/lose verdict. Scoring across *many* seeds is
+deliberate: a single-seed score rewards overfitting (drop the retry schedule and
+seed 3 solves in 2 steps), but robustness across seeds does not — an agent that
+ignores its belief drops to a 0% success rate. `score_pick_and_retry` (pinned by
+`tests/test_playground_trace.py`) does the aggregation; verified locally the
+shipped agent scores 100% / reward 0.65 while a belief-ignoring agent scores 0% /
+reward -1.2.
+
 ## Risks / watch-list
 
 - **First-load latency.** Pyodide core is a few MB. Lazy-load it only when the

diff --git a/pir/viz/playground_trace.py b/pir/viz/playground_trace.py
@@ -286,3 +286,46 @@ def pick_and_retry_trace_to_playground(
         "initial": initial,
         "steps": steps,
     }
+
+
+# --- "Beat the robot" challenge scoring -------------------------------------
+#
+# An agent is scored across many seeds, not one, so a policy that overfits a
+# single lucky seed (e.g. dropping the retry schedule to grab the belief mean
+# immediately) is exposed by a low success rate. This is the whole lesson: the
+# retry/belief logic exists for robustness, not for one episode.
+
+
+def score_pick_and_retry(run_agent: Any, agent_factory: Any, *, seeds: Any) -> dict[str, Any]:
+    """Run ``agent_factory()`` over ``seeds`` and aggregate the trace summaries.
+
+    ``run_agent`` is the example's own loop (passed in to keep this module
+    decoupled from examples/); ``agent_factory`` is called once per seed for a
+    fresh agent. Returns plain-JSON aggregate stats.
+    """
+
+    seeds = list(seeds)
+    successes = 0
+    steps_total = 0.0
+    retries_total = 0.0
+    reward_total = 0.0
+    miss_total = 0.0
+    for seed in seeds:
+        summary = run_agent(agent_factory(), seed=seed, render=False).summary()
+        if summary.success:
+            successes += 1
+        steps_total += summary.steps
+        retries_total += int(summary.counters.get("retry_count", 0) or 0)
+        reward_total += summary.total_reward
+        miss_total += summary.failure_counts.get("grasp_miss", 0)
+
+    n = len(seeds) or 1
+    return {
+        "episodes": len(seeds),
+        "successes": successes,
+        "success_rate": round(successes / n, 4),
+        "mean_steps": round(steps_total / n, 3),
+        "mean_retries": round(retries_total / n, 3),
+        "mean_reward": round(reward_total / n, 3),
+        "mean_grasp_miss": round(miss_total / n, 3),
+    }