From ce72e4124860d2f163023a69e91614e8b93a9b55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86gir=20M=C3=A1ni=20Hauksson?= <54936225+sourcehawk@users.noreply.github.com> Date: Sat, 30 May 2026 23:33:15 +0200 Subject: [PATCH 1/7] feat(skills/review-loop): add review-loop skill Opt-in skill that drives a PR to review-clean by looping: request the automated (Copilot) review, wait for it in the background so the session is never held hostage, triage every open comment with the rigor of receiving-code-review, address the correct ones, and re-request until a fresh review is silent and threads are resolved. A 3-round cap stops non-converging loops. The verified mechanism (gh pr edit --add-reviewer "@copilot"; the REST requested_reviewers endpoint rejects Copilot) and a gh-and-builtins-only background-wait script keep the skill within the plugin's superpowers + gh dependency rule. Co-Authored-By: Claude Opus 4.8 (1M context) --- skills/review-loop/SKILL.md | 115 ++++++++++++++++++ .../templates/await-copilot-review.sh | 59 +++++++++ 2 files changed, 174 insertions(+) create mode 100644 skills/review-loop/SKILL.md create mode 100755 skills/review-loop/templates/await-copilot-review.sh diff --git a/skills/review-loop/SKILL.md b/skills/review-loop/SKILL.md new file mode 100644 index 0000000..6e3dd6e --- /dev/null +++ b/skills/review-loop/SKILL.md @@ -0,0 +1,115 @@ +--- +name: review-loop +description: + Use when asked to run a review loop on a PR, drive a PR's automated + (Copilot) review to clean, or repeatedly request-and-address review + until nothing new comes back. +--- + +# review-loop + +## When to invoke + +When the user asks to loop a PR through automated review until it is clean — "run a review loop on #N", "get Copilot to review this and address everything", "keep requesting review until it's resolved". Also invoked from the two opt-in gates in `feature-dev-workflow:developing-a-feature` (sub-PRs, and the PR to main). + +Not for a single one-off review pass — that is the `review` skill (local diff review) or a single `gh pr edit --add-reviewer`. This skill is the *loop*: request, wait, triage, address, re-request, until clean. + +## The mechanism (verified — do not improvise the commands) + +Copilot review is requested **only** through `gh pr edit`. The REST `requested_reviewers` endpoint rejects Copilot (it accepts human logins and team slugs only), so `gh api .../requested_reviewers -f "reviewers[]=copilot-..."` returns 422. Use: + +``` +gh pr edit --add-reviewer "@copilot" +``` + +Requires `gh` ≥ 2.88.0 (`gh --version`). Not available on GitHub Enterprise Server. + +| Need | Command / identity | +| --- | --- | +| Request the review | `gh pr edit --add-reviewer "@copilot"` | +| Confirm it attached (gh can exit 0 without attaching) | `gh pr view --json reviewRequests` → expect a reviewer with login `Copilot` | +| Copilot's author login in `/pulls//reviews` | `copilot-pull-request-reviewer[bot]` (this is the watermark filter) | +| List open review threads | `gh api graphql` on `PullRequest.reviewThreads` → `id`, `isResolved`, `viewerCanResolve` | +| Resolve a thread | `gh api graphql` mutation `resolveReviewThread(input:{threadId:""})` (GraphQL only; no REST) | +| Reply in a comment thread | `gh api repos/{owner}/{repo}/pulls/{pr}/comments/{id}/replies -f body=...` | + +## The loop + +```dot +digraph review_loop { + "Capture watermark, request Copilot" [shape=box]; + "Attached?" [shape=diamond]; + "Copilot unavailable:\ntriage existing human comments, exit" [shape=box]; + "Wait in BACKGROUND for new review" [shape=box]; + "Triage every open comment\n(receiving-code-review)" [shape=box]; + "Address: apply correct, push back on wrong" [shape=box]; + "New actionable comments\nOR unresolved threads?" [shape=diamond]; + "Cap reached?" [shape=diamond]; + "Done: clean" [shape=doublecircle]; + "Stop, report what remains" [shape=box]; + + "Capture watermark, request Copilot" -> "Attached?"; + "Attached?" -> "Copilot unavailable:\ntriage existing human comments, exit" [label="no"]; + "Attached?" -> "Wait in BACKGROUND for new review" [label="yes"]; + "Wait in BACKGROUND for new review" -> "Triage every open comment\n(receiving-code-review)"; + "Triage every open comment\n(receiving-code-review)" -> "Address: apply correct, push back on wrong"; + "Address: apply correct, push back on wrong" -> "New actionable comments\nOR unresolved threads?"; + "New actionable comments\nOR unresolved threads?" -> "Cap reached?" [label="yes"]; + "Cap reached?" -> "Capture watermark, request Copilot" [label="no"]; + "Cap reached?" -> "Stop, report what remains" [label="yes"]; + "New actionable comments\nOR unresolved threads?" -> "Done: clean" [label="no"]; +} +``` + +One cycle, in order: + +1. **Capture the watermark, then request.** `SINCE=$(date -u +%Y-%m-%dT%H:%M:%SZ)` *before* `gh pr edit --add-reviewer "@copilot"`. The watermark is what distinguishes the new review from a stale one left by an earlier round. +2. **Confirm Copilot attached.** `gh pr view --json reviewRequests`. If Copilot is **not** in the list, it is unavailable for this repo/plan (or `gh` is too old / this is GHES). Do **not** start waiting — there is no reviewer to produce a review. Instead do one triage pass over any human comments already on the PR (Step 3), state that Copilot review is unavailable, and exit. There is no loop without an automated reviewer to re-request. +3. **Wait in the background.** Launch the wait script with `run_in_background: true` so the session is not held hostage: + + ``` + ${CLAUDE_PLUGIN_ROOT}/skills/review-loop/templates/await-copilot-review.sh "$SINCE" + ``` + + It polls `gh api .../pulls//reviews` for a Copilot review newer than `$SINCE` and exits 0 (printing the review) when one lands, or 124 on timeout. The harness re-invokes the session when it exits. Do **not** write a foreground `sleep`/`until` loop — foreground sleep is blocked and it freezes the session. +4. **Triage every open review comment** — Copilot's new ones plus any human comments already on the PR. **REQUIRED SUB-SKILL:** `superpowers:receiving-code-review`. Verify each against the codebase. No performative agreement, no blind implementation. Copilot is confidently wrong often enough that "apply all suggestions" is the wrong default. +5. **Address.** Apply the comments that are correct, one at a time, testing each; reply in the comment thread stating what changed; resolve the thread (`resolveReviewThread`). For a comment that is wrong or a judgment call, **do not silently apply or silently ignore** — handle the pushback per the calling context (below). +6. **Re-request and loop.** Commit and push the fixes (commit convention from the project's CLAUDE.md), then return to Step 1 with a fresh watermark. + +## Termination + +Stop and declare clean when **both**: + +- a fresh Copilot review (newer than the latest watermark) returns **no new actionable comments**, AND +- every review thread is **resolved or replied** (a verified-and-declined comment with a stated reason counts as resolved — do not thrash re-litigating it). + +**Safety cap: 3 request→address rounds.** If the loop has not converged by the cap, **stop** and report what remains unresolved. Do not raise the cap to keep going — non-convergence (Copilot surfacing churny or contradictory nits round after round) is a signal to hand back to the user, not to loop harder. Only count a review as "clean" if it arrived *after* your last push; a review from before the push is stale and must not end the loop. + +## Pushback handling depends on the calling context + +The skill is told its context when invoked. This controls Step 5 pushback only. + +- **Standalone, or the PR-to-main gate (interactive):** pause and surface the pushback to the user with technical reasoning, then act on their direction. +- **Autonomous multi-PR fan-out (sub-PR gate):** do **not** pause interactively — that breaks "autonomous fan-out has no per-sub-PR round-trips". Log the pushback as a bubble-up concern in the state file's `## Bubble-up log`, apply the rest, and let `feature-dev-workflow:reviewing-feature-progress` surface it at the wave checkpoint. + +## GitHub-mutation discipline + +Requesting the reviewer, replying in threads, resolving threads, and pushing are GitHub mutations. Follow the project rule: no mutation without a fresh confirmation against the specific thing about to land, in the interactive contexts. In autonomous fan-out, follow the autonomous-mode discipline already established for sub-PRs (the user opted into the mechanical bundle up front). + +## What this skill does not do + +- **Does not merge the PR.** It drives the PR to review-clean and hands back. +- **Does not invent comments or run its own static analysis.** Copilot and humans are the reviewers; this skill is the loop and the judgment layer over their output. +- **Does not block on human reviewers.** It sweeps in human comments already present, but only re-requests Copilot. + +## Red flags + +| Thought | Reality | +| --- | --- | +| "I'll `gh api .../requested_reviewers` to add Copilot" | That endpoint rejects Copilot (422). Only `gh pr edit --add-reviewer "@copilot"` works. | +| "I'll `sleep` in a loop until the review lands" | Foreground sleep is blocked and freezes the session. Use the background wait script; the harness wakes you. | +| "The command exited 0, so Copilot is reviewing" | `gh pr edit` can exit 0 without attaching. Confirm via `reviewRequests` before waiting. | +| "Copilot suggested it, so apply it" | Copilot is confidently wrong often. Triage via `receiving-code-review`; verify before applying. | +| "Still finding nits — one more round" | Past the 3-round cap, non-convergence is a signal to hand back, not to loop harder. | +| "A Copilot review exists, so we're clean" | Only a review *after your last push* counts. An earlier-round review is stale. | +| "I'll auto-dismiss the comment I disagree with" | Wrong/judgment-call comments get pushback (interactive) or a bubble-up concern (fan-out) — never a silent drop. | diff --git a/skills/review-loop/templates/await-copilot-review.sh b/skills/review-loop/templates/await-copilot-review.sh new file mode 100755 index 0000000..ca508a1 --- /dev/null +++ b/skills/review-loop/templates/await-copilot-review.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash +# Poll an open PR until GitHub Copilot posts a review newer than a watermark, +# then print that review as JSON and exit. Designed to be launched in the +# background so the agent session is never blocked waiting. +# +# Dependencies: gh (with `gh api --jq`, which uses gh's built-in jq engine) and +# shell builtins only. No jq binary, no gh extension. +# +# Usage: +# await-copilot-review.sh [interval-seconds] [timeout-seconds] +# +# is the watermark: capture it BEFORE requesting the review, e.g. +# SINCE=$(date -u +%Y-%m-%dT%H:%M:%SZ) +# so a review left over from a previous round can't be mistaken for the new one. +# +# Exit status: +# 0 a Copilot review newer than the watermark was found (printed to stdout) +# 124 timed out before one appeared +# 2 bad arguments + +set -euo pipefail + +PR="${1:-}" +SINCE="${2:-}" +INTERVAL="${3:-30}" +TIMEOUT="${4:-900}" + +if [ -z "$PR" ] || [ -z "$SINCE" ]; then + echo "usage: await-copilot-review.sh [interval] [timeout]" >&2 + exit 2 +fi + +REPO=$(gh repo view --json nameWithOwner --jq .nameWithOwner) +BOT="copilot-pull-request-reviewer[bot]" +DEADLINE=$(( $(date +%s) + TIMEOUT )) + +# jq filter (run by gh's built-in engine): Copilot-authored reviews submitted +# after the watermark, newest first. The bot's author login in /reviews is +# copilot-pull-request-reviewer[bot]; the broad test() guards against the +# identity surfacing under a variant login. The watermark is interpolated as a +# jq string literal — safe because an ISO-8601 timestamp has no jq metacharacters. +filter="[.[] + | select((.user.login==\"$BOT\") or (.user.login|test(\"[Cc]opilot\"))) + | select(.submitted_at > \"$SINCE\") + | {id, state, submitted_at, body}] + | sort_by(.submitted_at) | reverse" + +while :; do + found=$(gh api "repos/$REPO/pulls/$PR/reviews" --paginate --jq "$filter") + if [ -n "$found" ] && [ "$found" != "[]" ]; then + printf '%s\n' "$found" + exit 0 + fi + if [ "$(date +%s)" -ge "$DEADLINE" ]; then + echo "timed out after ${TIMEOUT}s waiting for a Copilot review on #$PR" >&2 + exit 124 + fi + sleep "$INTERVAL" +done From e7c29e872a0def29104a21f48cd98e8cece68b44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86gir=20M=C3=A1ni=20Hauksson?= <54936225+sourcehawk@users.noreply.github.com> Date: Sat, 30 May 2026 23:33:22 +0200 Subject: [PATCH 2/7] feat(skills): add opt-in review-loop gates to the feature flow Wires feature-dev-workflow:review-loop into developing-a-feature at two opt-in, default-off points: a per-sub-PR gate (recorded as sub_pr_review_loop in the state file, run by fanning-out-with-worktrees at each sub-PR ripening before the orchestrator's own review) and a PR-to-main gate before external review. Pushback is interactive at the PR-to-main gate and bubbles up to the wave checkpoint in autonomous fan-out. Adds the README and project-CLAUDE skill-table rows. Co-Authored-By: Claude Opus 4.8 (1M context) --- skills/developing-a-feature/SKILL.md | 9 +++++++++ skills/fanning-out-with-worktrees/SKILL.md | 15 ++++++++------- templates/project-CLAUDE.md | 1 + 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/skills/developing-a-feature/SKILL.md b/skills/developing-a-feature/SKILL.md index ab52cab..13c078d 100644 --- a/skills/developing-a-feature/SKILL.md +++ b/skills/developing-a-feature/SKILL.md @@ -46,6 +46,13 @@ For sequential single-PR work, skip to Step 4. For multi-PR work, dispatch paral Record the choice in the state file's frontmatter as `sub_pr_approval: autonomous` or `sub_pr_approval: manual`. The fan-out skill reads this field at every sub-PR ripening to decide whether to gate on user approval. Default if the field is missing in an older state file: `autonomous` (preserves the original behaviour). +**Sub-PR review-loop (multi-PR only).** Immediately after the approval-mode choice, ask a second `AskUserQuestion`: should each sub-PR run an automated review-loop and come back clean before it is self-merged? This is opt-in and independent of the approval mode. + +- **On** — at each sub-PR's ripening, the orchestrator runs `feature-dev-workflow:review-loop` against the open sub-PR before the existing `review`-skill pass and self-merge. A comment the loop wants to push back on does not pause the fan-out; it is logged as a bubble-up concern and surfaced at the wave checkpoint. +- **Off** (default) — ripening is unchanged. + +Record the choice as `sub_pr_review_loop: on` or `sub_pr_review_loop: off`. The fan-out skill reads it at every ripening. Default if the field is missing in an older state file: `off` (preserves the original behaviour). + ### 3. Set up the implementation environment - **Multi-PR (feature-branch model)** — `feature/` already exists, created and pushed by `feature-dev-workflow:planning-a-feature` and carrying the committed spec/plan/state. **Reuse it; never re-create it** off `origin/main` — that errors (`fatal: a branch named 'feature/' already exists`) and would orphan the planning artifacts. If planning already made the integration worktree at `.claude/worktrees/`, just `cd` into it. Otherwise attach one to the existing branch: @@ -94,6 +101,8 @@ Record the choice in the state file's frontmatter as `sub_pr_approval: autonomou Sub-PRs into the feature branch are owned by `feature-dev-workflow:fanning-out-with-worktrees`, not this step. +Once the PR to main is open — the single-PR PR, or the multi-PR integration PR — ask the user (via `AskUserQuestion`) whether to run an automated review-loop on it before handing off for external human review. If yes, **OPTIONAL SUB-SKILL:** `feature-dev-workflow:review-loop` against this PR; it drives the PR's automated (Copilot) review to clean. This is the interactive context, so a comment the loop wants to push back on pauses for the user. If no, hand off as-is. + ### 7. Tear down the planning artifacts Delete the plan + state file once the work is genuinely done. The spec stays — it's the durable ADR. The plan and state file are scratch; leaving them committed past readiness pollutes the repo with stale operational state that future `grep`s have to wade through. diff --git a/skills/fanning-out-with-worktrees/SKILL.md b/skills/fanning-out-with-worktrees/SKILL.md index f8b3549..009289a 100644 --- a/skills/fanning-out-with-worktrees/SKILL.md +++ b/skills/fanning-out-with-worktrees/SKILL.md @@ -93,13 +93,14 @@ When a sub-PR is ready (subagent reports `ready` and the relevant verification c Per sub-PR, in order: -1. **Review.** **REQUIRED SUB-SKILL:** the `review` skill — invoke it against the sub-PR number to walk the diff with full PR context. This review is weaker than external review (which lands at the integration PR) but stronger than nothing; it catches issues that would otherwise pile onto the integration PR reviewer. Beyond bugs, check the diff against the `## Conventions` block: does its layout, naming, and vocabulary match the block and the sibling PRs already merged? A convention violation caught here is one the integration reviewer never sees. -2. **Approval gate, per the state file's `sub_pr_approval` mode.** Every gate covers the **bundle**: merge + sub-issue close + state-file update. The close is bodyless (no `--comment` flag) — GitHub automatically cross-references the sub-issue from the merge commit via the sub-PR's body keyword, so no custom comment is needed and there's no "specific body about to land" for the close mutation. - - **`autonomous`** (default) — proceed straight through the bundle in steps 3-5. The user opted into the mechanical bundle (review → merge → bodyless close → state update) in `feature-dev-workflow:developing-a-feature` Step 2. +1. **Automated review-loop, if the state file's `sub_pr_review_loop` is `on`.** **OPTIONAL SUB-SKILL:** `feature-dev-workflow:review-loop` against the sub-PR number, in fan-out mode — it requests the automated (Copilot) review, waits for it in the background, and drives the PR to review-clean. In fan-out mode a comment the loop wants to push back on does **not** pause the fan-out: log it as a bubble-up concern in the `## Bubble-up log` (surfaced at the wave checkpoint by `feature-dev-workflow:reviewing-feature-progress`) and continue. Skip this step when `sub_pr_review_loop` is `off` or missing. It runs before the orchestrator's own review so the diff reviewed below is the post-automated-review diff that will actually merge. +2. **Review.** **REQUIRED SUB-SKILL:** the `review` skill — invoke it against the sub-PR number to walk the diff with full PR context. This review is weaker than external review (which lands at the integration PR) but stronger than nothing; it catches issues that would otherwise pile onto the integration PR reviewer. Beyond bugs, check the diff against the `## Conventions` block: does its layout, naming, and vocabulary match the block and the sibling PRs already merged? A convention violation caught here is one the integration reviewer never sees. +3. **Approval gate, per the state file's `sub_pr_approval` mode.** Every gate covers the **bundle**: merge + sub-issue close + state-file update. The close is bodyless (no `--comment` flag) — GitHub automatically cross-references the sub-issue from the merge commit via the sub-PR's body keyword, so no custom comment is needed and there's no "specific body about to land" for the close mutation. + - **`autonomous`** (default) — proceed straight through the bundle in steps 4-6. The user opted into the mechanical bundle (review → merge → bodyless close → state update) in `feature-dev-workflow:developing-a-feature` Step 2. - **`manual`** — pause and ask the user for explicit approval before the bundle. The prompt MUST surface: a one-line summary of the review findings ("review clean" / " findings, none blocking" / specific concerns), the PR's title and diff size, and a note that closing sub-issue `#` follows the merge. Wait for an explicit yes. On push-back, route the concern back to the worktree subagent via `SendMessage` instead of merging. -3. **Merge into the feature branch.** First **push any local state-file commits to `origin/feature/`** — the merge happens on GitHub's side and lands on origin's current tip, so unpushed local commits make the post-merge `git -C pull --ff-only` fail with "Not possible to fast-forward" (local and origin have diverged; recover with `git rebase origin/feature/`). Keep local == origin at every merge boundary. Then `gh pr merge --merge` (or `--squash` / `--rebase` per project preference), and `git -C fetch origin && git merge --ff-only origin/feature/` to bring the merge back. -4. **Close the sub-issue.** `gh issue close `. Sub-PRs into a non-default branch don't trigger `Fixes`/`Closes` — manual close is the workaround. The body's `Towards #` keyword left the issue open precisely so the orchestrator can close it here; the cross-reference from the merge commit (which references `#`, which references `#`) is preserved automatically without a custom comment. -5. **Update the state file.** Flip the row's status to `self-merged`. If the sub-PR was the realization of a contract (e.g. a pre-merge stub PR), flip the contract row's status to `locked` and fill in the `Realized in` pointer. +4. **Merge into the feature branch.** First **push any local state-file commits to `origin/feature/`** — the merge happens on GitHub's side and lands on origin's current tip, so unpushed local commits make the post-merge `git -C pull --ff-only` fail with "Not possible to fast-forward" (local and origin have diverged; recover with `git rebase origin/feature/`). Keep local == origin at every merge boundary. Then `gh pr merge --merge` (or `--squash` / `--rebase` per project preference), and `git -C fetch origin && git merge --ff-only origin/feature/` to bring the merge back. +5. **Close the sub-issue.** `gh issue close `. Sub-PRs into a non-default branch don't trigger `Fixes`/`Closes` — manual close is the workaround. The body's `Towards #` keyword left the issue open precisely so the orchestrator can close it here; the cross-reference from the merge commit (which references `#`, which references `#`) is preserved automatically without a custom comment. +6. **Update the state file.** Flip the row's status to `self-merged`. If the sub-PR was the realization of a contract (e.g. a pre-merge stub PR), flip the contract row's status to `locked` and fill in the `Realized in` pointer. ### 6. Checkpoint review, then dispatch wave N+1 @@ -123,7 +124,7 @@ When every wave is complete (every sub-issue closed, every row `self-merged`, ev - **Letting the worktree subagent review its own PR.** A subagent reviewing the code it just wrote has the same blind spots in review that it had in implementation. The orchestrator owns sub-PR review precisely because it didn't write the code. - **Letting a bubble-up concern die in one subagent.** Propagation is the whole point of the watch loop. - **Forgetting to manually close the sub-issue.** The keyword doesn't fire on non-default-branch merges; the issue page silently shows "open" even though the work shipped. -- **Letting the state file go stale.** A resumed session reads it as ground truth. Update every row as reality moves; commit **and push** the state-file diff per phase transition — an unpushed local commit diverges the feature branch the moment the next sub-PR squash-merges on GitHub (see Step 5.3). +- **Letting the state file go stale.** A resumed session reads it as ground truth. Update every row as reality moves; commit **and push** the state-file diff per phase transition — an unpushed local commit diverges the feature branch the moment the next sub-PR squash-merges on GitHub (see Step 5.4). ## Red flags diff --git a/templates/project-CLAUDE.md b/templates/project-CLAUDE.md index 1094bdd..0817de2 100644 --- a/templates/project-CLAUDE.md +++ b/templates/project-CLAUDE.md @@ -49,6 +49,7 @@ Invoke `feature-dev-workflow:planning-a-feature` at conception. It and the `**RE | Public-facing docs for what the feature changed (once structurally complete) | `feature-dev-workflow:writing-docs` | | Verify-before-done | `superpowers:verification-before-completion` | | Open / flip pull requests | `feature-dev-workflow:opening-a-pull-request` | +| Loop a PR through automated (Copilot) review until clean (opt-in) | `feature-dev-workflow:review-loop` | `superpowers:*` skills come from the [superpowers](https://github.com/obra/superpowers) plugin (a prerequisite, see below). From 43301fd351a580f10103ad13885b74d24601203c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86gir=20M=C3=A1ni=20Hauksson?= <54936225+sourcehawk@users.noreply.github.com> Date: Sun, 31 May 2026 00:55:16 +0200 Subject: [PATCH 3/7] fix(skills/review-loop): handle unreliable Copilot re-review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The loop's round-2+ step assumed re-requesting Copilot works like the first request. It doesn't: once Copilot has reviewed, re-adding it as a reviewer is unreliable — sometimes a fresh review fires (even on the same commit), sometimes nothing happens, and Copilot does not dependably auto-review on push. Verified against a live PR (two reviews landed on one commit) and GitHub's own docs/community reports. Re-trigger with remove-then-re-add, confirm a genuinely new review by watermark rather than the gh exit code, and STOP-and-report if none arrives before the wait times out instead of spinning — pointing the user at the durable path (a repo ruleset running Copilot review on every push). Also note that a fired re-review may repeat already-addressed comments, so triage re-verifies against the current diff. Co-Authored-By: Claude Opus 4.8 (1M context) --- skills/review-loop/SKILL.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/skills/review-loop/SKILL.md b/skills/review-loop/SKILL.md index 6e3dd6e..a6f63fd 100644 --- a/skills/review-loop/SKILL.md +++ b/skills/review-loop/SKILL.md @@ -26,7 +26,9 @@ Requires `gh` ≥ 2.88.0 (`gh --version`). Not available on GitHub Enterprise Se | Need | Command / identity | | --- | --- | -| Request the review | `gh pr edit --add-reviewer "@copilot"` | +| Request the review (first round) | `gh pr edit --add-reviewer "@copilot"` | +| Re-request after Copilot already reviewed | `gh pr edit --remove-reviewer "@copilot"` then `--add-reviewer "@copilot"`. Re-requesting is **unreliable** — it sometimes produces a fresh review (even on the same commit) and sometimes silently does nothing. Never assume it fired; confirm by watermark (next row). | +| Confirm a *new* review actually arrived | a Copilot review in `/pulls//reviews` with `submitted_at` newer than the watermark you captured before re-requesting. This is the only reliable signal — not the exit code of `gh pr edit`. | | Confirm it attached (gh can exit 0 without attaching) | `gh pr view --json reviewRequests` → expect a reviewer with login `Copilot` | | Copilot's author login in `/pulls//reviews` | `copilot-pull-request-reviewer[bot]` (this is the watermark filter) | | List open review threads | `gh api graphql` on `PullRequest.reviewThreads` → `id`, `isResolved`, `viewerCanResolve` | @@ -74,7 +76,9 @@ One cycle, in order: It polls `gh api .../pulls//reviews` for a Copilot review newer than `$SINCE` and exits 0 (printing the review) when one lands, or 124 on timeout. The harness re-invokes the session when it exits. Do **not** write a foreground `sleep`/`until` loop — foreground sleep is blocked and it freezes the session. 4. **Triage every open review comment** — Copilot's new ones plus any human comments already on the PR. **REQUIRED SUB-SKILL:** `superpowers:receiving-code-review`. Verify each against the codebase. No performative agreement, no blind implementation. Copilot is confidently wrong often enough that "apply all suggestions" is the wrong default. 5. **Address.** Apply the comments that are correct, one at a time, testing each; reply in the comment thread stating what changed; resolve the thread (`resolveReviewThread`). For a comment that is wrong or a judgment call, **do not silently apply or silently ignore** — handle the pushback per the calling context (below). -6. **Re-request and loop.** Commit and push the fixes (commit convention from the project's CLAUDE.md), then return to Step 1 with a fresh watermark. +6. **Re-request and loop.** Commit and push the fixes (commit convention from the project's CLAUDE.md). Capture a fresh watermark, then re-trigger Copilot with **remove-then-re-add** (`gh pr edit --remove-reviewer "@copilot"` then `--add-reviewer "@copilot"`) — a plain re-add often no-ops once Copilot has already reviewed. Launch the background wait (Step 3) against the new watermark and return to Step 4 when a new review lands. + + **If the wait times out with no new review, the re-request did not fire — STOP, do not spin.** Re-requesting is inherently unreliable (GitHub exposes no dependable programmatic re-review; Copilot does not reliably auto-review on push either). Report that automated re-review could not be triggered and point the user at the robust path: enable a **repo ruleset that runs Copilot code review on every push**, which makes the loop reliable, or click the re-request (🔄) icon in the PR's Reviewers menu. Do not loop further against a reviewer that isn't responding. ## Termination @@ -85,6 +89,8 @@ Stop and declare clean when **both**: **Safety cap: 3 request→address rounds.** If the loop has not converged by the cap, **stop** and report what remains unresolved. Do not raise the cap to keep going — non-convergence (Copilot surfacing churny or contradictory nits round after round) is a signal to hand back to the user, not to loop harder. Only count a review as "clean" if it arrived *after* your last push; a review from before the push is stale and must not end the loop. +**Stop early if a re-request can't be triggered.** Separate from the convergence cap: if a round's re-request (Step 6) produces no new review before the wait times out, stop and report rather than retrying blindly. Re-review is not reliably automatable; the durable fix is the on-push Copilot ruleset. Also expect that when a re-review *does* fire, Copilot may repeat comments you already addressed or dismissed — re-verify against the current diff rather than assuming a repeated comment is a new problem. + ## Pushback handling depends on the calling context The skill is told its context when invoked. This controls Step 5 pushback only. @@ -112,4 +118,6 @@ Requesting the reviewer, replying in threads, resolving threads, and pushing are | "Copilot suggested it, so apply it" | Copilot is confidently wrong often. Triage via `receiving-code-review`; verify before applying. | | "Still finding nits — one more round" | Past the 3-round cap, non-convergence is a signal to hand back, not to loop harder. | | "A Copilot review exists, so we're clean" | Only a review *after your last push* counts. An earlier-round review is stale. | +| "I re-added Copilot, so a fresh review is coming" | Re-request is unreliable and a plain re-add often no-ops after Copilot already reviewed. Use remove-then-re-add, and confirm a new review by watermark — if none arrives before the wait times out, stop and recommend the on-push ruleset. | +| "Copilot raised this again, so it's a new problem" | A re-review may repeat comments you already addressed or dismissed. Re-verify against the current diff before acting. | | "I'll auto-dismiss the comment I disagree with" | Wrong/judgment-call comments get pushback (interactive) or a bubble-up concern (fan-out) — never a silent drop. | From 479ea635feab9cc4dc61aeb78da94b4ad2896f28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86gir=20M=C3=A1ni=20Hauksson?= <54936225+sourcehawk@users.noreply.github.com> Date: Sun, 31 May 2026 00:57:20 +0200 Subject: [PATCH 4/7] fix(skills/review-loop): wait for push-triggered review before re-requesting When a repo auto-reviews Copilot on every push, the push that lands the round's fixes already triggers a review. The previous step 6 then also did an explicit remove-then-re-add, producing a second redundant review. Capture the watermark before pushing, wait for a push-triggered review first, and fall back to the explicit remove-then-re-add only if that wait times out. Same command sequence either way; the repo's behavior decides which branch resolves it, with no double review when auto-review is on. Co-Authored-By: Claude Opus 4.8 (1M context) --- skills/review-loop/SKILL.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/skills/review-loop/SKILL.md b/skills/review-loop/SKILL.md index a6f63fd..93cc137 100644 --- a/skills/review-loop/SKILL.md +++ b/skills/review-loop/SKILL.md @@ -76,9 +76,14 @@ One cycle, in order: It polls `gh api .../pulls//reviews` for a Copilot review newer than `$SINCE` and exits 0 (printing the review) when one lands, or 124 on timeout. The harness re-invokes the session when it exits. Do **not** write a foreground `sleep`/`until` loop — foreground sleep is blocked and it freezes the session. 4. **Triage every open review comment** — Copilot's new ones plus any human comments already on the PR. **REQUIRED SUB-SKILL:** `superpowers:receiving-code-review`. Verify each against the codebase. No performative agreement, no blind implementation. Copilot is confidently wrong often enough that "apply all suggestions" is the wrong default. 5. **Address.** Apply the comments that are correct, one at a time, testing each; reply in the comment thread stating what changed; resolve the thread (`resolveReviewThread`). For a comment that is wrong or a judgment call, **do not silently apply or silently ignore** — handle the pushback per the calling context (below). -6. **Re-request and loop.** Commit and push the fixes (commit convention from the project's CLAUDE.md). Capture a fresh watermark, then re-trigger Copilot with **remove-then-re-add** (`gh pr edit --remove-reviewer "@copilot"` then `--add-reviewer "@copilot"`) — a plain re-add often no-ops once Copilot has already reviewed. Launch the background wait (Step 3) against the new watermark and return to Step 4 when a new review lands. +6. **Re-request and loop.** Capture a fresh watermark *first*, then commit and push the fixes (commit convention from the project's CLAUDE.md). Now **wait before re-requesting** — launch the background wait (Step 3) against that watermark: - **If the wait times out with no new review, the re-request did not fire — STOP, do not spin.** Re-requesting is inherently unreliable (GitHub exposes no dependable programmatic re-review; Copilot does not reliably auto-review on push either). Report that automated re-review could not be triggered and point the user at the robust path: enable a **repo ruleset that runs Copilot code review on every push**, which makes the loop reliable, or click the re-request (🔄) icon in the PR's Reviewers menu. Do not loop further against a reviewer that isn't responding. + - **A repo that runs Copilot review on every push will produce the new review from the push alone.** Waiting first lets that auto-review land and counts it (it is newer than the pre-push watermark). Re-requesting on top of it would double the review — so do not re-request until the wait has had a chance to catch a push-triggered review. + - **Only if that wait times out with no new review**, re-trigger explicitly with **remove-then-re-add** (`gh pr edit --remove-reviewer "@copilot"` then `--add-reviewer "@copilot"`) — a plain re-add often no-ops once Copilot has already reviewed — and launch the wait once more. + + When a new review lands (from either path), return to Step 4. + + **If the second wait also times out, STOP — do not spin.** Re-requesting is inherently unreliable (GitHub exposes no dependable programmatic re-review; Copilot does not reliably auto-review on push either). Report that automated re-review could not be triggered and point the user at the robust path: enable a **repo ruleset that runs Copilot code review on every push**, which makes the loop reliable, or click the re-request (🔄) icon in the PR's Reviewers menu. Do not loop further against a reviewer that isn't responding. ## Termination @@ -119,5 +124,6 @@ Requesting the reviewer, replying in threads, resolving threads, and pushing are | "Still finding nits — one more round" | Past the 3-round cap, non-convergence is a signal to hand back, not to loop harder. | | "A Copilot review exists, so we're clean" | Only a review *after your last push* counts. An earlier-round review is stale. | | "I re-added Copilot, so a fresh review is coming" | Re-request is unreliable and a plain re-add often no-ops after Copilot already reviewed. Use remove-then-re-add, and confirm a new review by watermark — if none arrives before the wait times out, stop and recommend the on-push ruleset. | +| "I pushed the fixes, now re-request immediately" | If the repo auto-reviews on push, the push already triggers a review; an immediate re-request doubles it. Capture the watermark before pushing, wait for the push-triggered review first, and re-request only if it doesn't arrive. | | "Copilot raised this again, so it's a new problem" | A re-review may repeat comments you already addressed or dismissed. Re-verify against the current diff before acting. | | "I'll auto-dismiss the comment I disagree with" | Wrong/judgment-call comments get pushback (interactive) or a bubble-up concern (fan-out) — never a silent drop. | From b07b11841ca1dac1a8c56d36b87352dd534060a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86gir=20M=C3=A1ni=20Hauksson?= <54936225+sourcehawk@users.noreply.github.com> Date: Sun, 31 May 2026 01:08:26 +0200 Subject: [PATCH 5/7] fix(skills/review-loop): tolerate transient gh failures in the wait poller Address Copilot review on #14. Under set -euo pipefail a transient gh/network failure in the poll loop killed the script with gh's exit code, which the caller could not distinguish from the 124 timeout. Guard the poll with || true so a blip or rate-limit just retries next interval; only a real timeout exits 124. Also document why the submitted_at watermark comparison is a plain string compare (GitHub always returns UTC ISO-8601 Z timestamps, whose lexical order is chronological) rather than converting to epoch. Co-Authored-By: Claude Opus 4.8 (1M context) --- skills/review-loop/templates/await-copilot-review.sh | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/skills/review-loop/templates/await-copilot-review.sh b/skills/review-loop/templates/await-copilot-review.sh index ca508a1..65e2fcf 100755 --- a/skills/review-loop/templates/await-copilot-review.sh +++ b/skills/review-loop/templates/await-copilot-review.sh @@ -39,6 +39,12 @@ DEADLINE=$(( $(date +%s) + TIMEOUT )) # copilot-pull-request-reviewer[bot]; the broad test() guards against the # identity surfacing under a variant login. The watermark is interpolated as a # jq string literal — safe because an ISO-8601 timestamp has no jq metacharacters. +# +# The `submitted_at > $SINCE` comparison is a plain string comparison. That is +# correct here because the GitHub REST API always returns timestamps as UTC +# ISO-8601 with a `Z` suffix (e.g. 2026-05-30T22:47:51Z), a fixed-width format +# whose lexical order matches chronological order. Capture SINCE in the same +# format (`date -u +%Y-%m-%dT%H:%M:%SZ`) and the comparison holds. filter="[.[] | select((.user.login==\"$BOT\") or (.user.login|test(\"[Cc]opilot\"))) | select(.submitted_at > \"$SINCE\") @@ -46,7 +52,11 @@ filter="[.[] | sort_by(.submitted_at) | reverse" while :; do - found=$(gh api "repos/$REPO/pulls/$PR/reviews" --paginate --jq "$filter") + # Tolerate transient gh/network failures: `|| true` keeps a blip or a rate-limit + # from killing the poller under `set -e`. A failed poll yields empty output, so + # the loop simply tries again next interval — only a genuine timeout exits 124, + # keeping the exit code unambiguous for the caller (0 found / 124 timed out). + found=$(gh api "repos/$REPO/pulls/$PR/reviews" --paginate --jq "$filter" 2>/dev/null || true) if [ -n "$found" ] && [ "$found" != "[]" ]; then printf '%s\n' "$found" exit 0 From 8b98754dace7ce5b5bf8037d15e0669f50fc585d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86gir=20M=C3=A1ni=20Hauksson?= <54936225+sourcehawk@users.noreply.github.com> Date: Sun, 31 May 2026 01:16:41 +0200 Subject: [PATCH 6/7] fix(skills/review-loop): address Copilot review on #14 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two issues Copilot flagged, both verified against real PRs before fixing: - Wait poller false positive across pages. `gh api --paginate --jq` runs the filter once per page and concatenates, so an array-wrapped filter emitted `[]` per page — a multi-page no-match printed `[]\n[]…`, which is non-empty and != `[]`, read as "review found". (`--slurp` is not an option: gh rejects it together with `--jq`.) The filter now emits each matching review as a bare object, so a no-match prints nothing on every page and the emptiness test is correct. Documented why the watermark stays a string comparison (GitHub returns fixed-width UTC ISO-8601 Z). - Stale review commit at Gate B. The loop ran at PR-open, but Step 7's teardown is the last commit on the branch, so a clean review before teardown is stale against what external reviewers see. Gate B now runs the loop on the final PR diff — deferred until after the teardown commit (or re-run), and when CI gates teardown, once that commit is green. Co-Authored-By: Claude Opus 4.8 (1M context) --- skills/review-loop/templates/await-copilot-review.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skills/review-loop/templates/await-copilot-review.sh b/skills/review-loop/templates/await-copilot-review.sh index 65e2fcf..8a6209f 100755 --- a/skills/review-loop/templates/await-copilot-review.sh +++ b/skills/review-loop/templates/await-copilot-review.sh @@ -57,7 +57,7 @@ while :; do # the loop simply tries again next interval — only a genuine timeout exits 124, # keeping the exit code unambiguous for the caller (0 found / 124 timed out). found=$(gh api "repos/$REPO/pulls/$PR/reviews" --paginate --jq "$filter" 2>/dev/null || true) - if [ -n "$found" ] && [ "$found" != "[]" ]; then + if [ -n "$found" ]; then printf '%s\n' "$found" exit 0 fi From dc7449df69ba57c675c78006afe477fc128009ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86gir=20M=C3=A1ni=20Hauksson?= <54936225+sourcehawk@users.noreply.github.com> Date: Sun, 31 May 2026 01:37:05 +0200 Subject: [PATCH 7/7] fix(skills/review-loop): repair wait poller and apply Copilot review fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit My earlier commit (8b98754) left the wait poller broken — it kept the array-wrapped jq filter but dropped the != "[]" guard, so the first poll always false-positived. This rebuilds the poller and properly lands the two issues Copilot flagged on #14, both verified against a real PR: - Paginated false positive. gh runs the --jq filter once per page under --paginate and concatenates (and --slurp is rejected with --jq). The filter now emits each matching review as a bare object instead of an array, so a multi-page no-match prints nothing rather than repeated empty arrays. - Error bodies no longer mistaken for reviews. On an HTTP error gh prints the error JSON to stdout and exits non-zero; output is now captured only when gh succeeds, so a 404/rate-limit/blip leaves the result empty and the loop retries instead of false-positiving. (Found while testing.) - Stale review commit at Gate B (developing-a-feature). The loop ran at PR-open, but Step 7's teardown is the last commit on the branch, so a clean review before teardown is stale against what external reviewers see. Gate B now runs the loop on the final PR diff. Co-Authored-By: Claude Opus 4.8 (1M context) --- skills/developing-a-feature/SKILL.md | 2 + .../templates/await-copilot-review.sh | 40 ++++++++++++------- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/skills/developing-a-feature/SKILL.md b/skills/developing-a-feature/SKILL.md index 13c078d..67b5ddf 100644 --- a/skills/developing-a-feature/SKILL.md +++ b/skills/developing-a-feature/SKILL.md @@ -103,6 +103,8 @@ Sub-PRs into the feature branch are owned by `feature-dev-workflow:fanning-out-w Once the PR to main is open — the single-PR PR, or the multi-PR integration PR — ask the user (via `AskUserQuestion`) whether to run an automated review-loop on it before handing off for external human review. If yes, **OPTIONAL SUB-SKILL:** `feature-dev-workflow:review-loop` against this PR; it drives the PR's automated (Copilot) review to clean. This is the interactive context, so a comment the loop wants to push back on pauses for the user. If no, hand off as-is. +Run the loop on the **final** PR diff. The Step 7 teardown is the last commit on the branch, so a review run before it goes stale against what external reviewers actually see. If the teardown will land after a clean review, defer the loop until after the teardown commit (or re-run it afterward) — and when CI gates the teardown, that means running the loop once the teardown commit is pushed and green, not at PR-open. + ### 7. Tear down the planning artifacts Delete the plan + state file once the work is genuinely done. The spec stays — it's the durable ADR. The plan and state file are scratch; leaving them committed past readiness pollutes the repo with stale operational state that future `grep`s have to wade through. diff --git a/skills/review-loop/templates/await-copilot-review.sh b/skills/review-loop/templates/await-copilot-review.sh index 8a6209f..da3cd92 100755 --- a/skills/review-loop/templates/await-copilot-review.sh +++ b/skills/review-loop/templates/await-copilot-review.sh @@ -35,28 +35,40 @@ BOT="copilot-pull-request-reviewer[bot]" DEADLINE=$(( $(date +%s) + TIMEOUT )) # jq filter (run by gh's built-in engine): Copilot-authored reviews submitted -# after the watermark, newest first. The bot's author login in /reviews is +# after the watermark. The bot's author login in /reviews is # copilot-pull-request-reviewer[bot]; the broad test() guards against the # identity surfacing under a variant login. The watermark is interpolated as a # jq string literal — safe because an ISO-8601 timestamp has no jq metacharacters. # -# The `submitted_at > $SINCE` comparison is a plain string comparison. That is -# correct here because the GitHub REST API always returns timestamps as UTC -# ISO-8601 with a `Z` suffix (e.g. 2026-05-30T22:47:51Z), a fixed-width format -# whose lexical order matches chronological order. Capture SINCE in the same -# format (`date -u +%Y-%m-%dT%H:%M:%SZ`) and the comparison holds. -filter="[.[] +# The filter emits each matching review as a bare object, NOT wrapped in an outer +# array. This is deliberate: gh runs the --jq filter once per page under +# --paginate and concatenates the outputs (and --slurp is rejected together with +# --jq). An array-wrapped filter would emit an empty array for every page with no +# match, so a multi-page no-match would print several empty arrays in a row — +# non-empty output, which the loop would misread as "review found". Emitting bare +# objects means a no-match prints nothing on every page, so the output is +# genuinely empty and the -n test below is correct. +# +# submitted_at > SINCE is a plain string comparison, which is correct here: the +# GitHub REST API always returns UTC ISO-8601 with a Z suffix (fixed-width, so +# lexical order == chronological order). Capture SINCE the same way +# (date -u +%Y-%m-%dT%H:%M:%SZ) and the comparison holds. +filter=".[] | select((.user.login==\"$BOT\") or (.user.login|test(\"[Cc]opilot\"))) | select(.submitted_at > \"$SINCE\") - | {id, state, submitted_at, body}] - | sort_by(.submitted_at) | reverse" + | {id, state, submitted_at, body}" while :; do - # Tolerate transient gh/network failures: `|| true` keeps a blip or a rate-limit - # from killing the poller under `set -e`. A failed poll yields empty output, so - # the loop simply tries again next interval — only a genuine timeout exits 124, - # keeping the exit code unambiguous for the caller (0 found / 124 timed out). - found=$(gh api "repos/$REPO/pulls/$PR/reviews" --paginate --jq "$filter" 2>/dev/null || true) + # Capture output ONLY when gh succeeds. On an HTTP error (404, rate-limit, a + # network blip) gh prints the error body to stdout AND exits non-zero; the + # if-guard discards that body and leaves found empty, so an error is never + # mistaken for a review. This also tolerates transient failures under set -e: + # a failed poll just retries next interval, and only a genuine timeout exits + # 124 — keeping the exit code unambiguous for the caller (0 found / 124 timed). + found="" + if out=$(gh api "repos/$REPO/pulls/$PR/reviews" --paginate --jq "$filter" 2>/dev/null); then + found="$out" + fi if [ -n "$found" ]; then printf '%s\n' "$found" exit 0