diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4a9dad3..1b0f97b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -82,3 +82,31 @@ jobs: # Trivy audits what physically shipped in the image. vuln-type: os,library exit-code: '1' + + deploy: + # Runs only on push to main. PRs (including from forks) never satisfy + # this condition, so `FLY_API_TOKEN` is structurally unreachable from + # any job that runs untrusted PR code. + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + needs: [test, security, image-scan] + runs-on: ubuntu-latest + # Belt-and-suspenders: the workflow-level permissions block above + # already grants only `contents: read`, but a future top-level + # escalation would silently widen this job's scope. Pin the minimum + # here so the deploy job stays read-only against the repo even if + # the header drifts. `flyctl` itself authenticates to Fly via + # `FLY_API_TOKEN`, not via GitHub permissions. + permissions: + contents: read + steps: + - uses: actions/checkout@v6 + # Tracks: superfly/flyctl-actions/setup-flyctl@1.6 + # Pinned by commit SHA so a tag-swap upstream cannot change what + # holds `FLY_API_TOKEN` during the next deploy. Refresh the comment + # in lockstep with the SHA. Same convention as the Trivy pin in + # `image-scan` (#68) and the govulncheck pin in `security` (#41). + - uses: superfly/flyctl-actions/setup-flyctl@ed8efb33836e8b2096c7fd3ba1c8afe303ebbff1 + - name: flyctl deploy + run: flyctl deploy --remote-only + env: + FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} diff --git a/docs/architecture.md b/docs/architecture.md index 6089017..4bee448 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -62,6 +62,14 @@ Intended uses: **NOT recommended for production.** Setting it permanently silences a check whose entire purpose is to catch the silent-routing-failure mode described above. +## Hosting + +Production deploys to a single **Fly.io** machine in one region. TLS terminates in the relay binary (autocert, #9) — Fly runs the substrate in raw-TCP passthrough mode on `:80` and `:443`, with a dedicated IPv4 so Let's Encrypt's HTTP-01 challenge resolves deterministically. The autocert cache lives on a Fly volume at `/var/lib/relay/autocert`. + +The single-machine cap is platform-enforced via `min_machines_running = 1`, `auto_start_machines = false`, and `auto_stop_machines = "off"` in `fly.toml`, and binary-enforced via the `PYRYCODE_RELAY_SINGLE_INSTANCE` self-check (#65). Multi-instance scaling is out of scope for v1 — see § *Single-instance constraint* above. + +Bootstrap and rollback procedures: [`docs/deploy.md`](deploy.md). The manifest itself: [`fly.toml`](../fly.toml). CI deploy job: [`.github/workflows/ci.yml`](../.github/workflows/ci.yml). + ## Threat model Wire-protocol threats live in the protocol spec's [Security model](https://github.com/pyrycode/pyrycode/blob/main/docs/protocol-mobile.md#security-model). Operational threats specific to the relay binary as a deployed process — deploy, supply chain, DoS, log hygiene, cert handling, TLS config, error-leakage — live in [`docs/threat-model.md`](./threat-model.md). diff --git a/docs/deploy.md b/docs/deploy.md new file mode 100644 index 0000000..3082b71 --- /dev/null +++ b/docs/deploy.md @@ -0,0 +1,75 @@ +# Deploy + +The relay deploys to a single [Fly.io](https://fly.io) machine. CI deploys on every merge to +`main`; manual deploys are needed only for the one-time bootstrap and for +rollbacks. + +See [`docs/architecture.md` § *Hosting*](architecture.md#hosting) for the +hosting + TLS-termination decision record, and +[`docs/threat-model.md` § *Deploy security*](threat-model.md#deploy-security--vps-compromise) +for the operational threat surface the substrate sits on. + +## One-time bootstrap (per environment) + +Done once per Fly app — typically only the production app. + +1. Edit `fly.toml`: replace `__REGION__` with a Fly region code (e.g. + `ams`, `arn`, `fra` — see `flyctl platform regions`) and `__DOMAIN__` + with the public domain (e.g. `relay.pyrycode.dev`). These are + placeholders by design — the first deploy fails loudly if either is + left unset, which is preferable to a silently-misconfigured production + relay. +2. `flyctl apps create pyrycode-relay` (must match `app =` in `fly.toml`). +3. `flyctl ips allocate-v4 --app pyrycode-relay` — a dedicated IPv4 is + **required**, not optional, for autocert's HTTP-01 challenge to + resolve deterministically to the running machine on port 80. Shared + IPv4 + TCP passthrough is not a supported combination on Fly. This is + billable; call it out at provisioning review. +4. `flyctl volumes create relay_autocert --region --size 1 + --app pyrycode-relay` — the volume name must match `source =` in the + `[[mounts]]` block of `fly.toml`. The autocert cache lives here and + survives machine recycles, avoiding repeated Let's Encrypt + re-issuance. +5. DNS: point the production domain (A record) at the IPv4 from step 3. + Let's Encrypt resolves the domain via HTTP-01 on first deploy; + without DNS in place, the first WSS request hangs ~minutes while + autocert retries. +6. GitHub repo secret: `FLY_API_TOKEN` = output of `flyctl auth token`. + Settings → Secrets and variables → Actions → New repository secret. + The token grants deploy access to the entire Fly org — scope it to a + `pyrycode-relay`-only deploy token if Fly's tokens UI offers that at + bootstrap time. + +## Steady-state flow + +1. Open a PR. CI runs `test`, `security`, and `image-scan` on the PR HEAD. +2. Merge to `main`. CI re-runs the three jobs against `main`, then runs + `deploy` (gated on all three passing). +3. `deploy` invokes `flyctl deploy --remote-only`. Fly's remote builder + rebuilds the image from `Dockerfile` and replaces the single machine + in place via the `immediate` deploy strategy. + +Observability: + +- `flyctl status` — machine health. +- `flyctl logs --app pyrycode-relay` — relay stderr in real time. +- The deploy job's GitHub Actions log records the build + roll output. + +## Rollback + +Two paths, in increasing order of disruption. + +1. **By image digest (preferred).** `flyctl releases list` shows recent + release digests. `flyctl deploy --image --remote-only` + pins to the prior image without rebuilding. The autocert cache + persists across rollbacks (it's on the volume), so no Let's Encrypt + re-issuance is triggered. +2. **By release number.** `flyctl releases rollback` rolls back the + *most recent* release. Use when the prior release's digest isn't to + hand and a rollback is needed immediately. + +A rollback does **not** revert the `main` commit. To prevent CI's next +deploy from immediately re-rolling the broken release forward, either +revert the offending PR before the next merge to `main`, or disable the +`deploy` job temporarily by editing `.github/workflows/ci.yml` on a +revert PR. diff --git a/docs/knowledge/INDEX.md b/docs/knowledge/INDEX.md index 009ed87..b5daf33 100644 --- a/docs/knowledge/INDEX.md +++ b/docs/knowledge/INDEX.md @@ -4,6 +4,7 @@ One-line pointers into the evergreen knowledge base. Newest entries at the top o ## Features +- [Fly.io deploy](features/fly-deploy.md) — production host wiring: `fly.toml` declares TCP-passthrough on `:80`/`:443` (no Fly HTTP proxy, no Fly-managed certs) so TLS keeps terminating in the relay via autocert (#9), persistent Fly volume `relay_autocert` mounted at `/var/lib/relay/autocert`, and a single-machine hard cap encoded via `min_machines_running=1` + `auto_start_machines=false` + `auto_stop_machines="off"` + `[deploy] strategy="immediate"` (Fly Apps v2 has no `max_machines` key; the in-binary `PYRYCODE_RELAY_SINGLE_INSTANCE` self-check from #65 is the backstop). CI `deploy` job in `.github/workflows/ci.yml` runs `flyctl deploy --remote-only` on push to `main`, gated by branch-condition + `needs: [test, security, image-scan]` + `permissions: contents: read` so `FLY_API_TOKEN` is structurally unreachable from PR code; `superfly/flyctl-actions/setup-flyctl` pinned by commit SHA with `# Tracks:` comment (same convention as #68 / #41). Dedicated IPv4 is required (not optional) for autocert's HTTP-01 challenge; TCP passthrough preserves the real socket peer IP that #34's rate limiter reads. `__REGION__` / `__DOMAIN__` ship as placeholders that fail loud on first deploy (#38). - [Connection-count gauges](features/connection-count-gauges.md) — `pyrycode_relay_connected_binaries` and `pyrycode_relay_connected_phones` exposed via a pull-based `prometheus.Collector` reading `Registry.Counts()` on each scrape; zero edits to `registry.go`; scalar (no labels) by design — `{server="..."}` would carry the attacker-influenced `x-pyrycode-server` header onto the metrics surface, which threat-model § Log hygiene forbids; stale grace-expiry fires can't move the gauge because the pointer-identity guard (ADR-0006) keeps the maps unchanged and the gauge IS the map size; race-tested against 16 mutator goroutines + a tight-loop scraper under `-race`. First collector wired into the #59 seam (#61). - [Metrics registry (scaffolding)](features/metrics-registry.md) — private `*prometheus.Registry` + `NewMetricsHandler` factory wrapping `promhttp.HandlerFor` (text format only; OpenMetrics off; `HandlerOpts.Registry: reg` keeps `promhttp_metric_handler_*` off `DefaultRegisterer`). Seam shape for siblings: per-concern collector struct in its own file, constructed by a helper taking `prometheus.Registerer` (no mega-struct, no package-level vars) — first instantiated by #61's `connectionsCollector`. Listener still pending (#60). Structural defence against default-registry leaks via `TestMetricsRegistry_NoGlobalRegistrarLeak` (#59). - [Docker image](features/docker-image.md) — portable OCI artifact: multi-stage `Dockerfile` builds a fully-static binary (`CGO_ENABLED=0`, `-trimpath -s -w`) into `distroless/static-debian12:nonroot`; both base images digest-pinned with `# Tracks:` comments; exposes `:80`/`:443` and declares `/var/lib/relay/autocert` volume; host-specific wiring (TLS policy, ports, volumes, healthcheck) is #38's problem (#32). PR-time Trivy CVE scan against the just-built image lives in CI as the `image-scan` job, fails on **fixable** CRITICAL/HIGH only (`ignore-unfixed: true`), action pinned by commit SHA with `# Tracks: ` comment mirroring the Dockerfile pin convention; intentional overlap with `govulncheck` (source-reachability vs. shipped-artifact) (#68). Both scanners are also re-run daily against `main` via `.github/workflows/security-scan.yml` (cron + `workflow_dispatch`) so disclosed CVEs against unchanged deps surface within ≤24h rather than staying invisible until the next bump (#72); a red cron run also opens a `security-sensitive`-labelled GitHub issue via the workflow's `file-issue` job (artifact-handoff privilege split keeps `issues: write` off the scanners and out of workflow scope; deterministic-title dedup via `gh issue list --search 'in:title …'`) so regressions land as tracked work-items rather than passive Actions rows (#73). @@ -31,7 +32,8 @@ One-line pointers into the evergreen knowledge base. Newest entries at the top o ## Architecture -- [System overview](../architecture.md) — top-level: stateless WS router between phones and pyry binaries. Names the v1 single-instance constraint (in-memory registry → two replicas hold disjoint registries → silent `4404`), the two multi-instance paths documented as future work (shared registry; sticky-on-`x-pyrycode-server`), and the `PYRYCODE_RELAY_SINGLE_INSTANCE=1` bypass for the #65 startup self-check (#64). (Lives at `docs/architecture.md`; not yet split into `architecture/`.) +- [System overview](../architecture.md) — top-level: stateless WS router between phones and pyry binaries. Names the v1 single-instance constraint (in-memory registry → two replicas hold disjoint registries → silent `4404`), the two multi-instance paths documented as future work (shared registry; sticky-on-`x-pyrycode-server`), and the `PYRYCODE_RELAY_SINGLE_INSTANCE=1` bypass for the #65 startup self-check (#64). § *Hosting* records the Fly.io + relay-terminated-TLS + single-machine-via-fly.toml decision (#38). (Lives at `docs/architecture.md`; not yet split into `architecture/`.) +- [Deploy procedures](../deploy.md) — operator-facing bootstrap (one-time: `flyctl apps create` / `flyctl ips allocate-v4` / `flyctl volumes create relay_autocert` / DNS / `FLY_API_TOKEN` secret), steady-state (PR → merge to `main` → CI deploys), and rollback (`flyctl deploy --image ` preferred; `flyctl releases rollback` as fallback). Dedicated IPv4 is required for autocert's HTTP-01 challenge — not optional, billable (#38). ## Cross-cutting diff --git a/docs/knowledge/codebase/38.md b/docs/knowledge/codebase/38.md new file mode 100644 index 0000000..5d048dd --- /dev/null +++ b/docs/knowledge/codebase/38.md @@ -0,0 +1,69 @@ +# Ticket #38 — Fly.io deploy manifest + CI auto-deploy + +Wires the portable image from #32 to a concrete production host. Lands `fly.toml` (TCP-passthrough on `:80`/`:443`, persistent volume at `/var/lib/relay/autocert`, single-machine hard cap), a `deploy` job in `.github/workflows/ci.yml` that runs `flyctl deploy --remote-only` on push to `main`, the bootstrap/steady-state/rollback procedures in `docs/deploy.md`, and a § *Hosting* heading in `docs/architecture.md` recording the decision so a cold reader sees the call without scrolling the issue. No Go code touched. + +## Implementation + +- **`fly.toml`** (new, repo root) — Fly Apps v2 manifest. + - `[[services]]` blocks for `:80` and `:443` use `protocol = "tcp"` with `[[services.ports]]` and **no `handlers` value**. An unset handler list is pass-through; `handlers = ["http"]` would steal `:80` from autocert's HTTP-01 listener and ADR-0002's 404 fallback, `handlers = ["tls"]` on `:443` would terminate at Fly's edge and bypass autocert. The comments in the file flag both as load-bearing. + - `[[mounts]]` maps Fly volume `relay_autocert` → `/var/lib/relay/autocert` (matches the Dockerfile's `VOLUME` declaration). + - `[processes] app = "--domain __DOMAIN__ --cert-cache /var/lib/relay/autocert"` — flags as literal argv. The distroless runtime image has no shell, so `sh -c`-style env-var expansion into argv is not available. `--cert-cache` is set explicitly because `defaultCertCache()` returns `$HOME/.pyrycode-relay/certs` (i.e. `/home/nonroot/.pyrycode-relay/certs`), which is **not** the mount path. + - `primary_region = "__REGION__"` and the `__DOMAIN__` placeholder are intentionally left as such — the first deploy fails loud if either is unset, which is preferable to a silently-misconfigured production relay (the architect's ``/`` spec recommendation; the developer chose the loud-fail placeholder over baking in real values). + - **Single-machine cap is enforced declaratively** via the combination `min_machines_running = 1`, `auto_start_machines = false`, `auto_stop_machines = "off"`, and `[deploy] strategy = "immediate"`. Fly Apps v2 has **no `max_machines` key** despite the AC's wording — the four knobs together encode the AC's intent, plus a comment block flagging that `flyctl scale count > 1` violates the in-process-registry invariant. The runtime self-check (`PYRYCODE_RELAY_SINGLE_INSTANCE`, #65) is the in-binary backstop. + - `[[vm]] size = "shared-cpu-1x"`, `memory = "256mb"` — minimum viable; relay is light-weight. + +- **`.github/workflows/ci.yml`** — appended a fourth top-level job, `deploy`, after `image-scan`. + - Gated by `if: github.event_name == 'push' && github.ref == 'refs/heads/main'`. Fork PRs cannot satisfy this condition, so `FLY_API_TOKEN` is structurally unreachable from untrusted code (AC #4). + - `needs: [test, security, image-scan]` — three existing gates must pass before deploy runs. + - Job-level `permissions: contents: read` (belt-and-suspenders against future workflow-header widening, matching the convention from #68). + - `superfly/flyctl-actions/setup-flyctl@ed8efb33836e8b2096c7fd3ba1c8afe303ebbff1` — pinned by commit SHA with `# Tracks: superfly/flyctl-actions/setup-flyctl@1.6` comment alongside. Same convention as the Trivy pin (#68) and govulncheck pin (#41). + - `FLY_API_TOKEN` flows as `env:` on the deploy step only; never logged, never echoed, never written to disk. + +- **`docs/deploy.md`** (new) — three sections: bootstrap (`apps create` → `ips allocate-v4` → `volumes create relay_autocert` → DNS → `FLY_API_TOKEN` secret), steady-state (PR → merge → CI deploys), rollback (`flyctl deploy --image ` preferred; `flyctl releases rollback` as fallback). Cross-links to `architecture.md § Hosting` and `threat-model.md § Deploy security`. + +- **`docs/architecture.md` § Hosting** — three short paragraphs between § *Single-instance constraint (v1)* and § *Threat model*. Names the Fly.io + relay-terminated-TLS + single-machine-via-fly.toml call, points at `docs/deploy.md`, `fly.toml`, and `.github/workflows/ci.yml`. AC #5 carved an explicit exception permitting the architect/developer to land this single edit in `architecture.md` (the "Never update PROJECT-MEMORY" rule otherwise applies). + +## Patterns established + +- **Action pin + `# Tracks: ` comment is the convention for ALL third-party GitHub Actions in CI.** This ticket extends the pattern (originated #32 for Dockerfile base digests, repeated #68 for Trivy, #41 for govulncheck) to `superfly/flyctl-actions/setup-flyctl`. Renovate keeps the SHA fresh; the comment makes tag-swap attacks reviewable. +- **CI privilege minimisation by structural gating, not by environment-protection rules.** Branch-condition `if:` + `needs:` chain + workflow-level `contents: read` + job-level `permissions: contents: read` together ensure `FLY_API_TOKEN` cannot leak to PR code. No `environment:` block with required reviewers — that lever is available later (one-line addition) but deferred. Same shape as #73's `issues: write` privilege split for the file-issue job: privilege is held by the smallest job that genuinely needs it. +- **`--remote-only` over scan-image-reuse for deploy.** The image-scan job (#68) builds the image locally; reusing it at deploy time would require either GHCR push (regresses `image-scan`'s `contents: read` posture) or an artifact tarball handoff. Both deferred — the Fly remote builder's layer cache makes the rebuild cheap enough that the privilege-minimisation win is worth the seconds. Filed as an open question; not blocking. +- **TCP passthrough preserves real peer IP for #34's rate limiter.** The TLS-in-relay decision (autocert, #9) is now structurally load-bearing for the IP rate limiter: any future move to platform-terminated TLS would require teaching the rate limiter to trust `Fly-Client-IP`, which is the security-sensitive follow-up that was explicitly deferred. Out-of-scope for this ticket; flagged for the next operator considering the swap. +- **Loud placeholders over silently-misconfigured real values.** `__REGION__` and `__DOMAIN__` ship in the committed `fly.toml`; the developer fills them at bootstrap. A wrong-but-plausible real value escaping review is the failure mode this guards against (cf. #9-style silently-misconfigured cert dir). Bootstrap procedure in `docs/deploy.md` names this explicitly. + +## Verification + +No automated tests — the artifact is YAML + TOML + prose. + +- `flyctl validate fly.toml` (developer-side, pre-commit) — catches typos, unknown keys, syntactically-invalid TOML. +- CI workflow lint — `actionlint .github/workflows/ci.yml` available if uncertain; not wired into the repo. +- **The first real deploy IS the end-to-end test.** After the bootstrap procedure runs, the developer checks `flyctl status` (one machine, running, healthy), `curl -i https:///healthz` (`200 ok`), `curl -i http:///anything-not-acme` (`404`, per ADR-0002), and `flyctl volumes list` (mount present). `flyctl ssh console` will fail — that's expected, distroless has no shell, and it's a structural defence not a regression. + +## Out of scope (delegated) + +- **Scan-image reuse at deploy.** Would require GHCR wiring; deferred. Fly's remote builder rebuilds from `Dockerfile` on every deploy. +- **Per-app Fly deploy token.** `flyctl auth token` issues an org-wide token today; the developer should check at bootstrap whether Fly's tokens UI offers a `pyrycode-relay`-scoped token and use the narrower one if available. +- **Deploy approval gate.** Adding `environment: production` with required reviewers is a one-line addition for later; not gating this ticket. +- **Threat-model edit.** `docs/threat-model.md § Deploy security — VPS compromise` should be updated to reflect "operator-owned Fly account" alongside (or instead of) "operator-owned VPS." The architect's security review flagged this as a follow-up; not gating. +- **Platform-terminated TLS / `Fly-Client-IP` trust path for the rate limiter.** Security-sensitive follow-up against #34; explicitly out of scope per the issue body. +- **Multi-instance scaling.** Separate prerequisite ticket; the in-process registry is the constraint. + +## Lessons learned + +- **Read the platform docs at implementation time, not from training data.** The AC named `max_machines = 1` as a fly.toml key. Fly Apps v2 has no such key — the cap is enforced via four other declarative knobs plus operator discipline. The architect's spec flagged this and pre-empted the dead-end; the developer would otherwise have shipped a `fly.toml` that `flyctl validate` rejects. Apply: when an AC names a specific platform key, verify it exists in current docs before committing to the wording. +- **Distroless has no shell — argv substitution is not available.** A reflex `--domain ${DOMAIN}` in `[processes]` would have silently produced a relay invoked with the literal string `${DOMAIN}` as its domain. The spec called this out; the manifest passes literal `--domain __DOMAIN__` argv with operator-side substitution at bootstrap. Apply: any image that uses a no-shell runtime (distroless, scratch, `nonroot` variants) must treat `[processes]` / `command:` / `entrypoint:` values as final argv, not as a shell line. +- **Three-knob cap (`min_machines_running` + `auto_start_machines` + `auto_stop_machines`) only constrains the platform's autonomous behaviour; not the operator's.** `flyctl scale count 2` still violates the in-process-registry invariant. The in-binary `PYRYCODE_RELAY_SINGLE_INSTANCE` self-check from #65 is what catches an actual operator mistake; the manifest knobs prevent autoscaler accidents. Apply: when a correctness invariant depends on cardinality, name the in-binary backstop in the manifest's comment block so the next operator sees both layers. +- **Loud placeholders are a tested failure mode, not a TODO.** `__REGION__` / `__DOMAIN__` were a deliberate spec choice; the developer kept them rather than baking in real values. The first deploy fails fast if the operator forgets either. Apply: any host-specific value that *must* be operator-supplied (region, domain, secret) should ship as a placeholder that fails on first contact, not as a plausible default that escapes review. +- **The architect-spec security-review section pays off at the documentation phase.** The "TCP passthrough preserves real peer IP for #34" finding in the spec's security review is what this ticket-file's *Patterns established* section above turns into evergreen knowledge. Apply: when a spec's security review surfaces an alignment fact (this design depends on this other invariant), promote it to the per-ticket file's patterns section so the next ticket touching either side sees the dependency. + +## Cross-links + +- [Architect spec](../../specs/architecture/38-fly-deploy-manifest.md) — full design rationale, manifest stanza-by-stanza commentary, security review. +- [Feature: Fly.io deploy](../features/fly-deploy.md) — evergreen reference for the manifest + CI deploy job. +- [Feature: Docker image](../features/docker-image.md) — the portable artifact this ticket wires to a host; the `#38` deferrals named there close with this ticket. +- [Ticket #32 codebase notes](32.md) — Dockerfile base hardening that this ticket builds on. +- [Ticket #68 codebase notes](68.md) — the `image-scan` job whose `needs:` chain this ticket joins. +- [`docs/deploy.md`](../../deploy.md) — operator-facing bootstrap / steady-state / rollback procedures. +- [`docs/architecture.md` § Hosting](../../architecture.md#hosting) — the decision record. +- [ADR-0002](../decisions/0002-autocert-explicit-failure-on-port-80.md) — the `:80` 404 fallback that the TCP-passthrough `[[services]]` block preserves. +- [Threat model](../../threat-model.md) — § *Deploy security*, § *Cert & key handling* are the surfaces this deploy substrate sits on. diff --git a/docs/knowledge/features/docker-image.md b/docs/knowledge/features/docker-image.md index 016fe0b..a5d500b 100644 --- a/docs/knowledge/features/docker-image.md +++ b/docs/knowledge/features/docker-image.md @@ -2,7 +2,7 @@ Host-agnostic OCI image for the relay, produced by a multi-stage `Dockerfile` at the repo root. Image-layer hardening (small base, no shell, no package manager, non-root, digest-pinned bases, stripped static binary) is defence-in-depth on top of the runtime hardening already in place (autocert cache permission check, slow-loris timeouts, header-gate before WS upgrade, 256 KiB frame cap, opaque payload routing). -The image is intentionally **portable, not deployable on its own**: it exposes both `:80` and `:443` and declares a volume mount at `/var/lib/relay/autocert`, but TLS termination policy, port publishing, volume backing, single-instance enforcement, and healthcheck wiring are decisions the host manifest owns (#38 / #39 / #42). +The image is intentionally **portable, not deployable on its own**: it exposes both `:80` and `:443` and declares a volume mount at `/var/lib/relay/autocert`, but TLS termination policy, port publishing, volume backing, single-instance enforcement, and healthcheck wiring are decisions the host manifest owns. The host wiring landed as `fly.toml` + CI auto-deploy in #38 ([Feature: Fly.io deploy](fly-deploy.md)); single-instance binary-level self-check in #65 (#39 / #42 deferrals). ## Build and verification @@ -28,7 +28,7 @@ go build -trimpath -ldflags="-s -w -X main.Version=${VERSION}" \ - `CGO_ENABLED=0` → fully-static binary that runs on distroless/static (no glibc on the runtime image). - `-trimpath` → strips host build paths from the binary; defends against accidental disclosure of build-host directory structure via panic stack traces. - `-s -w` → strips symbol table and DWARF; reduces post-exploitation reverse-engineering convenience and signals build hygiene. -- `-X main.Version=${VERSION}` → mirrors the `Makefile`'s `LDFLAGS`. Image builds default to `VERSION=dev` (matches the bare-binary default); release tooling lands in #38 and overrides via `--build-arg VERSION=…`. +- `-X main.Version=${VERSION}` → mirrors the `Makefile`'s `LDFLAGS`. Image builds default to `VERSION=dev` (matches the bare-binary default); a release-tooling override via `--build-arg VERSION=…` remains available for future use (not wired into #38's CI deploy, which builds from `Dockerfile` defaults via `flyctl deploy --remote-only`). `go mod download` runs in a separate layer before `COPY . .` so source-only edits don't bust the dependency-cache layer. @@ -37,7 +37,7 @@ go build -trimpath -ldflags="-s -w -X main.Version=${VERSION}" \ `gcr.io/distroless/static-debian12:nonroot@sha256:…` (digest-pinned). The distroless `:nonroot` variant runs as uid `65532` upstream; the Dockerfile re-asserts `USER nonroot:nonroot` as belt-and-suspenders so a future base swap can't silently regress the invariant. The binary is the only thing in the runtime image — no shell, no package manager, no `apt`/`apk`, no `/etc/passwd` games. - `EXPOSE 80 443` — `:80` for autocert ACME http-01 challenges, `:443` for WSS. The portable artifact exposes both; the host manifest chooses publish-both (autocert mode) or publish-neither (`--insecure-listen` behind a reverse proxy). -- `VOLUME ["/var/lib/relay/autocert"]` — documented mount point for the autocert cache. Without a mount, `--cert-cache` defaults to `/home/nonroot/.pyrycode-relay/certs` (degraded posture: cache vanishes on container restart, forces re-issuance). The host manifest (#38) wires a real backing store. +- `VOLUME ["/var/lib/relay/autocert"]` — documented mount point for the autocert cache. Without a mount, `--cert-cache` defaults to `/home/nonroot/.pyrycode-relay/certs` (degraded posture: cache vanishes on container restart, forces re-issuance). `fly.toml`'s `[[mounts]]` block (#38) wires a Fly volume `relay_autocert` to this path; `[processes] app = "... --cert-cache /var/lib/relay/autocert"` is set explicitly so autocert writes into the persistent volume rather than the unmounted default. - `ENTRYPOINT ["/pyrycode-relay"]` — args at `docker run` go straight to the binary (`--domain …`, `--insecure-listen …`, etc.). ## Digest pinning convention @@ -66,8 +66,8 @@ The image layer contributes hardening to three operational surfaces (see `docs/t ## What this artifact deliberately does NOT do -- **No `HEALTHCHECK` directive.** `/healthz` (#10) is already exposed; platform health checks belong in the host manifest (#38), not in the portable artifact. -- **No host-specific config.** No `fly.toml`, `compose.yaml`, k8s manifest, or systemd unit — those live in #38. +- **No `HEALTHCHECK` directive.** `/healthz` (#10) is already exposed; platform health checks belong in the host manifest (`fly.toml`, #38), not in the portable artifact. +- **No host-specific config inside the image.** `fly.toml` (#38) sits at the repo root, not in the build context (its contents land at deploy time via `flyctl`, not as image bytes). - **No single-instance enforcement.** The relay's binary-slot single-instance constraint is #39's problem; the image can be run N times, but only one will hold the slot. - **No startup security-posture self-check.** #42 covers runtime self-validation. - **No `--cert-cache` baked in.** The default (`/home/nonroot/.pyrycode-relay/certs`) is only relevant for `--version` smoke tests; real deployments pass `--cert-cache /var/lib/relay/autocert` via the host manifest. @@ -87,4 +87,5 @@ On a red cron run, a third `file-issue` job (added in #73, scoped to `issues: wr - [Spec: 32-dockerfile-base-hardening](../../specs/architecture/32-dockerfile-base-hardening.md) — architect's design and security review. - [Threat model](../../threat-model.md) — § *Deploy security*, § *Supply chain*, § *Cert & key handling* are the surfaces this image layer hardens. - [Autocert TLS](autocert-tls.md) — what the `:80` / `:443` exposure and `/var/lib/relay/autocert` mount feed. -- [`/healthz` endpoint](healthz.md) — what platform health checks will probe (wired in #38, not here). +- [`/healthz` endpoint](healthz.md) — what platform health checks would probe (no `HEALTHCHECK` wired by #38; available for future use). +- [Feature: Fly.io deploy](fly-deploy.md) — the host manifest + CI auto-deploy that wires this image to production (#38). diff --git a/docs/knowledge/features/fly-deploy.md b/docs/knowledge/features/fly-deploy.md new file mode 100644 index 0000000..f45c3b9 --- /dev/null +++ b/docs/knowledge/features/fly-deploy.md @@ -0,0 +1,98 @@ +# Fly.io deploy — production host wiring + +The relay's production host is a single [Fly.io](https://fly.io) machine in one region. `fly.toml` at the repo root tells Fly how to run the portable image from #32; `.github/workflows/ci.yml`'s `deploy` job re-applies the manifest on every push to `main`. Operator-facing procedures live in [`docs/deploy.md`](../../deploy.md); the decision record lives in [`docs/architecture.md` § Hosting](../../architecture.md#hosting). + +## What it does + +- **`fly.toml`** declares a Fly Apps v2 app with TCP passthrough on `:80` + `:443`, a persistent volume at `/var/lib/relay/autocert`, and a single-machine hard cap. +- **CI `deploy` job** runs `flyctl deploy --remote-only` on push to `main`, gated on `test` + `security` + `image-scan` passing. Fly's remote builder rebuilds the image from `Dockerfile` against the merged commit and rolls the single machine in place. +- **Bootstrap is one-time:** `flyctl apps create` → `flyctl ips allocate-v4` → `flyctl volumes create relay_autocert` → DNS → set the `FLY_API_TOKEN` GitHub repo secret → fill `__REGION__` / `__DOMAIN__` placeholders in `fly.toml`. + +## Why this shape + +### TCP passthrough, not Fly's HTTP proxy + +TLS terminates **in the relay** via autocert (#9, shipped). Fly is a raw-TCP substrate; the binary holds the cert. Three load-bearing consequences: + +- `[[services]]` blocks use `protocol = "tcp"` with no `handlers = ["http"]` / `handlers = ["tls"]`. An explicit handler list would either steal `:80` from autocert's HTTP-01 listener (breaks cert issuance) or terminate TLS at Fly's edge (bypasses autocert entirely and would force a Fly-managed cert). +- A **dedicated IPv4** is required, not optional. Shared IPv4 + TCP passthrough is not a supported combination on Fly; Let's Encrypt's HTTP-01 challenge needs a deterministic resolution from `:80` to the running machine. `flyctl ips allocate-v4` is part of the bootstrap. +- The real socket peer IP reaches the relay verbatim. #34's IP rate limiter reads the socket peer; the TCP-passthrough decision is what keeps that working without needing to trust an `X-Forwarded-For` / `Fly-Client-IP` header. Any future move to platform-terminated TLS would require a security-sensitive follow-up against #34. + +ADR-0002's `:80` 404 fallback (explicit failure on non-ACME `:80` requests, rather than a 302 → `:443`) means `:80` carries non-trivial application logic. The TCP-passthrough manifest preserves that — Fly inserts no MITM-shaped middleware. + +### Single-machine hard cap + +The relay's connection registry is **in-process** (`internal/relay/registry.go`); two machines hold two disjoint registries and silently drop phones routed to the "wrong" replica. This is a correctness constraint, not a cost optimisation — `docs/architecture.md § Single-instance constraint (v1)` is canonical. + +Fly Apps v2 has **no `max_machines` key**. The cap is encoded via four declarative knobs: + +| Knob | Setting | Why | +| --- | --- | --- | +| `[[services]] min_machines_running` | `1` | Don't drop below one | +| `[[services]] auto_start_machines` | `false` | Don't let Fly create new machines on demand | +| `[[services]] auto_stop_machines` | `"off"` | Don't stop the one machine | +| `[deploy] strategy` | `"immediate"` | Don't create a second machine during deploy for blue/green | + +Operator discipline at `flyctl scale count` is the platform-level ceiling — Fly itself does not enforce one. The in-binary `PYRYCODE_RELAY_SINGLE_INSTANCE` self-check (#65) is the load-bearing backstop for an operator mistake. + +### Argv, not shell + +The distroless runtime image has no shell, so `[processes] app = "..."` is treated as **final argv** by Fly, not as a shell line. Env-var expansion (`${DOMAIN}`) is not available; the manifest passes literal values: + +```toml +[processes] + app = "--domain __DOMAIN__ --cert-cache /var/lib/relay/autocert" +``` + +`--cert-cache /var/lib/relay/autocert` is explicit because the binary's default (`defaultCertCache()` returns `$HOME/.pyrycode-relay/certs`, i.e. `/home/nonroot/.pyrycode-relay/certs`) is **not** the volume mount. Without the flag, autocert would write into an unmounted scratch path and lose the cache on every machine recycle. + +### Loud placeholders + +`primary_region = "__REGION__"` and `--domain __DOMAIN__` ship as placeholders. The first deploy fails loudly if the operator forgets to fill them — preferable to a plausible-but-wrong real value escaping review (cf. the silently-misconfigured-cert-dir failure mode the autocert work guarded against). + +## CI deploy job — privilege model + +`deploy` runs in `.github/workflows/ci.yml` after `image-scan`. Three structural defences keep `FLY_API_TOKEN` away from untrusted code: + +1. **Branch gate.** `if: github.event_name == 'push' && github.ref == 'refs/heads/main'`. PRs (including from forks) never satisfy this condition. +2. **`needs:` chain.** `test` + `security` + `image-scan` must pass. A bad PR that slips through review still has to pass the existing gates. +3. **Job-level `permissions: contents: read`.** Belt-and-suspenders on the workflow-level header. `FLY_API_TOKEN` flows only as `env:` on the deploy step. + +`superfly/flyctl-actions/setup-flyctl` is pinned by commit SHA with a `# Tracks: superfly/flyctl-actions/setup-flyctl@` comment alongside — same convention as the Trivy pin (#68) and govulncheck pin (#41). A tag-swap upstream cannot change what code holds the token between Renovate bumps. + +### Why `--remote-only`, not scan-image reuse + +The `image-scan` job (#68) builds the image locally as part of its scan. Reusing it at deploy time would require either: + +- Pushing to GHCR — regresses `image-scan`'s `contents: read` posture to `contents: read, packages: write`. +- An `actions/upload-artifact` tarball handoff + `docker load` in `deploy` — adds a new artifact channel and CI complexity. + +Both are deferred. Fly's remote builder rebuilds from `Dockerfile` on every deploy; layer-cache dedup makes the rebuild cost small in steady state. The privilege-minimisation win is permanent. + +## Rollback + +Two paths in `docs/deploy.md`, both operator-driven: + +1. **By image digest (preferred).** `flyctl releases list` → `flyctl deploy --image --remote-only`. Autocert cache persists across rollbacks (it's on the volume) — no Let's Encrypt re-issuance triggered. +2. **By release number.** `flyctl releases rollback` rolls back the most recent release. Use when the prior digest isn't to hand. + +A rollback does **not** revert the `main` commit. To prevent CI's next deploy from immediately re-rolling the broken release forward, revert the offending PR before the next merge — or temporarily disable the `deploy` job via a revert PR. + +## What this feature deliberately does NOT do + +- **No Fly-managed TLS, no Fly HTTP proxy.** Both would break autocert and silently change the rate-limiter's view of the peer IP. +- **No platform-level enforcement of single-machine cap.** Fly has no ceiling knob; operator discipline + the `PYRYCODE_RELAY_SINGLE_INSTANCE` self-check (#65) is the load-bearing combination. +- **No deploy-approval gate.** `environment: production` with required reviewers is a one-line addition for later; not wired in v1. +- **No multi-region / multi-instance scaling.** Blocked on the in-process-registry constraint; separate prerequisite ticket. +- **No image-reuse from the scan job.** Always `--remote-only` rebuild; privilege-minimisation over rebuild-time. + +## Cross-links + +- [Ticket #38 codebase notes](../codebase/38.md) — what landed in this ticket. +- [Architect spec](../../specs/architecture/38-fly-deploy-manifest.md) — full design rationale + security review. +- [`docs/deploy.md`](../../deploy.md) — operator-facing bootstrap / steady-state / rollback. +- [`docs/architecture.md` § Hosting](../../architecture.md#hosting) — the decision record. +- [Feature: Docker image](docker-image.md) — the portable artifact this manifest wires. +- [Feature: Autocert TLS](autocert-tls.md) — the in-binary TLS termination this substrate is shaped around. +- [ADR-0002](../decisions/0002-autocert-explicit-failure-on-port-80.md) — the `:80` 404 fallback the TCP-passthrough services preserve. +- [Threat model](../../threat-model.md) — § *Deploy security* (substrate shifts from VPS to Fly account; threat-model update flagged as follow-up). diff --git a/docs/specs/architecture/38-fly-deploy-manifest.md b/docs/specs/architecture/38-fly-deploy-manifest.md new file mode 100644 index 0000000..b23a81e --- /dev/null +++ b/docs/specs/architecture/38-fly-deploy-manifest.md @@ -0,0 +1,387 @@ +# Spec: host-specific deploy manifest — Fly.io + CI auto-deploy (#38) + +## Files to read first + +- `Dockerfile:46` — `VOLUME ["/var/lib/relay/autocert"]`; the destination path the Fly volume mount must match. +- `Dockerfile:53` — `EXPOSE 80 443`; the two ports `[[services]]` blocks in `fly.toml` must publish. +- `Dockerfile:55` — `ENTRYPOINT ["/pyrycode-relay"]`; the relay is invoked directly, with no shell wrapper. There is no `sh` or `bash` in the runtime image — env-var substitution into argv is not available; flag values are passed as literal strings via Fly's `[processes]` block. +- `cmd/pyrycode-relay/main.go:23-43` — the required-flag check (`--domain` xor `--insecure-listen`). The manifest invokes the `--domain` arm; `--insecure-listen` is irrelevant on Fly (TLS terminates here). +- `cmd/pyrycode-relay/main.go:76-117` — autocert wiring: HTTP-01 challenge on `:80` (`httpSrv`), WSS on `:443` (`httpsSrv`). Both listeners are mandatory in `--domain` mode; the fly.toml must publish both with TCP passthrough. +- `cmd/pyrycode-relay/main.go:120-125` — `defaultCertCache()` returns `$HOME/.pyrycode-relay/certs`. Inside the container that's `/home/nonroot/.pyrycode-relay/certs` (a writeable scratch path under the nonroot user's home) — **not** the volume mount. The manifest must pass `--cert-cache /var/lib/relay/autocert` explicitly so autocert writes into the persistent volume. +- `.github/workflows/ci.yml` (entire file, 85 lines) — the workflow the deploy job is appended to. Note the existing `permissions: contents: read` at workflow level, the `actions/checkout@v6` / `actions/setup-go@v6` major-version pins (a Renovate-managed convention this repo uses for first-party actions), and the explicit-SHA pin on the Trivy action (the convention for third-party actions). +- `docs/specs/architecture/68-trivy-image-scan.md` § *Pinning the action* — the SHA-pin + `# Tracks:` comment convention `setup-flyctl` must follow. +- `docs/specs/architecture/32-dockerfile-base-hardening.md` § *On `.dockerignore`*, § *Stage 2: runtime* — confirms the image is distroless (no shell), so the manifest cannot rely on `sh -c` env substitution. +- `docs/architecture.md` § *Single-instance constraint (v1)* (lines 33-63) — the single-machine cap is **load-bearing for correctness**, not a cost optimisation. The `PYRYCODE_RELAY_SINGLE_INSTANCE` bypass is the documented escape hatch; the manifest is not allowed to set it. +- `docs/threat-model.md` § *Deploy security — VPS compromise* and § *Cert & key handling* — the operational threat-model entries this ticket lands defence-in-depth on (managed-host substrate replacing bare-VPS; persistent volume backing the autocert cache the `0700` check enforces on). +- `docs/knowledge/decisions/0002-autocert-explicit-failure-on-port-80.md` — the `:80` 404 fallback (rather than a 302 → `:443` redirect) means port 80 carries non-trivial application logic the manifest must not strip. Confirms the fly.toml services block must not insert Fly's HTTP handler in front of `:80`. +- `README.md` § *Build* and § *Run* — the docs to cross-link from the new `docs/deploy.md`. + +(Codegraph context was queried (`task = "Fly.io deploy manifest…"`) and confirmed no Go code is touched. Falling back to direct file reads above for the docs/CI surfaces codegraph does not parse.) + +## Context + +The relay has had a portable Docker image since #32, but no host wiring. Without a manifest, every release is a manual `flyctl deploy` from a developer's laptop — non-reviewable, non-repeatable, and gated on whoever holds the Fly API token locally. This ticket lands the thin wrapper that tells Fly how to run the existing image, plus the CI job that re-applies the manifest on every merge to `main`. + +Two prior decisions ([#38 operator comment](https://github.com/pyrycode/pyrycode-relay/issues/38#issuecomment-4433395643)) constrain the design: + +1. **Host: Fly.io.** Picked over Hetzner Cloud / Railway. The remaining design space is the fly.toml shape. +2. **TLS terminates in the relay.** Autocert (#9, shipped) stays the cert holder; Fly is a TCP-passthrough substrate. This rules out Fly's default HTTP proxy and Fly-managed certificates; it requires a dedicated IPv4 so Let's Encrypt's HTTP-01 challenge resolves deterministically to the running machine. + +The single-machine cap is not a cost optimisation — it is a correctness constraint. `internal/relay/registry.go` is in-process; two machines would hold two disjoint registries and silently drop traffic across the split. `docs/architecture.md` already documents this. The manifest must surface the constraint at the platform level (declarative `min_machines_running` / scale guardrails) so that a future operator running `flyctl scale count 2` is at least a deliberate act, not an autoscaler accident. + +## Design + +### Scope — files touched + +1. **`fly.toml`** (new, repo root) — the Fly Apps v2 manifest. +2. **`.github/workflows/ci.yml`** (edit) — one new `deploy` job appended after `image-scan`. +3. **`docs/deploy.md`** (new) — bootstrap, steady-state, and rollback procedures. +4. **`docs/architecture.md`** (edit) — single-line hosting + TLS-termination decision under a new § *Hosting* heading, between § *Single-instance constraint (v1)* and § *Threat model*. (AC #5 offered `docs/PROJECT-MEMORY.md` or `docs/architecture.md`; the architect "Never Update" rule excludes PROJECT-MEMORY, so architecture.md is the right home. AC #5 carved an explicit exception for this single edit; documenting the choice of file here avoids the developer having to re-litigate it.) + +No Go production code. No test code. No new types or interfaces. + +### `fly.toml` shape + +```toml +# Fly.io manifest for pyrycode-relay. +# +# Decisions encoded here (see docs/architecture.md § Hosting): +# - Fly is a TCP-passthrough substrate; TLS terminates in the relay +# binary via autocert (#9). NO Fly HTTP proxy or Fly-managed certs. +# - Port 80 carries ACME HTTP-01 challenge traffic and the explicit +# 404 fallback for non-challenge requests (ADR-0002). Both arrive at +# the relay verbatim — Fly must not insert any HTTP handler. +# - Single-machine hard cap. The relay's connection registry is +# in-process; multi-instance silently routes phones to the wrong +# replica. See docs/architecture.md § Single-instance constraint. + +app = "pyrycode-relay" +primary_region = "" # operator-chosen at bootstrap; see docs/deploy.md. + +[build] + dockerfile = "Dockerfile" + +# --domain and --cert-cache passed as literal argv (distroless has no +# shell; env-var expansion into argv is not available). A domain change +# is a one-line edit to this file — config, not code — and rides the +# next deploy. +[processes] + app = "--domain --cert-cache /var/lib/relay/autocert" + +[[mounts]] + source = "relay_autocert" + destination = "/var/lib/relay/autocert" + # initial_size omitted; default is sufficient for autocert's + # account-key + per-domain cert (≤ a few KiB). Resize is operator-side. + +# Raw TCP services. NO `handlers = ["http"]` / `handlers = ["tls"]` — +# either would terminate at Fly's edge and break autocert. +[[services]] + protocol = "tcp" + internal_port = 80 + auto_stop_machines = "off" + auto_start_machines = false + min_machines_running = 1 + + [[services.ports]] + port = 80 + # handlers = [] — explicit no-op. Fly's default for an unset handlers + # list IS pass-through, but the explicit empty list is reviewable + # against a future Fly default change that might add an implicit "http". + +[[services]] + protocol = "tcp" + internal_port = 443 + auto_stop_machines = "off" + auto_start_machines = false + min_machines_running = 1 + + [[services.ports]] + port = 443 + # handlers = [] — see :80 block above. Crucially NOT ["tls"], which + # would steal cert handling from autocert and require a Fly-managed + # cert. + +[[vm]] + size = "shared-cpu-1x" + memory = "256mb" + +[deploy] + # Single machine — rolling/canary are inapplicable. immediate replaces + # in place; brief drop in availability during deploy is acceptable for + # this binary (clients reconnect; no state to drain). + strategy = "immediate" + # max_unavailable = 1 documents intent even though it's redundant with + # a 1-machine fleet. + max_unavailable = 1 +``` + +**Placeholders the developer fills at bootstrap time (NOT in this ticket's PR):** + +- `` — e.g. `ams`, `arn`, `fra`. Operator picks at `flyctl apps create`. +- `` — e.g. `relay.pyrycode.dev`. The actual production domain. + +For the initial commit landing this ticket: ship `fly.toml` with `` and `` set to operator-supplied real values, OR keep them as recognisable placeholders (`__REGION__` / `__DOMAIN__`) and have the developer set them via a follow-up commit before CI's first deploy run. Spec recommendation: ship with real values — a placeholder that escapes review is the failure mode (#9-style: a silently-misconfigured cert dir is worse than a loudly missing one). The first CI deploy then succeeds end-to-end. + +#### `max_machines = 1` — what AC #1 actually asks for + +The AC names `max_machines = 1` as a fly.toml key. As of the current Fly Apps v2 generation, **there is no `max_machines` key in fly.toml** — the hard cap is enforced via the combination of: + +- `[[services]]` `min_machines_running = 1` (don't drop below one), +- `auto_start_machines = false` (don't let Fly create new machines on demand), +- `auto_stop_machines = "off"` (don't stop the one machine), +- `[deploy] strategy = "immediate"` (don't create a second machine during deploy for blue/green), +- operator discipline at `flyctl scale count` (no platform-level "ceiling" knob exists). + +Spec choice: encode all four declarative knobs above (they are what the AC's *intent* requires), and add a comment block at the top of `fly.toml` calling out that `flyctl scale count > 1` would violate the single-instance constraint. The platform itself does not enforce the ceiling. The runtime self-check from #65 (`PYRYCODE_RELAY_SINGLE_INSTANCE` bypass) is the in-binary backstop; the manifest does **not** set the bypass. + +The developer should verify the exact stanza names against current Fly docs (`https://fly.io/docs/reference/configuration/`) at implementation time and `flyctl validate` the resulting file before commit. If Fly has introduced a `max_machines` key by then, set it. Don't invent a key that errors out at deploy. + +### CI deploy job + +Appended to `.github/workflows/ci.yml`, after `image-scan`. Sketch: + +```yaml + deploy: + # Runs only on push to main. PRs (including forks) never trigger this + # job, structurally preventing FLY_API_TOKEN exposure to untrusted code. + if: github.event_name == 'push' && github.ref == 'refs/heads/main' + needs: [test, security, image-scan] + runs-on: ubuntu-latest + permissions: + contents: read + # No environment: protection — gating is via branch (main is + # protected) and the fact that PRs from forks cannot reach this job. + # If a deploy-approval step is wanted later, an `environment:` block + # with required reviewers is the lever; deferred (out of scope). + steps: + - uses: actions/checkout@v6 + # Tracks: superfly/flyctl-actions/setup-flyctl@v1.5 + # Pinned by commit SHA so a tag-swap upstream cannot change what + # holds FLY_API_TOKEN during the next deploy. Refresh the comment + # in lockstep with the SHA. Same convention as the Trivy pin in + # image-scan (#68) and govulncheck pin (#41). + - uses: superfly/flyctl-actions/setup-flyctl@ + - name: flyctl deploy + run: flyctl deploy --remote-only + env: + FLY_API_TOKEN: ${{ secrets.FLY_API_TOKEN }} +``` + +Three structural defences against the AC's "untrusted PR code" concern: + +1. **Branch gate (`if:`).** `pull_request` events never satisfy the condition. Fork PRs cannot mutate `main`, so they cannot reach this job by any path other than the maintainer merging — at which point the code is no longer untrusted. +2. **`needs:` chain.** `test` + `security` + `image-scan` must pass. A malicious PR that slips through review still has to pass the existing gates before deploy runs. +3. **`permissions: contents: read`.** Locks the job down to the workflow-level baseline. `FLY_API_TOKEN` is the only privileged input; it's accessed via `${{ secrets.… }}` (the standard pattern) and is not exposed to any earlier job. + +#### Why `--remote-only` + +The AC pins `flyctl deploy --remote-only`. Fly's remote builder rebuilds the image from `Dockerfile` against the pushed commit. The image-scan job's locally-built image is **not** reused — exporting it cross-job would require an artifact upload + `packages:write` on the scanner job (regressing #68's `contents: read` posture) or a push to GHCR (same regression). The rebuild cost is paid once per deploy (~minutes on Fly's builders, cached aggressively across deploys via layer dedup); the privilege-minimisation win is permanent. The issue body's "Deploy should reuse that same artifact" is aspirational; the AC's pinned command is the load-bearing constraint and is what the spec implements. + +Reusing the scan-time image to deploy the bit-identical bytes (instead of rebuilding) is filed as a follow-up under § *Open questions* — it requires GHCR wiring that is well out of scope here. + +#### `setup-flyctl` pin + +`superfly/flyctl-actions/setup-flyctl` ships a managed `flyctl` binary; the SHA pin is on the **action**, not on `flyctl` itself. The action will install whichever `flyctl` version it bundles. If a specific `flyctl` version is required (it isn't today — `flyctl deploy --remote-only` is stable across recent releases), it's pinnable via a `with: version: …` input on the action. + +Developer step at implementation time: visit `https://github.com/superfly/flyctl-actions`, pick the latest tagged release (currently `v1.5` at time of writing), record its commit SHA via `git ls-remote https://github.com/superfly/flyctl-actions refs/tags/v1.5`, and paste into the pin with a matching `# Tracks:` comment. Renovate keeps this fresh thereafter, same as the Trivy pin. + +### `docs/deploy.md` shape + +~50 lines of prose split into three sections: + +```markdown +# Deploy + +The relay deploys to a single Fly.io machine. CI deploys on every merge to +`main`; manual deploys are needed only for the one-time bootstrap and for +rollbacks. + +## One-time bootstrap (per environment) + +Done once per Fly app — typically only the production app. + +1. `flyctl apps create pyrycode-relay` (matches `app =` in `fly.toml`). +2. `flyctl ips allocate-v4 --app pyrycode-relay` — a dedicated IPv4 is + **required**, not optional, for the autocert HTTP-01 challenge to + resolve deterministically to the running machine on port 80. Shared + IPv4 + TCP passthrough is not a supported combination on Fly. Billable; + call it out at provisioning review. +3. `flyctl volumes create relay_autocert --region --size 1 + --app pyrycode-relay` (matches `source =` in `fly.toml [[mounts]]`). +4. DNS: point `` (A record) at the IPv4 from step 2. Let's + Encrypt resolves the domain via HTTP-01 on first deploy; without DNS, + the relay's first WSS request hangs ~minutes while autocert retries. +5. GitHub repo secret: `FLY_API_TOKEN` = `flyctl auth token` output. + Settings → Secrets and variables → Actions → New repository secret. + The token holds deploy access to the entire Fly org — scope it to a + `pyrycode-relay`-only deploy token if Fly's tokens UI offers that + today. + +## Steady-state flow + +1. Open a PR. CI runs `test`, `security`, `image-scan` on the PR HEAD. +2. Merge to `main`. CI re-runs the three jobs against `main`, then runs + `deploy` (which is gated on all three passing). +3. `deploy` invokes `flyctl deploy --remote-only`. Fly's remote builder + rebuilds the image from `Dockerfile` and rolls the single machine to + the new image in place. + +Logs: `flyctl logs --app pyrycode-relay`. Status: `flyctl status`. + +## Rollback + +Two paths, in increasing order of disruption. + +1. **By image digest (preferred).** `flyctl releases list` shows recent + release digests. `flyctl deploy --image --remote-only` + pins to the prior image without rebuilding. The autocert cache + persists across rollbacks (it's on the volume); no LE re-issuance is + triggered. +2. **By release number.** `flyctl releases rollback` rolls back the + *most recent* release. Available when a rollback is needed + immediately and the digest of the prior release isn't to hand. + +A rollback does NOT revert the `main` commit. To prevent CI's next +deploy from immediately re-rolling the broken release forward, +either revert the offending PR before the next merge to `main` or +disable the `deploy` job temporarily via a workflow_dispatch toggle. +``` + +The doc cross-links to `docs/architecture.md` § *Hosting* (the decision record) and to `docs/threat-model.md` § *Deploy security* (the operational threat surface the deploy substrate sits on). + +### `docs/architecture.md` edit + +Insert a new heading `## Hosting` between the existing § *Single-instance constraint (v1)* and § *Threat model*. Two-paragraph body: + +```markdown +## Hosting + +Production deploys to a single **Fly.io** machine in one region. TLS +terminates in the relay binary (autocert, #9) — Fly runs the substrate +in raw-TCP passthrough mode on `:80` and `:443`, with a dedicated IPv4 +so Let's Encrypt's HTTP-01 challenge resolves deterministically. The +autocert cache lives on a Fly volume at `/var/lib/relay/autocert`. + +The single-machine cap is platform-enforced via `min_machines_running += 1` and `auto_start_machines = false` in `fly.toml`, and binary-enforced +via the `PYRYCODE_RELAY_SINGLE_INSTANCE` self-check (#65). Multi-instance +scaling is out of scope for v1 — see § *Single-instance constraint* +above. + +Bootstrap and rollback procedures: [`docs/deploy.md`](deploy.md). The +manifest itself: [`fly.toml`](../fly.toml). CI deploy job: +[`.github/workflows/ci.yml`](../.github/workflows/ci.yml). +``` + +This satisfies AC #5 ("a single line lands in PROJECT-MEMORY or equivalent recording the hosting + TLS-termination decision") — the AC permits architecture.md as the equivalent location. The "single line" target is the spirit; the prose above is three short paragraphs because the decision has three load-bearing parts (Fly + relay-TLS + single-machine-enforcement) that all want naming. A literal one-line note ("Host: Fly.io; TLS terminates in relay") would be technically AC-compliant but would force the next cold reader to chase three other docs to reconstruct the call. The AC's intent — *"the next agent reading the repo cold sees the call without scrolling this issue"* — is what this satisfies. + +## Concurrency model + +Not applicable at the manifest level. The relay's existing concurrency model (per-conn goroutines, `errgroup` fan-out in handlers) is unchanged; the manifest does not introduce new processes or coordination points. + +The CI deploy job is single-step and serial; no parallel deploy paths exist. + +## Error handling + +**Manifest-level failure modes the design accepts:** + +- **Deploy fails (Fly builder error, network glitch, FLY_API_TOKEN expired).** The job's step fails loud; the workflow run is red; the operator sees it in the Actions tab and on the GitHub commit. No partial state on the running machine — `flyctl deploy` only swaps the image atomically once the build succeeds. +- **`flyctl validate` would reject the manifest.** Caught at PR time via the `deploy` job's first run after merge — but the AC requires the developer to run `flyctl validate fly.toml` locally before the PR lands. Add to the developer's verification checklist (§ Testing strategy below). +- **Autocert cannot issue cert on first deploy (DNS not pointed, IPv4 not allocated, port 80 unreachable).** The relay logs an autocert error and the first WSS request hangs ~minutes. Caught at bootstrap-time, not at every deploy; the deploy.md procedure orders the steps so DNS + IPv4 are in place before the first deploy runs. +- **Volume not yet created (first deploy bootstrap order skipped).** Fly refuses to start the machine without the named volume. Loud failure, operator intervenes. + +**Failure modes the design does NOT introduce:** + +- The relay binary's own startup failure modes (`ErrCacheDirInsecure`, autocert mismatch) carry through unchanged. The Fly machine surfaces them as a non-zero exit; Fly retries the machine; eventually the deploy is marked failed and the prior release stays live. + +## Testing strategy + +No automated tests — the artifact is a YAML manifest + a workflow file + prose docs. Verification is via: + +1. **`flyctl validate fly.toml`** (run locally by the developer before commit). Catches typos, unknown keys, syntactically-invalid TOML. Fly's CLI ships this; no extra dependency. +2. **CI workflow lint.** `actionlint` (already not wired into this repo, but trivially runnable: `actionlint .github/workflows/ci.yml`) catches YAML schema errors and pinning convention violations. Optional; the AC does not require it. The developer can run it locally if uncertain about the YAML edit. +3. **First real deploy.** The bootstrap procedure in `docs/deploy.md` IS the end-to-end test. After the PR merges and CI runs `deploy` against `main` for the first time, the developer checks: + - `flyctl status` shows one machine, running, healthy. + - `curl -i https:///healthz` returns `200 ok`. + - `curl -i http:///anything-not-an-acme-challenge` returns `404` (ADR-0002). + - `flyctl ssh console` (if available; distroless has no shell, so this will fail — that's expected and a structural defence-in-depth, not a regression). + - `flyctl volumes list` shows `relay_autocert` mounted on the machine. + +These are AC verification steps for the developer's PR — not test code to land in the repo. There is no test harness for fly.toml or workflow files; the manifest's first deploy is its smoke test. + +## Open questions + +- **Reuse the image-scan-built image at deploy time.** Currently `--remote-only` rebuilds. Reusing the scanned image requires (a) `image-scan` pushing to GHCR (regresses its `contents: read` → `contents: read, packages: write` permission posture), or (b) `actions/upload-artifact` of a `docker save` tarball + `docker load` + `flyctl deploy --local-only --image …` in `deploy`. Both are deferred. The Fly remote builder's layer cache makes the rebuild cost small in steady state. +- **Region choice (`primary_region`).** Operator decision at bootstrap time. The relay has no opinion — single-instance, no latency-sensitive routing. The developer picks one when filling the `` placeholder. +- **Domain in fly.toml vs Fly secret.** Spec ships the domain as a literal in `fly.toml`'s `[processes]` block. Treating the domain as a secret was considered and rejected — the domain is not secret (it's published in DNS), and a literal config value is reviewable in the diff. Changing the domain is a one-line PR; the AC's intent ("a domain change does not require a code edit") is satisfied because TOML is config, not Go source. +- **Deploy approval gate (`environment:` with required reviewers).** Would add a human-in-the-loop click between merge and deploy. Out of scope; can be added later as a one-line `environment: production` addition with reviewer config in GitHub Settings. Not gating this ticket on it. +- **Token scope minimisation.** `flyctl auth token` issues an org-wide deploy token. Fly's recent (2026-Q1) deploy-token UI may now support per-app tokens; the developer should check at bootstrap time and use the narrower token if available. Not a manifest-level concern. + +## Security review + +**Verdict:** PASS + +### Trust boundaries + +- **Substrate ↔ binary boundary is unchanged.** Fly is a TCP passthrough on `:80` and `:443`; bytes arrive at the relay binary verbatim, gated by the existing chokepoints (`internal/relay/tls.go` for TLS handshake, `relay.EnforceHost` for SNI/Host mismatch, the WS adapter for frame size). The manifest does not add a new untrusted-data ingress. +- **CI ↔ deploy substrate boundary** is new and narrow: `FLY_API_TOKEN` flows from GitHub Actions secrets to a single step's `env:`, never to `with:` inputs that get logged, never to a file written on the runner. The step is in a job whose `permissions:` is `contents: read` — no `issues: write`, no `packages: write`, no `id-token: write`. A compromised step cannot escalate within GitHub; it can only deploy a bad image to Fly (an action that requires merging code to `main`, which has its own protections). + +### Tokens, secrets, credentials + +- **`FLY_API_TOKEN`** — held as a GitHub repo secret. Never logged. Never echoed (the step's `run:` block does not `echo` it). The `setup-flyctl` action receives it via `env:`, the standard secure pattern. Rotation: operator-driven (`flyctl auth token` → update repo secret); the relay does not need to know the token rotated. +- **Autocert account key + per-domain cert** — sit on the Fly volume at `/var/lib/relay/autocert`. The relay's existing `0700` permission check (`internal/relay/tls.go:16-55`) gates startup on the directory permissions; this remains the load-bearing control. Fly volumes are per-machine; a compromised Fly org would expose the volume. Threat-model match: same as `docs/threat-model.md` § *Cert & key handling*, residual-risk paragraph (same-UID compromise reads the cache). +- **No new secrets introduced by this ticket.** The domain is config, not secret. The region is config, not secret. + +### File operations + +- The manifest declares a volume mount; it does not write files. The relay's autocert writes inside the volume; that's covered by the existing tls.go permission check (separate code path, not introduced here). +- No path traversal surface — the volume destination is a constant string in `fly.toml`, not user-controlled. + +### Subprocess / external command execution + +- The CI deploy step runs `flyctl deploy --remote-only` — a single command with a single flag, no user-controlled interpolation. No `sh -c`, no shell-form `run:` block ambiguity. +- `setup-flyctl` action is pinned by commit SHA (the convention from #41 and #68) — a tag-swap upstream cannot change what code holds the token. `# Tracks:` comment makes a malicious SHA swap reviewable. + +### Cryptographic primitives + +- Not applicable — manifest contains no crypto. The relay's TLS posture is unchanged (`MinVersion: tls.VersionTLS12`, Go defaults for cipher suites). The substrate change (VPS → Fly) does not move TLS termination. + +### Network & I/O + +- **Port 80 and 443 published.** Both required (autocert HTTP-01 on `:80`, WSS on `:443`). The manifest's `[[services]]` blocks use `protocol = "tcp"` with no Fly-managed handlers, so Fly inserts no MITM-shaped middleware. ADR-0002's explicit 404 on non-challenge `:80` traffic reaches the public unchanged. +- **Slow-loris / timeout discipline** is unchanged — handled in `cmd/pyrycode-relay/main.go` (`ReadHeaderTimeout`, `ReadTimeout`, `WriteTimeout`, `IdleTimeout` on both HTTP and HTTPS servers, `:53-95` and `:82-102`). +- **Rate limiting** (#34, shipped) reads the socket peer IP directly. TCP passthrough preserves the real peer IP — the rate limiter sees the actual client address, not a Fly proxy. Validated by the AC's "TLS terminates in the relay" decision; the alternative path (Fly-terminated TLS) would have required the rate limiter to trust `Fly-Client-IP`, which is the security-sensitive follow-up that was explicitly deferred ([§ Out of scope](https://github.com/pyrycode/pyrycode-relay/issues/38)). +- **No new ingress surface.** The manifest publishes only the two ports the binary already binds. + +### Error messages, logs, telemetry + +- The deploy step's output (`flyctl deploy` log lines) appears in the GitHub Actions run log. `flyctl` does not echo `FLY_API_TOKEN` in its logs. GitHub Actions automatically masks values that match registered secrets even if a step did echo them — defence in depth. +- The relay binary's logging is unchanged. The Fly substrate does not introduce new log producers; `flyctl logs` is just a passthrough to the binary's stderr. + +### Concurrency + +- Single CI job, sequential steps. No new goroutines in the relay; the manifest doesn't touch Go code. +- The deploy strategy `immediate` replaces the single machine in place — there is no overlap window where two relay instances run simultaneously. The single-instance constraint is preserved across deploys (not just in steady state). + +### Threat model alignment + +- **`docs/threat-model.md` § *Deploy security — VPS compromise*.** The "operator-owned VPS" assumption is now "operator-owned Fly account." Threat surface shifts from "Linux VPS hardening" (SSH keys, fail2ban, auto-updates) to "Fly account hardening" (org-wide MFA, deploy-token scoping, no shared accounts). Worth a threat-model update; that's a follow-up doc PR, not blocking on this ticket. Flag added to § *Open questions* of `docs/threat-model.md`-equivalent if the developer prefers — but the AC does not require a threat-model edit, so this spec does not mandate one. +- **`docs/threat-model.md` § *Supply chain — Go dependencies*.** Unchanged: `go.mod` is identical, the same `go build` runs on Fly's remote builder. +- **`docs/threat-model.md` § *Cert & key handling*.** Improved: the autocert cache now lives on a persistent Fly volume, surviving machine recycles and reducing the LE-re-issuance churn that the `0700`-check guarantees we don't silently degrade through. +- **Protocol-spec security model** is unchanged — no wire-protocol surface is touched. + +### Findings + +- [Trust boundaries] No findings — the manifest preserves the single explicit boundary at the binary's TLS terminator and the existing header-gate. +- [Tokens] No findings — `FLY_API_TOKEN` is held only in GitHub secrets, scoped to a job with `contents: read`, never echoed. +- [File operations] No findings — manifest declares, does not write. +- [Subprocess] No findings — `flyctl deploy --remote-only` is a fixed command; action pinned by SHA. +- [Crypto] N/A — no crypto introduced. +- [Network & I/O] No findings — TCP passthrough preserves the real peer IP for #34 rate limiting; no Fly-inserted handler on either port. +- [Errors / logs] No findings — `FLY_API_TOKEN` is masked by GitHub Actions; relay logging unchanged. +- [Concurrency] No findings — single CI job, `immediate` deploy strategy preserves single-instance invariant. +- [Threat model alignment] SHOULD FIX (out of scope for this ticket) — `docs/threat-model.md` § *Deploy security* should be updated to reflect "operator-owned Fly account" alongside (or instead of) "operator-owned VPS." Flag as a follow-up; not gating. + +**Reviewer:** architect (self-review per `architect/security-review.md`) +**Date:** 2026-05-12 diff --git a/fly.toml b/fly.toml new file mode 100644 index 0000000..80d400f --- /dev/null +++ b/fly.toml @@ -0,0 +1,73 @@ +# Fly.io manifest for pyrycode-relay. +# +# Decisions encoded here (see docs/architecture.md § Hosting): +# - Fly is a TCP-passthrough substrate; TLS terminates in the relay +# binary via autocert (#9). NO Fly HTTP proxy or Fly-managed certs. +# - Port 80 carries ACME HTTP-01 challenge traffic and the explicit +# 404 fallback for non-challenge requests (ADR-0002). Both arrive at +# the relay verbatim — Fly must not insert any HTTP handler. +# - Single-machine hard cap. The relay's connection registry is +# in-process; multi-instance silently routes phones to the wrong +# replica. See docs/architecture.md § Single-instance constraint. +# `flyctl scale count > 1` violates this invariant — the platform +# itself does not enforce a ceiling, so operator discipline plus +# the in-binary self-check (#65) is the load-bearing control. + +app = "pyrycode-relay" +primary_region = "__REGION__" # operator fills at bootstrap; see docs/deploy.md. + +[build] + dockerfile = "Dockerfile" + +# --domain and --cert-cache passed as literal argv (distroless has no +# shell; env-var expansion into argv is not available). A domain change +# is a one-line edit to this file — config, not code — and rides the +# next deploy. +[processes] + app = "--domain __DOMAIN__ --cert-cache /var/lib/relay/autocert" + +[[mounts]] + source = "relay_autocert" + destination = "/var/lib/relay/autocert" + # initial_size omitted; default is sufficient for autocert's + # account-key + per-domain cert (≤ a few KiB). Resize is operator-side. + +# Raw TCP services. NO `handlers = ["http"]` / `handlers = ["tls"]` — +# either would terminate at Fly's edge and break autocert. +[[services]] + protocol = "tcp" + internal_port = 80 + auto_stop_machines = "off" + auto_start_machines = false + min_machines_running = 1 + + [[services.ports]] + port = 80 + # handlers omitted on purpose: an unset list is pass-through. An + # explicit `handlers = ["http"]` here would steal port 80 from the + # autocert HTTP-01 listener and ADR-0002's 404 fallback. + +[[services]] + protocol = "tcp" + internal_port = 443 + auto_stop_machines = "off" + auto_start_machines = false + min_machines_running = 1 + + [[services.ports]] + port = 443 + # handlers omitted on purpose: any value here (e.g. ["tls"]) would + # terminate TLS at Fly's edge and require a Fly-managed cert, + # bypassing autocert. + +[[vm]] + size = "shared-cpu-1x" + memory = "256mb" + +[deploy] + # Single machine — rolling/canary are inapplicable. `immediate` + # replaces in place; a brief drop in availability during deploy is + # acceptable (clients reconnect; no state to drain). + strategy = "immediate" + # Redundant with a 1-machine fleet, but documents intent. + max_unavailable = 1