diff --git a/cmd/pyrycode-relay/main.go b/cmd/pyrycode-relay/main.go index e6164b9..ce25d35 100644 --- a/cmd/pyrycode-relay/main.go +++ b/cmd/pyrycode-relay/main.go @@ -48,6 +48,7 @@ func main() { mux := http.NewServeMux() mux.Handle("/healthz", relay.NewHealthzHandler(reg, Version, startedAt)) mux.Handle("/v1/server", relay.ServerHandler(reg, logger, 30*time.Second)) + mux.Handle("/v1/client", relay.ClientHandler(reg, logger)) if *insecureListen != "" { logger.Info("starting", "version", Version, "mode", "insecure", "listen", *insecureListen) diff --git a/docs/PROJECT-MEMORY.md b/docs/PROJECT-MEMORY.md index 19a9823..2285f0a 100644 --- a/docs/PROJECT-MEMORY.md +++ b/docs/PROJECT-MEMORY.md @@ -15,8 +15,8 @@ Stateless WebSocket router between mobile clients and pyry binaries. Internet-ex | `WSConn` adapter (`nhooyr.io/websocket.Conn` → registry `Conn`; per-conn write mutex; `Close`-cancelled context; 10s `Send` deadline) | Done (#15) | `internal/relay/ws_conn.go` | | `/healthz` JSON endpoint (`status`, `version`, `connected_binaries`, `connected_phones`, `uptime_seconds`; `Cache-Control: no-store`; unauthenticated) | Done (#10) | `internal/relay/healthz.go`, `cmd/pyrycode-relay/main.go` | | WS upgrade on `/v1/server` (header gate pre-upgrade; `ClaimServer`; `4409` on conflict; `CloseRead`-held until #6; disconnect defers a 30s grace release via `ScheduleReleaseServer`) | Done (#16, #21) | `internal/relay/server_endpoint.go`, `cmd/pyrycode-relay/main.go` | -| WS upgrade on `/v1/client` | Not started | — | -| Header validation (`x-pyrycode-server`, `x-pyrycode-version`, `user-agent` on `/v1/server`; `x-pyrycode-token` on `/v1/client`) | `/v1/server` done (#16); `/v1/client` not started | `internal/relay/server_endpoint.go` | +| WS upgrade on `/v1/client` (header gate pre-upgrade; `RegisterPhone`; `4404` if no binary; `CloseRead`-held until #6; disconnect calls `UnregisterPhone`; token never parsed or logged) | Done (#5) | `internal/relay/client_endpoint.go`, `cmd/pyrycode-relay/main.go` | +| Header validation (`x-pyrycode-server`, `x-pyrycode-version`, `user-agent` on `/v1/server`; `x-pyrycode-server`, `x-pyrycode-token`, `user-agent` on `/v1/client`; optional `x-pyrycode-device-name` on `/v1/client`) | Done (#16, #5) | `internal/relay/server_endpoint.go`, `internal/relay/client_endpoint.go` | | Frame forwarding using the routing envelope | Not started | — | | `conn_id` generation scheme | Not started | — | | Threat model doc — operational surface (deploy, supply chain, DoS, log hygiene, cert handling, TLS, error leakage) | Done (#11) | `docs/threat-model.md` | @@ -33,12 +33,13 @@ Stateless WebSocket router between mobile clients and pyry binaries. Internet-ex - **Interface methods called under the lock are documented as non-blocking getters.** The registry's `Conn.ConnID()` is invoked under the write lock during `UnregisterPhone`; `Send` and `Close` are never called while the lock is held. Pattern: state the contract on the interface, never call something that could block on I/O while a mutex is held. - **Adapters bridge interface↔library API mismatches by owning policy locally.** When a library method needs a `context.Context` but the registry's `Conn` interface doesn't take one (and shouldn't — most callers don't have a context to thread), the adapter owns its own context: derived in the constructor, cancelled by `Close`, narrowed per-call with `WithTimeout` for deadline policy. Adopted in `WSConn` (#15); avoids forcing context-plumbing changes into upstream interfaces. - **Handler factories return `http.Handler`, not `http.HandlerFunc`.** `NewHealthzHandler(reg, version, startedAt) http.Handler` keeps construction factory-shaped so a future ticket adding per-handler state (logger, injectable clock) can do so without touching the call site in `main`. Adopted in `/healthz` (#10). -- **Validate adversarial input *before* allocating WS state.** Header-gate checks (presence + non-empty) run before `websocket.Accept`; a malformed request never receives an upgrade response and the registry is untouched on every error path. Adopted in `/v1/server` (#16); the same shape applies to `/v1/client` token validation (#5). -- **Application WS close codes are emitted on the underlying `*websocket.Conn`, not via `WSConn`.** `WSConn.Close()` always emits `StatusNormalClosure` — no policy parameters. Handlers that need `4409` / `4401` / `4404` call `c.Close(websocket.StatusCode(code), reason)` directly in the stillborn-WSConn window (after construction, before any `Send`). See [ADR-0005](knowledge/decisions/0005-application-close-codes-via-underlying-conn.md). Adopted in `/v1/server` (#16); applies to `/v1/client` (#5). -- **`defer { ScheduleRelease; Close; log }` is registered AFTER the successful claim**, never before. A conflict path (or any pre-claim error) returns before the defer is in place, so we never schedule a release on a slot we don't hold. Originally `ReleaseServer` (#16); swapped to `ScheduleReleaseServer(serverID, grace)` in #21 so a quick reconnect lands back in the same slot. The structural rule sharpened in #21 — pre-grace this was a benign no-op when violated; now violating it would arm a stray timer for an id the handler never owned. Same ordering applies to `/v1/client`'s phone registration. +- **Validate adversarial input *before* allocating WS state.** Header-gate checks (presence + non-empty) run before `websocket.Accept`; a malformed request never receives an upgrade response and the registry is untouched on every error path. Adopted in `/v1/server` (#16) and `/v1/client` (#5). +- **Application WS close codes are emitted on the underlying `*websocket.Conn`, not via `WSConn`.** `WSConn.Close()` always emits `StatusNormalClosure` — no policy parameters. Handlers that need `4409` / `4401` / `4404` call `c.Close(websocket.StatusCode(code), reason)` directly in the stillborn-WSConn window (after construction, before any `Send`). See [ADR-0005](knowledge/decisions/0005-application-close-codes-via-underlying-conn.md). Adopted in `/v1/server` `4409` (#16) and `/v1/client` `4404` (#5). +- **`defer { ScheduleRelease; Close; log }` is registered AFTER the successful claim**, never before. A conflict path (or any pre-claim error) returns before the defer is in place, so we never schedule a release on a slot we don't hold. Originally `ReleaseServer` (#16); swapped to `ScheduleReleaseServer(serverID, grace)` in #21 so a quick reconnect lands back in the same slot. The structural rule sharpened in #21 — pre-grace this was a benign no-op when violated; now violating it would arm a stray timer for an id the handler never owned. Adopted on `/v1/client`'s phone register/unregister in #5: defer is registered after `RegisterPhone` returns nil, so the no-server (`4404`) path never tries to unregister a slot we never claimed. - **Policy values live at the wiring site, not as package-level constants.** `30*time.Second` for the grace window is a literal in `cmd/pyrycode-relay/main.go`, threaded into `ServerHandler` as a constructor parameter — it appears exactly once, the value is policy (matches protocol spec), and inlining keeps the protocol-spec linkage visible where the relay is composed. Tests pass ms-scale durations through the same parameter. Promote to a package-level constant only when a second wiring entry point needs the same value. Adopted in `/v1/server` grace duration (#21). - **Pointer-identity for stale `time.AfterFunc` fires.** `time.Timer.Stop()` returns false if the timer's func has already started executing. Wrap each pending timer in a small struct and store the wrapper pointer in a map; the `AfterFunc` closure captures the wrapper pointer and asserts `map[key] == self` under the lock before acting. If a faster goroutine replaced the entry, the pointer no longer matches and the closure no-ops. Capturing the `*time.Timer` directly forces a self-referential local var (assigned after `AfterFunc` returns) which trips the race detector under stress; the wrapper avoids that. Adopted in `Registry.ScheduleReleaseServer` (#20). Same shape applies to any "one cancellable timer per key" pattern. -- **Hold long-lived WS handlers open with `c.CloseRead(r.Context())` plus `<-readCtx.Done()` until the real read loop lands.** `CloseRead` drains-and-discards frames (including control frames — pings must be processed for the connection to observe a peer-side close). The frame loop ticket replaces `<-readCtx.Done()` with the actual read body in the same call site. Adopted in `/v1/server` (#16) pending #6. +- **Credentials the relay does not validate are presence-checked then discarded — never logged, never put in error strings.** `/v1/client`'s `X-Pyrycode-Token` is opaque to the relay (the binary owns verification). The handler reads the token into a local string, branches on `!= ""`, and lets the local go out of scope unread. The log-event field set is enumerated explicitly in the doc; the token name does not appear in any `slog` call, `fmt.Errorf`, or response body. Defence is layered: spec, code review, and the structural absence of any code path that uses the value after the gate. Same posture extends to any future "courier credential" the relay carries but does not own (e.g. session resume tokens). Adopted in `/v1/client` (#5). +- **Hold long-lived WS handlers open with `c.CloseRead(r.Context())` plus `<-readCtx.Done()` until the real read loop lands.** `CloseRead` drains-and-discards frames (including control frames — pings must be processed for the connection to observe a peer-side close). The frame loop ticket replaces `<-readCtx.Done()` with the actual read body in the same call site. Adopted in `/v1/server` (#16) and `/v1/client` (#5) pending #6. - **Capture process-state timestamps in `main` after `flag.Parse()`, not as package-level vars.** `startedAt := time.Now()` lives inside `main` and is passed into the handler factory. A package-level `var startedAt = time.Now()` would fire at import time — before flag parsing, before `--version` early-returns — and be wrong for short-lived test binaries and any future deferred-serve setup. Adopted in #10. ## Conventions diff --git a/docs/knowledge/INDEX.md b/docs/knowledge/INDEX.md index 60ca3e5..c563512 100644 --- a/docs/knowledge/INDEX.md +++ b/docs/knowledge/INDEX.md @@ -4,6 +4,7 @@ One-line pointers into the evergreen knowledge base. Newest entries at the top o ## Features +- [`/v1/client` WS upgrade](features/client-endpoint.md) — phone-side ingress: validates `X-Pyrycode-Server` / `X-Pyrycode-Token` / `User-Agent` pre-upgrade (token presence-only, never parsed/logged); registers phone on the binary's slot, emits `4404` if no binary holds the id, holds the conn via `CloseRead` until #6's frame loop replaces it (#5). - [`/v1/server` WS upgrade](features/server-endpoint.md) — binary-side ingress: validates `X-Pyrycode-Server` / `X-Pyrycode-Version` / `User-Agent` pre-upgrade, claims the slot, emits `4409` on conflict, holds the conn via `CloseRead` until #6's frame loop replaces it; on disconnect schedules a 30s grace release so a quick reconnect inherits the slot (#21). - [`/healthz` JSON endpoint](features/healthz.md) — unauthenticated `GET /healthz` returning `{status, version, connected_binaries, connected_phones, uptime_seconds}`; `Cache-Control: no-store`, body bounded ≈135 bytes. - [WSConn adapter](features/ws-conn-adapter.md) — wraps `nhooyr.io/websocket.Conn` to satisfy the registry's `Conn`; owns the per-conn write mutex and a `Close`-cancelled context with a 10s per-`Send` deadline. diff --git a/docs/knowledge/features/client-endpoint.md b/docs/knowledge/features/client-endpoint.md new file mode 100644 index 0000000..a0666f1 --- /dev/null +++ b/docs/knowledge/features/client-endpoint.md @@ -0,0 +1,187 @@ +# `/v1/client` — phone-side WebSocket upgrade + +`/v1/client` is the relay's ingress for mobile phones. A phone opens an outbound WSS to the relay, sends three required headers (one of which is the device token, opaque to the relay), and — if a binary currently holds the requested `serverID` — gets registered on that binary's phone slice in the connection registry. If no binary holds the slot, the WS is closed with application code `4404`. + +This is the public, internet-exposed endpoint. The peer is *less* trusted than `/v1/server`'s peer (which at least runs operator-issued software); anyone on the internet who learns the relay hostname can connect. Header validation runs **before** `websocket.Accept`; the token is presence-checked only and never logged. + +This is the phone side only. Frame forwarding is #6; heartbeat is #7. The handler currently holds the connection open by draining-and-discarding frames — #6 swaps in the real read loop in the same call site. + +## Wire shape + +```http +GET /v1/client HTTP/1.1 +Host: relay.example.com +Upgrade: websocket +Connection: Upgrade +Sec-WebSocket-Version: 13 +Sec-WebSocket-Key: +X-Pyrycode-Server: +X-Pyrycode-Token: +User-Agent: +X-Pyrycode-Device-Name: +``` + +| Header (canonical) | Required | Source | Enforcement | +|---|---|---|---| +| `X-Pyrycode-Server` | Yes | phone's target server-id | non-empty | +| `X-Pyrycode-Token` | Yes | phone-side device token | **non-empty (presence only — never parsed by the relay)** | +| `User-Agent` | Yes | HTTP standard | non-empty | +| `X-Pyrycode-Device-Name` | No | user-supplied label | logged verbatim if present; absent → empty string | + +`r.Header.Get` canonicalises case. Missing or empty among the required set → `400 Bad Request`, empty body, no upgrade, registry untouched, no log line. Wrong method or missing `Upgrade` headers fall through to `websocket.Accept`, which writes the library's standard 4xx. + +The relay does not validate the token — the binary does. The token is read into a local string for the presence-check, then goes out of scope unread. It is **never** logged, never compared, never propagated by this handler. Token verification is the binary's responsibility (per protocol spec § Phone → relay → binary). + +## Close codes + +| Code | Meaning | +|---|---| +| `1000` (`StatusNormalClosure`) | clean close on shutdown / unregister / binary-grace expiry. | +| `4404` | no binary currently holds the requested `serverID`. Reason: `"no server with that id"`. | + +`4404` is application-defined per RFC 6455 (4000–4999 range); mirrors the `4409` pattern on `/v1/server`. The close-reason is the protocol-spec literal — it deliberately does not echo the requested server-id back (that would invite probing). + +## API + +Package `internal/relay` (`client_endpoint.go`): + +```go +func ClientHandler(reg *Registry, logger *slog.Logger) http.Handler +``` + +One exported symbol. No new types, no new sentinel errors, no new package-level constants. No `grace` parameter — phone disconnect is immediate (`UnregisterPhone`); the only grace concept on this endpoint lives entirely inside the registry's `handleGraceExpiry`, which closes orphan phones when a binary's grace window expires (see § Concurrency below). + +Wired in `cmd/pyrycode-relay/main.go` next to `/v1/server`: + +```go +mux.Handle("/v1/client", relay.ClientHandler(reg, logger)) +``` + +## Algorithm + +1. Validate the three required headers. Any missing/empty → `400`, return. +2. Read `X-Pyrycode-Device-Name` (optional; `""` if absent; informational only). +3. `websocket.Accept` with `OriginPatterns: []string{"*"}`. On error, the library has already written a 4xx; return silently. +4. Construct `connID = "client-" + serverID + "-" + randHex8()` (8 hex chars from `crypto/rand`). +5. Wrap with `NewWSConn(c, connID)`. +6. `reg.RegisterPhone(serverID, wsconn)`: + - On `ErrNoServer` → close the underlying `*websocket.Conn` with code `4404` and reason `"no server with that id"`, log `phone_register_no_server`, return. + - On any other error → `wsconn.Close()`, return (defensive; not currently reachable). +7. Log `phone_registered`. +8. `defer { reg.UnregisterPhone(serverID, connID); wsconn.Close(); log phone_unregistered }`. Registered **after** the successful `RegisterPhone` so a no-server path never tries to unregister a slot we never owned. +9. `readCtx := c.CloseRead(r.Context()); <-readCtx.Done()` — block until the peer closes (or until the registry tears the conn down on binary-grace expiry). + +`randHex8` and `remoteHost` are reused verbatim from `server_endpoint.go` — same package, no duplication. + +## Logging + +Three structured event types. Field set is fixed; nothing else (token, full headers, payloads) is logged. + +| event | fields | +|---|---| +| `phone_registered` | `server_id`, `conn_id`, `device_name`, `remote` | +| `phone_register_no_server` | `server_id`, `remote` | +| `phone_unregistered` | `server_id`, `conn_id` | + +Explicitly **not logged on any path:** + +- `X-Pyrycode-Token` — the relay never parses, compares, or propagates it; logging it would be a credential leak. The token is read into a local once, presence-checked, and discarded. No `fmt.Errorf`, no debug `Printf` references it — code review enforces this in addition to the handler structurally never reaching for it after the gate. +- `User-Agent` — validated for presence, then discarded (no operational use). +- Any full request-header dump. + +`device_name` is the user-supplied label (e.g. `"Juhana's iPhone"`); the threat model's MAY-be-logged list names it explicitly. Empty string on absence; logged literally — slog's text/JSON handlers escape control characters, so log-line forging via crafted device names is structurally blocked. `remote` is the IP host portion of `r.RemoteAddr`, no port. + +`conn_id` is logged on register and unregister so operators can correlate a session across both events. `phone_register_no_server` deliberately omits `conn_id` because no `WSConn` was constructed before the close on that path. + +Header gate failures (`400`) are not logged — same hygiene rationale as `/v1/server`: avoid amplifying header-floods into log volume. + +## Concurrency + +| Step | Goroutine | Lock | Lifecycle | +|---|---|---|---| +| Header validation | request goroutine | none | pre-upgrade; no resources held | +| `websocket.Accept` | request goroutine | none | conn allocated on success | +| `RegisterPhone` | request goroutine | registry write lock (held internally) | one-shot | +| `CloseRead` | spawns one read-discard goroutine | none | terminates on conn close / `r.Context()` cancel | +| `<-readCtx.Done()` | request goroutine, blocking | none | unblocks on (a) peer close, (b) registry-driven `Close` from binary-grace expiry, (c) server shutdown | +| `defer` (unregister/close/log) | request goroutine | `wsconn.closeOnce` | runs only on the success path; idempotent | + +The handler takes no lock of its own. `WSConn.Close`'s `closeOnce` is a `sync.Once`, not a held lock. `UnregisterPhone` is a no-op on unknown `(serverID, connID)`. **No lock-order risk; no goroutine-leak path.** + +### Phone close on binary-grace expiry + +When a binary disconnects, `/v1/server`'s defer arms `ScheduleReleaseServer(serverID, 30s)`. If no reconnect arrives, the registry's `handleGraceExpiry`: + +1. removes the binary entry, +2. snapshots the phones slice, +3. deletes the phones entry from the map, +4. calls `Close()` on every snapshotted phone. + +The phone handler's `<-readCtx.Done()` then unblocks (the underlying conn's context was cancelled by `WSConn.Close`), the defer runs, and: + +- `reg.UnregisterPhone(serverID, connID)` no-ops (entry already deleted in step 3). +- `wsconn.Close()` no-ops (`sync.Once` already fired in step 4). +- `phone_unregistered` log line fires. + +The phone observes the close on its socket as `StatusNormalClosure` — by deliberate design, no `4404` / `4409` is sent on grace expiry. Every step is idempotent; the interaction is structurally safe. + +## Design notes + +- **`OriginPatterns: ["*"]`.** Same rationale as `/v1/server`: the custom-header gate one layer above structurally excludes browser-driven CSWSH. Browser raw WebSocket cannot set custom request headers, and `fetch` with custom headers triggers a CORS pre-flight this endpoint does not satisfy. +- **Direct `c.Close` on the no-server path, not `wsconn.Close()`.** `WSConn.Close` always emits `StatusNormalClosure`; `4404` requires the underlying `*websocket.Conn`. The no-server case is a stillborn WSConn — no `Send` was attempted, no goroutine holds `writeMu` — so the WSConn invariant is preserved in spirit. See [ADR-0005](../decisions/0005-application-close-codes-via-underlying-conn.md). +- **`defer` after `RegisterPhone` succeeds.** If `RegisterPhone` returns `ErrNoServer`, the handler must NOT call `UnregisterPhone` on a slot it never claimed. The registry tolerates it as a no-op today, but the structural rule sharpens once any cleanup grows side effects beyond a no-op (e.g. metrics increment in a future ticket). +- **No `RegisterPhone` retry on `ErrNoServer` even during a binary's grace window.** During grace, `BinaryFor` still returns the (closed) binary, so `RegisterPhone` succeeds and `handleGraceExpiry` later closes the phone cleanly on expiry. If the slot is empty when `RegisterPhone` runs, that's a true 4404 — no other binary will pick up this phone's claim before the phone re-dials. The handler does not retry, does not poll, does not double-check. +- **`CloseRead` instead of `for { c.Read() }`.** Same as `/v1/server`: keep the goroutine alive, drain control frames so the conn observes peer-close, defer to #6 for the real read loop. Swap site is `<-readCtx.Done()` → real loop body in the same goroutine. +- **`crypto/rand`-backed `connID` suffix.** 32 bits is sufficient — scoped per server-id, used only as an opaque map key in `UnregisterPhone`. `RegisterPhone` does not dedupe by `ConnID` (registry contract); collision odds at v1 scale are negligible. Using `crypto/rand` over `math/rand` is forward-compat hardening: if a future ticket exposes the conn-id, unguessable bytes avoid creating an oracle. +- **No length cap or charset check on `device_name`.** Informational only; slog escapes control characters; no observed failure motivates a check. If oversized headers ever become an issue, mitigation lives upstream (per-IP rate limit, per-header byte cap) — both deferred to the DoS ticket. + +## What this handler deliberately does NOT do + +- **No token validation.** The relay is not the trust boundary for the phone token; the binary owns it. The relay's authorization model on `/v1/client` is purely "is there a binary holding this server-id?" +- **No connection caps (per-IP or global).** Documented residual in `docs/threat-model.md` § DoS resistance. Same gap as `/v1/server`, named there, not widened. +- **No phone-count-per-server-id cap.** `phones[serverID]` grows under attack; the broadcast cost (when #6 lands) is the DoS shape that owns it. +- **No frame loop.** `CloseRead` discards frames until #6 replaces it. +- **No heartbeat / ping-pong.** #7. +- **No phone-side reconnect grace.** The binary-side grace from #20 already closes orphan phones cleanly on expiry; phone-side grace is not in the protocol spec and is out of scope. +- **No `400` log line.** Avoids amplifying header-floods into log volume. +- **No log on `websocket.Accept` errors.** Library writes a 4xx; the failure is visible in the http access log. + +## Adversarial framing + +- **Server-id enumeration via 4404 vs success.** `4404` confirms "no binary holds that id"; success confirms "a binary holds that id, and the relay accepted my token's *presence*." Both are protocol-spec-defined responses; the relay cannot withhold the signal without breaking phones. The token's *value* is validated by the binary on the first frame post-upgrade — an invalid token surfaces as `4401` from the binary forwarded by the relay (#6's territory). At the relay layer, server-id existence is observable by design. +- **Token leak via crafted log line.** The token is never logged, never put into an error string, never compared with `fmt.Errorf`. Log-field set is enumerated; structurally the handler does not reach for the token after the presence-check. The defence is layered: spec, code review, and the absence of any code path that uses the token after the gate. +- **Crafted `X-Pyrycode-Device-Name` to forge log lines.** slog's text and JSON handlers quote string values and escape control characters. Log-line forging is structurally blocked. +- **Slow-loris on upgrade.** Same as `/v1/server`: peer that completes handshake but never sends frames consumes one goroutine; `CloseRead` waits. Connection caps deferred. +- **Race: binary disconnects between phone's `RegisterPhone` and phone observing its conn live.** `RegisterPhone` succeeds; the binary's grace timer arms in parallel; if grace expires, `handleGraceExpiry` closes this phone; the handler's defer runs cleanly. Phone observes `StatusNormalClosure` and reconnects. +- **Phone re-dials thousands of times to grow the phones slice forever.** Each successful register appends; each disconnect removes via `UnregisterPhone`. The slice does not leak across connections. Steady state is "concurrent live phones" — bounded by file descriptors / connection cap (deferred). +- **Token-presence oracle via 4404.** Token-absent → 400; token-present + no binary → 4404; token-present + binary → success. The 400 vs 4404 difference is gated on token presence, but the *validity* of the token is not testable through the relay (the binary owns that). 4404 is not a presence oracle — it confirms the gate passed, which an attacker already knew because they sent a non-empty value. +- **`crypto/rand` panic.** Same posture as `/v1/server`: panic terminates the connection, http server recovers, defer is not yet registered (panic fires inside `randHex8`, before `RegisterPhone`), registry stays clean. + +Verdict from the security review: **PASS**. Reuses `/v1/server`'s audited shape (validate-pre-upgrade, defer-after-success, application-close-codes-on-underlying-conn), narrows the token's lifetime to a single presence-check, excludes the token from every log path by explicit field-set enumeration. The phone/binary lifecycle interactions with #20's grace machinery are walked end-to-end and idempotent on every path. + +## Testing + +`internal/relay/client_endpoint_test.go`, `package relay`. Same harness as `server_endpoint_test.go`: `httptest.NewServer(ClientHandler(reg, logger))` with `websocket.Dial` clients, `slog.NewTextHandler(io.Discard, nil)` for logs. + +`fakeConn` is reused from `registry_test.go` to seed a binary on tests that exercise the success path; tests for the 4404 path skip the seed. + +Tests (1:1 with AC bullets): + +- `TestClientEndpoint_ValidUpgrade_RegistersPhone` — seed binary; dial with valid headers; poll `reg.PhonesFor` until populated; assert `ConnID()` matches `"client--<8 hex>"`. +- `TestClientEndpoint_HeaderGate_400` — table-driven over the three required headers; expect `400`; registry empty. +- `TestClientEndpoint_NoBinary_4404` — no seed; dial; one `Read` returns `*websocket.CloseError` with `Code == 4404` and `Reason == "no server with that id"`; registry empty. +- `TestClientEndpoint_PeerClose_UnregistersPhone` — seed; dial; close client; poll `reg.PhonesFor` until empty; binary remains claimed. +- `TestClientEndpoint_MultiplePhones_IndependentLifecycle` — seed; dial three phones; close them in non-FIFO order; assert removal order does not corrupt other entries (covers `UnregisterPhone`'s swap-with-last-then-truncate). +- `TestClientEndpoint_DeviceNameOptional_HandlerAccepts` — dial without and with `X-Pyrycode-Device-Name`; both succeed. + +Not tested: registry mocks (use the real one — race-tested in #3); library mocks; exact log output; token-not-logged invariant (would require diverting slog to a buffer; defended by code review + spec). + +## Related + +- [`/v1/server`](server-endpoint.md) — sibling ingress; this handler reuses its `randHex8`, `remoteHost`, validate-pre-upgrade shape, and stillborn-WSConn close-code pattern. +- [Connection registry](connection-registry.md) — `RegisterPhone` / `UnregisterPhone` / `ErrNoServer` are the primitives this handler routes through. Its `handleGraceExpiry` closes phones registered here when a binary's grace window expires. +- [WSConn adapter](ws-conn-adapter.md) — what the handler hands to `RegisterPhone`. The direct-`c.Close` on no-server is the documented exception to the WSConn-only invariant. +- [ADR-0005](../decisions/0005-application-close-codes-via-underlying-conn.md) — close-code emission on the underlying `*websocket.Conn`. +- [ADR-0006](../decisions/0006-grace-period-as-reclaim-path.md) — phones registered here are torn down by the registry on binary-grace expiry; this handler's defer is idempotent against that path. +- [Threat model](../../threat-model.md) — log hygiene (token never logged), DoS resistance (connection-cap residual), error-response leakage (generic 4404 reason). +- [Protocol spec § Phone → relay → binary](https://github.com/pyrycode/pyrycode/blob/main/docs/protocol-mobile.md#phone--relay--binary) — authoritative wire shape and close-code set. diff --git a/docs/specs/architecture/5-client-endpoint.md b/docs/specs/architecture/5-client-endpoint.md new file mode 100644 index 0000000..1bd17a1 --- /dev/null +++ b/docs/specs/architecture/5-client-endpoint.md @@ -0,0 +1,432 @@ +# Spec — `/v1/client` WS upgrade, header gate, phone register/unregister (#5) + +## Files to read first + +- `internal/relay/server_endpoint.go` — full file (122 lines). The phone handler is a structural twin: same shape (validate → upgrade → wrap → register → defer-unregister → CloseRead/wait). Reuse `randHex8` and `remoteHost` verbatim — they live in this same package and need no duplication. +- `internal/relay/registry.go:18-21` — `ErrNoServer` sentinel mapped to WS close code 4404; the handler branches on this with `errors.Is`. +- `internal/relay/registry.go:186-228` — `RegisterPhone` / `UnregisterPhone` semantics. Note: `RegisterPhone` returns `ErrNoServer` (not `ErrServerIDConflict`); does NOT deduplicate by `ConnID` (caller's responsibility — random 32-bit suffix makes accidental collision negligible). `UnregisterPhone` is a no-op on unknown `(serverID, connID)`, so a defer that fires after the registry has already torn down the entry (e.g. via `handleGraceExpiry` orphan-close at line 181) is safe. +- `internal/relay/registry.go:148-184` — `ScheduleReleaseServer` + `handleGraceExpiry`. Critical for the phone handler: when the binary's grace window expires, the registry calls `Close()` on every registered phone (line 181-183) and removes them from the map. The phone handler's `CloseRead`-blocked goroutine then unblocks (because `WSConn.Close` cancels the underlying conn's context), runs its defer, and calls `UnregisterPhone` — which no-ops because the entry was already removed. This interaction is structurally safe; the spec re-confirms it under § Concurrency. +- `internal/relay/ws_conn.go` — full file (83 lines). Phone wraps `*websocket.Conn` with `NewWSConn(c, connID)` exactly like #16. The 4404 close path uses the same "stillborn WSConn — close directly on `*websocket.Conn` so the application close code reaches the wire" pattern documented in PROJECT-MEMORY § "Application WS close codes are emitted on the underlying `*websocket.Conn`, not via `WSConn`." +- `internal/relay/server_endpoint_test.go` — full file (357 lines). The phone test file mirrors `startServer`, `dialWith`, `validHeaders` pattern verbatim with phone-side header set; the close-code assertion shape (`errors.As(_, &websocket.CloseError)`) is reused. +- `cmd/pyrycode-relay/main.go:48-50` — current mux registration site. One new line slots in next to `/v1/server`. +- `docs/threat-model.md:42-55` § "Log hygiene" — enumerates what MUST NOT be logged. **Tokens are top of that list.** The phone gate sees a token; the handler must never put it in a log line, error message, or response body. `device_name` is in the MAY-be-logged list (advertised by the binary, not user-secret). +- `docs/threat-model.md:77-85` § "Error response leakage" — public-facing handlers return generic messages. +- `docs/PROJECT-MEMORY.md:36-38` — the four patterns this handler inherits: handler-factory shape, validate-before-upgrade, app-close-codes-on-underlying-conn, defer-after-successful-claim ordering. +- [`pyrycode/pyrycode/docs/protocol-mobile.md` § Phone → relay → binary](https://github.com/pyrycode/pyrycode/blob/main/docs/protocol-mobile.md#phone--relay--binary) — authoritative spec for the phone-side headers and the `4404` close code. The relay does NOT validate the token; the binary does. +- `nhooyr.io/websocket` package docs — `Accept`, `AcceptOptions.OriginPatterns`, `StatusCode`, `Conn.CloseRead`, `Conn.Close`. Same calls as #16. + +## Context + +Implements the phone-side handshake. The endpoint accepts an inbound WebSocket from a mobile client on `/v1/client`, validates three required headers BEFORE upgrading, upgrades on success, looks up the binary holding `x-pyrycode-server`, and registers the phone connection in the registry. If no binary holds the slot, the WS is closed with application code `4404`. + +The relay is **not** the trust boundary for the phone token: per the protocol spec, `x-pyrycode-token` is opaque to the relay — presence-checked only, never parsed, never compared, never logged. The binary owns token verification. The relay's authorization model on `/v1/client` is purely: "is there a binary holding this server-id?" + +This is the phone-ingress only. Frame forwarding is #6, heartbeat is #7, the 30-second grace window on disconnect is #8 (registry-side already shipped via #20; phone-side close code on grace expiry is registry-handled — see § "Phone close on binary grace-expiry"). Out of scope here: any reading from the phone WS beyond `CloseRead`'s drain-and-discard goroutine. + +## Design + +### Package & files + +- New file: `internal/relay/client_endpoint.go` (`package relay`). +- New test file: `internal/relay/client_endpoint_test.go` (`package relay`). +- Edit: `cmd/pyrycode-relay/main.go` adds one line registering the handler on the mux. + +No changes to `registry.go`, `ws_conn.go`, `envelope.go`, `tls.go`, `healthz.go`, `server_endpoint.go`, `doc.go`, `go.mod`, or `Makefile`. `randHex8` and `remoteHost` are package-private helpers in `server_endpoint.go` and are reused as-is. + +### Public API + +```go +package relay + +// ClientHandler returns the http.Handler for /v1/client: the phone-side +// WebSocket upgrade endpoint. It validates required headers, upgrades the +// connection, registers the phone in reg under the requested server-id, and +// holds the connection open until the phone closes it (or the registry +// tears it down on binary-grace expiry). +// +// Close codes used by this endpoint: +// - 1000 (StatusNormalClosure) clean close on shutdown / unregister. +// - 4404 (no server with that id) no binary currently holds the slot. +// +// The relay treats x-pyrycode-token as opaque — its presence is required, +// its value is never parsed, compared, or logged. Token verification is the +// binary's responsibility (per protocol spec § Phone → relay → binary). +func ClientHandler(reg *Registry, logger *slog.Logger) http.Handler +``` + +One exported symbol. No new types, no new sentinel errors. Note: no `grace` parameter — phone disconnect is immediate (`UnregisterPhone`); no policy decision is wired through this handler. The grace concept on `/v1/client` lives entirely in the registry's `handleGraceExpiry` (it closes orphan phones on binary-grace expiry); see § Concurrency. + +### Required headers (validated pre-upgrade) + +| Header (canonical) | Required | Source | Enforcement | +|---------------------------|----------|-----------------------|-----------------------------------------------------------| +| `X-Pyrycode-Server` | Yes | Phone's target id | non-empty | +| `X-Pyrycode-Token` | Yes | Phone-side device tok | non-empty (presence only — never parsed by the relay) | +| `User-Agent` | Yes | HTTP standard | non-empty | +| `X-Pyrycode-Device-Name` | No | User-supplied label | logged verbatim if present; absent → log empty string | + +`r.Header.Get` performs canonicalisation (case-insensitive). Missing or empty among the required set → `http.Error(w, "", http.StatusBadRequest)` with no body. Registry untouched, `Accept` not called, no log line emitted (gate is silent — same hygiene rationale as #16's gate). + +### Algorithm + +``` +1. Validate the three required headers. Any missing/empty → 400, return. + Read x-pyrycode-device-name; "" if absent (no validation; informational). +2. websocket.Accept(w, r, &websocket.AcceptOptions{ + OriginPatterns: []string{"*"}, + }) + - On error: the library has already written a 4xx; just return. + Registry untouched. +3. connID := "client-" + serverID + "-" + randHex8() +4. wsconn := NewWSConn(c, connID) +5. err := reg.RegisterPhone(serverID, wsconn) + - errors.Is(err, ErrNoServer): + _ = c.Close(websocket.StatusCode(4404), "no server with that id") + logger.Info("phone_register_no_server", + "server_id", serverID, + "remote", remoteHost(r)) + return + - other err (not produced by current registry, but for completeness): + wsconn.Close() + return +6. logger.Info("phone_registered", + "server_id", serverID, + "conn_id", connID, + "device_name", deviceName, + "remote", remoteHost(r)) +7. defer: + reg.UnregisterPhone(serverID, connID) + wsconn.Close() + logger.Info("phone_unregistered", + "server_id", serverID, + "conn_id", connID) +8. // Hold the connection open until the peer closes it. Frame loop (#6) + // replaces this block with a real read loop later. + readCtx := c.CloseRead(r.Context()) + <-readCtx.Done() +``` + +Three design notes inside this algorithm: + +**Why `c.Close` directly on the no-server path instead of `wsconn.Close()`.** Same reasoning as #16's `4409` path: `WSConn.Close()` always sends `StatusNormalClosure`; the no-server path needs application-defined `4404`. The WSConn here is stillborn — no `Send` was attempted, no concurrent goroutine holds `writeMu` — so the WSConn doc invariant is preserved in spirit. Pattern is documented in PROJECT-MEMORY § "Application WS close codes are emitted on the underlying `*websocket.Conn`, not via `WSConn`." + +**Why the defer is registered AFTER `RegisterPhone` succeeds, not before.** Mirrors #16 exactly. If `RegisterPhone` returns `ErrNoServer`, the handler must NOT call `UnregisterPhone` on a slot it never registered to (registry tolerates it as a no-op, but the structural rule from PROJECT-MEMORY § "`defer { … }` is registered AFTER the successful claim" applies: never schedule cleanup on resources we don't own — the rule sharpens once any cleanup grows side effects beyond a no-op, e.g. metrics increment in a future ticket). + +**Why `CloseRead` instead of `for { c.Read() }` or a bare block.** Same reasoning as #16: keep the goroutine alive, drain control frames so the conn observes peer-close, defer to #6 for the real read loop. Swap site is `<-readCtx.Done()` → real loop body in the same goroutine. + +### Phone close on binary grace-expiry — interaction with #20 + +When a binary disconnects, #21's handler arms `ScheduleReleaseServer(serverID, 30s)`. If no reconnect arrives within 30s, the registry's `handleGraceExpiry` (registry.go:169-184): + +1. removes the binary entry, +2. snapshots the phones slice, +3. deletes the phones entry from the map, +4. calls `Close()` on every snapshotted phone (each phone is a `*WSConn`; its `Close` cancels its context and closes the underlying `*websocket.Conn`). + +The phone handler's `CloseRead`-blocked goroutine sees its context cancelled (the read goroutine errors out, `readCtx.Done()` fires), runs the defer: + +- `reg.UnregisterPhone(serverID, connID)` — no-op (entry already deleted in step 3). +- `wsconn.Close()` — no-op (sync.Once already fired in step 4). +- log line `phone_unregistered` — fires. + +This interaction is structurally safe: every step is idempotent. The phone observes the close on its socket as `StatusNormalClosure` (registry's `Close()` path through `WSConn.Close`). No 4404 / 4409 / other code is sent on grace-expiry — that's a registry-side concern that registry-#20 explicitly chose to leave at `StatusNormalClosure`. Out of scope to change in this ticket. + +### Helper functions + +None new. `randHex8` and `remoteHost` already exist in `server_endpoint.go` (same package); reuse verbatim. The 32-bit suffix gives collision odds of ~negligible per server-id; `RegisterPhone` does not dedupe by `ConnID` (the registry's contract notes this), so colliding ids would manifest as two phones with the same key — `UnregisterPhone` removes the first match by linear scan, leaving the second registered until its own defer runs and finds nothing to unregister. At v1 scale (≤ tens of phones per server-id), collision is statistically irrelevant. + +### `cmd/pyrycode-relay/main.go` edit + +One line, inserted after the `/v1/server` registration (`main.go:50`): + +```go +mux.Handle("/v1/client", relay.ClientHandler(reg, logger)) +``` + +`logger` and `reg` are already in scope. No other edits. + +### Concurrency model + +The handler runs in the http.Server's per-request goroutine. The `WSConn` it constructs is one of N entries in `r.phones[serverID]`; broadcasts (future tickets) reach it through `reg.PhonesFor`. There is one `CloseRead`-spawned read-discard goroutine per accepted connection. + +| Method / step | Goroutine | Lock | Lifecycle | +|---|---|---|---| +| Header validation | request goroutine | none | pre-upgrade; no resources held | +| `websocket.Accept` | request goroutine | none | conn allocated on success | +| `RegisterPhone` | request goroutine | registry write lock (held internally) | one-shot | +| `CloseRead` | spawns one read-discard goroutine | none | terminates on conn close / ctx cancel | +| `<-readCtx.Done()` | request goroutine, blocking | none | unblocks on (a) peer close, (b) registry-driven `wsconn.Close` from grace expiry, (c) `r.Context()` cancel from server shutdown | +| `defer { Unregister; wsconn.Close; log }` | request goroutine | `wsconn`'s `closeOnce` | runs on every successful register path; idempotent | + +Three unblock paths feed `<-readCtx.Done()`: + +- **Peer close.** Phone closes the socket; `CloseRead`'s read errors; ctx cancels. +- **Binary grace-expiry close.** Registry's `handleGraceExpiry` calls `wsconn.Close()` on this phone; `WSConn.Close` cancels the per-conn ctx and closes the `*websocket.Conn`; `CloseRead`'s read errors; ctx cancels. +- **Server shutdown.** http server cancels `r.Context()`; `CloseRead`'s read returns; ctx cancels. Defer runs, unregister no-ops if other phones beat it (or if grace-expiry already cleared the slice). + +All three paths converge on the same defer; all defer steps are idempotent. + +### Lock-order / deadlock + +The handler takes no lock of its own. Registry methods take the registry's `mu` internally. `WSConn.Close`'s `closeOnce` is a `sync.Once`, not a held lock. The `CloseRead` goroutine never touches the registry. **No lock-order risk.** + +The one subtlety the spec calls out for clarity: `UnregisterPhone` calls `c.ConnID()` under the registry write lock (registry.go:215). For a `*WSConn`, `ConnID()` is a pure getter on the `connID` field — non-blocking, fits the registry's interface contract. No new constraint. + +### Error handling + +- **Missing/empty required header** → `400 Bad Request`, empty body. No log line. Registry untouched. +- **`websocket.Accept` error** — library writes its own 4xx; handler returns silently. Registry untouched. +- **`RegisterPhone` returns `ErrNoServer`** — close 4404, log `event=phone_register_no_server server_id= remote=`. Header values are NOT logged on this path; in particular **the token never appears in a log line** (it does not appear on the success path either). This event includes `server_id` because the no-server signal is operationally useful (a phone targeting a binary that disconnected); it's the same field the threat model already names as MAY-be-logged. +- **`RegisterPhone` returns any other error** — not produced by the current registry. Defensively: `wsconn.Close()` and return without logging. If a future registry change introduces such an error, that diff adds the log. +- **`CloseRead` / `<-readCtx.Done()`** does not fail — returns when the conn ends. +- **No panics from this handler.** `randHex8`'s panic-on-RNG-failure is fatal-by-design; same as #16. + +### Logging — exact field set + +Per `docs/threat-model.md` § "Log hygiene", this handler emits three event types: + +| event | fields | +|-----------------------------|-------------------------------------------------------| +| `phone_registered` | `server_id`, `conn_id`, `device_name`, `remote` | +| `phone_register_no_server` | `server_id`, `remote` | +| `phone_unregistered` | `server_id`, `conn_id` | + +Explicitly **NOT** logged on any path: + +- `x-pyrycode-token` — never appears as a field, never appears inside a formatted error string. The token is read into a local variable for the presence-check, then goes out of scope unread. +- `User-Agent` — validated for presence, discarded (same as #16; matches the protocol-spec posture: phone identifies itself via `device_name`, not via `User-Agent`). +- Any full-request-header dump. + +`device_name` is the user-supplied label (e.g. `"Juhana's iPhone"`); the threat model's MAY-be-logged list names it explicitly. Empty string on absence; logged literally — slog's text/JSON handlers escape control characters, so log-line forging is structurally blocked. `remote` is the IP host portion of `r.RemoteAddr`, no port (`remoteHost` strips it). + +`conn_id` is logged on register and unregister so operators can correlate a phone session across the two events. `phone_register_no_server` deliberately omits `conn_id` because no `WSConn` was constructed before the close on that path (handler returns inside the error branch before line 4 of the algorithm). + +## Testing strategy + +`internal/relay/client_endpoint_test.go`, `package relay`. Same harness shape as `server_endpoint_test.go`: `httptest.NewServer(ClientHandler(reg, logger))` with `websocket.Dial` clients. + +### Test harness + +```go +// startClient spins up an httptest.NewServer running ClientHandler against +// a fresh registry. Returns the registry (for assertions), the WS URL, and +// a cleanup. Tests that need a binary registered call seedBinary on the +// returned registry before dialling. +func startClient(t *testing.T) (*Registry, string, func()) { + t.Helper() + reg := NewRegistry() + logger := slog.New(slog.NewTextHandler(io.Discard, nil)) + srv := httptest.NewServer(ClientHandler(reg, logger)) + wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") + return reg, wsURL, srv.Close +} + +func dialWithClient(t *testing.T, wsURL string, hdr http.Header) (*websocket.Conn, *http.Response, error) { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + return websocket.Dial(ctx, wsURL, &websocket.DialOptions{HTTPHeader: hdr}) +} + +func validClientHeaders(serverID string) http.Header { + h := http.Header{} + h.Set("X-Pyrycode-Server", serverID) + h.Set("X-Pyrycode-Token", "phone-token-opaque") + h.Set("User-Agent", "pyry-phone-test/0.1.0") + return h +} + +// seedBinary places a no-op fakeConn (defined in registry_test.go) as the +// binary for serverID, so RegisterPhone succeeds. Tests that exercise the +// 4404 path skip this. +func seedBinary(t *testing.T, reg *Registry, serverID string) { + t.Helper() + if err := reg.ClaimServer(serverID, &fakeConn{id: "bin-" + serverID}); err != nil { + t.Fatalf("seed ClaimServer: %v", err) + } +} +``` + +`fakeConn` already exists in `registry_test.go` (same package, same build) — reuse it. + +### Tests (1:1 with AC bullets) + +1. **`TestClientEndpoint_ValidUpgrade_RegistersPhone`** — seed binary on `"s1"`; dial with `validClientHeaders("s1")`; assert `Dial` returns no error; poll `reg.PhonesFor("s1")` (bounded retry, ~1 s, 10 ms sleep) until a phone appears; assert exactly one phone present, its `ConnID()` starts with `"client-s1-"` and ends with 8 hex chars (regex / `len`+`hex.DecodeString`). + +2. **`TestClientEndpoint_HeaderGate_400`** — table-driven over the three required headers; each row builds `validClientHeaders` and either deletes the header (via `req.Header[name] = nil`, the same trick #16's test uses for `User-Agent`'s default) or sets it to `""`. Use a plain `http.Client.Do` with the standard upgrade preamble. Expect `StatusCode == 400`. After: `reg.Counts() == (0, 0)` (registry was empty to start; no binary was seeded). + +3. **`TestClientEndpoint_NoBinary_4404`** — do NOT seed binary. Dial with `validClientHeaders("s1")`. Read once on the connection; expect `*websocket.CloseError` with `Code == websocket.StatusCode(4404)` and `Reason == "no server with that id"`. Assert with `errors.As`. After: `reg.Counts() == (0, 0)`. + +4. **`TestClientEndpoint_PeerClose_UnregistersPhone`** — seed binary; dial; wait until registered; call `c.Close(websocket.StatusNormalClosure, "")` on the client; poll `reg.PhonesFor("s1")` (bounded retry ~2 s) until empty. The binary remains claimed (the test does not touch it). + +5. **`TestClientEndpoint_MultiplePhones_IndependentLifecycle`** — seed binary; dial three phones in sequence (`c1`, `c2`, `c3`); wait until `len(reg.PhonesFor("s1")) == 3`. Close `c2`; poll until `len(reg.PhonesFor("s1")) == 2` AND the remaining two phones' `ConnID()`s match the expected `c1` and `c3` ids (capture on register). Close `c1`; poll until `len == 1`. Close `c3`; poll until `len == 0`. Verifies (a) `RegisterPhone` is order-preserving / additive, (b) `UnregisterPhone` removes only the closed phone, (c) the registry's slice mechanics in `UnregisterPhone` (swap-with-last-then-truncate, registry.go:218-225) does not corrupt other entries. + +6. **`TestClientEndpoint_DeviceNameOptional_HandlerAccepts`** — seed binary; dial with `validClientHeaders("s1")` plus NO `X-Pyrycode-Device-Name`; assert success (phone registers). Then dial again with `X-Pyrycode-Device-Name: "Juhana's iPhone"`; assert success. Single test asserts the device-name header is treated as optional — required by the AC ("Read optional header... not required"). + +### Edit fan-out check + +Files: 2 new + 1 line edit in `main.go`. Helpers (`randHex8`, `remoteHost`) reused by name — no rename, no signature change. `fakeConn` reused by name. **Zero consumer call sites need updating.** Well below the 10-call-site red line. + +### What this test file deliberately does not do + +- **Mock the registry.** Use the real one — `Registry` is race-tested in #3 and grace-extended in #20. +- **Mock `nhooyr.io/websocket`.** Same reasoning as #15. +- **Test the binary-grace-expiry close path.** Registry-internal; covered by #20's tests. The phone handler's behaviour under that path (defer no-ops cleanly) is structural; asserting it would require dragging `ScheduleReleaseServer` timing into a `/v1/client` test, which would re-test #20. +- **Assert exact logger output.** `io.Discard` in tests, same as #16. +- **Test `x-pyrycode-token` value handling.** The relay does not parse, compare, or otherwise act on the token's value — only on its presence. The `HeaderGate_400` table covers presence (via the empty / missing rows for `X-Pyrycode-Token`). Asserting "token is not logged" would require diverting slog output to a buffer; the spec relies on code review + the security-review section to enforce the no-log invariant. + +### Lint expectations + +`make vet`, `make test` (under `-race`), `make build` all clean. `gosec ./...` and `govulncheck ./...` clean. No new dependencies. + +## Open questions + +1. **Should the 4404 close-reason mention the requested server-id?** No. The protocol-spec close-reason is a literal generic string. Echoing the server-id back invites probing: an attacker enumerating server-ids gets confirmation in the close frame. The current message — `"no server with that id"` — leaks nothing. The AC mandates this exact string; preserve it. + +2. **Should `RegisterPhone` be retried after `ErrNoServer` if the binary just disconnected (registry in grace window)?** No. During the grace window `BinaryFor` returns the (closed) binary, so `RegisterPhone` succeeds and the phone is registered to a binary that may never reclaim — the registry's `handleGraceExpiry` then closes it on expiry. This is the registry's deliberate behaviour (#20); the phone handler does not retry, does not poll, does not double-check. If the binary's slot is empty when `RegisterPhone` runs, that's a true 4404 — no other binary will ever pick up this phone's claim before the phone re-dials. + +3. **Should the handler reject `device_name` with control characters / very long values?** No. slog escapes control chars; `device_name` is informational only; v1 has no length cap on any header in either endpoint. If the relay grows a per-IP rate limit later, oversized headers are caught upstream of this handler. Per evidence-based defence, no observed failure motivates a check here. + +4. **Should `phone_registered` log the device-name even when empty?** Yes — keep the field present so structured-log consumers see consistent shape. `slog`'s text handler emits `device_name=""` for an empty string, which is unambiguous. + +## Out of scope (re-stated, for the developer) + +- Frame forwarding loop (`#6`) — `CloseRead` discards frames; replace with the real loop in #6. +- Heartbeat / ping-pong (`#7`). +- Per-phone reconnect grace (the binary-side grace from #20 ALREADY closes phones cleanly on expiry; phone-side grace is not in the protocol spec and is not in scope). +- `/v1/server` endpoint — already shipped in #16, grace-extended in #21. +- Per-IP / global connection caps — deferred to the DoS-hardening ticket (`docs/threat-model.md` § "DoS resistance"). +- Token validation — by protocol design, the relay does not see token semantics; the binary owns it. +- Updating `docs/PROJECT-MEMORY.md` and `docs/knowledge/` — owned by the documentation step in the pipeline. + +## Done means + +- `internal/relay/client_endpoint.go` exists with `ClientHandler` and a top-of-file comment block listing close codes. +- `internal/relay/client_endpoint_test.go` covers the six tests above. +- `cmd/pyrycode-relay/main.go` registers the handler on `/v1/client`. +- `make vet`, `make test` (under `-race`), `make build`, `gosec ./...`, `govulncheck ./...` all clean. +- One commit on `feature/5`: `feat(relay): /v1/client WS upgrade with header gate and phone register (#5)`. + +--- + +## Security review (security-sensitive label) + +### Threat surface for THIS ticket + +This is the phone-side ingress — the public, internet-exposed endpoint that mobile clients connect to. The peer is *less* trusted than the binary (`/v1/server` peer is at least running operator-issued software; `/v1/client` peer is anyone on the internet who learned the relay hostname). Every byte of every frame the relay later forwards FROM the phone arrives through this WS upgrade. + +The endpoint sees `x-pyrycode-token` for the first and only time inside the relay process. The relay does not validate the token — the binary does — but the relay handles it: reads it from the header into a local string, presence-checks it, lets it go out of scope. A misstep here (logging it, putting it in an error response, propagating it into a frame) would be a credential leak. + +### Categories walked + +#### 1. Trust boundaries + +The handler's input is `*http.Request`. (a) `serverID` from `X-Pyrycode-Server` is treated as an opaque map key by the registry — no parsing, no filesystem, no command construction. (b) `token` from `X-Pyrycode-Token` is read for presence-check only; never logged, never compared, never forwarded by this handler (frame-forwarding ticket #6 may forward it to the binary inside an envelope — that's #6's contract, not this one's). (c) `userAgent` is presence-checked and discarded. (d) `deviceName` is logged literally; slog escaping is the boundary. (e) `r.RemoteAddr` is parsed by `net.SplitHostPort` for log-only use. + +The trust boundary is explicit: a single function, the `r.Header.Get` block at the top of the handler. Nothing past the gate calls `r.Header` again — the validated locals are the only surface that leaves the boundary. + +**Finding:** No trust boundary widened beyond what the registry already accepts. The token's narrow lifetime (read-once, discarded) is the design's primary defence against credential leakage. + +#### 2. Tokens, secrets, credentials + +- `x-pyrycode-token`: read into a local string, presence-checked, never used after that. **MUST NOT** appear in any log line — the spec calls this out explicitly under § "Logging — exact field set" with `User-Agent` deliberately omitted alongside it. The 4404 close path takes the token's presence as already-validated; an attacker who omits the token gets 400, not 4404, so 4404 cannot be used as a token-presence oracle either way (token-absent → 400; token-present + no-binary → 4404; token-present + binary-present → success). +- The relay does not store, hash, or compare tokens. There is no on-disk artefact for this ticket. +- Token rotation/revocation is owned by the binary (per protocol spec). The relay has no role. +- `conn_id` (32-bit `crypto/rand` suffix) is NOT a security token — same as #16's analysis. It's an opaque routing key; an attacker who knew it could not impersonate the connection (the connection IS the trust). Using `crypto/rand` is principled defence-in-depth: if a future ticket exposes it, an unguessable id avoids creating an oracle. + +**Finding:** Token handling is minimal-scope by design — the relay is not the trust boundary, and the spec keeps the token's lifetime inside the handler as short as possible. The only failure mode is "developer adds a debug log statement that includes the token"; the spec, the threat model, and the security-review section all name this. Defence is layered: doc, code review, and the structural absence of any code path that uses the token after presence-check. + +#### 3. File operations + +N/A — handler does no file I/O. + +#### 4. Subprocess / external command execution + +N/A — handler runs no subprocess. + +#### 5. Cryptographic primitives + +- `randHex8` uses `crypto/rand` (reused from `server_endpoint.go`). Correct primitive. +- TLS termination is upstream (autocert, #9). No TLS handling here. +- No comparison of attacker-controlled values to secrets — the relay does not have the secret to compare against. `crypto/subtle.ConstantTimeCompare` not applicable. + +**Finding:** Inherits #16's RNG-failure-is-fatal posture. No new crypto code. + +#### 6. Network & I/O + +- **Input size limits.** No explicit length cap on header values. `http.Server.ReadHeaderTimeout: 10 s` (`cmd/pyrycode-relay/main.go:53-95`) bounds total header-read duration. Per-header byte cap is not enforced; the threat model already accepts this in v1 (`docs/threat-model.md` § "DoS resistance"). **Finding:** Inherited residual; same posture as #16. +- **Header validation pre-upgrade.** All three required headers checked before `websocket.Accept`. A request missing any header exits with 400 and zero allocation beyond the response. Token is presence-checked, not value-checked (per protocol spec). **Finding:** Matches AC. +- **Timeout discipline.** Inherited from `http.Server` config in `main.go`. No new timeouts in this handler. WSConn's 10s `Send` deadline (#15) bounds the write side once forwarding ticket lands. +- **Slow-loris on upgrade.** Same as #16: peer that completes handshake but never sends frames consumes one goroutine; `CloseRead` waits. Connection caps deferred. +- **Resource exhaustion.** No phone-count-per-server-id cap. The registry's `phones[serverID]` slice grows unbounded under attack. **Finding:** Inherited DoS gap, named in the threat model. Mitigation deferred. +- **TLS.** Inherited from #9. `MinVersion: tls.VersionTLS12`. No change. + +#### 7. Error messages, logs, telemetry + +- **Response bodies.** 400 body is empty. 4404 close-reason is the literal `"no server with that id"` — protocol-defined string, no internal-state leakage. `websocket.Accept` errors let the library write its own short generic 4xx. +- **Logged fields:** `server_id`, `conn_id`, `device_name`, `remote` on success; `server_id`, `remote` on no-server. **Never logged:** token, full headers, user-agent, payloads. The spec § "Logging" enumerates this in a table. +- **No `err.Error()` written to response.** All 4xx bodies are empty or library-defaulted. +- **Conflict-style version-disclosure oracle (#16's concern).** Not applicable: this endpoint does not log binary version anywhere; only the bare minimum identifiers go to logs. + +**Finding:** Matches `docs/threat-model.md` § "Log hygiene" exactly. Token never reaches a log statement on any code path the spec defines. + +#### 8. Concurrency + +- **Lock-order.** Handler takes no lock of its own. Registry methods take `mu` internally. WSConn's `closeOnce` is a sync.Once, not a held lock. **Finding:** No lock-order risk. +- **TOCTOU on shared state.** `RegisterPhone` checks-and-writes under the write lock atomically (registry.go:191-198). The handler does not pre-check via `BinaryFor` then call `RegisterPhone` — it calls `RegisterPhone` directly and branches on the returned error. **Finding:** No TOCTOU window. +- **Defer-before-success race.** Defer is registered AFTER `RegisterPhone` returns nil — same ordering rule as #16 (PROJECT-MEMORY pattern). **Finding:** Safe by construction. +- **Goroutine lifecycle.** One request goroutine + one `CloseRead` read-discard goroutine per accepted connection. The read-discard goroutine terminates on conn close OR `r.Context()` cancel; the request goroutine terminates on `<-readCtx.Done()`; defer runs; goroutines exit. **No leak path** — including under (a) peer close, (b) registry-driven `Close` from binary grace-expiry, (c) server shutdown. +- **Concurrent close races.** `WSConn.Close` is `sync.Once`-guarded; `UnregisterPhone` is no-op-on-miss. The registry's `handleGraceExpiry` calling `Close()` (registry.go:181) racing with the handler's deferred `Close()` is benign — first one wins, second one no-ops. + +**Finding:** No deadlock, no goroutine leak, no TOCTOU. + +#### 9. Threat model alignment + +- **Phone is hostile by default.** The spec assumes this and validates pre-upgrade. +- **Token never logged.** The protocol spec implicitly requires this (the relay is a courier; couriers don't read the contents). The threat model § "Log hygiene" makes it explicit. The spec § "Logging" makes it explicit one level deeper. +- **No amplification.** The relay forwards 1:1; this handler cannot create amplification. +- **First-claim-wins under contention is registry-internal.** Phone-side has no equivalent of `4409` because phones do not "claim" anything — they register additively. The registry's slice is unbounded under attack; that's the residual DoS named in the threat model. +- **Connection caps.** Deferred to a future DoS-hardening ticket. + +### Adversarial framings considered + +- *"Attacker sends `X-Pyrycode-Token: ` to crash slog or fill up logs."* Token is never logged. Header read into a string of size ~RAM (bounded by `ReadHeaderTimeout` and Go's net/http internal limits — `http.DefaultMaxHeaderBytes` is 1 MB). Not used after the presence-check; goes out of scope. **Finding:** No path to log volume or memory exhaustion via the token. + +- *"Attacker sends `X-Pyrycode-Device-Name: \n` to spoof a log entry."* slog's text handler quotes string values; control characters are escaped. JSON handler escapes them too. **Finding:** Log-line forging is structurally blocked. + +- *"Attacker enumerates server-ids by dialling /v1/client with each candidate and observing 4404 vs success."* The 4404 path tells the attacker "no binary has that id." The success path tells them "a binary has that id, AND my opaque token was accepted by the relay (presence-only)." Both are protocol-spec-defined responses. The relay cannot withhold this signal without breaking phones. **Mitigation lives at the protocol layer:** the binary, on receiving the phone's first frame post-upgrade, validates the token; an invalid token leads to a 4401 from the binary (forwarded by the relay in #6's frame-forwarding logic). At the relay layer, server-id existence is observable; this is by design. **Finding:** Acceptable; named here so a reader doesn't think it's an oversight. + +- *"Attacker dials /v1/client with valid headers, holds the connection open without sending frames."* `CloseRead` blocks reading; per-conn cost is one goroutine + one slot in `phones[serverID]`. Without a connection cap, a botnet can fill the registry. Same residual as binary-side; deferred to DoS ticket. + +- *"Attacker exhausts the per-server phones slice, then a real phone tries to register."* `RegisterPhone` always succeeds on a valid serverID (it appends; no cap). The attacker's phones outnumber the legitimate one but each legitimate phone still registers and still gets frames forwarded by #6. The DoS shape is "fan-out cost on broadcast" — owned by future #6 / DoS ticket, not this one. + +- *"Attacker tries CSWSH from a browser."* Same as #16: browser raw-WebSocket cannot set custom request headers, fetch with custom headers triggers CORS pre-flight that the relay does not satisfy. The custom-header gate above the upgrade structurally excludes browser attackers. `OriginPatterns: ["*"]` is safe for this reason. + +- *"Attacker uses POST /v1/client to confuse the upgrade path."* `websocket.Accept` rejects non-GET upgrades with a library 4xx. Registry untouched. Test coverage explicitly exercises this on `/v1/server`; the same library behaviour applies here, but covering it for this endpoint is left implicit (the underlying library is the same). + +- *"Attacker re-dials thousands of times to grow the phones slice forever."* Each successful register appends; each disconnect removes via `UnregisterPhone`. The slice does not leak across connections. Steady state is "concurrent live phones" — bounded by file descriptors / OS / connection cap (deferred). No memory leak on the phones map specifically. + +- *"Attacker sends valid headers, gets 4404, retries with the same headers in a tight loop."* No registry state per failed attempt — `RegisterPhone` failed before any side effect. Attack costs one accept + one close per iteration. Same shape as any unauthenticated 4xx-response endpoint; bounded by DoS ticket. + +- *"Race: binary disconnects between the phone's `RegisterPhone` and the phone observing its conn live."* `RegisterPhone` succeeds (returns nil). The handler logs `phone_registered`. The defer is registered. The grace timer is armed by the binary handler's defer (in parallel). If grace expires, `handleGraceExpiry` closes this phone; the handler's defer runs cleanly. Phone observes a `StatusNormalClosure` and reconnects. **Finding:** Safe by construction; documented under § "Phone close on binary grace-expiry". + +- *"`crypto/rand` panic."* Same posture as #16: panic terminates the connection, http server recovers, defer runs, registry untouched (panic fires inside `randHex8`, which is called BEFORE `RegisterPhone`, so no entry to clean up). Loud failure is correct. + +### Findings + +- **[Trust boundaries]** No findings — single explicit gate at the top of the handler; downstream code holds presence-checked locals, never re-reads `r.Header`. +- **[Tokens, secrets, credentials]** No MUST FIX. **SHOULD FIX (advisory):** the token's no-log posture relies on developer discipline. The spec itself enforces it via the explicit Logging table; code-review on the implementation diff must verify no log statement, no `fmt.Errorf`, and no debug `Printf` includes the token-bearing local. +- **[File operations]** N/A — no file I/O. +- **[Subprocess]** N/A — no exec. +- **[Crypto]** No findings — `crypto/rand` only, reused from #16. +- **[Network & I/O]** No findings on this ticket. Connection caps and per-header byte caps are residual risks already named in `docs/threat-model.md` § "DoS resistance" — out of scope, deferred to a future DoS ticket. +- **[Error messages, logs, telemetry]** No findings — log-field set explicitly enumerated; token excluded by name. The 4404 close-reason matches the AC's literal string with no internal-state leakage. +- **[Concurrency]** No findings — the binary-grace / phone-close interaction is documented and structurally safe; all defer steps are idempotent; no lock-order risk. +- **[Threat model alignment]** No findings — server-id-existence-disclosure on 4404 is by design (named under "Adversarial framings"). DoS gaps inherited. + +### Verdict: PASS + +The handler reuses #16's audited shape (validate-pre-upgrade, defer-after-success, application-close-codes-on-underlying-conn), narrows the token's lifetime to a single presence-check inside the handler, and excludes the token from every log path by explicit field-set enumeration. The phone/binary lifecycle interactions with #20's grace machinery are walked end-to-end and shown to be idempotent on every path. Connection caps and per-header byte caps are documented residual risks, named in the threat model, and explicitly out of scope. The `security-sensitive` label is justified — this is the public, internet-facing, token-bearing ingress. + +**Reviewer:** architect (self-review per `architect/security-review.md`) +**Date:** 2026-05-09 diff --git a/internal/relay/client_endpoint.go b/internal/relay/client_endpoint.go new file mode 100644 index 0000000..4793e22 --- /dev/null +++ b/internal/relay/client_endpoint.go @@ -0,0 +1,88 @@ +package relay + +// WebSocket close codes used by /v1/client. RFC 6455 reserves 4000–4999 for +// application use. +// +// - 1000 (StatusNormalClosure) clean close on shutdown / unregister. +// - 4404 (no server with that id) no binary currently holds the slot. +// +// The relay treats x-pyrycode-token as opaque — its presence is required, +// its value is never parsed, compared, or logged. Token verification is the +// binary's responsibility (per protocol-mobile.md § Phone → relay → binary). + +import ( + "errors" + "log/slog" + "net/http" + + "nhooyr.io/websocket" +) + +// ClientHandler returns the http.Handler for /v1/client: the phone-side +// WebSocket upgrade endpoint. It validates required headers, upgrades the +// connection, registers the phone in reg under the requested server-id, and +// holds the connection open until the phone closes it (or the registry tears +// it down on binary-grace expiry). +func ClientHandler(reg *Registry, logger *slog.Logger) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + serverID := r.Header.Get("X-Pyrycode-Server") + token := r.Header.Get("X-Pyrycode-Token") + userAgent := r.Header.Get("User-Agent") + if serverID == "" || token == "" || userAgent == "" { + http.Error(w, "", http.StatusBadRequest) + return + } + // token goes out of scope here unread: never parsed, compared, or + // logged. The binary owns token verification (protocol-mobile.md). + deviceName := r.Header.Get("X-Pyrycode-Device-Name") + + // OriginPatterns: ["*"] — same rationale as /v1/server: the + // custom-header gate above structurally excludes browser-driven + // CSWSH attempts. + c, err := websocket.Accept(w, r, &websocket.AcceptOptions{ + OriginPatterns: []string{"*"}, + }) + if err != nil { + return + } + + connID := "client-" + serverID + "-" + randHex8() + wsconn := NewWSConn(c, connID) + + if err := reg.RegisterPhone(serverID, wsconn); err != nil { + if errors.Is(err, ErrNoServer) { + // Stillborn WSConn — close directly on the underlying + // *websocket.Conn so the 4404 application code reaches the + // wire (WSConn.Close always emits StatusNormalClosure). + _ = c.Close(websocket.StatusCode(4404), "no server with that id") + logger.Info("phone_register_no_server", + "server_id", serverID, + "remote", remoteHost(r)) + return + } + wsconn.Close() + return + } + + logger.Info("phone_registered", + "server_id", serverID, + "conn_id", connID, + "device_name", deviceName, + "remote", remoteHost(r)) + + defer func() { + reg.UnregisterPhone(serverID, connID) + wsconn.Close() + logger.Info("phone_unregistered", + "server_id", serverID, + "conn_id", connID) + }() + + // Hold the connection open until the peer closes it (or the + // registry tears it down on binary-grace expiry). CloseRead drains + // control frames so the conn observes peer-close. The frame loop + // (#6) replaces this block with a real read loop later. + readCtx := c.CloseRead(r.Context()) + <-readCtx.Done() + }) +} diff --git a/internal/relay/client_endpoint_test.go b/internal/relay/client_endpoint_test.go new file mode 100644 index 0000000..60a28c1 --- /dev/null +++ b/internal/relay/client_endpoint_test.go @@ -0,0 +1,292 @@ +package relay + +import ( + "context" + "encoding/hex" + "errors" + "io" + "log/slog" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + "nhooyr.io/websocket" +) + +// startClient spins up an httptest.NewServer running ClientHandler against +// a fresh registry. Tests that need a binary registered call seedBinary on +// the returned registry before dialling. +func startClient(t *testing.T) (*Registry, string, func()) { + t.Helper() + reg := NewRegistry() + logger := slog.New(slog.NewTextHandler(io.Discard, nil)) + srv := httptest.NewServer(ClientHandler(reg, logger)) + wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") + return reg, wsURL, srv.Close +} + +func dialWithClient(t *testing.T, wsURL string, hdr http.Header) (*websocket.Conn, *http.Response, error) { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + return websocket.Dial(ctx, wsURL, &websocket.DialOptions{HTTPHeader: hdr}) +} + +func validClientHeaders(serverID string) http.Header { + h := http.Header{} + h.Set("X-Pyrycode-Server", serverID) + h.Set("X-Pyrycode-Token", "phone-token-opaque") + h.Set("User-Agent", "pyry-phone-test/0.1.0") + return h +} + +func seedBinary(t *testing.T, reg *Registry, serverID string) { + t.Helper() + if err := reg.ClaimServer(serverID, &fakeConn{id: "bin-" + serverID}); err != nil { + t.Fatalf("seed ClaimServer(%q): %v", serverID, err) + } +} + +func waitForPhones(t *testing.T, reg *Registry, serverID string, want int, timeout time.Duration) []Conn { + t.Helper() + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + got := reg.PhonesFor(serverID) + if len(got) == want { + return got + } + time.Sleep(10 * time.Millisecond) + } + got := reg.PhonesFor(serverID) + t.Fatalf("PhonesFor(%q) len = %d, want %d after %v", serverID, len(got), want, timeout) + return nil +} + +func TestClientEndpoint_ValidUpgrade_RegistersPhone(t *testing.T) { + reg, wsURL, cleanup := startClient(t) + defer cleanup() + seedBinary(t, reg, "s1") + + c, _, err := dialWithClient(t, wsURL, validClientHeaders("s1")) + if err != nil { + t.Fatalf("dial: %v", err) + } + defer c.Close(websocket.StatusNormalClosure, "") + + phones := waitForPhones(t, reg, "s1", 1, time.Second) + id := phones[0].ConnID() + const prefix = "client-s1-" + if !strings.HasPrefix(id, prefix) { + t.Fatalf("ConnID() = %q, want prefix %q", id, prefix) + } + suffix := strings.TrimPrefix(id, prefix) + if len(suffix) != 8 { + t.Fatalf("ConnID suffix = %q (len %d), want 8 hex chars", suffix, len(suffix)) + } + if _, err := hex.DecodeString(suffix); err != nil { + t.Fatalf("ConnID suffix %q is not hex: %v", suffix, err) + } +} + +func TestClientEndpoint_HeaderGate_400(t *testing.T) { + type row struct { + name string + header string + value string + delete bool + } + rows := []row{ + {name: "missing_X-Pyrycode-Server", header: "X-Pyrycode-Server", delete: true}, + {name: "empty_X-Pyrycode-Server", header: "X-Pyrycode-Server", value: ""}, + {name: "missing_X-Pyrycode-Token", header: "X-Pyrycode-Token", delete: true}, + {name: "empty_X-Pyrycode-Token", header: "X-Pyrycode-Token", value: ""}, + {name: "missing_User-Agent", header: "User-Agent", delete: true}, + {name: "empty_User-Agent", header: "User-Agent", value: ""}, + } + + for _, tc := range rows { + tc := tc + t.Run(tc.name, func(t *testing.T) { + reg := NewRegistry() + logger := slog.New(slog.NewTextHandler(io.Discard, nil)) + srv := httptest.NewServer(ClientHandler(reg, logger)) + defer srv.Close() + + req, err := http.NewRequest(http.MethodGet, srv.URL+"/", nil) + if err != nil { + t.Fatalf("new request: %v", err) + } + req.Header.Set("Upgrade", "websocket") + req.Header.Set("Connection", "Upgrade") + req.Header.Set("Sec-WebSocket-Version", "13") + req.Header.Set("Sec-WebSocket-Key", "dGhlIHNhbXBsZSBub25jZQ==") + req.Header.Set("X-Pyrycode-Server", "s1") + req.Header.Set("X-Pyrycode-Token", "phone-token-opaque") + req.Header.Set("User-Agent", "pyry-phone-test/0.1.0") + if tc.delete { + // For User-Agent, http.Client adds a default value when the + // header is Del'd. Setting the key to a nil slice keeps + // Header.has true so the default is not injected, while + // sending no header on the wire. (Same trick as + // server_endpoint_test.go.) + req.Header[tc.header] = nil + } else { + req.Header.Set(tc.header, tc.value) + } + + client := &http.Client{Timeout: 5 * time.Second} + resp, err := client.Do(req) + if err != nil { + t.Fatalf("do: %v", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusBadRequest { + t.Fatalf("status = %d, want 400", resp.StatusCode) + } + if b, p := reg.Counts(); b != 0 || p != 0 { + t.Fatalf("registry counts = (%d, %d), want (0, 0)", b, p) + } + }) + } +} + +func TestClientEndpoint_NoBinary_4404(t *testing.T) { + reg, wsURL, cleanup := startClient(t) + defer cleanup() + + c, _, err := dialWithClient(t, wsURL, validClientHeaders("s1")) + if err != nil { + t.Fatalf("dial: %v", err) + } + defer c.Close(websocket.StatusNormalClosure, "") + + readCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + _, _, readErr := c.Read(readCtx) + if readErr == nil { + t.Fatalf("Read returned nil error, want close") + } + var ce websocket.CloseError + if !errors.As(readErr, &ce) { + t.Fatalf("Read err = %v (%T), want *websocket.CloseError", readErr, readErr) + } + if ce.Code != websocket.StatusCode(4404) { + t.Fatalf("close code = %d, want 4404", ce.Code) + } + if ce.Reason != "no server with that id" { + t.Fatalf("close reason = %q, want %q", ce.Reason, "no server with that id") + } + if b, p := reg.Counts(); b != 0 || p != 0 { + t.Fatalf("registry counts = (%d, %d), want (0, 0)", b, p) + } +} + +func TestClientEndpoint_PeerClose_UnregistersPhone(t *testing.T) { + reg, wsURL, cleanup := startClient(t) + defer cleanup() + seedBinary(t, reg, "s1") + + c, _, err := dialWithClient(t, wsURL, validClientHeaders("s1")) + if err != nil { + t.Fatalf("dial: %v", err) + } + waitForPhones(t, reg, "s1", 1, time.Second) + + if err := c.Close(websocket.StatusNormalClosure, ""); err != nil { + t.Fatalf("client Close: %v", err) + } + waitForPhones(t, reg, "s1", 0, 2*time.Second) + + // Binary stays claimed — the test did not touch /v1/server. + if _, ok := reg.BinaryFor("s1"); !ok { + t.Fatalf("BinaryFor(s1) was released by phone close; binary entry should be untouched") + } +} + +func TestClientEndpoint_MultiplePhones_IndependentLifecycle(t *testing.T) { + reg, wsURL, cleanup := startClient(t) + defer cleanup() + seedBinary(t, reg, "s1") + + dialPhone := func() *websocket.Conn { + c, _, err := dialWithClient(t, wsURL, validClientHeaders("s1")) + if err != nil { + t.Fatalf("dial: %v", err) + } + return c + } + + c1 := dialPhone() + waitForPhones(t, reg, "s1", 1, time.Second) + c2 := dialPhone() + waitForPhones(t, reg, "s1", 2, time.Second) + c3 := dialPhone() + phones3 := waitForPhones(t, reg, "s1", 3, time.Second) + + id1 := phones3[0].ConnID() + id2 := phones3[1].ConnID() + id3 := phones3[2].ConnID() + if id1 == id2 || id2 == id3 || id1 == id3 { + t.Fatalf("ConnIDs not unique: %q %q %q", id1, id2, id3) + } + + // Close the middle phone; the other two remain. + if err := c2.Close(websocket.StatusNormalClosure, ""); err != nil { + t.Fatalf("c2 Close: %v", err) + } + phones2 := waitForPhones(t, reg, "s1", 2, 2*time.Second) + got := map[string]bool{ + phones2[0].ConnID(): true, + phones2[1].ConnID(): true, + } + if !got[id1] || !got[id3] { + t.Fatalf("after closing c2, remaining phones = %v, want %q and %q present", got, id1, id3) + } + if got[id2] { + t.Fatalf("after closing c2, c2's id %q still registered", id2) + } + + if err := c1.Close(websocket.StatusNormalClosure, ""); err != nil { + t.Fatalf("c1 Close: %v", err) + } + phones1 := waitForPhones(t, reg, "s1", 1, 2*time.Second) + if phones1[0].ConnID() != id3 { + t.Fatalf("after closing c1, remaining phone id = %q, want %q", phones1[0].ConnID(), id3) + } + + if err := c3.Close(websocket.StatusNormalClosure, ""); err != nil { + t.Fatalf("c3 Close: %v", err) + } + waitForPhones(t, reg, "s1", 0, 2*time.Second) +} + +func TestClientEndpoint_DeviceNameOptional_HandlerAccepts(t *testing.T) { + reg, wsURL, cleanup := startClient(t) + defer cleanup() + seedBinary(t, reg, "s1") + + // No device-name header. + hdr := validClientHeaders("s1") + c1, _, err := dialWithClient(t, wsURL, hdr) + if err != nil { + t.Fatalf("dial without device-name: %v", err) + } + waitForPhones(t, reg, "s1", 1, time.Second) + + // With device-name header. + hdr2 := validClientHeaders("s1") + hdr2.Set("X-Pyrycode-Device-Name", "Juhana's iPhone") + c2, _, err := dialWithClient(t, wsURL, hdr2) + if err != nil { + t.Fatalf("dial with device-name: %v", err) + } + waitForPhones(t, reg, "s1", 2, time.Second) + + // Clean up. + _ = c1.Close(websocket.StatusNormalClosure, "") + _ = c2.Close(websocket.StatusNormalClosure, "") + waitForPhones(t, reg, "s1", 0, 2*time.Second) +} diff --git a/pyrycode-relay b/pyrycode-relay index 9fb20d4..29cacdf 100755 Binary files a/pyrycode-relay and b/pyrycode-relay differ