diff --git a/cmd/pyrycode-relay/main.go b/cmd/pyrycode-relay/main.go index db90eea..a541435 100644 --- a/cmd/pyrycode-relay/main.go +++ b/cmd/pyrycode-relay/main.go @@ -106,6 +106,7 @@ func main() { relay.NewConnectionsMetrics(metricsReg, reg) relay.NewForwardMetrics(metricsReg, reg) relay.NewGraceMetrics(metricsReg, reg) + upgradeMetrics := relay.NewUpgradeMetrics(metricsReg) metricsMux := http.NewServeMux() metricsMux.Handle("/metrics", relay.NewMetricsHandler(metricsReg)) @@ -148,10 +149,14 @@ func main() { mux := http.NewServeMux() mux.Handle("/healthz", relay.NewHealthzHandler(reg, Version, startedAt)) - mux.Handle("/v1/server", rateLimit(relay.ServerHandler(reg, logger, 30*time.Second, maxFrameBytes))) + // upgradeMetrics.WrapXxxRateLimitDeny sits OUTSIDE the rate-limit + // middleware so it observes the middleware's HTTP 429 (the only + // HTTP-429 source in either pipe — header-gate writes 400, success + // is 101, WS application close codes travel inside the upgrade). + mux.Handle("/v1/server", upgradeMetrics.WrapServerRateLimitDeny(rateLimit(relay.ServerHandler(reg, logger, 30*time.Second, maxFrameBytes, upgradeMetrics)))) // maxPhones=16 caps phones per server-id; over-cap registrations are // rejected with WS close 4429. Per #30 architect spec. - mux.Handle("/v1/client", rateLimit(relay.ClientHandler(reg, logger, maxFrameBytes, 16))) + mux.Handle("/v1/client", upgradeMetrics.WrapClientRateLimitDeny(rateLimit(relay.ClientHandler(reg, logger, maxFrameBytes, 16, upgradeMetrics)))) if *insecureListen != "" { logger.Info("starting", "version", Version, "mode", "insecure", "listen", *insecureListen) diff --git a/docs/knowledge/INDEX.md b/docs/knowledge/INDEX.md index e915e9b..9ffdce9 100644 --- a/docs/knowledge/INDEX.md +++ b/docs/knowledge/INDEX.md @@ -4,6 +4,7 @@ One-line pointers into the evergreen knowledge base. Newest entries at the top o ## Features +- [Upgrade-attempt and register-failure counters](features/upgrade-and-register-failure-counters.md) — two CounterVecs at the `/v1/server` + `/v1/client` upgrade call sites so operators can distinguish "flood of malformed headers" from "flood of 4409 conflicts" from "flood of 4429 phone-cap rejections" from "flood of rate-limited clients" without log-grepping. `pyrycode_relay_ws_upgrade_attempts_total{endpoint, outcome}` (`endpoint` ∈ `{server, client}`, `outcome` ∈ `{accept, reject_headers, reject_409, reject_404, reject_429, reject_rate_limit}`) is incremented exactly once per terminal decision; `pyrycode_relay_register_failures_total{kind}` (`kind` ∈ `{no_server, server_in_use, phones_at_cap, ratelimit}`) co-increments in lockstep with the matching `outcome=reject_*` cell. All 16 series are pre-bound via `WithLabelValues` in `NewUpgradeMetrics(reg prometheus.Registerer)` — including the three structurally unreachable cells (`server×reject_404`, `server×reject_429`, `client×reject_409`) which stay at 0 by construction — so ops dashboards see a stable scrape shape rather than series flickering into existence on first occurrence. Nine event-named, nil-receiver-safe methods (`ServerAccept`, `ServerHeaderReject`, `ServerIDConflict`, `ServerRateLimitDeny`, `ClientAccept`, `ClientHeaderReject`, `ClientNoServer`, `ClientPhonesAtCap`, `ClientRateLimitDeny`); the five composite methods are the lockstep contract — a single call bumps both the endpoint×outcome and kind cells, so tests cannot accidentally advance one without the other. **Two load-bearing seam choices**: (a) handler-parameter threading (`*UpgradeMetrics` as the final arg to `ServerHandler` / `ClientHandler`) instead of #58's hooks-on-`Registry`, because three of the five terminal sites (header-reject, success, rate-limit-deny) have NO Registry call to hook through — fan-out is 7 mechanical `, nil` test appends, under the 10-call-site red line; (b) `WrapServerRateLimitDeny` / `WrapClientRateLimitDeny` as a status-code observer wrapper stacked OUTSIDE the rate-limit middleware (#47), because adding a callback parameter to `NewRateLimitMiddleware` would cascade across 11 call sites — HTTP 429 in either pipe is unambiguously the rate-limit deny signal (header-gate writes 400, success writes 101, WS close codes 4404/4409/4429 travel inside the upgraded socket). The wrapper's `statusObserverResponseWriter` **must implement `http.Hijacker`** because the wrapped pipe terminates in `websocket.Accept` which type-asserts the writer for hijacking; without forwarding, the upgrade fails with HTTP 500 — a constraint that applies to any future middleware wrapping a WS upgrade. Increments fire structurally tight to the deciding branch (between `http.Error`/`logger.Info` and `return`) so "did this code path execute?" is one statement from "did the counter increment?". All three label values (`endpoint`, `outcome`, `kind`) are hard-coded compile-time constants set per event-named method — none derived from request headers, request paths, or any other attacker-influenced input; cardinality is exactly 16 regardless of traffic. Registered against the private `metricsReg` per ADR-0008 § Scope of use; package-level `TestMetricsRegistry_NoGlobalRegistrarLeak` plus a local sibling `TestUpgradeMetrics_NoGlobalRegistrarLeak` enforce no `DefaultRegisterer` leak structurally. The defensive `wsconn.Close()` non-`Is`-matched branches in both handlers stay MUST-NOT-increment — a future error variant would be a new outcome and belongs in a follow-up ticket. Out of scope: latency histograms for the upgrade handshake, a second HTTP-429 source on the upgrade pipes (would invalidate the observer's filter), source-IP-labelled attempt counters (would carry attacker-influenced label values) (#57). - [Per-IP rate-limit middleware (`/v1/server` + `/v1/client`)](features/rate-limit-middleware.md) — HTTP middleware that throttles WS upgrade attempts per source IP **before** `websocket.Accept` runs, before the registry is touched, before the mobile/binary token header is read. One new exported symbol: `relay.NewRateLimitMiddleware(limiter *IPRateLimiter, logger *slog.Logger, trustForwardedFor bool) func(http.Handler) http.Handler` — returns the conventional composition shape, matching `EnforceHost`. Per request: extract `ip := ClientIP(r, trustForwardedFor)`, deny on `ip == ""` (empty-IP guard runs **before** `Allow` because the limiter's `Allow("")` is a normal map key, not a deny — loud-failure rule: refuse un-attributable traffic rather than admit it un-throttled), deny on `!limiter.Allow(ip)`, otherwise `next.ServeHTTP`. Both deny paths emit one `logger.Warn("rate_limited", "remote", strconv.Quote(ip))` line — single allowlisted key, `strconv.Quote` defends against control-byte log injection, the empty-IP case renders `remote="\"\""` so operators can distinguish the two deny causes by the value. 429 body is empty (`http.Error(w, "", http.StatusTooManyRequests)` — matches the existing `BadRequest` shape in `server_endpoint.go` / `client_endpoint.go`, no internal state leaked). New `--trust-x-forwarded-for` CLI flag (default `false`, Usage carries the explicit spoofing warning) is threaded once into the factory. Policy lives at the wiring site in `cmd/pyrycode-relay/main.go`: `rateLimitRefillEvery = 6s`, `rateLimitBurst = 20`, `rateLimitEvictionInterval = 5min` → ~10 sustained attempts/IP/min with 20-attempt burst headroom; `5min` sweep is well above `burst*refillEvery = 120s`. One shared `*IPRateLimiter` is applied to both `mux.Handle` registrations — a misbehaving IP retrying against either endpoint shares one bucket; `/healthz` is intentionally NOT wrapped (must remain pollable from monitoring). `defer limiter.Close()` is best-effort (current `os.Exit` listener path skips defers; graceful shutdown out of scope). Concurrency safety fully delegated to the limiter; middleware is stateless, no new locks, no new goroutines. Mobile/binary `X-Pyrycode-Token` structurally not read on this code path. Why middleware not constructor injection: keeps frame-routing handlers ignorant of admission control, makes `/healthz`'s exemption fall out for free, mirrors `EnforceHost`. Closes the third leg of `docs/threat-model.md` § *DoS resistance* alongside #29 (per-frame size cap) and #30 (per-server-id phone cap); the "token-bucket rate limit on /v1/server and /v1/client" line moves from *Future hardening* to v1 mitigation. Out of scope: connection-count cap (different threat surface — *attempt rate* vs *resident count*), CIDR-aware trusted-proxy chain (today's flag is flat all-or-nothing), multi-instance shared-state (would need Redis), rate-limit metrics counter (future ticket parallel to #58), adaptive policy, graceful-shutdown signal handler (#47). - [Metrics listener (localhost-only)](features/metrics-listener.md) — separate `*http.Server` for `/metrics`, bound to a loopback IP literal (default `127.0.0.1:9090`); kept off the internet-exposed public listener that serves `/healthz` + `/v1/{server,client}` because metric values leak operational state. Two exports + one sentinel in `internal/relay/metrics_listen.go`: `ErrNonLoopbackBind` (branchable via `errors.Is`), `CheckLoopbackBind(addr)` (pure validator — `net.SplitHostPort` → `ListenerPort` for the port-0 / range / format rejects inherited from #81 → `net.ParseIP(host).IsLoopback()`; hostnames including `localhost:9090` are rejected even when they currently resolve to loopback — the DNS-time TOCTOU window is closed structurally, not by re-resolving), and `NewMetricsServer(addr, h)` (opt-out-aware: `addr == ""` → `(nil, nil)`, validator failure → `(nil, err)`, otherwise an `*http.Server` with the public listener's four timeouts duplicated literally so either listener can drift independently in a future ticket). Wired into both listener branches in `cmd/pyrycode-relay/main.go`: a goroutine launch mirroring the autocert mode's http-01 listener pattern (`os.Exit(1)` on `ListenAndServe` failure — loud-fail-over-silent because a relay booted with metrics enabled but silently not serving them would mislead operator scrapes), the metrics port joins **both** the `expected` and `actual` sets of #81's allowlist (declared secondary listener — must land on both sides of the asymmetric check). Empty-flag opt-out is structural via `metricsSrv != nil` guards at every reference site — no repeated `if *metricsListen == "" {}` branches. TLS / authn deliberately out of scope (loopback IS the defence); same-host adversary in scope but not a defence target; graceful shutdown deferred to #31. Three tests in `metrics_listen_test.go` — 12-row validator matrix, 3-row constructor matrix with timeouts pinned to literal values (not a shared constant), and an end-to-end happy-path that drives validator + constructor + `net.Listen("tcp", "127.0.0.1:0")` + actual `http.Get` round-trip by exploiting `http.Server.Serve(l)` ignoring `Addr` once a listener is supplied (#60). - [Listener port allowlist (boot-time refusal)](features/listener-port-allowlist.md) — relay refuses to start (exit 2) if the set of TCP ports it is *about to bind* (`http.Server.Addr` values) contains any port outside an explicit expected set derived from parsed flags: `{443, 80}` in autocert mode, `{}` in `--insecure-listen :` mode. Catches stray `net/http/pprof :6060` listeners, env-flipped debug ports, accidentally-enabled metrics exporters. Three exports from `internal/relay/listeners.go`: `ErrUnexpectedListener` sentinel (branchable via `errors.Is`; wrapped message names surplus + expected ports both ascending so the failure log is deterministic across `map`-iteration runs and grep-friendly), `ListenerPort(addr string) (uint16, error)` parsing `":443"` / `"127.0.0.1:8080"` / `"[::1]:443"` with an explicit reject of port 0 (the ephemeral-placeholder trap — would smuggle an unknown bound port past the actual-set construction), and `CheckListenerPorts(expected, actual map[uint16]struct{}) error` (pure set-difference). **Asymmetric by design**: surplus = error, missing = nil (a failure to bind an expected port surfaces as `ListenAndServe`'s own bind error at exit 1; duplicating the signal at boot would clutter logs). **Port-only**: interface binding (`127.0.0.1` vs `0.0.0.0`) intentionally out of scope on a single-instance internet-exposed deploy. **Reports all surplus in one error** so a manifest enabling several debug surfaces fails in one boot rather than N restart cycles. Wired into each listener branch separately in `cmd/pyrycode-relay/main.go` (different `http.Server` shapes; lifting a helper would invent surface area without a second consumer); structured log fields `unexpected_ports` + `expected_ports` recomputed in `main` via unexported `listenerPortLists` so `Check…` stays a single-error return. Paired with `TestBinaryDoesNotImportPprof` in `cmd/pyrycode-relay/deps_test.go` — shells out to `go list -deps -json`, catches the `import _ "net/http/pprof"` handler-registration variant that attaches `/debug/pprof/*` to `http.DefaultServeMux` *without* opening a new port (the runtime check would miss it). Belt-and-suspenders means different fabric: stochastic-ish runtime guard + deterministic compile-time test, neither alone is complete. Exit 2 = config-rejected-at-boot, harmonised across the sentinel family (architect overrode the AC's literal `os.Exit(1)`) (#81). diff --git a/docs/knowledge/codebase/57.md b/docs/knowledge/codebase/57.md new file mode 100644 index 0000000..6bf5fc2 --- /dev/null +++ b/docs/knowledge/codebase/57.md @@ -0,0 +1,87 @@ +# Ticket #57 — `ws_upgrade_attempts_total` + `register_failures_total` counters + +Fourth slice of the #56 metrics rollout (after #59 scaffolding, #61 gauges, #60 listener, #58 frame/grace counters). Adds **outcome-labelled counters at the WebSocket-upgrade call sites** so an operator can distinguish "flood of malformed headers" from "flood of `4409` conflicts" from "flood of `4429` phone-cap rejections" from "flood of rate-limited clients" without log-grepping. + +## Implementation + +- **`internal/relay/metrics_upgrade.go` (new, ~230 lines)** — one exported `UpgradeMetrics` struct, one constructor, nine event-named methods, and two endpoint-specific `Wrap*RateLimitDeny` middleware factories: + - `UpgradeMetrics` holds 13 unexported pre-bound `prometheus.Counter` fields (9 reachable endpoint×outcome cells + 4 register-failure kinds). The 3 structurally unreachable cells (`server×reject_404`, `server×reject_429`, `client×reject_409`) are pre-bound via `WithLabelValues` in the constructor but have no stored field and no method to advance them — they exist only so the scrape exposes them at 0. Extending either vector requires a new method here, an updated test, and a spec amendment. + - `NewUpgradeMetrics(reg prometheus.Registerer) *UpgradeMetrics` constructs and `MustRegister`s two `*prometheus.CounterVec`s (`pyrycode_relay_ws_upgrade_attempts_total` with labels `[endpoint, outcome]`, `pyrycode_relay_register_failures_total` with label `[kind]`), pre-binds all 16 cells, and returns the struct. ADR-0008 § Scope of use forbids `prometheus.DefaultRegisterer`; `TestMetricsRegistry_NoGlobalRegistrarLeak` and a local sibling test enforce that structurally. + - **Event-named methods**, all nil-receiver safe: `ServerAccept`, `ServerHeaderReject`, `ServerIDConflict`, `ServerRateLimitDeny`, `ClientAccept`, `ClientHeaderReject`, `ClientNoServer`, `ClientPhonesAtCap`, `ClientRateLimitDeny`. The five composite methods (`ServerIDConflict`, `ClientNoServer`, `ClientPhonesAtCap`, `ServerRateLimitDeny`, `ClientRateLimitDeny`) bump both the endpoint×outcome cell AND the matching register-failure kind in lockstep, so a single method call is the increment contract — tests cannot accidentally advance one without the other. Method names encode the event, not the labels; mirrors `forwardMetrics.phoneToBinary` from #58. + - **`WrapServerRateLimitDeny(next) http.Handler`** + **`WrapClientRateLimitDeny(next) http.Handler`** — status-code observer middleware that wraps the response writer in a tiny `statusObserverResponseWriter`, runs `next`, and calls `Server`/`ClientRateLimitDeny()` if `next` wrote HTTP 429. The wrapper sits OUTSIDE the rate-limit middleware so it observes the middleware's 429; the rate-limit middleware is the only HTTP-429 source in either pipe (header-gate writes 400, success is 101, WS application close codes 4404/4409/4429 travel inside the upgraded WebSocket). The two methods are named-per-endpoint rather than a single `WrapRateLimitDeny(endpoint string, ...)` because the label value MUST be a compile-time constant — same constraint as `metrics_forward.go`'s hard-coded `direction` values. + - `statusObserverResponseWriter` is a 30-line type that records the first `WriteHeader` code and forwards every other method to the embedded `http.ResponseWriter`. **Implements `http.Hijacker`** by type-asserting and delegating, because the wrapped pipe includes the WebSocket upgrade handler — `websocket.Accept` type-asserts the writer for `http.Hijacker` to hand off the TCP connection. +- **`internal/relay/server_endpoint.go` (modified, +1 signature, +3 increments)** — adds `metrics *UpgradeMetrics` as the fifth parameter to `ServerHandler`. Increments placed structurally tight to the deciding branch: + - Header-gate reject branch (line ~45): `metrics.ServerHeaderReject()` between `http.Error` and `return`. + - `ErrServerIDConflict` branch (line ~73): `metrics.ServerIDConflict()` between the close-code write and the `logger.Info`. The composite method bumps both upgrade-outcome and register-failure kind. + - Success path (line ~89): `metrics.ServerAccept()` before the success log fires. + - The defensive `wsconn.Close()` branch at the original line 78-83 stays MUST-NOT-increment — a future error variant returned by `ClaimServer` would be a new outcome and belongs in a follow-up ticket. +- **`internal/relay/client_endpoint.go` (modified, +1 signature, +4 increments)** — adds `metrics *UpgradeMetrics` as the fifth parameter to `ClientHandler`. Increments: + - Header-gate reject (line ~46): `metrics.ClientHeaderReject()`. + - `ErrNoServer` branch (line ~70): `metrics.ClientNoServer()` (composite). + - `ErrPhonesAtCap` branch (line ~77): `metrics.ClientPhonesAtCap()` (composite). + - Success path (line ~89): `metrics.ClientAccept()` before the success log. + - Defensive `wsconn.Close()` branch at the original line 81-83 stays MUST-NOT-increment. +- **`cmd/pyrycode-relay/main.go` (+1 constructor line, +2 wiring lines)** — one new call in the metrics-wiring block: + + ```go + upgradeMetrics := relay.NewUpgradeMetrics(metricsReg) + ``` + + and the two mux entries are rewritten to stack the observer OUTSIDE the rate-limit middleware: + + ```go + mux.Handle("/v1/server", upgradeMetrics.WrapServerRateLimitDeny(rateLimit(relay.ServerHandler(reg, logger, 30*time.Second, maxFrameBytes, upgradeMetrics)))) + mux.Handle("/v1/client", upgradeMetrics.WrapClientRateLimitDeny(rateLimit(relay.ClientHandler(reg, logger, maxFrameBytes, 16, upgradeMetrics)))) + ``` + + Boot-time ordering is correct: the construction call runs before any listener starts serving. +- **`internal/relay/metrics_upgrade_test.go` (new, ~370 lines)** — five tests plus three slice-of-string label inventories and two assertion helpers: + - `assertServerOutcomes` / `assertClientOutcomes` / `assertFailureKinds` walk the closed-label inventories and assert every cell against a `map[string]int`; missing keys are asserted at 0. This is the "and 0 elsewhere" half of the no-double-increment guarantee — without it, a stray composite-method bump on an unrelated terminal path would slip through. + - `TestUpgradeMetrics_ServerEndpoint_TerminalPaths` table-driven over `{accept, reject_headers, reject_409, reject_rate_limit}` — each row stands up a fresh registry + metrics registry + `httptest.NewServer` (or, for the rate-limit row, a directly-served middleware stack), exercises the terminal, and asserts the single-cell-at-1 + all-other-cells-at-0 invariant. The `reject_rate_limit` row uses `newMiddlewareTestLimiter(t, 1)` so the second attempt is denied without time dependence; that row also asserts `reject_headers=1` because the first attempt with no headers reaches the inner `ServerHandler` and gets header-gated. + - `TestUpgradeMetrics_ClientEndpoint_TerminalPaths` analogous over `{accept, reject_headers, reject_404, reject_429, reject_rate_limit}`. The `reject_429` row seeds a binary, sets `cap=1`, dials one phone to fill the cap, then dials a second and reads the WS close. + - `TestUpgradeMetrics_RegisterFailures_CoIncrement` drives each composite method once directly (no handler machinery — the methods are the contract under test) and asserts the four `kind=*` cells match their paired `outcome=reject_*` cells. The `ratelimit` kind is at 2 because both `Server`- and `ClientRateLimitDeny` bump the same kind cell — that's the "sum on `kind` equals the sum on the matching `outcome` value" invariant from the spec's Help string made executable. + - `TestUpgradeMetrics_AllSixteenSeries_Exposed` drives every reachable method once, then asserts: 9 reachable endpoint×outcome cells at 1, 3 structurally unreachable cells at 0, 4 failure-kind cells at the totals implied by composite methods (`ratelimit=2`, the rest at 1). 12 + 4 = 16 series. + - `TestUpgradeMetrics_NoGlobalRegistrarLeak` snapshot-deltas `prometheus.DefaultGatherer.Gather()` before/after constructing the upgrade vectors against a fresh private registry. Same shape as the package-level `TestMetricsRegistry_NoGlobalRegistrarLeak`; locally pinned so a future refactor of `metrics_test.go` cannot accidentally drop the structural defence. +- **Test fan-out (mechanical edits)** — seven `, nil` appends, no behaviour change: + - `server_endpoint_test.go` lines 25, 106, 339 (three sites). + - `client_endpoint_test.go` lines 25, 36, 125 (three sites). + - `ratelimit_middleware_test.go` line 263 (one site). + +## Acceptance criteria — verification map + +- **AC-1** (`ws_upgrade_attempts_total` defined with `endpoint`, `outcome` labels covering the six outcomes): `metrics_upgrade.go` constructor registers the vec and pre-binds all 12 endpoint×outcome combinations. Test: `TestUpgradeMetrics_AllSixteenSeries_Exposed`. +- **AC-2** (`register_failures_total` defined with `kind` covering four values): same file, second vec; tests assert all four cells. +- **AC-3** (both counters register against `NewMetricsRegistry()`, never `prometheus.DefaultRegisterer`; `TestMetricsRegistry_NoGlobalRegistrarLeak` still passes): constructor takes `prometheus.Registerer`; the package-level invariant runs unmodified; `TestUpgradeMetrics_NoGlobalRegistrarLeak` is the local sibling. +- **AC-4** (every `/v1/server` terminal path increments exactly once; `reject_409` co-increments `kind=server_in_use`; `reject_rate_limit` co-increments `kind=ratelimit`): `server_endpoint.go` increments are structurally tight to each branch; `TestUpgradeMetrics_ServerEndpoint_TerminalPaths` exercises each. Composite methods enforce co-increment. +- **AC-5** (every `/v1/client` terminal path increments exactly once; `reject_404` ↔ `no_server`, `reject_429` ↔ `phones_at_cap`, `reject_rate_limit` ↔ `ratelimit`): `client_endpoint.go` + `TestUpgradeMetrics_ClientEndpoint_TerminalPaths`. +- **AC-6** (no double-increment per request, verified by table-driven assertions of exact values): `assertServerOutcomes` / `assertClientOutcomes` walk the full 6-cell inventory and assert 0 on every unstated cell. A stray increment surfaces as the inventory assertion finding non-zero where 0 was expected. +- **AC-7** (all 16 series exposed after a per-path exercise via `promhttp.HandlerFor`): `TestUpgradeMetrics_AllSixteenSeries_Exposed` is exactly this — drives every reachable method, scrapes via `NewMetricsHandler` (which is `promhttp.HandlerFor` underneath), asserts all 16 cells. +- **AC-8** (`make vet`, `make test -race`, `make build` clean): verified in the developer run. +- **AC-9** (codebase summary + INDEX update): this file lands now; the INDEX entry is the documentation phase's responsibility per the dev-agent contract. + +## Patterns established + +- **Status-code observer middleware for cross-middleware metric observation.** When a metric increment site lives in a middleware that runs before the handler (here: the rate-limit middleware), don't thread a callback parameter through the middleware constructor — the call-site fan-out crosses the architect-playbook red line. Instead, stack a thin `http.ResponseWriter` wrapper OUTSIDE the middleware that records `WriteHeader` and inspects the status code after the inner pipe returns. The wrapper must implement `http.Hijacker` if any downstream handler does a WS upgrade. The single deterministic HTTP-status-429 source in the pipe is what makes this readable as "rate-limit deny"; if a future ticket introduces a second 429 source on the same pipe (e.g. capacity-based admission), the observer's filter must grow to disambiguate (or move closer to the source). +- **Composite event methods as the lockstep contract.** When two metric cells must always advance together (here: an `outcome=reject_*` and its paired `kind=*`), bake the lockstep into a single event-named method. The "their sum on `kind` equals the sum on the matching `outcome`" invariant becomes a property of the method, not of every caller — tests cannot accidentally bump one without the other. Mirrors the pre-bind pattern from #58 (constructor binds the labels once; callers reach them via event names). +- **Pre-bind unreachable label cells to expose zero-valued series.** A closed-label contract gains stability if the constructor pre-binds every cell — including structurally unreachable ones — so the scrape shape is fixed regardless of which paths have been exercised. Operator dashboards see consistent series rather than series flickering into existence on first occurrence. Cost is one `WithLabelValues` call per cell at boot; benefit is a discoverable surface. +- **Closed-label inventories in test code as the no-double-increment proof.** Instead of asserting "the cell I exercised is 1" (which misses a stray increment on a sibling cell), define a `var allXxxOutcomes = []string{…}` slice and an `assertXxxOutcomes(t, h, map)` helper that walks the slice and asserts the wanted value (or 0 for missing keys). A stray bump anywhere in the inventory surfaces as a non-zero-where-zero-expected failure. Same shape can be lifted to any future label-set test. +- **Nil-safe `*UpgradeMetrics` parameter on existing handlers as a low-fan-out seam.** When hook-on-Registry (#58 pattern) doesn't work because the increment site has no Registry call to hook into (here: header-reject, success, rate-limit-deny), the next-best seam is a nil-safe pointer parameter on the handler constructor. Every existing test gets a mechanical `, nil` append (7 call sites here, under the 10-call-site red line). The nil-receiver-safe methods mean test code that doesn't care about metrics writes nothing. + +## Lessons learned + +- **The `http.Hijacker` re-export is load-bearing on any wrapper that surrounds a WebSocket upgrade.** The initial spec sketch for `statusObserverResponseWriter` did not call out the `Hijack` forwarding; without it, `websocket.Accept` would fail because the type assertion `w.(http.Hijacker)` returns false on a bare embed. The forwarding method is six lines (type-assert + delegate-or-`http.ErrNotSupported`) and is the difference between a successful upgrade and a 500. Any future middleware wrapper added between the outer mux and a WS handler must also forward `Hijack`. (`http.ResponseController` in newer stdlib provides an alternative, but `nhooyr.io/websocket` at v1.8 type-asserts directly.) +- **Wrapping the rate-limit middleware was non-obvious — the natural reach was to add a hook parameter to `NewRateLimitMiddleware`.** Adding a callback to the existing middleware constructor would have cascaded across 11 call sites (1 prod + 10 tests), at the edit-fan-out red line. The wrapper-outside choice keeps the rate-limit middleware unchanged and the metric concern out of `ratelimit_middleware.go` entirely. Carry forward: when a metric increment site is "the same place an existing middleware fires", prefer a thin status-code observer OUTSIDE the middleware over a parameter-threading change INSIDE it. +- **The pre-bind-unreachable-cells choice paid for itself in test ergonomics.** With every cell pre-bound, `assertServerOutcomes`/`assertClientOutcomes` walking the closed inventory just works — every cell scrapes to a line, even at 0, so the substring match in `assertCounter` finds what it expects. Without pre-binding, the unreached cells would be absent from the scrape body and `assertCounter`'s "want 0" line would fail to substring-match `"{…} 0"`. The constructor's three "throwaway" `_ = upgradeVec.WithLabelValues(…)` lines are not throwaway — they are the contract. +- **Driving composite methods directly in `TestUpgradeMetrics_RegisterFailures_CoIncrement` is cleaner than dialing handlers.** The co-increment invariant is a property of the method, not of the handler. Bringing handlers into the test wiring would couple the invariant to dial/upgrade machinery that is irrelevant to the contract. Same posture: test the smallest unit that carries the invariant. + +## Cross-links + +- [Architect spec](../../specs/architecture/57-upgrade-and-register-failure-counters.md) — full design rationale + security review (PASS, no findings beyond the closed-label constraint already structurally enforced). +- [Codebase note #58](58.md) — frame/grace counters; established the pre-bind pattern this ticket extends. +- [Codebase note #59](59.md) — registry + handler scaffolding; the seam this ticket registers against. +- [Codebase note #60](60.md) — listener wiring conventions. +- [Codebase note #61](61.md) — connection-count gauges; established the same-package fakes posture. +- [Codebase note #47](47.md) — per-IP rate-limit middleware; the deny path observed by `WrapXxxRateLimitDeny`. +- [Codebase note #30](30.md) — phones-per-server-id cap (WS close 4429); the active increment site for `client,reject_429` and `kind=phones_at_cap`. +- [ADR-0008](../decisions/0008-prometheus-client-adoption.md) — Prometheus scope-of-use rules; closed-label cardinality budget; `DefaultRegisterer` ban. +- [#56 — parent ticket](https://github.com/pyrycode/pyrycode-relay/issues/56) — split into #59 / #60 / #61 / #58 / #57. diff --git a/docs/knowledge/features/upgrade-and-register-failure-counters.md b/docs/knowledge/features/upgrade-and-register-failure-counters.md new file mode 100644 index 0000000..b4f6a77 --- /dev/null +++ b/docs/knowledge/features/upgrade-and-register-failure-counters.md @@ -0,0 +1,129 @@ +# Upgrade-attempt and register-failure counters + +Two Prometheus counter vectors at the `/v1/server` and `/v1/client` WebSocket-upgrade call sites, so an operator can distinguish "flood of malformed headers" from "flood of `4409` conflicts" from "flood of `4429` phone-cap rejections" from "flood of rate-limited clients" without log-grepping: + +- `pyrycode_relay_ws_upgrade_attempts_total{endpoint, outcome}` — one increment per terminal decision in `ServerHandler` / `ClientHandler`. A single request never increments more than once. + - `endpoint` ∈ `{server, client}`. + - `outcome` ∈ `{accept, reject_headers, reject_409, reject_404, reject_429, reject_rate_limit}`. +- `pyrycode_relay_register_failures_total{kind}` — co-incremented in lockstep with the matching `outcome=reject_*` cell. + - `kind` ∈ `{no_server, server_in_use, phones_at_cap, ratelimit}`. + +All 16 series (12 endpoint×outcome cells + 4 kind cells) are pre-bound via `WithLabelValues` in the constructor and exposed at the scrape — including the three structurally unreachable cells (`server×reject_404`, `server×reject_429`, `client×reject_409`), which stay at 0 by construction. Ops dashboards see a stable shape rather than series flickering into existence on first occurrence. + +## API + +Package `internal/relay`: + +```go +// metrics_upgrade.go +func NewUpgradeMetrics(reg prometheus.Registerer) *UpgradeMetrics + +func (m *UpgradeMetrics) ServerAccept() +func (m *UpgradeMetrics) ServerHeaderReject() +func (m *UpgradeMetrics) ServerIDConflict() // bumps server×reject_409 AND kind=server_in_use +func (m *UpgradeMetrics) ServerRateLimitDeny() // bumps server×reject_rate_limit AND kind=ratelimit +func (m *UpgradeMetrics) ClientAccept() +func (m *UpgradeMetrics) ClientHeaderReject() +func (m *UpgradeMetrics) ClientNoServer() // bumps client×reject_404 AND kind=no_server +func (m *UpgradeMetrics) ClientPhonesAtCap() // bumps client×reject_429 AND kind=phones_at_cap +func (m *UpgradeMetrics) ClientRateLimitDeny() // bumps client×reject_rate_limit AND kind=ratelimit + +func (m *UpgradeMetrics) WrapServerRateLimitDeny(next http.Handler) http.Handler +func (m *UpgradeMetrics) WrapClientRateLimitDeny(next http.Handler) http.Handler +``` + +Every method is nil-receiver safe. `ServerHandler` / `ClientHandler` accept `*UpgradeMetrics` as their final parameter; existing tests pass `nil` and write nothing to either vector. + +## Design — two load-bearing choices + +### Why a per-handler nil-safe parameter, not hooks-on-Registry + +Three of the five per-endpoint terminal sites (header-reject, success, rate-limit-deny) have **no Registry call to hook through** — the #58 hooks-on-`Registry` pattern doesn't fit. The next-best seam is an exported `*UpgradeMetrics` parameter on the two handler constructors, threaded once from `cmd/pyrycode-relay/main.go`. The parameter is nil-safe: every existing test gets a mechanical `, nil` append (7 sites total, under the 10-call-site fan-out red line). + +### Why a status-code observer wrapper around the rate-limit middleware + +The rate-limit middleware (#47) short-circuits before the upgrade handler runs, so the handler cannot observe its deny. Adding a callback parameter to `NewRateLimitMiddleware` would cascade across **11 call sites** (1 prod + 10 tests) — at the edit-fan-out red line. Instead, `WrapServerRateLimitDeny` / `WrapClientRateLimitDeny` stack a thin `statusObserverResponseWriter` OUTSIDE the rate-limit middleware and inspect the status code after the inner pipe returns. HTTP 429 in either pipe **is** the rate-limit deny signal: + +- Header-gate rejects write 400. +- Successful upgrades write 101 (regardless of subsequent WS application close codes). +- WS close codes 4404 / 4409 / 4429 travel **inside** the upgraded WebSocket, not as HTTP status. +- The rate-limit middleware is structurally the only HTTP-429 source on either pipe. + +If a future ticket introduces a second HTTP-429 source on the same pipe (e.g. capacity-based admission), the observer's filter must grow to disambiguate, or the increment must move closer to the source. + +The two wrap methods are named-per-endpoint (`WrapServerRateLimitDeny` vs. `WrapClientRateLimitDeny`) rather than a single `WrapRateLimitDeny(endpoint string, next)` because the `endpoint` label value MUST be a compile-time constant — same constraint as `metrics_forward.go`'s hard-coded `direction` values. + +### Composite methods as the lockstep contract + +The five composite methods (`ServerIDConflict`, `ClientNoServer`, `ClientPhonesAtCap`, `ServerRateLimitDeny`, `ClientRateLimitDeny`) bump both the endpoint×outcome cell AND the matching `kind` cell in a single call. The "their sum on `kind` equals the sum on the matching `outcome` value" invariant from the help string is a property of the method, not of every caller — tests cannot accidentally advance one without the other. + +## Increment placement — structurally tight to the deciding branch + +Each handler increment fires **immediately after** the deciding branch is taken and **before** any error return / WS close, so "did this code path execute?" is one statement away from "did the counter increment?". + +`server_endpoint.go`: + +- Header-gate reject → `metrics.ServerHeaderReject()` between `http.Error` and `return`. +- `ClaimServer` `ErrServerIDConflict` (WS 4409) → `metrics.ServerIDConflict()` between the close-code write and the `logger.Info`. +- Success → `metrics.ServerAccept()` before the success log fires. +- The defensive `wsconn.Close()` branch on a non-conflict `ClaimServer` error is structurally unreachable from the current registry and **does not** increment — a future variant would be a new outcome and belongs in a follow-up ticket. + +`client_endpoint.go`: + +- Header-gate reject → `metrics.ClientHeaderReject()`. +- `ErrNoServer` (WS 4404) → `metrics.ClientNoServer()` (composite: also kind=`no_server`). +- `ErrPhonesAtCap` (WS 4429) → `metrics.ClientPhonesAtCap()` (composite: also kind=`phones_at_cap`). +- Success → `metrics.ClientAccept()` before the success log. +- The defensive `wsconn.Close()` non-`Is`-matched branch stays MUST-NOT-increment. + +`cmd/pyrycode-relay/main.go` wires the observer OUTSIDE the rate-limit middleware: + +```go +upgradeMetrics := relay.NewUpgradeMetrics(metricsReg) + +mux.Handle("/v1/server", + upgradeMetrics.WrapServerRateLimitDeny( + rateLimit(relay.ServerHandler(reg, logger, 30*time.Second, maxFrameBytes, upgradeMetrics)))) +mux.Handle("/v1/client", + upgradeMetrics.WrapClientRateLimitDeny( + rateLimit(relay.ClientHandler(reg, logger, maxFrameBytes, 16, upgradeMetrics)))) +``` + +## `statusObserverResponseWriter` — Hijack forwarding is load-bearing + +The 30-line observer records the first `WriteHeader` code and forwards every other call to the embedded `http.ResponseWriter`. **It MUST implement `http.Hijacker`** because the wrapped pipe terminates in `websocket.Accept`, which type-asserts the writer for `http.Hijacker` to hand off the TCP connection. Without the forwarding method, the upgrade fails with HTTP 500. Any future middleware wrapper that surrounds a WS upgrade handler must also forward `Hijack`. (`http.ResponseController` in newer stdlib provides an alternative, but `nhooyr.io/websocket` at v1.8 type-asserts directly.) + +The wrapper does not buffer the response body — it records one `int` per request. No new memory pressure or DoS vector. + +## No attacker-influenced labels + +All three label spaces (`endpoint`, `outcome`, `kind`) are hard-coded string constants set at the call site by the event-named methods. None are derived from request headers, request paths, or any other attacker-influenced input. Cardinality is exactly 16 series regardless of traffic — same load-bearing constraint as `metrics_forward.go`'s `direction` label, with the same rationale (a label whose value comes from a user-controlled header explodes cardinality and exposes the header value to anyone scraping `/metrics`). Extending either vector requires updating `NewUpgradeMetrics`, the `TestUpgradeMetrics_AllSixteenSeries_Exposed` expected-series list, and the doc comment on the type. + +## Concurrency + +- **No new goroutines.** Counter increments run on whichever goroutine the caller is on — the per-request `ServeHTTP` goroutine for handler increments, the same goroutine for the wrapper's post-pipe inspection. +- **No new locks.** `prometheus.Counter.Inc` is internally atomic. +- **Per-request observer state.** `statusObserverResponseWriter` is allocated per `ServeHTTP` call — no shared state across goroutines. +- **CounterVec children are pre-bound** in the constructor; each increment is a single atomic Inc on a pre-resolved child. + +## Registration discipline + +`NewUpgradeMetrics` accepts a `prometheus.Registerer` and `MustRegister`s both vectors against it. Per ADR-0008 § *Scope of use*, registration against `prometheus.DefaultRegisterer` is forbidden; the package-level `TestMetricsRegistry_NoGlobalRegistrarLeak` plus a local sibling `TestUpgradeMetrics_NoGlobalRegistrarLeak` enforce the rule structurally (before/after snapshot of `prometheus.DefaultGatherer.Gather()` is unchanged). + +## Out of scope + +- **Latency histograms** for upgrade handshake duration — not covered by this slice. A future ticket would weigh the hot-path cost. +- **A second HTTP-429 source on the upgrade pipes** — would invalidate the observer's "429 ⇒ rate-limit deny" filter. None exist today; a future contributor adding one must extend the observer or relocate the increment. +- **Connection-attempt rate metrics by source IP** — would carry attacker-influenced label values, ruled out by threat model § Log hygiene. + +## Related + +- [Metrics registry (scaffolding)](metrics-registry.md) — the seam these counters plug into (#59). +- [Metrics listener (localhost-only)](metrics-listener.md) — the `/metrics` surface that serves them (#60). +- [Connection-count gauges](connection-count-gauges.md) — pull-based sibling collector (#61). +- [Frame-forwarded and grace-expiry counters](frame-and-grace-counters.md) — push-shaped sibling using hooks-on-`Registry` instead of a handler parameter (#58); same pre-bind pattern, different seam. +- [Per-IP rate-limit middleware](rate-limit-middleware.md) — the deny path observed by `WrapXxxRateLimitDeny` (#47). +- [`/v1/server` WS upgrade](server-endpoint.md) — host of the server-side increments (#26, #57). +- [`/v1/client` WS upgrade](client-endpoint.md) — host of the client-side increments, including the `4429` phones-at-cap branch from #30. +- [ADR-0008: Prometheus client adoption](../decisions/0008-prometheus-client-adoption.md) — scope-of-use rules; closed-label cardinality budget; `DefaultRegisterer` ban. +- [Threat model](../../threat-model.md) — log-hygiene rule extended to metric labels. diff --git a/docs/specs/architecture/57-upgrade-and-register-counters.md b/docs/specs/architecture/57-upgrade-and-register-counters.md new file mode 100644 index 0000000..46bf9a6 --- /dev/null +++ b/docs/specs/architecture/57-upgrade-and-register-counters.md @@ -0,0 +1,385 @@ +# Spec — `ws_upgrade_attempts_total` + `register_failures_total` counters (#57) + +## Files to read first + +- `internal/relay/server_endpoint.go:38-115` — the four terminal branches the + spec hooks: header-gate reject (line 43-46), `ErrServerIDConflict` (line + 65-77), success path (line 85-94), and an unreachable defensive branch + (line 78-83) the spec must NOT increment. +- `internal/relay/client_endpoint.go:37-117` — the five terminal branches: + header-gate reject (line 42-45), `ErrNoServer` (line 64-73), + `ErrPhonesAtCap` (line 74-80), defensive unreachable branch (line 81-83), + success path (line 85-97). +- `internal/relay/ratelimit_middleware.go` (whole file, ~40 lines) — the + deny path this spec observes via response status. Empty-IP guard (line + 27-31) and bucket-deny (line 32-36) both write `http.StatusTooManyRequests` + and are the ONLY HTTP-status-429 sources in either pipe. +- `internal/relay/metrics.go` (whole file, ~50 lines) — `NewMetricsRegistry` + and the ADR-0008 rule: "register against the registry returned here, + NEVER against `prometheus.DefaultRegisterer`". The + `TestMetricsRegistry_NoGlobalRegistrarLeak` invariant in `metrics_test.go` + is the structural enforcement. +- `internal/relay/metrics_forward.go` (whole file, ~45 lines) — pattern + reference for a counter-vector concern: pre-bound `prometheus.Counter` + fields on a small struct, single constructor that registers the vector + and binds the labels. This ticket follows that shape with two vectors. +- `internal/relay/metrics_connections.go:32-51` — the `direction`-label + pattern note: label values are hard-coded constants, never + attacker-influenced. Same constraint applies to `endpoint`, `outcome`, + and `kind` here. +- `internal/relay/metrics_counters_test.go:14-35` — `assertCounter` helper. + Same package, no duplication. New tests scrape via `NewMetricsHandler` and + substring-match `{} `. +- `internal/relay/server_endpoint_test.go:18-43` — `startServer` / `dialWith` + / `validHeaders` helpers. The new counter test reuses them. +- `internal/relay/client_endpoint_test.go:18-60` — `startClient` / + `startClientWithCap` / `dialWithClient` / `validClientHeaders` / + `seedBinary` helpers. The new counter test reuses them. +- `internal/relay/ratelimit_middleware_test.go:240-280` — shape reference + for testing rate-limit deny against a real handler. Adapt to assert + counter increments instead of registry pristineness. +- `cmd/pyrycode-relay/main.go:103-154` — current metrics-wiring + handler- + mounting block. New `NewUpgradeMetrics` call lands alongside + `NewConnectionsMetrics`/`NewForwardMetrics`/`NewGraceMetrics`; the two + endpoint mounts gain the new metrics parameter and a per-endpoint + rate-limit deny observer. +- `docs/specs/architecture/58-frame-and-grace-counters.md` § *Why + hooks-on-Registry, not parameter threading* — negatively relevant: that + spec rejected parameter threading because the increment sites lived in + code that doesn't (and per ADR-0008 § Scope of use must not) import + `prometheus`. This spec accepts a bounded parameter threading because + three of the five terminal sites (header-reject, success, rate-limit + deny) have NO Registry call to hook into. +- `docs/specs/architecture/59-metrics-scaffolding.md` § *Scope of use* — + the ADR-0008 rule reproduced. Load-bearing for the test invariant. +- `docs/PROJECT-MEMORY.md` line 24 — *"Validate at the envelope boundary, + not deeper."* Negatively relevant: the `endpoint`, `outcome`, and + `kind` label values are hard-coded constants, set at the call site, not + derived from request state. + +## Context + +Slice of the metrics rollout (split from #37). The metrics scaffolding +(#59) shipped `NewMetricsRegistry()` returning a private +`*prometheus.Registry`; #58 and #61 added the forward/grace/connected +metrics. This ticket adds the **upgrade-attempt counter** (labels +`endpoint`, `outcome`) and the **register-failure counter** (label `kind`) +at the `/v1/server` and `/v1/client` call sites. + +Per the ticket body, the label cardinality is fixed and small: 2 endpoints +× 6 outcomes + 4 register-failure kinds = 16 series total. **All 12 +endpoint×outcome combinations are pre-bound**, including the three +structurally unreachable ones (`server,reject_404`; `server,reject_429`; +`client,reject_409`), so the metric surface is stable and discoverable — +ops dashboards see zero-valued series rather than the series silently +missing. The AC pins this: "exposes **all 16 series** in Prometheus text +format" after the per-path exercise. + +### Two load-bearing design choices + +1. **Where does the rate-limit-deny increment live?** The rate-limit + middleware short-circuits before the handler runs; the handler cannot + observe the deny. Adding a hook parameter to `NewRateLimitMiddleware` + would cascade across 11 call sites (1 prod + 10 tests) — at the + edit-fan-out red line. This spec picks a **status-code observer + wrapper** stacked OUTSIDE the rate-limit middleware: HTTP 429 from the + wrapped pipe is the rate-limit-deny signal, full stop. The rate-limit + middleware is structurally the only HTTP-429 source in either pipe + (the handler writes 400 for header-gate; WS-close codes 4409/4404/4429 + travel inside the 101-upgraded WebSocket, not as HTTP status). One + wrapper per endpoint carries the `endpoint` label as a hard-coded + constant. + +2. **How are the counters reached from the handlers?** Three of the five + per-endpoint terminal sites (header-reject, success, rate-limit-deny) + have no Registry call to hook through. Hooks-on-Registry (the #58 + pattern) does not work here. This spec adds an exported + `*UpgradeMetrics` parameter to `ServerHandler` and `ClientHandler`. + The parameter is **nil-safe**: callers that don't care about metrics + (every existing test) pass `nil`. The 5 server-handler test sites and + 4 client-handler test sites get a mechanical `, nil` append — under + the 10-call-site red line for edit fan-out. + +## Design + +### Package layout + +Two new files, one modified middleware test, one modified main, two +modified endpoint files, two modified endpoint test files. + +``` +internal/relay/ + metrics_upgrade.go (new) — UpgradeMetrics struct + constructor + observer middleware + metrics_upgrade_test.go (new) — table-driven per-terminal-path tests + 16-series scrape test + server_endpoint.go (modify) — add metrics parameter, 4 increment points + client_endpoint.go (modify) — add metrics parameter, 5 increment points + server_endpoint_test.go (modify) — 3 call sites: append `, nil` + client_endpoint_test.go (modify) — 3 call sites: append `, nil` + ratelimit_middleware_test.go (modify) — 1 call site: append `, nil` +cmd/pyrycode-relay/main.go (modify) — construct UpgradeMetrics, mount per-endpoint deny observer +``` + +### `UpgradeMetrics` — type and constructor (`metrics_upgrade.go`) + +Shape mirrors `forwardMetrics` (`metrics_forward.go:17-44`): + +- Exported type `UpgradeMetrics` with **unexported** pre-bound + `prometheus.Counter` fields — one per (endpoint, outcome) combination + in the full 12-cell cross product, one per `kind` in the 4-cell + register-failure space. The fields themselves are unexported because + callers reach them only through the event-named methods below; this + matches `forwardMetrics`. +- Constructor `NewUpgradeMetrics(reg prometheus.Registerer) *UpgradeMetrics` + registers two `*prometheus.CounterVec`s against `reg` (NOT + `prometheus.DefaultRegisterer` — the existing + `TestMetricsRegistry_NoGlobalRegistrarLeak` enforces this structurally) + and binds all 16 label combinations via `WithLabelValues`. +- Metric names and help strings: + - `pyrycode_relay_ws_upgrade_attempts_total` — "WebSocket upgrade + attempts by endpoint and outcome. One increment per terminal + decision; a single request never increments more than once." + - `pyrycode_relay_register_failures_total` — "Registry-side + registration failures by kind. Co-incremented with the matching + `ws_upgrade_attempts_total{outcome=reject_*}` series; their sum on + `kind` equals the sum on the matching `outcome` value." + +- Event-named methods (nil-receiver safe — `if m == nil { return }` + at the top of each): + - `(*UpgradeMetrics).ServerAccept()` + - `(*UpgradeMetrics).ServerHeaderReject()` + - `(*UpgradeMetrics).ServerIDConflict()` — bumps `upgrade{server,reject_409}` AND `failures{kind=server_in_use}` + - `(*UpgradeMetrics).ClientAccept()` + - `(*UpgradeMetrics).ClientHeaderReject()` + - `(*UpgradeMetrics).ClientNoServer()` — bumps `upgrade{client,reject_404}` AND `failures{kind=no_server}` + - `(*UpgradeMetrics).ClientPhonesAtCap()` — bumps `upgrade{client,reject_429}` AND `failures{kind=phones_at_cap}` + - `(*UpgradeMetrics).ServerRateLimitDeny()` — bumps `upgrade{server,reject_rate_limit}` AND `failures{kind=ratelimit}` + - `(*UpgradeMetrics).ClientRateLimitDeny()` — bumps `upgrade{client,reject_rate_limit}` AND `failures{kind=ratelimit}` + +Method names encode the **event**, not the labels — same convention as +`forwardMetrics.phoneToBinary` / `binaryToPhone`. Composite methods are +the increment contract: register-failure and upgrade-outcome co-increment +in lockstep, so a single method bumps both. This makes the "their sum on +`kind` equals the sum on the matching `outcome`" invariant a property of +the method, not of the caller — tests can't accidentally bump one without +the other. + +### Rate-limit deny observer — `WrapRateLimitDeny` (`metrics_upgrade.go`) + +Small middleware factory; one call site per endpoint in `main.go`. Shape: + +``` +func (m *UpgradeMetrics) WrapServerRateLimitDeny(next http.Handler) http.Handler +func (m *UpgradeMetrics) WrapClientRateLimitDeny(next http.Handler) http.Handler +``` + +Both return an `http.Handler` that wraps `next` with a status-code +observer (an inner `http.ResponseWriter` that records `WriteHeader` +calls). If the wrapped pipe wrote `http.StatusTooManyRequests`, the +wrapper calls `m.ServerRateLimitDeny()` / `m.ClientRateLimitDeny()` +respectively. Nil-receiver safe. + +Wiring order in `main.go`: + +``` +mux.Handle("/v1/server", + upgradeMetrics.WrapServerRateLimitDeny( + rateLimit( + relay.ServerHandler(reg, logger, 30*time.Second, maxFrameBytes, upgradeMetrics)))) +mux.Handle("/v1/client", + upgradeMetrics.WrapClientRateLimitDeny( + rateLimit( + relay.ClientHandler(reg, logger, maxFrameBytes, 16, upgradeMetrics)))) +``` + +The observer sits OUTSIDE the rate-limit middleware so it sees the +middleware's 429. It sits OUTSIDE the upgrade handler too, but the +handler never writes HTTP 429 — it writes HTTP 400 (header gate), +HTTP 101 (success path, regardless of subsequent WS close code), or no +HTTP status (4xx written by `websocket.Accept` itself on a malformed +upgrade). So the observer's "429 → rate-limit deny" interpretation is +unambiguous. + +The two `WrapXxxRateLimitDeny` methods are named-per-endpoint rather than +a single `WrapRateLimitDeny(endpoint string, next)` because the label +value MUST be a compile-time constant, not a string parameter — same +constraint as `metrics_forward.go`'s hard-coded `direction` values. + +### Handler increment placement + +Each increment fires **immediately after** the deciding branch is taken +and **before** any error return / WS close. This keeps "did this code +path execute?" structurally tied to "did the counter increment?" — the +linkage is one statement, no intermediate control flow. + +`server_endpoint.go`: + +- After the header-gate check (line 43-46): on the reject branch, call + `metrics.ServerHeaderReject()` between `http.Error` and `return`. +- After the `ClaimServer` `errors.Is(err, ErrServerIDConflict)` check + (line 65-77): call `metrics.ServerIDConflict()` between the close-code + write and the `logger.Info` line. (Compound method bumps both upgrade + outcome and register-failure kind.) +- The defensive `wsconn.Close()` branch at line 78-83 is unreachable + from the current registry — the spec MUST NOT increment there. A + future error variant added to `ClaimServer` would be a new outcome and + belongs in a follow-up ticket. +- After `reg.ClaimServer` returns nil and `logger.Info("server_claimed", + ...)` fires (line 85-88): call `metrics.ServerAccept()` before the + `defer func() { ... }()` block. + +`client_endpoint.go`: + +- After the header-gate check (line 42-45): `metrics.ClientHeaderReject()` + before the `return`. +- After `ErrNoServer` branch (line 64-73): `metrics.ClientNoServer()` + before the `return`. +- After `ErrPhonesAtCap` branch (line 74-80): `metrics.ClientPhonesAtCap()` + before the `return`. +- The defensive `wsconn.Close()` branch at line 81-83 is unreachable — + MUST NOT increment. +- After `reg.RegisterPhoneCapped` returns nil and the success log fires + (line 85-89): `metrics.ClientAccept()` before the `defer func() { + ... }()`. + +### Concurrency model + +`prometheus.Counter.Inc()` is documented thread-safe. No additional +synchronization needed. The observer middleware's `WriteHeader` +recording uses a per-request `*statusObserverResponseWriter` — request- +scoped, no shared state across goroutines. + +### Error handling + +The counter increments cannot fail (panic-only on a malformed metric +registration, which the constructor catches at boot via `MustRegister`). +Per-request paths have no error surface to add — they are post-condition +markers, not actions that can themselves fail. + +### Testing strategy — `metrics_upgrade_test.go` + +Five tests, all `t.Parallel()`-safe: + +1. **`TestUpgradeMetrics_ServerEndpoint_TerminalPaths`** — table-driven + over the four server terminal cases (`reject_headers`, `reject_409`, + `accept`, **and one synthetic 429 case driven via the deny observer**). + For each row: construct `NewRegistry()` + `NewMetricsRegistry()` + + `NewUpgradeMetrics(mreg)`; stand up an `httptest.NewServer` running + `ServerHandler(reg, logger, grace, maxFrameBytes, upgradeMetrics)` + wrapped in `WrapServerRateLimitDeny`; exercise the terminal path; + `assertCounter` on the exact label set with value 1, and on every + OTHER `(server, *)` label set with value 0. The "and 0 elsewhere" + half is what proves no double-increment on a single request. +2. **`TestUpgradeMetrics_ClientEndpoint_TerminalPaths`** — same shape + over the five client terminal cases. The `reject_404` row dials + without seeding a binary; the `reject_429` row seeds a binary, + registers `maxPhones` phones first via `startClientWithCap`, then + dials once more. +3. **`TestUpgradeMetrics_RegisterFailures_CoIncrement`** — pin the + AC invariant that + `register_failures_total{kind=server_in_use}` increments **exactly + when** `ws_upgrade_attempts_total{endpoint=server,outcome=reject_409}` + does, and the same for `no_server`, `phones_at_cap`, `ratelimit`. + Drive one terminal of each kind, scrape, assert the matched pairs + step in lockstep. +4. **`TestUpgradeMetrics_AllSixteenSeries_Exposed`** — after a synthetic + exercise of all 9 reachable terminal paths (the unreachable 3 are + pre-bound to zero by the constructor), scrape via + `NewMetricsHandler(mreg)` and assert the response body contains a + line for **each of the 16 expected series**. Use the existing + `assertCounter` helper; the unreachable ones are asserted at value 0. +5. **`TestUpgradeMetrics_NoGlobalRegistrarLeak`** — invoke + `NewUpgradeMetrics(NewMetricsRegistry())`; before/after snapshot of + `prometheus.DefaultGatherer.Gather()` length is unchanged. Same shape + as `TestMetricsRegistry_NoGlobalRegistrarLeak`. (This is also covered + transitively by that existing test if it picks up our new + constructor, but pinning it locally is cheap and survives future + refactors of `metrics_test.go`.) + +### Test fan-out (existing tests) + +Mechanical `, nil` appends, no behavior change: + +- `server_endpoint_test.go` lines 25, 106, 339 — three sites. +- `client_endpoint_test.go` lines 25, 36, 125 — three sites. +- `ratelimit_middleware_test.go` line 263 — one site. + +Total 7 mechanical edits. Per the architect playbook's "no mechanical +edits escape" rule, raw count of edited call sites for the +`ServerHandler` + `ClientHandler` signature change is 9 (the 7 above + +2 in `cmd/pyrycode-relay/main.go`), strictly under the 10-call-site red +line. The `NewRateLimitMiddleware` signature is intentionally NOT changed +— that's the load-bearing reason the deny observer is a wrapper rather +than a middleware-internal hook (avoids 11-call-site fan-out). + +## Open questions + +- **Should the observer also fire on header-gate-induced 400s?** No — + the AC defines `reject_headers` as the header-gate outcome, and the + handler itself increments that branch. The observer's 429-only filter + is intentional. +- **Should `WrapServerRateLimitDeny` exist on `*UpgradeMetrics` or as a + free function?** Method on `*UpgradeMetrics` — keeps the per-endpoint + label-binding co-located with the constructor that bound it, and means + the wiring site reads `upgradeMetrics.WrapServerRateLimitDeny(...)` + (clear: "wrap with the deny counter from these metrics") rather than + `relay.WrapServerRateLimitDeny(upgradeMetrics, ...)` (one indirection + more, and ambiguous about whose counters). +- **The `register_failures_total` `kind` label space** is defined as + closed (4 values). If a future ticket introduces a fifth registry-side + failure, the closed-label contract requires (a) updating the + constructor's `WithLabelValues` calls, (b) updating + `TestUpgradeMetrics_AllSixteenSeries_Exposed`'s expected-series list, + and (c) updating the AC. The `ws_upgrade_attempts_total` `outcome` + label is similarly closed. Both lists belong in the godoc on + `NewUpgradeMetrics`. + +## Security review + +**Verdict:** PASS + +**Findings:** + +- **[1. Trust boundaries]** No findings. The handlers' trust boundary is + the header-gate check at the top of `ServerHandler` / `ClientHandler` + (unchanged by this spec). All counter increments fire AFTER the + deciding branch is taken, so no attacker-influenced data reaches the + metric system as a value (counters increment; they don't carry + payloads). +- **[2. Tokens, secrets, credentials]** No findings — this spec touches + no tokens. `X-Pyrycode-Token` remains unread/unlogged/uncompared per + `client_endpoint.go:46-47`'s existing contract. +- **[3. File operations]** N/A — no filesystem code paths. +- **[4. Subprocess / external command execution]** N/A. +- **[5. Cryptographic primitives]** N/A. +- **[6. Network & I/O]** No findings. Input-size caps (`maxFrameBytes`), + header validation, and timeout discipline are unchanged. The deny + observer's `statusObserverResponseWriter` does not buffer the response + body — it only records the status code passed to `WriteHeader` (one + `int` per request). No new memory pressure or DoS vector. +- **[7. Error messages, logs, telemetry]** **MUST FIX prevented at the + design level**: all three new labels (`endpoint`, `outcome`, `kind`) + are **hard-coded constants** at the call site, set by the + event-named methods. None are derived from request headers, request + paths, or any other attacker-influenced input. Same load-bearing + constraint as `metrics_forward.go`'s `direction` label, with the same + rationale (`docs/specs/architecture/58-frame-and-grace-counters.md` + § Security review § Tokens): a label whose value comes from a + user-controlled header explodes cardinality (cost) and exposes the + header value to anyone scraping `/metrics` (information leak). The + fixed 16-series cardinality is recorded in this spec's Context as + "label cardinality budget", auditable by reading the constructor. +- **[8. Concurrency]** No findings. `prometheus.Counter.Inc()` is + thread-safe by contract. The observer middleware's + `statusObserverResponseWriter` is request-scoped — one per + `ServeHTTP` call, never shared. +- **[9. Threat model alignment]** Covers `docs/threat-model.md`'s + observability requirement: the operator can distinguish the four + "flood" patterns called out in the user story (malformed headers, 4409 + conflicts, 4404/4429 phone failures, rate-limited clients) without + log-grepping. No new exposure to the threat-model adversary — + `/metrics` continues to be served on a separate loopback listener + (`metricsListen` in `main.go`), so the new counters are not reachable + from the internet-exposed listener. + +**Reviewer:** architect (self-review per `architect/security-review.md`) +**Date:** 2026-05-13 diff --git a/internal/relay/client_endpoint.go b/internal/relay/client_endpoint.go index 7659e4c..9aac65e 100644 --- a/internal/relay/client_endpoint.go +++ b/internal/relay/client_endpoint.go @@ -34,13 +34,17 @@ import ( // server-id at one time; an over-cap registration is rejected with WS close // code 4429. maxPhones <= 0 disables the cap (used by tests that do not // exercise the boundary). -func ClientHandler(reg *Registry, logger *slog.Logger, maxFrameBytes int64, maxPhones int) http.Handler { +// +// metrics is nil-safe: callers that don't observe upgrade outcomes pass +// nil and every counter call no-ops. +func ClientHandler(reg *Registry, logger *slog.Logger, maxFrameBytes int64, maxPhones int, metrics *UpgradeMetrics) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { serverID := r.Header.Get("X-Pyrycode-Server") token := r.Header.Get("X-Pyrycode-Token") userAgent := r.Header.Get("User-Agent") if serverID == "" || token == "" || userAgent == "" { http.Error(w, "", http.StatusBadRequest) + metrics.ClientHeaderReject() return } // token goes out of scope here unread: never parsed, compared, or @@ -66,6 +70,7 @@ func ClientHandler(reg *Registry, logger *slog.Logger, maxFrameBytes int64, maxP // *websocket.Conn so the 4404 application code reaches the // wire (WSConn.Close always emits StatusNormalClosure). _ = c.Close(websocket.StatusCode(4404), "no server with that id") + metrics.ClientNoServer() logger.Info("phone_register_no_server", "server_id", serverID, "remote", remoteHost(r)) @@ -73,6 +78,7 @@ func ClientHandler(reg *Registry, logger *slog.Logger, maxFrameBytes int64, maxP } if errors.Is(err, ErrPhonesAtCap) { _ = c.Close(websocket.StatusCode(4429), "too many phones for server-id") + metrics.ClientPhonesAtCap() logger.Info("phone_register_at_cap", "server_id", serverID, "remote", remoteHost(r)) @@ -82,6 +88,7 @@ func ClientHandler(reg *Registry, logger *slog.Logger, maxFrameBytes int64, maxP return } + metrics.ClientAccept() logger.Info("phone_registered", "server_id", serverID, "conn_id", connID, diff --git a/internal/relay/client_endpoint_test.go b/internal/relay/client_endpoint_test.go index e94a15f..d0c95db 100644 --- a/internal/relay/client_endpoint_test.go +++ b/internal/relay/client_endpoint_test.go @@ -22,7 +22,7 @@ func startClient(t *testing.T) (*Registry, string, func()) { t.Helper() reg := NewRegistry() logger := slog.New(slog.NewTextHandler(io.Discard, nil)) - srv := httptest.NewServer(ClientHandler(reg, logger, 256*1024, 0)) + srv := httptest.NewServer(ClientHandler(reg, logger, 256*1024, 0, nil)) wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") return reg, wsURL, srv.Close } @@ -33,7 +33,7 @@ func startClientWithCap(t *testing.T, maxPhones int) (*Registry, string, func()) t.Helper() reg := NewRegistry() logger := slog.New(slog.NewTextHandler(io.Discard, nil)) - srv := httptest.NewServer(ClientHandler(reg, logger, 256*1024, maxPhones)) + srv := httptest.NewServer(ClientHandler(reg, logger, 256*1024, maxPhones, nil)) wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") return reg, wsURL, srv.Close } @@ -122,7 +122,7 @@ func TestClientEndpoint_HeaderGate_400(t *testing.T) { t.Run(tc.name, func(t *testing.T) { reg := NewRegistry() logger := slog.New(slog.NewTextHandler(io.Discard, nil)) - srv := httptest.NewServer(ClientHandler(reg, logger, 256*1024, 0)) + srv := httptest.NewServer(ClientHandler(reg, logger, 256*1024, 0, nil)) defer srv.Close() req, err := http.NewRequest(http.MethodGet, srv.URL+"/", nil) diff --git a/internal/relay/metrics_upgrade.go b/internal/relay/metrics_upgrade.go new file mode 100644 index 0000000..cb71f79 --- /dev/null +++ b/internal/relay/metrics_upgrade.go @@ -0,0 +1,258 @@ +package relay + +import ( + "bufio" + "net" + "net/http" + + "github.com/prometheus/client_golang/prometheus" +) + +// UpgradeMetrics emits two Prometheus CounterVecs at the WebSocket upgrade +// call sites: +// +// - pyrycode_relay_ws_upgrade_attempts_total{endpoint,outcome} — one +// increment per terminal decision in ServerHandler / ClientHandler. +// endpoint ∈ {server, client}; outcome ∈ {accept, reject_headers, +// reject_409, reject_404, reject_429, reject_rate_limit}. All 12 +// combinations are pre-bound by NewUpgradeMetrics; three are +// structurally unreachable (server×{reject_404, reject_429}, +// client×reject_409) but are exposed as zero-valued series so ops +// dashboards see a stable shape. +// +// - pyrycode_relay_register_failures_total{kind} — co-incremented with +// the matching upgrade-outcome series. kind ∈ {no_server, +// server_in_use, phones_at_cap, ratelimit}. +// +// Label values are HARD-CODED CONSTANTS set per event-named method. +// None are derived from request headers, request paths, or any other +// attacker-influenced input — same load-bearing constraint as +// metrics_forward.go's direction label +// (docs/specs/architecture/58-frame-and-grace-counters.md § Security +// review § Tokens). The closed-label contract is: extending either +// vector requires updating this constructor, the +// TestUpgradeMetrics_AllSixteenSeries_Exposed expected-series list, +// and the doc comment above. +// +// Methods are nil-receiver safe: callers that don't care about metrics +// pass nil to ServerHandler / ClientHandler / the Wrap* helpers, and +// every counter call is a one-line no-op. +type UpgradeMetrics struct { + serverAccept prometheus.Counter + serverHeaderReject prometheus.Counter + serverIDConflict prometheus.Counter + serverRateLimitDeny prometheus.Counter + + clientAccept prometheus.Counter + clientHeaderReject prometheus.Counter + clientNoServer prometheus.Counter + clientPhonesAtCap prometheus.Counter + clientRateLimitDeny prometheus.Counter + + failServerInUse prometheus.Counter + failNoServer prometheus.Counter + failPhonesAtCap prometheus.Counter + failRateLimit prometheus.Counter +} + +// NewUpgradeMetrics registers the two counter vectors against reg and +// pre-binds all 16 label combinations via WithLabelValues so the entire +// metric surface is visible at scrape time even before the first +// upgrade attempt. The structurally unreachable cells +// (server×{reject_404, reject_429}, client×reject_409) are pre-bound to +// zero and have no method to advance them — a future error variant +// would need both an event-named method and a spec update. +// +// Wiring site: cmd/pyrycode-relay/main.go, alongside NewConnectionsMetrics +// / NewForwardMetrics / NewGraceMetrics. Boot-time only. +func NewUpgradeMetrics(reg prometheus.Registerer) *UpgradeMetrics { + upgradeVec := prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "pyrycode_relay_ws_upgrade_attempts_total", + Help: "WebSocket upgrade attempts by endpoint and outcome. One increment per terminal decision; a single request never increments more than once.", + }, + []string{"endpoint", "outcome"}, + ) + reg.MustRegister(upgradeVec) + + failureVec := prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "pyrycode_relay_register_failures_total", + Help: "Registry-side registration failures by kind. Co-incremented with the matching pyrycode_relay_ws_upgrade_attempts_total{outcome=reject_*} series; their sum on kind equals the sum on the matching outcome value.", + }, + []string{"kind"}, + ) + reg.MustRegister(failureVec) + + // Pre-bind the unreachable cells so the scrape exposes them at 0. + // They have no stored field because no event-named method points at + // them — extending either vector requires both a method here and a + // spec/AC update. + _ = upgradeVec.WithLabelValues("server", "reject_404") + _ = upgradeVec.WithLabelValues("server", "reject_429") + _ = upgradeVec.WithLabelValues("client", "reject_409") + + return &UpgradeMetrics{ + serverAccept: upgradeVec.WithLabelValues("server", "accept"), + serverHeaderReject: upgradeVec.WithLabelValues("server", "reject_headers"), + serverIDConflict: upgradeVec.WithLabelValues("server", "reject_409"), + serverRateLimitDeny: upgradeVec.WithLabelValues("server", "reject_rate_limit"), + + clientAccept: upgradeVec.WithLabelValues("client", "accept"), + clientHeaderReject: upgradeVec.WithLabelValues("client", "reject_headers"), + clientNoServer: upgradeVec.WithLabelValues("client", "reject_404"), + clientPhonesAtCap: upgradeVec.WithLabelValues("client", "reject_429"), + clientRateLimitDeny: upgradeVec.WithLabelValues("client", "reject_rate_limit"), + + failServerInUse: failureVec.WithLabelValues("server_in_use"), + failNoServer: failureVec.WithLabelValues("no_server"), + failPhonesAtCap: failureVec.WithLabelValues("phones_at_cap"), + failRateLimit: failureVec.WithLabelValues("ratelimit"), + } +} + +// ServerAccept records a successful /v1/server upgrade. +func (m *UpgradeMetrics) ServerAccept() { + if m == nil { + return + } + m.serverAccept.Inc() +} + +// ServerHeaderReject records a /v1/server header-gate rejection (HTTP 400). +func (m *UpgradeMetrics) ServerHeaderReject() { + if m == nil { + return + } + m.serverHeaderReject.Inc() +} + +// ServerIDConflict records a /v1/server upgrade rejected with WS close +// 4409 (server-id already claimed). Composite: increments both the +// endpoint×outcome and the register-failure kind in lockstep. +func (m *UpgradeMetrics) ServerIDConflict() { + if m == nil { + return + } + m.serverIDConflict.Inc() + m.failServerInUse.Inc() +} + +// ServerRateLimitDeny records a /v1/server attempt denied by the per-IP +// rate-limit middleware (HTTP 429). Composite: bumps both the +// endpoint×outcome and the register-failure kind=ratelimit. +func (m *UpgradeMetrics) ServerRateLimitDeny() { + if m == nil { + return + } + m.serverRateLimitDeny.Inc() + m.failRateLimit.Inc() +} + +// ClientAccept records a successful /v1/client upgrade. +func (m *UpgradeMetrics) ClientAccept() { + if m == nil { + return + } + m.clientAccept.Inc() +} + +// ClientHeaderReject records a /v1/client header-gate rejection (HTTP 400). +func (m *UpgradeMetrics) ClientHeaderReject() { + if m == nil { + return + } + m.clientHeaderReject.Inc() +} + +// ClientNoServer records a /v1/client upgrade rejected with WS close +// 4404 (no binary for server-id). Composite. +func (m *UpgradeMetrics) ClientNoServer() { + if m == nil { + return + } + m.clientNoServer.Inc() + m.failNoServer.Inc() +} + +// ClientPhonesAtCap records a /v1/client upgrade rejected with WS close +// 4429 (phones-per-server-id cap exceeded, per #30). Composite. +func (m *UpgradeMetrics) ClientPhonesAtCap() { + if m == nil { + return + } + m.clientPhonesAtCap.Inc() + m.failPhonesAtCap.Inc() +} + +// ClientRateLimitDeny records a /v1/client attempt denied by the per-IP +// rate-limit middleware (HTTP 429). Composite. +func (m *UpgradeMetrics) ClientRateLimitDeny() { + if m == nil { + return + } + m.clientRateLimitDeny.Inc() + m.failRateLimit.Inc() +} + +// WrapServerRateLimitDeny returns an http.Handler that wraps next with a +// status-code observer. If the wrapped pipe writes +// http.StatusTooManyRequests, ServerRateLimitDeny is invoked once per +// request. The rate-limit middleware is the only HTTP-429 source in the +// /v1/server pipe — ServerHandler writes HTTP 400 on header-gate, HTTP +// 101 on success, and WS application close codes (4409) travel inside +// the upgraded WebSocket, not as HTTP status. Nil-receiver safe; a nil +// receiver returns next unchanged. +func (m *UpgradeMetrics) WrapServerRateLimitDeny(next http.Handler) http.Handler { + if m == nil { + return next + } + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + obs := &statusObserverResponseWriter{ResponseWriter: w} + next.ServeHTTP(obs, r) + if obs.status == http.StatusTooManyRequests { + m.ServerRateLimitDeny() + } + }) +} + +// WrapClientRateLimitDeny is the /v1/client counterpart of +// WrapServerRateLimitDeny. +func (m *UpgradeMetrics) WrapClientRateLimitDeny(next http.Handler) http.Handler { + if m == nil { + return next + } + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + obs := &statusObserverResponseWriter{ResponseWriter: w} + next.ServeHTTP(obs, r) + if obs.status == http.StatusTooManyRequests { + m.ClientRateLimitDeny() + } + }) +} + +// statusObserverResponseWriter records the HTTP status code passed to +// WriteHeader and forwards every other ResponseWriter method to the +// underlying writer. Request-scoped; one instance per ServeHTTP call. +// Implements http.Hijacker so the downstream WS-upgrade path can still +// hijack the connection on the success branch — websocket.Accept type- +// asserts the ResponseWriter for http.Hijacker. +type statusObserverResponseWriter struct { + http.ResponseWriter + status int +} + +func (s *statusObserverResponseWriter) WriteHeader(code int) { + if s.status == 0 { + s.status = code + } + s.ResponseWriter.WriteHeader(code) +} + +func (s *statusObserverResponseWriter) Hijack() (net.Conn, *bufio.ReadWriter, error) { + h, ok := s.ResponseWriter.(http.Hijacker) + if !ok { + return nil, nil, http.ErrNotSupported + } + return h.Hijack() +} diff --git a/internal/relay/metrics_upgrade_test.go b/internal/relay/metrics_upgrade_test.go new file mode 100644 index 0000000..99223d1 --- /dev/null +++ b/internal/relay/metrics_upgrade_test.go @@ -0,0 +1,455 @@ +package relay + +import ( + "context" + "errors" + "io" + "log/slog" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" + + "nhooyr.io/websocket" +) + +// pollUntil polls fn every 5ms until it returns true or deadline passes. +// Wraps the same shape used in server_endpoint_test.go to keep this file +// self-contained (no new test-helper file just for a 5-line loop). +func pollUntil(deadline time.Time, fn func() bool) bool { + for time.Now().Before(deadline) { + if fn() { + return true + } + time.Sleep(5 * time.Millisecond) + } + return fn() +} + +// newUpgradeBundle returns the full per-test wiring. The metrics +// registry is private to this bundle, so counter assertions are +// hermetic across t.Parallel runs. +func newUpgradeBundle() (*Registry, *UpgradeMetrics, http.Handler) { + reg := NewRegistry() + mreg := NewMetricsRegistry() + m := NewUpgradeMetrics(mreg) + return reg, m, NewMetricsHandler(mreg) +} + +// allServerOutcomes / allClientOutcomes pin the 6-outcome label +// surface. Tests assert "exactly one cell at N, every other cell at 0" +// against these slices so that a future outcome added without a +// corresponding test exposure surfaces as a missing-cell zero +// assertion. +var allServerOutcomes = []string{ + "accept", "reject_headers", "reject_409", "reject_404", "reject_429", "reject_rate_limit", +} +var allClientOutcomes = []string{ + "accept", "reject_headers", "reject_409", "reject_404", "reject_429", "reject_rate_limit", +} +var allFailureKinds = []string{ + "no_server", "server_in_use", "phones_at_cap", "ratelimit", +} + +// assertServerOutcomes asserts every (server, outcome) cell against +// the want map; missing keys are asserted as 0. Same shape for client. +func assertServerOutcomes(t *testing.T, h http.Handler, want map[string]int) { + t.Helper() + for _, outcome := range allServerOutcomes { + v := want[outcome] + assertCounter(t, h, "pyrycode_relay_ws_upgrade_attempts_total", + `endpoint="server",outcome="`+outcome+`"`, v) + } +} + +func assertClientOutcomes(t *testing.T, h http.Handler, want map[string]int) { + t.Helper() + for _, outcome := range allClientOutcomes { + v := want[outcome] + assertCounter(t, h, "pyrycode_relay_ws_upgrade_attempts_total", + `endpoint="client",outcome="`+outcome+`"`, v) + } +} + +func assertFailureKinds(t *testing.T, h http.Handler, want map[string]int) { + t.Helper() + for _, kind := range allFailureKinds { + v := want[kind] + assertCounter(t, h, "pyrycode_relay_register_failures_total", + `kind="`+kind+`"`, v) + } +} + +func TestUpgradeMetrics_ServerEndpoint_TerminalPaths(t *testing.T) { + t.Parallel() + + t.Run("accept", func(t *testing.T) { + t.Parallel() + reg, m, scrape := newUpgradeBundle() + logger := slog.New(slog.NewTextHandler(io.Discard, nil)) + srv := httptest.NewServer(ServerHandler(reg, logger, 100*time.Millisecond, 256*1024, m)) + defer srv.Close() + wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + c, _, err := websocket.Dial(ctx, wsURL, &websocket.DialOptions{HTTPHeader: validHeaders("s1")}) + if err != nil { + t.Fatalf("dial: %v", err) + } + defer c.Close(websocket.StatusNormalClosure, "") + + if !pollUntil(time.Now().Add(time.Second), func() bool { + _, ok := reg.BinaryFor("s1") + return ok + }) { + t.Fatalf("BinaryFor(s1) never registered") + } + + assertServerOutcomes(t, scrape, map[string]int{"accept": 1}) + assertFailureKinds(t, scrape, nil) + }) + + t.Run("reject_headers", func(t *testing.T) { + t.Parallel() + reg, m, scrape := newUpgradeBundle() + logger := slog.New(slog.NewTextHandler(io.Discard, nil)) + srv := httptest.NewServer(ServerHandler(reg, logger, 100*time.Millisecond, 256*1024, m)) + defer srv.Close() + + // Missing X-Pyrycode-Server → header-gate 400. + req, _ := http.NewRequest(http.MethodGet, srv.URL+"/", nil) + req.Header.Set("X-Pyrycode-Version", "0.1.0-test") + req.Header.Set("User-Agent", "pyry-test/0.1.0") + resp, err := (&http.Client{Timeout: 5 * time.Second}).Do(req) + if err != nil { + t.Fatalf("do: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusBadRequest { + t.Fatalf("status = %d, want 400", resp.StatusCode) + } + + assertServerOutcomes(t, scrape, map[string]int{"reject_headers": 1}) + assertFailureKinds(t, scrape, nil) + _ = reg + }) + + t.Run("reject_409", func(t *testing.T) { + t.Parallel() + reg, m, scrape := newUpgradeBundle() + logger := slog.New(slog.NewTextHandler(io.Discard, nil)) + srv := httptest.NewServer(ServerHandler(reg, logger, 100*time.Millisecond, 256*1024, m)) + defer srv.Close() + wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + c1, _, err := websocket.Dial(ctx, wsURL, &websocket.DialOptions{HTTPHeader: validHeaders("s1")}) + if err != nil { + t.Fatalf("dial #1: %v", err) + } + defer c1.Close(websocket.StatusNormalClosure, "") + if !pollUntil(time.Now().Add(time.Second), func() bool { + _, ok := reg.BinaryFor("s1") + return ok + }) { + t.Fatalf("first claim did not register") + } + + c2, _, err := websocket.Dial(ctx, wsURL, &websocket.DialOptions{HTTPHeader: validHeaders("s1")}) + if err != nil { + t.Fatalf("dial #2: %v", err) + } + defer c2.Close(websocket.StatusNormalClosure, "") + readCtx, cancelRead := context.WithTimeout(context.Background(), 5*time.Second) + defer cancelRead() + _, _, readErr := c2.Read(readCtx) + var ce websocket.CloseError + if !errors.As(readErr, &ce) || ce.Code != websocket.StatusCode(4409) { + t.Fatalf("close err = %v, want 4409", readErr) + } + + assertServerOutcomes(t, scrape, map[string]int{"accept": 1, "reject_409": 1}) + assertFailureKinds(t, scrape, map[string]int{"server_in_use": 1}) + }) + + t.Run("reject_rate_limit", func(t *testing.T) { + t.Parallel() + reg, m, scrape := newUpgradeBundle() + _ = reg + logger := slog.New(slog.NewTextHandler(io.Discard, nil)) + // Burst 1 → the second request from the same IP is denied with + // HTTP 429 BEFORE the upgrade handler is reached. The deny + // observer sits OUTSIDE the rate-limit middleware so the 429 + // from the middleware is what the observer sees. + limiter := newMiddlewareTestLimiter(t, 1) + rateLimit := NewRateLimitMiddleware(limiter, logger, false) + // Real ServerHandler in the inner position so the wiring shape + // matches main.go; the bucket-deny ensures it is never reached. + inner := rateLimit(ServerHandler(NewRegistry(), logger, time.Second, 256*1024, m)) + wrapped := m.WrapServerRateLimitDeny(inner) + + // One allowed (the inner ServerHandler will 400 on missing + // headers — the observer does NOT increment on that), one + // denied. + rr1 := httptest.NewRecorder() + wrapped.ServeHTTP(rr1, newMiddlewareRequest("1.1.1.1:1", "")) + rr2 := httptest.NewRecorder() + wrapped.ServeHTTP(rr2, newMiddlewareRequest("1.1.1.1:1", "")) + if rr2.Code != http.StatusTooManyRequests { + t.Fatalf("denied attempt: status = %d, want 429", rr2.Code) + } + + assertServerOutcomes(t, scrape, map[string]int{ + "reject_headers": 1, // the first attempt header-gated + "reject_rate_limit": 1, // the second attempt rate-limited + }) + assertFailureKinds(t, scrape, map[string]int{"ratelimit": 1}) + }) +} + +func TestUpgradeMetrics_ClientEndpoint_TerminalPaths(t *testing.T) { + t.Parallel() + + t.Run("accept", func(t *testing.T) { + t.Parallel() + reg, m, scrape := newUpgradeBundle() + logger := slog.New(slog.NewTextHandler(io.Discard, nil)) + srv := httptest.NewServer(ClientHandler(reg, logger, 256*1024, 0, m)) + defer srv.Close() + seedBinary(t, reg, "s1") + wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + c, _, err := websocket.Dial(ctx, wsURL, &websocket.DialOptions{HTTPHeader: validClientHeaders("s1")}) + if err != nil { + t.Fatalf("dial: %v", err) + } + defer c.Close(websocket.StatusNormalClosure, "") + waitForPhones(t, reg, "s1", 1, time.Second) + + assertClientOutcomes(t, scrape, map[string]int{"accept": 1}) + assertFailureKinds(t, scrape, nil) + }) + + t.Run("reject_headers", func(t *testing.T) { + t.Parallel() + reg, m, scrape := newUpgradeBundle() + logger := slog.New(slog.NewTextHandler(io.Discard, nil)) + srv := httptest.NewServer(ClientHandler(reg, logger, 256*1024, 0, m)) + defer srv.Close() + + req, _ := http.NewRequest(http.MethodGet, srv.URL+"/", nil) + req.Header.Set("X-Pyrycode-Token", "phone-token-opaque") + req.Header.Set("User-Agent", "pyry-test/0.1.0") + resp, err := (&http.Client{Timeout: 5 * time.Second}).Do(req) + if err != nil { + t.Fatalf("do: %v", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusBadRequest { + t.Fatalf("status = %d, want 400", resp.StatusCode) + } + + assertClientOutcomes(t, scrape, map[string]int{"reject_headers": 1}) + assertFailureKinds(t, scrape, nil) + _ = reg + }) + + t.Run("reject_404", func(t *testing.T) { + t.Parallel() + reg, m, scrape := newUpgradeBundle() + logger := slog.New(slog.NewTextHandler(io.Discard, nil)) + srv := httptest.NewServer(ClientHandler(reg, logger, 256*1024, 0, m)) + defer srv.Close() + // No seedBinary — RegisterPhoneCapped returns ErrNoServer. + wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + c, _, err := websocket.Dial(ctx, wsURL, &websocket.DialOptions{HTTPHeader: validClientHeaders("s1")}) + if err != nil { + t.Fatalf("dial: %v", err) + } + defer c.Close(websocket.StatusNormalClosure, "") + readCtx, cancelRead := context.WithTimeout(context.Background(), 5*time.Second) + defer cancelRead() + _, _, readErr := c.Read(readCtx) + var ce websocket.CloseError + if !errors.As(readErr, &ce) || ce.Code != websocket.StatusCode(4404) { + t.Fatalf("close err = %v, want 4404", readErr) + } + + assertClientOutcomes(t, scrape, map[string]int{"reject_404": 1}) + assertFailureKinds(t, scrape, map[string]int{"no_server": 1}) + _ = reg + }) + + t.Run("reject_429", func(t *testing.T) { + t.Parallel() + const cap = 1 + reg, m, scrape := newUpgradeBundle() + logger := slog.New(slog.NewTextHandler(io.Discard, nil)) + srv := httptest.NewServer(ClientHandler(reg, logger, 256*1024, cap, m)) + defer srv.Close() + seedBinary(t, reg, "s1") + wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + // First phone fills the cap. + c1, _, err := websocket.Dial(ctx, wsURL, &websocket.DialOptions{HTTPHeader: validClientHeaders("s1")}) + if err != nil { + t.Fatalf("dial first: %v", err) + } + defer c1.Close(websocket.StatusNormalClosure, "") + waitForPhones(t, reg, "s1", cap, time.Second) + + // Second phone trips ErrPhonesAtCap. + c2, _, err := websocket.Dial(ctx, wsURL, &websocket.DialOptions{HTTPHeader: validClientHeaders("s1")}) + if err != nil { + t.Fatalf("dial over-cap: %v", err) + } + defer c2.Close(websocket.StatusNormalClosure, "") + readCtx, cancelRead := context.WithTimeout(context.Background(), 5*time.Second) + defer cancelRead() + _, _, readErr := c2.Read(readCtx) + var ce websocket.CloseError + if !errors.As(readErr, &ce) || ce.Code != websocket.StatusCode(4429) { + t.Fatalf("close err = %v, want 4429", readErr) + } + + assertClientOutcomes(t, scrape, map[string]int{ + "accept": 1, // first phone + "reject_429": 1, // second phone + }) + assertFailureKinds(t, scrape, map[string]int{"phones_at_cap": 1}) + }) + + t.Run("reject_rate_limit", func(t *testing.T) { + t.Parallel() + _, m, scrape := newUpgradeBundle() + logger := slog.New(slog.NewTextHandler(io.Discard, nil)) + limiter := newMiddlewareTestLimiter(t, 1) + rateLimit := NewRateLimitMiddleware(limiter, logger, false) + inner := rateLimit(ClientHandler(NewRegistry(), logger, 256*1024, 0, m)) + wrapped := m.WrapClientRateLimitDeny(inner) + + rr1 := httptest.NewRecorder() + wrapped.ServeHTTP(rr1, newMiddlewareRequest("1.1.1.1:1", "")) + rr2 := httptest.NewRecorder() + wrapped.ServeHTTP(rr2, newMiddlewareRequest("1.1.1.1:1", "")) + if rr2.Code != http.StatusTooManyRequests { + t.Fatalf("denied attempt: status = %d, want 429", rr2.Code) + } + + assertClientOutcomes(t, scrape, map[string]int{ + "reject_headers": 1, + "reject_rate_limit": 1, + }) + assertFailureKinds(t, scrape, map[string]int{"ratelimit": 1}) + }) +} + +// TestUpgradeMetrics_RegisterFailures_CoIncrement pins the AC invariant +// that pyrycode_relay_register_failures_total{kind} co-increments in +// lockstep with the matching pyrycode_relay_ws_upgrade_attempts_total +// {endpoint,outcome=reject_*} cell — the composite event-named methods +// are the increment contract, so a single method bumps both. +func TestUpgradeMetrics_RegisterFailures_CoIncrement(t *testing.T) { + t.Parallel() + _, m, scrape := newUpgradeBundle() + + // Drive each failure kind once via the event methods (no handler + // machinery needed — the methods are the contract under test). + m.ServerIDConflict() + m.ClientNoServer() + m.ClientPhonesAtCap() + m.ServerRateLimitDeny() + m.ClientRateLimitDeny() + + // server_in_use ↔ server,reject_409 + assertCounter(t, scrape, "pyrycode_relay_register_failures_total", `kind="server_in_use"`, 1) + assertCounter(t, scrape, "pyrycode_relay_ws_upgrade_attempts_total", `endpoint="server",outcome="reject_409"`, 1) + // no_server ↔ client,reject_404 + assertCounter(t, scrape, "pyrycode_relay_register_failures_total", `kind="no_server"`, 1) + assertCounter(t, scrape, "pyrycode_relay_ws_upgrade_attempts_total", `endpoint="client",outcome="reject_404"`, 1) + // phones_at_cap ↔ client,reject_429 + assertCounter(t, scrape, "pyrycode_relay_register_failures_total", `kind="phones_at_cap"`, 1) + assertCounter(t, scrape, "pyrycode_relay_ws_upgrade_attempts_total", `endpoint="client",outcome="reject_429"`, 1) + // ratelimit ↔ {server,client},reject_rate_limit (sum on kind = sum + // on the matching outcome value). + assertCounter(t, scrape, "pyrycode_relay_register_failures_total", `kind="ratelimit"`, 2) + assertCounter(t, scrape, "pyrycode_relay_ws_upgrade_attempts_total", `endpoint="server",outcome="reject_rate_limit"`, 1) + assertCounter(t, scrape, "pyrycode_relay_ws_upgrade_attempts_total", `endpoint="client",outcome="reject_rate_limit"`, 1) +} + +// TestUpgradeMetrics_AllSixteenSeries_Exposed asserts the full label +// surface is visible after a per-method exercise: 12 endpoint×outcome +// cells (9 reachable advanced to 1, 3 structurally unreachable at 0) +// plus 4 failure-kind cells (all advanced to 1) = 16 series. +func TestUpgradeMetrics_AllSixteenSeries_Exposed(t *testing.T) { + t.Parallel() + _, m, scrape := newUpgradeBundle() + + // Advance every reachable method exactly once. The composite + // methods take care of the failure-kind cells. + m.ServerAccept() + m.ServerHeaderReject() + m.ServerIDConflict() + m.ServerRateLimitDeny() + m.ClientAccept() + m.ClientHeaderReject() + m.ClientNoServer() + m.ClientPhonesAtCap() + m.ClientRateLimitDeny() + + // 9 reachable endpoint×outcome cells at 1. + assertServerOutcomes(t, scrape, map[string]int{ + "accept": 1, + "reject_headers": 1, + "reject_409": 1, + "reject_rate_limit": 1, + }) + assertClientOutcomes(t, scrape, map[string]int{ + "accept": 1, + "reject_headers": 1, + "reject_404": 1, + "reject_429": 1, + "reject_rate_limit": 1, + }) + + // 3 structurally unreachable cells exposed at 0. + assertCounter(t, scrape, "pyrycode_relay_ws_upgrade_attempts_total", `endpoint="server",outcome="reject_404"`, 0) + assertCounter(t, scrape, "pyrycode_relay_ws_upgrade_attempts_total", `endpoint="server",outcome="reject_429"`, 0) + assertCounter(t, scrape, "pyrycode_relay_ws_upgrade_attempts_total", `endpoint="client",outcome="reject_409"`, 0) + + // 4 failure-kind cells at the totals implied by the composite + // methods: server_in_use=1, no_server=1, phones_at_cap=1, ratelimit=2. + assertFailureKinds(t, scrape, map[string]int{ + "server_in_use": 1, + "no_server": 1, + "phones_at_cap": 1, + "ratelimit": 2, + }) +} + +// TestUpgradeMetrics_NoGlobalRegistrarLeak pins ADR-0008 § Scope of use +// locally — same shape as TestMetricsRegistry_NoGlobalRegistrarLeak. +// Constructing the upgrade vectors against a private registry MUST NOT +// touch prometheus.DefaultRegisterer. +func TestUpgradeMetrics_NoGlobalRegistrarLeak(t *testing.T) { + t.Parallel() + before := defaultGathererSize(t) + _ = NewUpgradeMetrics(NewMetricsRegistry()) + after := defaultGathererSize(t) + if before != after { + t.Fatalf("default registerer changed: before=%d, after=%d "+ + "(NewUpgradeMetrics must never register on prometheus.DefaultRegisterer; "+ + "see ADR-0008 § Scope of use)", before, after) + } +} + diff --git a/internal/relay/ratelimit_middleware_test.go b/internal/relay/ratelimit_middleware_test.go index 96cc247..71e2796 100644 --- a/internal/relay/ratelimit_middleware_test.go +++ b/internal/relay/ratelimit_middleware_test.go @@ -260,7 +260,7 @@ func TestRateLimitMiddleware_RegistryNotTouchedOnDeny(t *testing.T) { l := newMiddlewareTestLimiter(t, 1) reg := NewRegistry() logger := discardLogger() - wrapped := NewRateLimitMiddleware(l, logger, false)(ServerHandler(reg, logger, time.Second, 256*1024)) + wrapped := NewRateLimitMiddleware(l, logger, false)(ServerHandler(reg, logger, time.Second, 256*1024, nil)) // Exhaust the bucket without going through the WS handler (no headers, // so even if it did run it would 400 — but the deny must short-circuit diff --git a/internal/relay/server_endpoint.go b/internal/relay/server_endpoint.go index 585d57f..eeb6dd7 100644 --- a/internal/relay/server_endpoint.go +++ b/internal/relay/server_endpoint.go @@ -35,13 +35,17 @@ import ( // // maxFrameBytes is the per-frame read cap threaded into NewWSConn; see // docs/specs/architecture/29-wsconn-read-limit.md for the derivation. -func ServerHandler(reg *Registry, logger *slog.Logger, grace time.Duration, maxFrameBytes int64) http.Handler { +// +// metrics is nil-safe: callers that don't observe upgrade outcomes pass +// nil and every counter call no-ops. +func ServerHandler(reg *Registry, logger *slog.Logger, grace time.Duration, maxFrameBytes int64, metrics *UpgradeMetrics) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { serverID := r.Header.Get("X-Pyrycode-Server") versionHeader := r.Header.Get("X-Pyrycode-Version") userAgent := r.Header.Get("User-Agent") if serverID == "" || versionHeader == "" || userAgent == "" { http.Error(w, "", http.StatusBadRequest) + metrics.ServerHeaderReject() return } @@ -70,6 +74,7 @@ func ServerHandler(reg *Registry, logger *slog.Logger, grace time.Duration, maxF // "reach the connection only through WSConn methods" // invariant is preserved in spirit. _ = c.Close(websocket.StatusCode(4409), "server-id already claimed") + metrics.ServerIDConflict() logger.Info("server_id_conflict", "server_id", serverID, "remote", remoteHost(r)) @@ -82,6 +87,7 @@ func ServerHandler(reg *Registry, logger *slog.Logger, grace time.Duration, maxF return } + metrics.ServerAccept() logger.Info("server_claimed", "server_id", serverID, "binary_version", versionHeader, diff --git a/internal/relay/server_endpoint_test.go b/internal/relay/server_endpoint_test.go index 8e20c67..fd5498d 100644 --- a/internal/relay/server_endpoint_test.go +++ b/internal/relay/server_endpoint_test.go @@ -22,7 +22,7 @@ func startServer(t *testing.T, grace time.Duration) (*Registry, string, func()) t.Helper() reg := NewRegistry() logger := slog.New(slog.NewTextHandler(io.Discard, nil)) - srv := httptest.NewServer(ServerHandler(reg, logger, grace, 256*1024)) + srv := httptest.NewServer(ServerHandler(reg, logger, grace, 256*1024, nil)) wsURL := "ws" + strings.TrimPrefix(srv.URL, "http") return reg, wsURL, srv.Close } @@ -103,7 +103,7 @@ func TestServerEndpoint_HeaderGate_400(t *testing.T) { t.Run(tc.name, func(t *testing.T) { reg := NewRegistry() logger := slog.New(slog.NewTextHandler(io.Discard, nil)) - srv := httptest.NewServer(ServerHandler(reg, logger, 100*time.Millisecond, 256*1024)) + srv := httptest.NewServer(ServerHandler(reg, logger, 100*time.Millisecond, 256*1024, nil)) defer srv.Close() req, err := http.NewRequest(http.MethodGet, srv.URL+"/", nil) @@ -336,7 +336,7 @@ func TestServerEndpoint_WrongMethod_NoPanic(t *testing.T) { t.Run(tc.name, func(t *testing.T) { reg := NewRegistry() logger := slog.New(slog.NewTextHandler(io.Discard, nil)) - srv := httptest.NewServer(ServerHandler(reg, logger, 100*time.Millisecond, 256*1024)) + srv := httptest.NewServer(ServerHandler(reg, logger, 100*time.Millisecond, 256*1024, nil)) defer srv.Close() resp, err := tc.do(srv.URL)