diff --git a/CHANGELOG.md b/CHANGELOG.md index 1c8a2ed56..28627e40c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **Infra** runc rootfs is now a writable file system - **Matchmaker** Fix logs not shipping if lobby exits immediately +- **API** 520 error when long polling ## [23.2.0-rc1] - 2023-12-01 diff --git a/docs/infrastructure/API_TIMEOUTS.md b/docs/infrastructure/API_TIMEOUTS.md new file mode 100644 index 000000000..b1839db9c --- /dev/null +++ b/docs/infrastructure/API_TIMEOUTS.md @@ -0,0 +1,20 @@ +# API Timeouts + +Many load balancers have 60s configured as default timeout. Our API timeouts are designed to work within these bounds. + +## Long polling + +We use long polling (i.e. `watch_index`) to implement real time functionality. This means we need to be cautious about existing timeouts. + +## Timeouts + +Current timeouts: + +- Cloudflare: 100s ([source](https://developers.cloudflare.com/support/troubleshooting/cloudflare-errors/troubleshooting-cloudflare-5xx-errors/#error-524-a-timeout-occurred)) + - **Behavior** Returns a 524 + - Cannot be configured unless paying for Cloudflare Enterprise +- Traefik: 60s ([source](https://github.com/rivet-gg/rivet/blob/c63067ce6e81f97b435e424e576fbd922b14f748/infra/tf/k8s_infra/traefik.tf#L65)) + - **Motivation** `api-helper` should always handle this error if everything is functioning correctly. This is meant to be less than Cloudflare to be able to show a Traefik-specific response. +- `select_with_timeout!`: 40s ([source](https://github.com/rivet-gg/rivet/blob/9811ae11656d63e26b4814fe15f7f852f5479a48/lib/util/macros/src/lib.rs#L12)) + - **Behavior** Timeout handled by API endpoint, usually 200 + - **Motivation** This gives a 10s budget for any requests before/after the select statement diff --git a/infra/tf/k8s_infra/traefik.tf b/infra/tf/k8s_infra/traefik.tf index b93e5477d..9d200b6ea 100644 --- a/infra/tf/k8s_infra/traefik.tf +++ b/infra/tf/k8s_infra/traefik.tf @@ -61,9 +61,9 @@ resource "helm_release" "traefik" { additionalArguments = [ "--providers.http.endpoint=http://rivet-api-route.rivet-service.svc.cluster.local/traefik/config/core?token=${module.traefik_secrets.values["rivet/api_route/token"]}", "--providers.http.pollInterval=2.5s", - # 60s for the long polling requests to gracefully exit + 30s for padding - "--entryPoints.web.transport.lifeCycle.graceTimeOut=90s", - "--entryPoints.websecure.transport.lifeCycle.graceTimeOut=90s", + # See docs/infrastructure/API_TIMEOUTS.md + "--entryPoints.web.transport.lifeCycle.graceTimeOut=60s", + "--entryPoints.websecure.transport.lifeCycle.graceTimeOut=60s", ] logs = { diff --git a/lib/util/macros/src/lib.rs b/lib/util/macros/src/lib.rs index 70ecb4767..74ca09444 100644 --- a/lib/util/macros/src/lib.rs +++ b/lib/util/macros/src/lib.rs @@ -8,8 +8,10 @@ use syn::parse::{Parse, ParseStream}; use syn::token::Return; use syn::{braced, bracketed, Expr, Ident, Result, Token}; -// 1 min -const DEFAULT_TIMEOUT: u64 = 60 * 1000; +/// Represented in seconds. +/// +/// See docs/infrastructure/API_TIMEOUTS.md for reasoning. +const DEFAULT_TIMEOUT: u64 = 40 * 1000; mod kw { syn::custom_keyword!(JITTER);