rivet-dev · graphite-app · Jan 13, 2024 · Jan 13, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -24,6 +24,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 -   **Bolt** Moved additional project roots to Bolt.toml
 -   **types** Support multiple project roots for reusing Protobuf types
 -   **Infra** Switch from AWS ELB to NLB to work around surge queue length limitation
+-   **pools** Allow infinite Redis reconnection attempts
+-   **pools** Set Redis client names
+-   **pools** Ping Redis every 15 seconds
+-   **pools** Enable `test_before_acquire` on SQLx
+-   **pools** Decrease SQLx `idle_timeout` to 3 minutes
+-   **pools** Set ClickHoue `idle_timeout` to 15 seconds
 
 ### Security
 

diff --git a/docs/infrastructure/API_TIMEOUTS.md b/docs/infrastructure/API_TIMEOUTS.md
diff --git a/docs/infrastructure/TIMEOUTS.md b/docs/infrastructure/TIMEOUTS.md
@@ -0,0 +1,55 @@
+# Timeouts
+
+Many load balancers have 60s configured as default timeout. Our API timeouts are designed to work within these bounds.
+
+## Preface: How network reaches Rivet
+
+```
+Client -> Cloudflare -> NLB -> Traefik -> api-monolith
+```
+
+## Infra timeouts
+
+These are the timeouts that our API servers are restricted to:
+
+-   Cloudflare: 100s ([source](https://developers.cloudflare.com/support/troubleshooting/cloudflare-errors/troubleshooting-cloudflare-5xx-errors/#error-524-a-timeout-occurred))
+    -   **Behavior** Returns a 524
+    -   Cannot be configured unless paying for Cloudflare Enterprise
+-   AWS NAT Gateway: 350 seconds idle (without keepalive) ([source](https://docs.aws.amazon.com/vpc/latest/userguide/nat-gateway-troubleshooting.html#nat-gateway-troubleshooting-timeout))
+    -   **Behavior** Connection drop
+-   AWS NLB: 350 seconds ([source](https://docs.aws.amazon.com/elasticloadbalancing/latest/network/network-load-balancers.html#connection-idle-timeout))
+    -   **Behavior** Connection drop
+-   Traefik: 60s ([source](https://github.com/rivet-gg/rivet/blob/c63067ce6e81f97b435e424e576fbd922b14f748/infra/tf/k8s_infra/traefik.tf#L65))
+    -   **Behavior** _Unknown_
+    -   Unlike the other timeouts, this is configurable by us
+
+## Rivet API Timeouts
+
+We use long polling (i.e. `watch_index`) to implement real time functionality. This means we need to be cautious about existing timeouts.
+
+Current timeouts:
+
+-   `api-helper`: 50s ([source](https://github.com/rivet-gg/rivet/blob/9811ae11656d63e26b4814fe15f7f852f5479a48/lib/api-helper/macros/src/lib.rs#L975))
+    -   **Behavior** Returns `API_REQUEST_TIMEOUT`
+    -   **Motivation** This gives a 10s budget for any other 60s timeout
+-   `select_with_timeout!`: 40s ([source](https://github.com/rivet-gg/rivet/blob/9811ae11656d63e26b4814fe15f7f852f5479a48/lib/util/macros/src/lib.rs#L12))
+    -   **Behavior** Timeout handled by API endpoint, usually 200
+    -   **Motivation** This gives a 10s budget for any requests before/after the select statement
+
+## Database connections
+
+### CockroachDB
+
+-   `idle_timeout` is set to 3 minutes, which is less than the NAT Gateway timeout
+-   `test_before_acquire` is left as true in order to ensure we don't run in to timeouts, even though this adds significant overhead
+
+### Redis
+
+-   We ping the database manually every 15 seconds
+-   Back off retries is set to infinity in order to ensure that `ConnectionManager` always returns to a valid state no matter the connection issues
+    -   The current internal logic will cause the Redis connection to fail after 6 automatic disconnects, which will cause the cluster to fail if idle for too long
+
+### Misc Resources
+
+-   [Implementing long-running TCP Connections within VPC networking](https://aws.amazon.com/blogs/networking-and-content-delivery/implementing-long-running-tcp-connections-within-vpc-networking/)
+-   [Introducing configurable Idle timeout for Connection tracking](https://aws.amazon.com/blogs/networking-and-content-delivery/introducing-configurable-idle-timeout-for-connection-tracking/) (this is intentionally not configured)
diff --git a/lib/pools/src/lib.rs b/lib/pools/src/lib.rs
@@ -151,36 +151,38 @@ async fn crdb_from_env(client_name: String) -> Result<Option<CrdbPool>, Error> {
 			.max_lifetime(Duration::from_secs(30 * 60))
 			// Remove connections after a while in order to reduce load
 			// on CRDB after bursts
-			.idle_timeout(Some(Duration::from_secs(10 * 60)))
+			.idle_timeout(Some(Duration::from_secs(3 * 60)))
 			// Open connections immediately on startup
 			.min_connections(min_connections)
 			// Raise the cap, since this is effectively the amount of
 			// simultaneous requests we can handle. See
 			// https://www.cockroachlabs.com/docs/stable/connection-pooling.html
 			.max_connections(max_connections)
-			// Speeds up requests at the expense of potential
-			// failures. See `before_acquire`.
-			.test_before_acquire(false)
-			// Ping once per minute to validate the connection is still alive
-			.before_acquire(|conn, meta| {
-				Box::pin(async move {
-					if meta.idle_for.as_secs() < 60 {
-						Ok(true)
-					} else {
-						match sqlx::Connection::ping(conn).await {
-							Ok(_) => Ok(true),
-							Err(err) => {
-								// See https://docs.aws.amazon.com/vpc/latest/userguide/nat-gateway-troubleshooting.html#nat-gateway-troubleshooting-timeout
-								tracing::warn!(
-									?err,
-									"crdb ping failed, potential idle tcp connection drop"
-								);
-								Ok(false)
-							}
-						}
-					}
-				})
-			})
+			// NOTE: This is disabled until we can ensure that TCP connections stop getting dropped
+			// on AWS.
+			// // Speeds up requests at the expense of potential
+			// // failures. See `before_acquire`.
+			// .test_before_acquire(false)
+			// // Ping once per minute to validate the connection is still alive
+			// .before_acquire(|conn, meta| {
+			// 	Box::pin(async move {
+			// 		if meta.idle_for.as_secs() < 60 {
+			// 			Ok(true)
+			// 		} else {
+			// 			match sqlx::Connection::ping(conn).await {
+			// 				Ok(_) => Ok(true),
+			// 				Err(err) => {
+			// 					// See https://docs.aws.amazon.com/vpc/latest/userguide/nat-gateway-troubleshooting.html#nat-gateway-troubleshooting-timeout
+			// 					tracing::warn!(
+			// 						?err,
+			// 						"crdb ping failed, potential idle tcp connection drop"
+			// 					);
+			// 					Ok(false)
+			// 				}
+			// 			}
+			// 		}
+			// 	})
+			// })
 			.connect(&url)
 			.await
 			.map_err(Error::BuildSqlx)?;
@@ -204,11 +206,11 @@ async fn redis_from_env() -> Result<HashMap<String, RedisPool>, Error> {
 				.name("redis_from_env")
 				.spawn(async move {
 					tracing::info!(%url, "redis connecting");
-					let conn = redis::Client::open(url.as_str())
-						.map_err(Error::BuildRedis)?
-						.get_tokio_connection_manager()
-						.await
-						.map_err(Error::BuildRedis)?;
+					let client = redis::Client::open(url.as_str()).map_err(Error::BuildRedis)?;
+					let conn =
+						redis::aio::ConnectionManager::new_with_backoff(client, 2, 100, usize::MAX)
+							.await
+							.map_err(Error::BuildRedis)?;
 
 					tracing::info!(%url, "redis connected");
 
@@ -236,7 +238,7 @@ fn clickhouse_from_env() -> Result<Option<ClickHousePool>, Error> {
 		// Build HTTP client
 		let mut http_connector = hyper::client::connect::HttpConnector::new();
 		http_connector.enforce_http(false);
-		http_connector.set_keepalive(Some(Duration::from_secs(60)));
+		http_connector.set_keepalive(Some(Duration::from_secs(15)));
 		let https_connector = hyper_tls::HttpsConnector::new_with_connector(http_connector);
 		let http_client = hyper::Client::builder()
 			.pool_idle_timeout(Duration::from_secs(2))
@@ -257,40 +259,36 @@ fn clickhouse_from_env() -> Result<Option<ClickHousePool>, Error> {
 	}
 }
 
-#[tracing::instrument(level = "trace", skip(_pools))]
-async fn runtime(_pools: Pools, client_name: String) {
-	// TODO: Delete this once confirmed this is no longer an issue
-
+#[tracing::instrument(level = "trace", skip(pools))]
+async fn runtime(pools: Pools, client_name: String) {
 	// We have to manually ping the Redis connection since `ConnectionManager`
 	// doesn't do this for us. If we don't make a request on a Redis connection
 	// for a long time, we'll get a broken pipe error, so this keeps the
 	// connection alive.
 
-	// let mut interval = tokio::time::interval(Duration::from_secs(15));
-	// loop {
-	// 	interval.tick().await;
-
-	// 	// TODO: This will ping the same pool multiple times if it shares the
-	// 	// same URL
-	// 	for (db, conn) in &pools.redis {
-	// 		// HACK: Instead of sending `PING`, we test the connection by
-	// 		// updating the client's name. We do this because
-	// 		// `ConnectionManager` doesn't let us hook in to new connections, so
-	// 		// we have to manually update the client's name.
-	// 		let mut conn = conn.clone();
-	// 		let res = redis::cmd("CLIENT")
-	// 			.arg("SETNAME")
-	// 			.arg(&client_name)
-	// 			.query_async::<_, ()>(&mut conn)
-	// 			.await;
-	// 		match res {
-	// 			Ok(_) => {
-	// 				tracing::trace!(%db, "ping success");
-	// 			}
-	// 			Err(err) => {
-	// 				tracing::error!(%db, ?err, "redis ping failed");
-	// 			}
-	// 		}
-	// 	}
-	// }
+	let mut interval = tokio::time::interval(Duration::from_secs(15));
+	loop {
+		interval.tick().await;
+
+		for (db, conn) in &pools.redis {
+			// HACK: Instead of sending `PING`, we test the connection by
+			// updating the client's name. We do this because
+			// `ConnectionManager` doesn't let us hook in to new connections, so
+			// we have to manually update the client's name.
+			let mut conn = conn.clone();
+			let res = redis::cmd("CLIENT")
+				.arg("SETNAME")
+				.arg(&client_name)
+				.query_async::<_, ()>(&mut conn)
+				.await;
+			match res {
+				Ok(_) => {
+					tracing::trace!(%db, "ping success");
+				}
+				Err(err) => {
+					tracing::error!(%db, ?err, "redis ping failed");
+				}
+			}
+		}
+	}
 }