fix(clusters): dont delete servers immediately with linode

rivet-gg · Aug 9, 2024 · 1828b2c · 1828b2c
1 parent 514231a
commit 1828b2c
Show file tree

Hide file tree

Showing 4 changed files with 55 additions and 28 deletions.
diff --git a/docs/packages/cluster/GOTCHAS.md b/docs/packages/cluster/GOTCHAS.md
@@ -0,0 +1,16 @@
+# Gotchas
+
+## Provider Billing Intervals
+
+Server providers bill for usage based on different billing cycles. It is important to know when scaling
+because if the cycle is too long, autoscaling quickly will end up wasting money due to unused billing time.
+
+### Linode
+
+Linode bills hourly. Servers running for 1 minute, 59 minutes, or 60 minutes all cost the user the same
+amount. Internally, we do not immediately delete servers when scaling down and instead start draining them.
+This means if we have to scale up again within an hour we can undrain an existing server instead of
+provisioning a new one, saving the user money.
+
+When using Linode, you should choose your drain timeout to be close to (but not over) intervals of an hour
+(3_600_000 ms).
diff --git a/svc/pkg/cluster/src/workflows/datacenter/scale.rs b/svc/pkg/cluster/src/workflows/datacenter/scale.rs
@@ -19,7 +19,7 @@ use chirp_workflow::prelude::*;
 use futures_util::{FutureExt, StreamExt, TryStreamExt};
 use serde_json::json;
 
-use crate::types::{Datacenter, PoolType};
+use crate::types::{Datacenter, PoolType, Provider};
 
 #[derive(sqlx::FromRow)]
 struct ServerRow {
@@ -76,6 +76,7 @@ enum DrainState {
 
 struct PoolCtx {
 	datacenter_id: Uuid,
+	provider: Provider,
 	pool_type: PoolType,
 	desired_count: usize,
 }
@@ -271,6 +272,7 @@ async fn inner(
 	for pool in &dc.pools {
 		let pool_ctx = PoolCtx {
 			datacenter_id: dc.datacenter_id,
+			provider: dc.provider.clone(),
 			pool_type: pool.pool_type.clone(),
 			desired_count: pool.desired_count.max(pool.min_count).min(pool.max_count) as usize,
 		};
@@ -351,7 +353,12 @@ async fn scale_down_job_servers<'a, I: Iterator<Item = &'a Server>>(
 	let (nomad_servers, without_nomad_servers) =
 		active_servers.partition::<Vec<_>, _>(|server| server.has_nomad_node);
 
-	let destroy_count = (active_count - pctx.desired_count).min(without_nomad_servers.len());
+	let destroy_count = match pctx.provider {
+		// Never destroy servers when scaling down with Linode, always drain
+		Provider::Linode => 0,
+		#[allow(unreachable_patterns)]
+		_ => (active_count - pctx.desired_count).min(without_nomad_servers.len()),
+	};
 	let drain_count = active_count - pctx.desired_count - destroy_count;
 
 	// Destroy servers

diff --git a/svc/pkg/cluster/src/workflows/server/mod.rs b/svc/pkg/cluster/src/workflows/server/mod.rs
@@ -235,19 +235,27 @@ pub(crate) async fn cluster_server(ctx: &mut WorkflowCtx, input: &Input) -> Glob
 				.await?;
 			}
 			Main::NomadDrainComplete(_) => {
-				ctx.activity(SetDrainCompleteInput {
-					server_id: input.server_id,
-				})
-				.await?;
-
-				// Scale
-				ctx.tagged_signal(
-					&json!({
-						"datacenter_id": input.datacenter_id,
-					}),
-					crate::workflows::datacenter::Scale {},
-				)
-				.await?;
+				match dc.provider {
+					Provider::Linode => {
+						// Job server draining is handled via cluster-gc for Linode
+					}
+					#[allow(unreachable_patterns)]
+					_ => {
+						ctx.activity(SetDrainCompleteInput {
+							server_id: input.server_id,
+						})
+						.await?;
+
+						// Scale
+						ctx.tagged_signal(
+							&json!({
+								"datacenter_id": input.datacenter_id,
+							}),
+							crate::workflows::datacenter::Scale {},
+						)
+						.await?;
+					}
+				}
 			}
 			Main::Drain(_) => {
 				ctx.workflow(drain::Input {
@@ -271,13 +279,15 @@ pub(crate) async fn cluster_server(ctx: &mut WorkflowCtx, input: &Input) -> Glob
 		}
 	}
 
+	// Cleanup DNS
 	if let PoolType::Gg = input.pool_type {
 		ctx.workflow(dns_delete::Input {
 			server_id: input.server_id,
 		})
 		.await?;
 	}
 
+	// Cleanup server
 	match dc.provider {
 		Provider::Linode => {
 			tracing::info!(server_id=?input.server_id, "destroying linode server");

diff --git a/svc/pkg/cluster/standalone/gc/src/lib.rs b/svc/pkg/cluster/standalone/gc/src/lib.rs
@@ -29,31 +29,24 @@ pub async fn run_from_env(ts: i64, pools: rivet_pools::Pools) -> GlobalResult<()
 		let ctx = ctx.clone();
 
 		async move {
-			let pool_types = [
-				serde_json::to_string(&PoolType::Gg)?,
-				serde_json::to_string(&PoolType::Ats)?,
-			];
-
-			// Select all draining gg and ats servers
+			// Select all draining servers
 			let servers = sql_fetch_all!(
 				[ctx, ServerRow, @tx tx]
 				"
 				SELECT server_id, datacenter_id, pool_type, pool_type2, drain_ts
 				FROM db_cluster.servers
 				WHERE
-					pool_type2 = ANY($1) AND
-					cloud_destroy_ts IS NULL AND
-					drain_ts IS NOT NULL
+					drain_ts IS NOT NULL AND
+					drain_complete_ts IS NULL AND
+					cloud_destroy_ts IS NULL
 				",
-				&pool_types,
-				ts,
 			)
 			.await?;
-
 			if servers.is_empty() {
 				return Ok(Vec::new());
 			}
 
+			// Fetch relevant datacenters
 			let datacenters_res = ctx
 				.op(cluster::ops::datacenter::get::Input {
 					datacenter_ids: servers
@@ -63,6 +56,7 @@ pub async fn run_from_env(ts: i64, pools: rivet_pools::Pools) -> GlobalResult<()
 				})
 				.await?;
 
+			// Determine which servers are finished draining via their drain timeout
 			let drained_servers = servers
 				.into_iter()
 				.map(|server| {
@@ -100,7 +94,7 @@ pub async fn run_from_env(ts: i64, pools: rivet_pools::Pools) -> GlobalResult<()
 				return Ok(Vec::new());
 			}
 
-			tracing::info!(count=%drained_servers.len(), "servers done draining");
+			tracing::info!("{} servers done draining", drained_servers.len());
 
 			// Update servers that have completed draining
 			sql_execute!(