diff --git a/docs/packages/cluster/GOTCHAS.md b/docs/packages/cluster/GOTCHAS.md new file mode 100644 index 0000000000..76be73fe66 --- /dev/null +++ b/docs/packages/cluster/GOTCHAS.md @@ -0,0 +1,16 @@ +# Gotchas + +## Provider Billing Intervals + +Server providers bill for usage based on different billing cycles. It is important to know when scaling +because if the cycle is too long, autoscaling quickly will end up wasting money due to unused billing time. + +### Linode + +Linode bills hourly. Servers running for 1 minute, 59 minutes, or 60 minutes all cost the user the same +amount. Internally, we do not immediately delete servers when scaling down and instead start draining them. +This means if we have to scale up again within an hour we can undrain an existing server instead of +provisioning a new one, saving the user money. + +When using Linode, you should choose your drain timeout to be close to (but not over) intervals of an hour +(3_600_000 ms). diff --git a/lib/bolt/core/src/context/project.rs b/lib/bolt/core/src/context/project.rs index 7368eb51ef..66901dc9d5 100644 --- a/lib/bolt/core/src/context/project.rs +++ b/lib/bolt/core/src/context/project.rs @@ -321,6 +321,32 @@ impl ProjectContextData { "invalid datacenter ({}): Job min > desired", name_id, ); + + // Validate Linode + #[allow(irrefutable_let_patterns)] + if let config::ns::ProvisioningProvider::Linode = datacenter.provider { + assert!( + ats_pool.drain_timeout >= 55 * 60 * 1000, + "invalid datacenter ({}): ATS drain timeout < 55 min (Linode bills hourly, drain timeout should be close to hour intervals)", + name_id, + ); + + if let Some(gg_pool) = &gg_pool { + assert!( + gg_pool.drain_timeout >= 55 * 60 * 1000, + "invalid datacenter ({}): GG drain timeout < 55 min (Linode bills hourly, drain timeout should be close to hour intervals)", + name_id, + ); + } + + if let Some(job_pool) = &job_pool { + assert!( + job_pool.drain_timeout >= 55 * 60 * 1000, + "invalid datacenter ({}): Job drain timeout < 55 min (Linode bills hourly, drain timeout should be close to hour intervals)", + name_id, + ); + } + } } } diff --git a/svc/pkg/cluster/src/workflows/datacenter/scale.rs b/svc/pkg/cluster/src/workflows/datacenter/scale.rs index f38e041ebf..ce67ef9416 100644 --- a/svc/pkg/cluster/src/workflows/datacenter/scale.rs +++ b/svc/pkg/cluster/src/workflows/datacenter/scale.rs @@ -19,7 +19,7 @@ use chirp_workflow::prelude::*; use futures_util::{FutureExt, StreamExt, TryStreamExt}; use serde_json::json; -use crate::types::{Datacenter, PoolType}; +use crate::types::{Datacenter, PoolType, Provider}; #[derive(sqlx::FromRow)] struct ServerRow { @@ -76,6 +76,7 @@ enum DrainState { struct PoolCtx { datacenter_id: Uuid, + provider: Provider, pool_type: PoolType, desired_count: usize, } @@ -271,6 +272,7 @@ async fn inner( for pool in &dc.pools { let pool_ctx = PoolCtx { datacenter_id: dc.datacenter_id, + provider: dc.provider.clone(), pool_type: pool.pool_type.clone(), desired_count: pool.desired_count.max(pool.min_count).min(pool.max_count) as usize, }; @@ -351,7 +353,12 @@ async fn scale_down_job_servers<'a, I: Iterator>( let (nomad_servers, without_nomad_servers) = active_servers.partition::, _>(|server| server.has_nomad_node); - let destroy_count = (active_count - pctx.desired_count).min(without_nomad_servers.len()); + let destroy_count = match pctx.provider { + // Never destroy servers when scaling down with Linode, always drain + Provider::Linode => 0, + #[allow(unreachable_patterns)] + _ => (active_count - pctx.desired_count).min(without_nomad_servers.len()), + }; let drain_count = active_count - pctx.desired_count - destroy_count; // Destroy servers diff --git a/svc/pkg/cluster/src/workflows/server/mod.rs b/svc/pkg/cluster/src/workflows/server/mod.rs index b90401fe81..3173d871de 100644 --- a/svc/pkg/cluster/src/workflows/server/mod.rs +++ b/svc/pkg/cluster/src/workflows/server/mod.rs @@ -235,19 +235,27 @@ pub(crate) async fn cluster_server(ctx: &mut WorkflowCtx, input: &Input) -> Glob .await?; } Main::NomadDrainComplete(_) => { - ctx.activity(SetDrainCompleteInput { - server_id: input.server_id, - }) - .await?; - - // Scale - ctx.tagged_signal( - &json!({ - "datacenter_id": input.datacenter_id, - }), - crate::workflows::datacenter::Scale {}, - ) - .await?; + match dc.provider { + Provider::Linode => { + // Job server draining is handled via cluster-gc for Linode + } + #[allow(unreachable_patterns)] + _ => { + ctx.activity(SetDrainCompleteInput { + server_id: input.server_id, + }) + .await?; + + // Scale + ctx.tagged_signal( + &json!({ + "datacenter_id": input.datacenter_id, + }), + crate::workflows::datacenter::Scale {}, + ) + .await?; + } + } } Main::Drain(_) => { ctx.workflow(drain::Input { @@ -271,6 +279,7 @@ pub(crate) async fn cluster_server(ctx: &mut WorkflowCtx, input: &Input) -> Glob } } + // Cleanup DNS if let PoolType::Gg = input.pool_type { ctx.workflow(dns_delete::Input { server_id: input.server_id, @@ -278,6 +287,7 @@ pub(crate) async fn cluster_server(ctx: &mut WorkflowCtx, input: &Input) -> Glob .await?; } + // Cleanup server match dc.provider { Provider::Linode => { tracing::info!(server_id=?input.server_id, "destroying linode server"); diff --git a/svc/pkg/cluster/standalone/gc/src/lib.rs b/svc/pkg/cluster/standalone/gc/src/lib.rs index bb64e769d8..37e9c93436 100644 --- a/svc/pkg/cluster/standalone/gc/src/lib.rs +++ b/svc/pkg/cluster/standalone/gc/src/lib.rs @@ -29,31 +29,24 @@ pub async fn run_from_env(ts: i64, pools: rivet_pools::Pools) -> GlobalResult<() let ctx = ctx.clone(); async move { - let pool_types = [ - serde_json::to_string(&PoolType::Gg)?, - serde_json::to_string(&PoolType::Ats)?, - ]; - - // Select all draining gg and ats servers + // Select all draining servers let servers = sql_fetch_all!( [ctx, ServerRow, @tx tx] " SELECT server_id, datacenter_id, pool_type, pool_type2, drain_ts FROM db_cluster.servers WHERE - pool_type2 = ANY($1) AND - cloud_destroy_ts IS NULL AND - drain_ts IS NOT NULL + drain_ts IS NOT NULL AND + drain_complete_ts IS NULL AND + cloud_destroy_ts IS NULL ", - &pool_types, - ts, ) .await?; - if servers.is_empty() { return Ok(Vec::new()); } + // Fetch relevant datacenters let datacenters_res = ctx .op(cluster::ops::datacenter::get::Input { datacenter_ids: servers @@ -63,6 +56,7 @@ pub async fn run_from_env(ts: i64, pools: rivet_pools::Pools) -> GlobalResult<() }) .await?; + // Determine which servers are finished draining via their drain timeout let drained_servers = servers .into_iter() .map(|server| { @@ -100,7 +94,7 @@ pub async fn run_from_env(ts: i64, pools: rivet_pools::Pools) -> GlobalResult<() return Ok(Vec::new()); } - tracing::info!(count=%drained_servers.len(), "servers done draining"); + tracing::info!("{} servers done draining", drained_servers.len()); // Update servers that have completed draining sql_execute!(