Skip to content

Commit

Permalink
fix(clusters): dont delete servers immediately with linode
Browse files Browse the repository at this point in the history
  • Loading branch information
MasterPtato committed Aug 9, 2024
1 parent 514231a commit 1828b2c
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 28 deletions.
16 changes: 16 additions & 0 deletions docs/packages/cluster/GOTCHAS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Gotchas

## Provider Billing Intervals

Server providers bill for usage based on different billing cycles. It is important to know when scaling
because if the cycle is too long, autoscaling quickly will end up wasting money due to unused billing time.

### Linode

Linode bills hourly. Servers running for 1 minute, 59 minutes, or 60 minutes all cost the user the same
amount. Internally, we do not immediately delete servers when scaling down and instead start draining them.
This means if we have to scale up again within an hour we can undrain an existing server instead of
provisioning a new one, saving the user money.

When using Linode, you should choose your drain timeout to be close to (but not over) intervals of an hour
(3_600_000 ms).
11 changes: 9 additions & 2 deletions svc/pkg/cluster/src/workflows/datacenter/scale.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ use chirp_workflow::prelude::*;
use futures_util::{FutureExt, StreamExt, TryStreamExt};
use serde_json::json;

use crate::types::{Datacenter, PoolType};
use crate::types::{Datacenter, PoolType, Provider};

#[derive(sqlx::FromRow)]
struct ServerRow {
Expand Down Expand Up @@ -76,6 +76,7 @@ enum DrainState {

struct PoolCtx {
datacenter_id: Uuid,
provider: Provider,
pool_type: PoolType,
desired_count: usize,
}
Expand Down Expand Up @@ -271,6 +272,7 @@ async fn inner(
for pool in &dc.pools {
let pool_ctx = PoolCtx {
datacenter_id: dc.datacenter_id,
provider: dc.provider.clone(),
pool_type: pool.pool_type.clone(),
desired_count: pool.desired_count.max(pool.min_count).min(pool.max_count) as usize,
};
Expand Down Expand Up @@ -351,7 +353,12 @@ async fn scale_down_job_servers<'a, I: Iterator<Item = &'a Server>>(
let (nomad_servers, without_nomad_servers) =
active_servers.partition::<Vec<_>, _>(|server| server.has_nomad_node);

let destroy_count = (active_count - pctx.desired_count).min(without_nomad_servers.len());
let destroy_count = match pctx.provider {
// Never destroy servers when scaling down with Linode, always drain
Provider::Linode => 0,
#[allow(unreachable_patterns)]
_ => (active_count - pctx.desired_count).min(without_nomad_servers.len()),
};
let drain_count = active_count - pctx.desired_count - destroy_count;

// Destroy servers
Expand Down
36 changes: 23 additions & 13 deletions svc/pkg/cluster/src/workflows/server/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -235,19 +235,27 @@ pub(crate) async fn cluster_server(ctx: &mut WorkflowCtx, input: &Input) -> Glob
.await?;
}
Main::NomadDrainComplete(_) => {
ctx.activity(SetDrainCompleteInput {
server_id: input.server_id,
})
.await?;

// Scale
ctx.tagged_signal(
&json!({
"datacenter_id": input.datacenter_id,
}),
crate::workflows::datacenter::Scale {},
)
.await?;
match dc.provider {
Provider::Linode => {
// Job server draining is handled via cluster-gc for Linode
}
#[allow(unreachable_patterns)]
_ => {
ctx.activity(SetDrainCompleteInput {
server_id: input.server_id,
})
.await?;

// Scale
ctx.tagged_signal(
&json!({
"datacenter_id": input.datacenter_id,
}),
crate::workflows::datacenter::Scale {},
)
.await?;
}
}
}
Main::Drain(_) => {
ctx.workflow(drain::Input {
Expand All @@ -271,13 +279,15 @@ pub(crate) async fn cluster_server(ctx: &mut WorkflowCtx, input: &Input) -> Glob
}
}

// Cleanup DNS
if let PoolType::Gg = input.pool_type {
ctx.workflow(dns_delete::Input {
server_id: input.server_id,
})
.await?;
}

// Cleanup server
match dc.provider {
Provider::Linode => {
tracing::info!(server_id=?input.server_id, "destroying linode server");
Expand Down
20 changes: 7 additions & 13 deletions svc/pkg/cluster/standalone/gc/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,31 +29,24 @@ pub async fn run_from_env(ts: i64, pools: rivet_pools::Pools) -> GlobalResult<()
let ctx = ctx.clone();

async move {
let pool_types = [
serde_json::to_string(&PoolType::Gg)?,
serde_json::to_string(&PoolType::Ats)?,
];

// Select all draining gg and ats servers
// Select all draining servers
let servers = sql_fetch_all!(
[ctx, ServerRow, @tx tx]
"
SELECT server_id, datacenter_id, pool_type, pool_type2, drain_ts
FROM db_cluster.servers
WHERE
pool_type2 = ANY($1) AND
cloud_destroy_ts IS NULL AND
drain_ts IS NOT NULL
drain_ts IS NOT NULL AND
drain_complete_ts IS NULL AND
cloud_destroy_ts IS NULL
",
&pool_types,
ts,
)
.await?;

if servers.is_empty() {
return Ok(Vec::new());
}

// Fetch relevant datacenters
let datacenters_res = ctx
.op(cluster::ops::datacenter::get::Input {
datacenter_ids: servers
Expand All @@ -63,6 +56,7 @@ pub async fn run_from_env(ts: i64, pools: rivet_pools::Pools) -> GlobalResult<()
})
.await?;

// Determine which servers are finished draining via their drain timeout
let drained_servers = servers
.into_iter()
.map(|server| {
Expand Down Expand Up @@ -100,7 +94,7 @@ pub async fn run_from_env(ts: i64, pools: rivet_pools::Pools) -> GlobalResult<()
return Ok(Vec::new());
}

tracing::info!(count=%drained_servers.len(), "servers done draining");
tracing::info!("{} servers done draining", drained_servers.len());

// Update servers that have completed draining
sql_execute!(
Expand Down

0 comments on commit 1828b2c

Please sign in to comment.