From 01ee21b5da0b46b8aa2356ced8af3b6d9966ba7e Mon Sep 17 00:00:00 2001 From: MasterPtato <23087326+MasterPtato@users.noreply.github.com> Date: Tue, 3 Sep 2024 23:00:54 +0000 Subject: [PATCH] fix(clusters): add drain padding to nomad (#1100) ## Changes --- fern/definition/admin/clusters/common.yml | 2 +- .../cli/src/commands/cluster/datacenter.rs | 3 ++ svc/pkg/cluster/src/workflows/server/drain.rs | 44 +++++++++++++++++-- svc/pkg/cluster/src/workflows/server/mod.rs | 1 - 4 files changed, 45 insertions(+), 5 deletions(-) diff --git a/fern/definition/admin/clusters/common.yml b/fern/definition/admin/clusters/common.yml index 210f67863b..5c64e49cc0 100644 --- a/fern/definition/admin/clusters/common.yml +++ b/fern/definition/admin/clusters/common.yml @@ -23,7 +23,7 @@ types: desired_count: integer min_count: integer max_count: integer - drain_timeout: long + drain_timeout_ms: long Hardware: properties: diff --git a/lib/bolt/cli/src/commands/cluster/datacenter.rs b/lib/bolt/cli/src/commands/cluster/datacenter.rs index ebc9250ea9..771227735b 100644 --- a/lib/bolt/cli/src/commands/cluster/datacenter.rs +++ b/lib/bolt/cli/src/commands/cluster/datacenter.rs @@ -282,6 +282,8 @@ mod render { #[tabled(display_with = "display_pool_type")] pub pool_type: Option, #[tabled(display_with = "display_option")] + pub drain_timeout: Option, + #[tabled(display_with = "display_option")] pub min_count: Option, #[tabled(display_with = "display_option")] pub desired_count: Option, @@ -303,6 +305,7 @@ mod render { .chain(d.pools.iter().cloned().map(|pool| DcTableRow { pool: PoolTableRow { pool_type: Some(pool.pool_type), + drain_timeout: Some(format!("{}s", pool.drain_timeout / 1000)), min_count: Some(pool.min_count), desired_count: Some(pool.desired_count), max_count: Some(pool.max_count), diff --git a/svc/pkg/cluster/src/workflows/server/drain.rs b/svc/pkg/cluster/src/workflows/server/drain.rs index 73c623fefa..87101b3aca 100644 --- a/svc/pkg/cluster/src/workflows/server/drain.rs +++ b/svc/pkg/cluster/src/workflows/server/drain.rs @@ -8,6 +8,10 @@ use serde_json::json; use crate::types::PoolType; +// In ms, a small amount of time to separate the completion of the drain in Nomad to the deletion of the +// cluster server. We want the Nomad drain to complete first. +const NOMAD_DRAIN_PADDING: u64 = 10000; + lazy_static::lazy_static! { static ref NOMAD_CONFIG: Configuration = nomad_util::new_config_from_env().unwrap(); } @@ -17,18 +21,24 @@ pub(crate) struct Input { pub datacenter_id: Uuid, pub server_id: Uuid, pub pool_type: PoolType, - pub drain_timeout: u64, } #[workflow] pub(crate) async fn cluster_server_drain(ctx: &mut WorkflowCtx, input: &Input) -> GlobalResult<()> { + let drain_timeout = ctx + .activity(GetDrainTimeoutInput { + datacenter_id: input.datacenter_id, + pool_type: input.pool_type.clone(), + }) + .await?; + match input.pool_type { PoolType::Job => { let started_drain = ctx .activity(DrainNodeInput { datacenter_id: input.datacenter_id, server_id: input.server_id, - drain_timeout: input.drain_timeout, + drain_timeout, }) .await?; @@ -57,6 +67,32 @@ pub(crate) async fn cluster_server_drain(ctx: &mut WorkflowCtx, input: &Input) - Ok(()) } +#[derive(Debug, Serialize, Deserialize, Hash)] +pub(crate) struct GetDrainTimeoutInput { + pub datacenter_id: Uuid, + pub pool_type: PoolType, +} + +#[activity(GetDrainTimeout)] +pub(crate) async fn get_drain_timeout( + ctx: &ActivityCtx, + input: &GetDrainTimeoutInput, +) -> GlobalResult { + let dcs_res = ctx + .op(crate::ops::datacenter::get::Input { + datacenter_ids: vec![input.datacenter_id], + }) + .await?; + let dc = unwrap!(dcs_res.datacenters.into_iter().next()); + + let pool = unwrap!( + dc.pools.iter().find(|p| p.pool_type == input.pool_type), + "datacenter does not have this type of pool configured" + ); + + Ok(pool.drain_timeout) +} + #[derive(Debug, Serialize, Deserialize, Hash)] struct DrainNodeInput { datacenter_id: Uuid, @@ -85,7 +121,9 @@ async fn drain_node(ctx: &ActivityCtx, input: &DrainNodeInput) -> GlobalResult Glob datacenter_id: input.datacenter_id, server_id: input.server_id, pool_type: input.pool_type.clone(), - drain_timeout: pool.drain_timeout, }) .await?; }