Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion fern/definition/admin/clusters/common.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ types:
desired_count: integer
min_count: integer
max_count: integer
drain_timeout: long
drain_timeout_ms: long

Hardware:
properties:
Expand Down
3 changes: 3 additions & 0 deletions lib/bolt/cli/src/commands/cluster/datacenter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,8 @@ mod render {
#[tabled(display_with = "display_pool_type")]
pub pool_type: Option<models::AdminClustersPoolType>,
#[tabled(display_with = "display_option")]
pub drain_timeout: Option<String>,
#[tabled(display_with = "display_option")]
pub min_count: Option<i32>,
#[tabled(display_with = "display_option")]
pub desired_count: Option<i32>,
Expand All @@ -303,6 +305,7 @@ mod render {
.chain(d.pools.iter().cloned().map(|pool| DcTableRow {
pool: PoolTableRow {
pool_type: Some(pool.pool_type),
drain_timeout: Some(format!("{}s", pool.drain_timeout / 1000)),
min_count: Some(pool.min_count),
desired_count: Some(pool.desired_count),
max_count: Some(pool.max_count),
Expand Down
44 changes: 41 additions & 3 deletions svc/pkg/cluster/src/workflows/server/drain.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ use serde_json::json;

use crate::types::PoolType;

// In ms, a small amount of time to separate the completion of the drain in Nomad to the deletion of the
// cluster server. We want the Nomad drain to complete first.
const NOMAD_DRAIN_PADDING: u64 = 10000;

lazy_static::lazy_static! {
static ref NOMAD_CONFIG: Configuration = nomad_util::new_config_from_env().unwrap();
}
Expand All @@ -17,18 +21,24 @@ pub(crate) struct Input {
pub datacenter_id: Uuid,
pub server_id: Uuid,
pub pool_type: PoolType,
pub drain_timeout: u64,
}

#[workflow]
pub(crate) async fn cluster_server_drain(ctx: &mut WorkflowCtx, input: &Input) -> GlobalResult<()> {
let drain_timeout = ctx
.activity(GetDrainTimeoutInput {
datacenter_id: input.datacenter_id,
pool_type: input.pool_type.clone(),
})
.await?;

match input.pool_type {
PoolType::Job => {
let started_drain = ctx
.activity(DrainNodeInput {
datacenter_id: input.datacenter_id,
server_id: input.server_id,
drain_timeout: input.drain_timeout,
drain_timeout,
})
.await?;

Expand Down Expand Up @@ -57,6 +67,32 @@ pub(crate) async fn cluster_server_drain(ctx: &mut WorkflowCtx, input: &Input) -
Ok(())
}

#[derive(Debug, Serialize, Deserialize, Hash)]
pub(crate) struct GetDrainTimeoutInput {
pub datacenter_id: Uuid,
pub pool_type: PoolType,
}

#[activity(GetDrainTimeout)]
pub(crate) async fn get_drain_timeout(
ctx: &ActivityCtx,
input: &GetDrainTimeoutInput,
) -> GlobalResult<u64> {
let dcs_res = ctx
.op(crate::ops::datacenter::get::Input {
datacenter_ids: vec![input.datacenter_id],
})
.await?;
let dc = unwrap!(dcs_res.datacenters.into_iter().next());

let pool = unwrap!(
dc.pools.iter().find(|p| p.pool_type == input.pool_type),
"datacenter does not have this type of pool configured"
);

Ok(pool.drain_timeout)
}

#[derive(Debug, Serialize, Deserialize, Hash)]
struct DrainNodeInput {
datacenter_id: Uuid,
Expand Down Expand Up @@ -85,7 +121,9 @@ async fn drain_node(ctx: &ActivityCtx, input: &DrainNodeInput) -> GlobalResult<b
models::NodeUpdateDrainRequest {
drain_spec: Some(Box::new(models::DrainSpec {
// In nanoseconds. `drain_timeout` must be below 292 years for this to not overflow
deadline: Some((input.drain_timeout * 1000000) as i64),
deadline: Some(
(input.drain_timeout.saturating_sub(NOMAD_DRAIN_PADDING) * 1000000) as i64,
),
ignore_system_jobs: None,
})),
mark_eligible: None,
Expand Down
1 change: 0 additions & 1 deletion svc/pkg/cluster/src/workflows/server/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,6 @@ pub(crate) async fn cluster_server(ctx: &mut WorkflowCtx, input: &Input) -> Glob
datacenter_id: input.datacenter_id,
server_id: input.server_id,
pool_type: input.pool_type.clone(),
drain_timeout: pool.drain_timeout,
})
.await?;
}
Expand Down