From 0c58f835529da2cb0f8883425a6cbe824d12d759 Mon Sep 17 00:00:00 2001 From: MasterPtato <23087326+MasterPtato@users.noreply.github.com> Date: Mon, 19 Aug 2024 03:56:04 +0000 Subject: [PATCH] fix(servers): use correct timeout for sleeping (#1076) ## Changes --- svc/api/servers/src/route/servers.rs | 15 +++++++++++++- svc/pkg/ds/src/workflows/server/destroy.rs | 23 ++++++++++++++++------ svc/pkg/ds/src/workflows/server/mod.rs | 2 +- 3 files changed, 32 insertions(+), 8 deletions(-) diff --git a/svc/api/servers/src/route/servers.rs b/svc/api/servers/src/route/servers.rs index a1743e86e3..33e5014865 100644 --- a/svc/api/servers/src/route/servers.rs +++ b/svc/api/servers/src/route/servers.rs @@ -180,6 +180,19 @@ pub async fn destroy( assert::server_for_env(&ctx, server_id, game_id, env_id).await?; + ensure_with!( + query.override_kill_timeout.unwrap_or(0) >= 0, + API_BAD_QUERY_PARAMETER, + parameter = "override_kill_timeout", + error = "must be positive" + ); + ensure_with!( + query.override_kill_timeout.unwrap_or(0) < 2 * 60 * 60 * 1000, + API_BAD_QUERY_PARAMETER, + parameter = "override_kill_timeout", + error = "cannot be longer than 2 hours" + ); + let mut sub = ctx .subscribe::(&json!({ "server_id": server_id, @@ -191,7 +204,7 @@ pub async fn destroy( "server_id": server_id, }), ds::workflows::server::Destroy { - override_kill_timeout_ms: query.override_kill_timeout.unwrap_or_default(), + override_kill_timeout_ms: query.override_kill_timeout, }, ) .await?; diff --git a/svc/pkg/ds/src/workflows/server/destroy.rs b/svc/pkg/ds/src/workflows/server/destroy.rs index 2ea61802f7..f42ca325d4 100644 --- a/svc/pkg/ds/src/workflows/server/destroy.rs +++ b/svc/pkg/ds/src/workflows/server/destroy.rs @@ -1,3 +1,5 @@ +use std::convert::TryInto; + use chirp_workflow::prelude::*; use futures_util::FutureExt; use serde_json::json; @@ -13,7 +15,7 @@ pub struct DestroyComplete {} #[derive(Debug, Serialize, Deserialize)] pub(crate) struct Input { pub server_id: Uuid, - pub override_kill_timeout_ms: i64, + pub override_kill_timeout_ms: Option, } #[workflow] @@ -43,6 +45,9 @@ pub(crate) async fn ds_server_destroy(ctx: &mut WorkflowCtx, input: &Input) -> G if let Some(alloc_id) = &dynamic_server.alloc_id { ctx.activity(KillAllocInput { alloc_id: alloc_id.clone(), + kill_timeout_ms: input + .override_kill_timeout_ms + .unwrap_or(dynamic_server.kill_timeout_ms), }) .await?; } @@ -69,6 +74,7 @@ struct UpdateDbInput { struct UpdateDbOutput { server_id: Uuid, datacenter_id: Uuid, + kill_timeout_ms: i64, dispatched_job_id: Option, alloc_id: Option, } @@ -96,6 +102,7 @@ async fn update_db(ctx: &ActivityCtx, input: &UpdateDbInput) -> GlobalResult GlobalResult GlobalResult<()> { - // Kills the allocation after 30 seconds - // - // See `docs/packages/job/JOB_DRAINING_AND_KILL_TIMEOUTS.md` - // TODO: Move this to a workflow sleep RVTEE-497 - tokio::time::sleep(util_job::JOB_STOP_TIMEOUT).await; + tokio::time::sleep(std::time::Duration::from_millis( + input.kill_timeout_ms.try_into()?, + )) + .await; // TODO: Handle 404 safely. See RVTEE-498 if let Err(err) = signal_allocation( diff --git a/svc/pkg/ds/src/workflows/server/mod.rs b/svc/pkg/ds/src/workflows/server/mod.rs index 3245de0344..f2ce75d247 100644 --- a/svc/pkg/ds/src/workflows/server/mod.rs +++ b/svc/pkg/ds/src/workflows/server/mod.rs @@ -1103,7 +1103,7 @@ pub struct CreateFailed {} #[signal("ds_server_destroy")] pub struct Destroy { - pub override_kill_timeout_ms: i64, + pub override_kill_timeout_ms: Option, } /// Choose which port to assign for a job's ingress port.