From 332f88c480a3d5686e0278a20cbd64d560c821b0 Mon Sep 17 00:00:00 2001 From: NathanFlurry Date: Thu, 18 Apr 2024 22:52:27 +0000 Subject: [PATCH] chore: doc drain & kill timeouts (#646) ## Changes --- .../job/JOB_DRAINING_AND_KILL_TIMEOUTS.md | 33 +++++++++++++++++++ svc/pkg/job-run/worker/src/workers/stop.rs | 7 ++-- 2 files changed, 38 insertions(+), 2 deletions(-) create mode 100644 docs/packages/job/JOB_DRAINING_AND_KILL_TIMEOUTS.md diff --git a/docs/packages/job/JOB_DRAINING_AND_KILL_TIMEOUTS.md b/docs/packages/job/JOB_DRAINING_AND_KILL_TIMEOUTS.md new file mode 100644 index 0000000000..e6885fb690 --- /dev/null +++ b/docs/packages/job/JOB_DRAINING_AND_KILL_TIMEOUTS.md @@ -0,0 +1,33 @@ +# Job draining & kill timeouts + +## Relavant Code + +| Name | Timeout | Reason | Location | +| ------------------------------ | ---------------------------------- | --------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------- | +| Nomad client config | Something really high | Always be higher than anything passed to `datacenter.drain_timeout` | `svc/pkg/cluster/worker/src/workers/server_install/install_scripts/files/nomad_configure.sh` (`max_kill_timeout`) | +| Drain nomad job | `datacenter.drain_timeout` | How long the Nomad jobs have to stop | `svc/pkg/mm/worker/src/workers/lobby_create/nomad_job.rs` (`nodes_api::update_node_drain`) | +| Nomad job kill timeout | Something really high | Always be higher than anything passed to `datacenter.drain_timeout`. We'll manually send `SIGKILL`. | `svc/pkg/mm/worker/src/workers/lobby_create/nomad_job.rs` (`kill_timeout`) | +| job-run-stop delete Nomad job | Nomad job kill timeout (see above) | This causes Nomad to send a `SIGTERM` | `svc/pkg/mm/worker/src/workers/lobby_create/nomad_job.rs` (`kill_timeout`) | +| job-run-stop manually kill job | `util_job::JOB_STOP_TIMEOUT` (30s) | This lets us configure a lower kill timeout when manually stopping a job | `svc/pkg/job-run/worker/src/workers/stop.rs` (`allocations_api::signal_allocation`) | + +## Signals 101 + +- `SIGTERM` = gracefully stop, jobs should handle this gracefully +- `SIGKILL` = hard stop, cannot be handled custom + +## Node draining vs manually stopping a job + +### Node draining + +1. `nodes_api::update_node_drain` +2. Calls `SIGTERM` on jobs + PROBLEM: jobs are only given 60s to shut down b/c of their `kill_timeout` +3. Waits until the timeout +4. Sends `SIGKILL` to any remaining jobs + +### Manually stopping a job + +1. `allocations_api::delete_job`, which Nomad sends `SIGTERM` +2. Manually send `SIGKILL` after `util_job::JOB_STOP_TIMEOUT` if alloc still running + - This is less than the job's kill timeout + - If the worker crashes, job-gc will clean up the job later diff --git a/svc/pkg/job-run/worker/src/workers/stop.rs b/svc/pkg/job-run/worker/src/workers/stop.rs index ba96e36966..5a234cea19 100644 --- a/svc/pkg/job-run/worker/src/workers/stop.rs +++ b/svc/pkg/job-run/worker/src/workers/stop.rs @@ -22,7 +22,8 @@ lazy_static::lazy_static! { nomad_util::config_from_env().unwrap(); } -#[worker(name = "job-run-stop")] +// Update timeout to give time for the timeout in `kill_allocation` +#[worker(name = "job-run-stop", timeout = 90)] async fn worker(ctx: &OperationContext) -> GlobalResult<()> { // NOTE: Idempotent @@ -167,7 +168,9 @@ async fn update_db( Ok(Some((run_row, run_meta_nomad_row))) } -// Kills the allocation after 30 seconds +/// Kills the allocation after 30 seconds +/// +/// See `docs/packages/job/JOB_DRAINING_AND_KILL_TIMEOUTS.md` fn kill_allocation(nomad_region: String, alloc_id: String) { task::spawn(async move { tokio::time::sleep(util_job::JOB_STOP_TIMEOUT).await;