From 332f88c480a3d5686e0278a20cbd64d560c821b0 Mon Sep 17 00:00:00 2001
From: NathanFlurry <git@nathanflurry.com>
Date: Thu, 18 Apr 2024 22:52:27 +0000
Subject: [PATCH] chore: doc drain & kill timeouts (#646)

<!-- Please make sure there is an issue that this PR is correlated to. -->

## Changes

<!-- If there are frontend changes, please include screenshots. -->
---
 .../job/JOB_DRAINING_AND_KILL_TIMEOUTS.md     | 33 +++++++++++++++++++
 svc/pkg/job-run/worker/src/workers/stop.rs    |  7 ++--
 2 files changed, 38 insertions(+), 2 deletions(-)
 create mode 100644 docs/packages/job/JOB_DRAINING_AND_KILL_TIMEOUTS.md

diff --git a/docs/packages/job/JOB_DRAINING_AND_KILL_TIMEOUTS.md b/docs/packages/job/JOB_DRAINING_AND_KILL_TIMEOUTS.md
new file mode 100644
index 0000000000..e6885fb690
--- /dev/null
+++ b/docs/packages/job/JOB_DRAINING_AND_KILL_TIMEOUTS.md
@@ -0,0 +1,33 @@
+# Job draining & kill timeouts
+
+## Relavant Code
+
+| Name                           | Timeout                            | Reason                                                                                              | Location                                                                                                          |
+| ------------------------------ | ---------------------------------- | --------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------- |
+| Nomad client config            | Something really high              | Always be higher than anything passed to `datacenter.drain_timeout`                                 | `svc/pkg/cluster/worker/src/workers/server_install/install_scripts/files/nomad_configure.sh` (`max_kill_timeout`) |
+| Drain nomad job                | `datacenter.drain_timeout`         | How long the Nomad jobs have to stop                                                                | `svc/pkg/mm/worker/src/workers/lobby_create/nomad_job.rs` (`nodes_api::update_node_drain`)                        |
+| Nomad job kill timeout         | Something really high              | Always be higher than anything passed to `datacenter.drain_timeout`. We'll manually send `SIGKILL`. | `svc/pkg/mm/worker/src/workers/lobby_create/nomad_job.rs` (`kill_timeout`)                                        |
+| job-run-stop delete Nomad job  | Nomad job kill timeout (see above) | This causes Nomad to send a `SIGTERM`                                                               | `svc/pkg/mm/worker/src/workers/lobby_create/nomad_job.rs` (`kill_timeout`)                                        |
+| job-run-stop manually kill job | `util_job::JOB_STOP_TIMEOUT` (30s) | This lets us configure a lower kill timeout when manually stopping a job                            | `svc/pkg/job-run/worker/src/workers/stop.rs` (`allocations_api::signal_allocation`)                               |
+
+## Signals 101
+
+-   `SIGTERM` = gracefully stop, jobs should handle this gracefully
+-   `SIGKILL` = hard stop, cannot be handled custom
+
+## Node draining vs manually stopping a job
+
+### Node draining
+
+1. `nodes_api::update_node_drain`
+2. Calls `SIGTERM` on jobs
+   PROBLEM: jobs are only given 60s to shut down b/c of their `kill_timeout`
+3. Waits until the timeout
+4. Sends `SIGKILL` to any remaining jobs
+
+### Manually stopping a job
+
+1. `allocations_api::delete_job`, which Nomad sends `SIGTERM`
+2. Manually send `SIGKILL` after `util_job::JOB_STOP_TIMEOUT` if alloc still running
+    - This is less than the job's kill timeout
+    - If the worker crashes, job-gc will clean up the job later
diff --git a/svc/pkg/job-run/worker/src/workers/stop.rs b/svc/pkg/job-run/worker/src/workers/stop.rs
index ba96e36966..5a234cea19 100644
--- a/svc/pkg/job-run/worker/src/workers/stop.rs
+++ b/svc/pkg/job-run/worker/src/workers/stop.rs
@@ -22,7 +22,8 @@ lazy_static::lazy_static! {
 		nomad_util::config_from_env().unwrap();
 }
 
-#[worker(name = "job-run-stop")]
+// Update timeout to give time for the timeout in `kill_allocation`
+#[worker(name = "job-run-stop", timeout = 90)]
 async fn worker(ctx: &OperationContext<job_run::msg::stop::Message>) -> GlobalResult<()> {
 	// NOTE: Idempotent
 
@@ -167,7 +168,9 @@ async fn update_db(
 	Ok(Some((run_row, run_meta_nomad_row)))
 }
 
-// Kills the allocation after 30 seconds
+/// Kills the allocation after 30 seconds
+///
+/// See `docs/packages/job/JOB_DRAINING_AND_KILL_TIMEOUTS.md`
 fn kill_allocation(nomad_region: String, alloc_id: String) {
 	task::spawn(async move {
 		tokio::time::sleep(util_job::JOB_STOP_TIMEOUT).await;