rivet-dev · graphite-app · Apr 18, 2024 · Apr 18, 2024
diff --git a/docs/packages/job/JOB_DRAINING_AND_KILL_TIMEOUTS.md b/docs/packages/job/JOB_DRAINING_AND_KILL_TIMEOUTS.md
@@ -0,0 +1,33 @@
+# Job draining & kill timeouts
+
+## Relavant Code
+
+| Name                           | Timeout                            | Reason                                                                                              | Location                                                                                                          |
+| ------------------------------ | ---------------------------------- | --------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------- |
+| Nomad client config            | Something really high              | Always be higher than anything passed to `datacenter.drain_timeout`                                 | `svc/pkg/cluster/worker/src/workers/server_install/install_scripts/files/nomad_configure.sh` (`max_kill_timeout`) |
+| Drain nomad job                | `datacenter.drain_timeout`         | How long the Nomad jobs have to stop                                                                | `svc/pkg/mm/worker/src/workers/lobby_create/nomad_job.rs` (`nodes_api::update_node_drain`)                        |
+| Nomad job kill timeout         | Something really high              | Always be higher than anything passed to `datacenter.drain_timeout`. We'll manually send `SIGKILL`. | `svc/pkg/mm/worker/src/workers/lobby_create/nomad_job.rs` (`kill_timeout`)                                        |
+| job-run-stop delete Nomad job  | Nomad job kill timeout (see above) | This causes Nomad to send a `SIGTERM`                                                               | `svc/pkg/mm/worker/src/workers/lobby_create/nomad_job.rs` (`kill_timeout`)                                        |
+| job-run-stop manually kill job | `util_job::JOB_STOP_TIMEOUT` (30s) | This lets us configure a lower kill timeout when manually stopping a job                            | `svc/pkg/job-run/worker/src/workers/stop.rs` (`allocations_api::signal_allocation`)                               |
+
+## Signals 101
+
+-   `SIGTERM` = gracefully stop, jobs should handle this gracefully
+-   `SIGKILL` = hard stop, cannot be handled custom
+
+## Node draining vs manually stopping a job
+
+### Node draining
+
+1. `nodes_api::update_node_drain`
+2. Calls `SIGTERM` on jobs
+   PROBLEM: jobs are only given 60s to shut down b/c of their `kill_timeout`
+3. Waits until the timeout
+4. Sends `SIGKILL` to any remaining jobs
+
+### Manually stopping a job
+
+1. `allocations_api::delete_job`, which Nomad sends `SIGTERM`
+2. Manually send `SIGKILL` after `util_job::JOB_STOP_TIMEOUT` if alloc still running
+    - This is less than the job's kill timeout
+    - If the worker crashes, job-gc will clean up the job later
diff --git a/svc/pkg/job-run/worker/src/workers/stop.rs b/svc/pkg/job-run/worker/src/workers/stop.rs
@@ -22,7 +22,8 @@ lazy_static::lazy_static! {
 		nomad_util::config_from_env().unwrap();
 }
 
-#[worker(name = "job-run-stop")]
+// Update timeout to give time for the timeout in `kill_allocation`
+#[worker(name = "job-run-stop", timeout = 90)]
 async fn worker(ctx: &OperationContext<job_run::msg::stop::Message>) -> GlobalResult<()> {
 	// NOTE: Idempotent
 
@@ -167,7 +168,9 @@ async fn update_db(
 	Ok(Some((run_row, run_meta_nomad_row)))
 }
 
-// Kills the allocation after 30 seconds
+/// Kills the allocation after 30 seconds
+///
+/// See `docs/packages/job/JOB_DRAINING_AND_KILL_TIMEOUTS.md`
 fn kill_allocation(nomad_region: String, alloc_id: String) {
 	task::spawn(async move {
 		tokio::time::sleep(util_job::JOB_STOP_TIMEOUT).await;