From 7fa7ebfe5a8749161577c839b654826a78645455 Mon Sep 17 00:00:00 2001 From: Nathan Flurry Date: Tue, 10 Sep 2024 04:02:21 +0000 Subject: [PATCH] chore: increase nomad heartbeat ttl --- infra/tf/k8s_infra/nomad.tf | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/infra/tf/k8s_infra/nomad.tf b/infra/tf/k8s_infra/nomad.tf index bdc5a63709..afea07de97 100644 --- a/infra/tf/k8s_infra/nomad.tf +++ b/infra/tf/k8s_infra/nomad.tf @@ -37,6 +37,23 @@ locals { retry_join = [${join(", ", local.nomad_server_addrs_escaped)}] retry_interval = "10s" } + + # Increase grace period for intermittent cross-region + # connection issues. These are usually compounded by the + # fact that everything is sent through a tunnel, requiring 2x + # the latency in order to reconnect to the server. + # + # We kill all jobs on a node when the jobs are lost, so we want to be + # extra generous about considering a node as lost. + # + # The default behavior is 10s for the heartbeat TTL + 10s for + # heartbeat grace. + # + # This behavior will sent a heartbeat every 15s but wait 2m + # before considering a node as lost and killing all + # allocations. + heartbeat_grace = "2m" + min_heartbeat_ttl = "15s" } telemetry {