From f0b9a22d4d1a5e7336d84d6a30ee96956df496d4 Mon Sep 17 00:00:00 2001 From: Nathan Flurry Date: Thu, 18 Jan 2024 22:24:32 +0000 Subject: [PATCH] Dynamically sleep for Treafik config update in mm-lobby-ready-set (#363) ## Changes --- CHANGELOG.md | 1 + svc/pkg/job-run/worker/src/workers/create/mod.rs | 13 ++++++++++--- svc/pkg/mm/worker/src/workers/lobby_ready_set.rs | 16 ++++++++++++++++ 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b36d5ec307..3056c7b9a7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,6 +37,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **api-helper** Box path futures for faster compile times - Upgrade `async-nats` - `test-mm-lobby-echo` now handles `SIGTERM` and exits immediately, allows for less resource consumption while testing lobbies +- **mm** Dynamically sleep based on lobby's `create_ts` for Treafik config to update ### Security diff --git a/svc/pkg/job-run/worker/src/workers/create/mod.rs b/svc/pkg/job-run/worker/src/workers/create/mod.rs index c30f588666..453ad8e9ac 100644 --- a/svc/pkg/job-run/worker/src/workers/create/mod.rs +++ b/svc/pkg/job-run/worker/src/workers/create/mod.rs @@ -2,6 +2,7 @@ use std::collections::HashMap; use chirp_worker::prelude::*; use proto::backend::{self, pkg::*}; +use tokio::time::Duration; mod create_job; @@ -10,6 +11,13 @@ mod create_job; const MAX_PARAMETER_KEY_LEN: usize = 64; const MAX_PARAMETER_VALUE_LEN: usize = 8_192; // 8 KB +/// HACK: Give the Traefik load balancer time to complete before considering the lobby ready. +/// +/// Traefik updates every 500 ms and we give an extra 500 ms for grace. +/// +/// See also svc/pkg/mm/worker/src/workers/lobby_ready_set.rs TRAEFIK_GRACE_MS. +const TRAEFIK_GRACE: Duration = Duration::from_millis(1_000); + lazy_static::lazy_static! { static ref NOMAD_CONFIG: nomad_client::apis::configuration::Configuration = nomad_util::config_from_env().unwrap(); @@ -127,9 +135,8 @@ async fn worker(ctx: &OperationContext) -> Global write_to_db_after_run(&ctx, run_id, &nomad_dispatched_job_id).await?; db_write_perf.end(); - // HACK: Wait for Treafik to pick up the new job. 500 ms is the polling interval for the - // Traefik HTTP provider. - tokio::time::sleep(std::time::Duration::from_millis(500)).await; + // See TRAEFIK_GRACE_MS + tokio::time::sleep(TRAEFIK_GRACE).await; msg!([ctx] job_run::msg::create_complete(run_id) { run_id: Some(run_id.into()), diff --git a/svc/pkg/mm/worker/src/workers/lobby_ready_set.rs b/svc/pkg/mm/worker/src/workers/lobby_ready_set.rs index b07eca7f58..417c9feeb6 100644 --- a/svc/pkg/mm/worker/src/workers/lobby_ready_set.rs +++ b/svc/pkg/mm/worker/src/workers/lobby_ready_set.rs @@ -1,6 +1,15 @@ use chirp_worker::prelude::*; use proto::backend::pkg::*; use serde_json::json; +use tokio::time::Duration; + +/// HACK: Give the Traefik load balancer time to complete before considering the lobby ready. +/// +/// Traefik updates every 500 ms and we give an extra 500 ms for grace. +/// +/// See also svc/pkg/job-run/worker/src/workers/create/mod.rs TRAEFIK_GRACE_MS +// const TRAEFIK_GRACE_MS: i64 = 750; +const TRAEFIK_GRACE_MS: i64 = 100; lazy_static::lazy_static! { static ref REDIS_SCRIPT: redis::Script = redis::Script::new(include_str!("../../redis-scripts/lobby_ready_set.lua")); @@ -53,6 +62,13 @@ async fn worker(ctx: &OperationContext) -> Global } }; + // See TRAEFIK_GRACE_MS + let traefik_grace_ms = TRAEFIK_GRACE_MS - (util::timestamp::now() - lobby_row.create_ts); + if traefik_grace_ms > 0 { + tracing::info!(traefik_grace_ms, "sleeping for traefik grace"); + tokio::time::sleep(Duration::from_millis(traefik_grace_ms as u64)).await; + } + msg!([ctx] mm::msg::lobby_ready_complete(lobby_id) { lobby_id: Some(lobby_id.into()), })