From d10ca551238e4a0fbab1a87bb2aee4815bab0f16 Mon Sep 17 00:00:00 2001 From: Thanh Ha Date: Mon, 25 Aug 2025 10:22:05 -0400 Subject: [PATCH] Split out the Meta AWS H100 to a separate check The AWS H100 check regularly flaps due to it being a limited resource and seems like it can regularly queue for 3 or 4 hrs, even occassionally going as high as 5-6 hrs. Split the check out and set the check time to 4 hrs so that it flaps hopefully less. Signed-off-by: Thanh Ha --- datadog-synthetics_tests.tf | 29 +++++++++++++++++++++++++++ scripts/check-long-queue-meta-h100.js | 19 ++++++++++++++++++ scripts/check-long-queue-meta.js | 2 +- 3 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 scripts/check-long-queue-meta-h100.js diff --git a/datadog-synthetics_tests.tf b/datadog-synthetics_tests.tf index 03ee55a..5a60395 100644 --- a/datadog-synthetics_tests.tf +++ b/datadog-synthetics_tests.tf @@ -428,6 +428,35 @@ EOT } } +resource "datadog_synthetics_test" "pytorch-gha-runners-queue-check-meta-h100" { + type = "api" + name = "GHA Runner Queue Check - Meta Runners - AWS H100" + message = < item.machine_type.startsWith(MACHINE_TYPE_FILTER) && item.avg_queue_s > 14400) + .map(item => ({ machine_type: item.machine_type, avg_queue_s: item.avg_queue_s })); + +if (highQueueItems.length > 0) { + const machineDetails = highQueueItems + .map(item => `${item.machine_type} (${item.avg_queue_s}s)`) + .join(', '); + const message = `High queue detected for machine types containing ${MACHINE_TYPE_FILTER}: ${machineDetails}`; + console.error(message); +} + +dd.expect(highQueueItems.length > 0).to.be.false; diff --git a/scripts/check-long-queue-meta.js b/scripts/check-long-queue-meta.js index aa9ae8d..e88e4a8 100644 --- a/scripts/check-long-queue-meta.js +++ b/scripts/check-long-queue-meta.js @@ -1,5 +1,5 @@ dd.expect(dd.response.statusCode).to.equal(200); -const EXCLUDED_MACHINE_PATTERNS = ['.dgx.', '.rocm.', '.s390x', '^lf\\.']; +const EXCLUDED_MACHINE_PATTERNS = ['.dgx.', '.rocm.', '.s390x', '^lf\\.', '^linux.aws.h100']; const jsonData = dd.response.body; const parsedData = JSON.parse(jsonData); const highQueueItems = parsedData