From e9e29820f7b10667372be97821c20f6e61e1f4cf Mon Sep 17 00:00:00 2001 From: Thanh Ha Date: Thu, 4 Sep 2025 10:39:08 -0400 Subject: [PATCH] Revert "Do not fail check if PyTorch HUD API is down (#37)" This change causes more flapping than it was intended to prevent. I think the change 739272347fa3ce268214516c3d92a17fd74ddef0 does a better job of handling the flapping so lets revert this one. This reverts commit a928f56b0792aee10d1e9e9ff531e844593d4b71. --- scripts/check-long-queue-intel.js | 35 +++++++++---------- scripts/check-long-queue-lf.js | 35 +++++++++---------- scripts/check-long-queue-meta-h100.js | 35 +++++++++---------- scripts/check-long-queue-meta.js | 48 ++++++++++++--------------- scripts/check-long-queue-nvidia.js | 35 +++++++++---------- scripts/check-long-queue-rocm.js | 35 +++++++++---------- scripts/check-long-queue-s390x.js | 35 +++++++++---------- 7 files changed, 111 insertions(+), 147 deletions(-) diff --git a/scripts/check-long-queue-intel.js b/scripts/check-long-queue-intel.js index ae44dc4..40cc5a6 100644 --- a/scripts/check-long-queue-intel.js +++ b/scripts/check-long-queue-intel.js @@ -1,24 +1,19 @@ -if (dd.response.statusCode !== 200) { - // We do not want to fail due to hud.pytorch.org API failure. - console.log('Status code is not 200, stopping execution'); - dd.expect(true).to.equal(true); -} -else { - const MACHINE_TYPE_FILTER = '.idc.'; - const jsonData = dd.response.body; - const parsedData = JSON.parse(jsonData); +dd.expect(dd.response.statusCode).to.equal(200); - const highQueueItems = parsedData - .filter(item => item.machine_type.includes(MACHINE_TYPE_FILTER) && item.avg_queue_s > 10800) - .map(item => ({ machine_type: item.machine_type, avg_queue_s: item.avg_queue_s })); +const MACHINE_TYPE_FILTER = '.idc.'; +const jsonData = dd.response.body; +const parsedData = JSON.parse(jsonData); - if (highQueueItems.length > 0) { - const machineDetails = highQueueItems - .map(item => `${item.machine_type} (${item.avg_queue_s}s)`) - .join(', '); - const message = `High queue detected for machine types containing ${MACHINE_TYPE_FILTER}: ${machineDetails}`; - console.error(message); - } +const highQueueItems = parsedData + .filter(item => item.machine_type.includes(MACHINE_TYPE_FILTER) && item.avg_queue_s > 10800) + .map(item => ({ machine_type: item.machine_type, avg_queue_s: item.avg_queue_s })); - dd.expect(highQueueItems.length > 0).to.be.false; +if (highQueueItems.length > 0) { + const machineDetails = highQueueItems + .map(item => `${item.machine_type} (${item.avg_queue_s}s)`) + .join(', '); + const message = `High queue detected for machine types containing ${MACHINE_TYPE_FILTER}: ${machineDetails}`; + console.error(message); } + +dd.expect(highQueueItems.length > 0).to.be.false; diff --git a/scripts/check-long-queue-lf.js b/scripts/check-long-queue-lf.js index 682cde4..b090c29 100644 --- a/scripts/check-long-queue-lf.js +++ b/scripts/check-long-queue-lf.js @@ -1,24 +1,19 @@ -if (dd.response.statusCode !== 200) { - // We do not want to fail due to hud.pytorch.org API failure. - console.log('Status code is not 200, stopping execution'); - dd.expect(true).to.equal(true); -} -else { - const MACHINE_TYPE_FILTER = 'lf.'; - const jsonData = dd.response.body; - const parsedData = JSON.parse(jsonData); +dd.expect(dd.response.statusCode).to.equal(200); - const highQueueItems = parsedData - .filter(item => item.machine_type.startsWith(MACHINE_TYPE_FILTER) && item.avg_queue_s > 10800) - .map(item => ({ machine_type: item.machine_type, avg_queue_s: item.avg_queue_s })); +const MACHINE_TYPE_FILTER = 'lf.'; +const jsonData = dd.response.body; +const parsedData = JSON.parse(jsonData); - if (highQueueItems.length > 0) { - const machineDetails = highQueueItems - .map(item => `${item.machine_type} (${item.avg_queue_s}s)`) - .join(', '); - const message = `High queue detected for machine types containing ${MACHINE_TYPE_FILTER}: ${machineDetails}`; - console.error(message); - } +const highQueueItems = parsedData + .filter(item => item.machine_type.startsWith(MACHINE_TYPE_FILTER) && item.avg_queue_s > 10800) + .map(item => ({ machine_type: item.machine_type, avg_queue_s: item.avg_queue_s })); - dd.expect(highQueueItems.length > 0).to.be.false; +if (highQueueItems.length > 0) { + const machineDetails = highQueueItems + .map(item => `${item.machine_type} (${item.avg_queue_s}s)`) + .join(', '); + const message = `High queue detected for machine types containing ${MACHINE_TYPE_FILTER}: ${machineDetails}`; + console.error(message); } + +dd.expect(highQueueItems.length > 0).to.be.false; diff --git a/scripts/check-long-queue-meta-h100.js b/scripts/check-long-queue-meta-h100.js index 74f06b7..a284913 100644 --- a/scripts/check-long-queue-meta-h100.js +++ b/scripts/check-long-queue-meta-h100.js @@ -1,24 +1,19 @@ -if (dd.response.statusCode !== 200) { - // We do not want to fail due to hud.pytorch.org API failure. - console.log('Status code is not 200, stopping execution'); - dd.expect(true).to.equal(true); -} -else { - const MACHINE_TYPE_FILTER = 'linux.aws.h100'; - const jsonData = dd.response.body; - const parsedData = JSON.parse(jsonData); +dd.expect(dd.response.statusCode).to.equal(200); - const highQueueItems = parsedData - .filter(item => item.machine_type === MACHINE_TYPE_FILTER && item.avg_queue_s > 21600) - .map(item => ({ machine_type: item.machine_type, avg_queue_s: item.avg_queue_s })); +const MACHINE_TYPE_FILTER = 'linux.aws.h100'; +const jsonData = dd.response.body; +const parsedData = JSON.parse(jsonData); - if (highQueueItems.length > 0) { - const machineDetails = highQueueItems - .map(item => `${item.machine_type} (${item.avg_queue_s}s)`) - .join(', '); - const message = `High queue detected for machine type ${MACHINE_TYPE_FILTER}: ${machineDetails}`; - console.error(message); - } +const highQueueItems = parsedData + .filter(item => item.machine_type === MACHINE_TYPE_FILTER && item.avg_queue_s > 21600) + .map(item => ({ machine_type: item.machine_type, avg_queue_s: item.avg_queue_s })); - dd.expect(highQueueItems.length > 0).to.be.false; +if (highQueueItems.length > 0) { + const machineDetails = highQueueItems + .map(item => `${item.machine_type} (${item.avg_queue_s}s)`) + .join(', '); + const message = `High queue detected for machine type ${MACHINE_TYPE_FILTER}: ${machineDetails}`; + console.error(message); } + +dd.expect(highQueueItems.length > 0).to.be.false; diff --git a/scripts/check-long-queue-meta.js b/scripts/check-long-queue-meta.js index b1268d4..6ee37a6 100644 --- a/scripts/check-long-queue-meta.js +++ b/scripts/check-long-queue-meta.js @@ -1,28 +1,22 @@ -if (dd.response.statusCode !== 200) { - // We do not want to fail due to hud.pytorch.org API failure. - console.log('Status code is not 200, stopping execution'); - dd.expect(true).to.equal(true); -} -else { - const EXCLUDED_MACHINE_PATTERNS = ['.dgx.', '.idc.', '.rocm.', '.s390x', '^lf\\.', '^linux.aws.h100']; - const jsonData = dd.response.body; - const parsedData = JSON.parse(jsonData); - const highQueueItems = parsedData - .filter(item => { - const machineType = item.machine_type; - return !EXCLUDED_MACHINE_PATTERNS.some(pattern => - pattern.startsWith('^') ? - new RegExp(pattern).test(machineType) : - machineType.includes(pattern) - ) && item.avg_queue_s > 10800; - }) - .map(item => ({ machine_type: item.machine_type, avg_queue_s: item.avg_queue_s })); - if (highQueueItems.length > 0) { - const machineDetails = highQueueItems - .map(item => `${item.machine_type} (${item.avg_queue_s}s)`) - .join(', '); - const message = `High queue detected for machine types: ${machineDetails}`; - console.error(message); - } - dd.expect(highQueueItems.length > 0).to.be.false; +dd.expect(dd.response.statusCode).to.equal(200); +const EXCLUDED_MACHINE_PATTERNS = ['.dgx.', '.idc.', '.rocm.', '.s390x', '^lf\\.', '^linux.aws.h100']; +const jsonData = dd.response.body; +const parsedData = JSON.parse(jsonData); +const highQueueItems = parsedData + .filter(item => { + const machineType = item.machine_type; + return !EXCLUDED_MACHINE_PATTERNS.some(pattern => + pattern.startsWith('^') ? + new RegExp(pattern).test(machineType) : + machineType.includes(pattern) + ) && item.avg_queue_s > 10800; + }) + .map(item => ({ machine_type: item.machine_type, avg_queue_s: item.avg_queue_s })); +if (highQueueItems.length > 0) { + const machineDetails = highQueueItems + .map(item => `${item.machine_type} (${item.avg_queue_s}s)`) + .join(', '); + const message = `High queue detected for machine types: ${machineDetails}`; + console.error(message); } +dd.expect(highQueueItems.length > 0).to.be.false; diff --git a/scripts/check-long-queue-nvidia.js b/scripts/check-long-queue-nvidia.js index 879fe43..514e234 100644 --- a/scripts/check-long-queue-nvidia.js +++ b/scripts/check-long-queue-nvidia.js @@ -1,24 +1,19 @@ -if (dd.response.statusCode !== 200) { - // We do not want to fail due to hud.pytorch.org API failure. - console.log('Status code is not 200, stopping execution'); - dd.expect(true).to.equal(true); -} -else { - const MACHINE_TYPE_FILTER = '.dgx.'; - const jsonData = dd.response.body; - const parsedData = JSON.parse(jsonData); +dd.expect(dd.response.statusCode).to.equal(200); - const highQueueItems = parsedData - .filter(item => item.machine_type.includes(MACHINE_TYPE_FILTER) && item.avg_queue_s > 10800) - .map(item => ({ machine_type: item.machine_type, avg_queue_s: item.avg_queue_s })); +const MACHINE_TYPE_FILTER = '.dgx.'; +const jsonData = dd.response.body; +const parsedData = JSON.parse(jsonData); - if (highQueueItems.length > 0) { - const machineDetails = highQueueItems - .map(item => `${item.machine_type} (${item.avg_queue_s}s)`) - .join(', '); - const message = `High queue detected for machine types containing ${MACHINE_TYPE_FILTER}: ${machineDetails}`; - console.error(message); - } +const highQueueItems = parsedData + .filter(item => item.machine_type.includes(MACHINE_TYPE_FILTER) && item.avg_queue_s > 10800) + .map(item => ({ machine_type: item.machine_type, avg_queue_s: item.avg_queue_s })); - dd.expect(highQueueItems.length > 0).to.be.false; +if (highQueueItems.length > 0) { + const machineDetails = highQueueItems + .map(item => `${item.machine_type} (${item.avg_queue_s}s)`) + .join(', '); + const message = `High queue detected for machine types containing ${MACHINE_TYPE_FILTER}: ${machineDetails}`; + console.error(message); } + +dd.expect(highQueueItems.length > 0).to.be.false; diff --git a/scripts/check-long-queue-rocm.js b/scripts/check-long-queue-rocm.js index e63dde6..d9c0e89 100644 --- a/scripts/check-long-queue-rocm.js +++ b/scripts/check-long-queue-rocm.js @@ -1,24 +1,19 @@ -if (dd.response.statusCode !== 200) { - // We do not want to fail due to hud.pytorch.org API failure. - console.log('Status code is not 200, stopping execution'); - dd.expect(true).to.equal(true); -} -else { - const MACHINE_TYPE_FILTER = '.rocm.'; - const jsonData = dd.response.body; - const parsedData = JSON.parse(jsonData); +dd.expect(dd.response.statusCode).to.equal(200); - const highQueueItems = parsedData - .filter(item => item.machine_type.includes(MACHINE_TYPE_FILTER) && item.avg_queue_s > 14400) - .map(item => ({ machine_type: item.machine_type, avg_queue_s: item.avg_queue_s })); +const MACHINE_TYPE_FILTER = '.rocm.'; +const jsonData = dd.response.body; +const parsedData = JSON.parse(jsonData); - if (highQueueItems.length > 0) { - const machineDetails = highQueueItems - .map(item => `${item.machine_type} (${item.avg_queue_s}s)`) - .join(', '); - const message = `High queue detected for machine types containing ${MACHINE_TYPE_FILTER}: ${machineDetails}`; - console.error(message); - } +const highQueueItems = parsedData + .filter(item => item.machine_type.includes(MACHINE_TYPE_FILTER) && item.avg_queue_s > 14400) + .map(item => ({ machine_type: item.machine_type, avg_queue_s: item.avg_queue_s })); - dd.expect(highQueueItems.length > 0).to.be.false; +if (highQueueItems.length > 0) { + const machineDetails = highQueueItems + .map(item => `${item.machine_type} (${item.avg_queue_s}s)`) + .join(', '); + const message = `High queue detected for machine types containing ${MACHINE_TYPE_FILTER}: ${machineDetails}`; + console.error(message); } + +dd.expect(highQueueItems.length > 0).to.be.false; diff --git a/scripts/check-long-queue-s390x.js b/scripts/check-long-queue-s390x.js index 6b38317..8e4713b 100644 --- a/scripts/check-long-queue-s390x.js +++ b/scripts/check-long-queue-s390x.js @@ -1,24 +1,19 @@ -if (dd.response.statusCode !== 200) { - // We do not want to fail due to hud.pytorch.org API failure. - console.log('Status code is not 200, stopping execution'); - dd.expect(true).to.equal(true); -} -else { - const MACHINE_TYPE_FILTER = '.s390x'; - const jsonData = dd.response.body; - const parsedData = JSON.parse(jsonData); +dd.expect(dd.response.statusCode).to.equal(200); - const highQueueItems = parsedData - .filter(item => item.machine_type.includes(MACHINE_TYPE_FILTER) && item.avg_queue_s > 10800) - .map(item => ({ machine_type: item.machine_type, avg_queue_s: item.avg_queue_s })); +const MACHINE_TYPE_FILTER = '.s390x'; +const jsonData = dd.response.body; +const parsedData = JSON.parse(jsonData); - if (highQueueItems.length > 0) { - const machineDetails = highQueueItems - .map(item => `${item.machine_type} (${item.avg_queue_s}s)`) - .join(', '); - const message = `High queue detected for machine types containing ${MACHINE_TYPE_FILTER}: ${machineDetails}`; - console.error(message); - } +const highQueueItems = parsedData + .filter(item => item.machine_type.includes(MACHINE_TYPE_FILTER) && item.avg_queue_s > 10800) + .map(item => ({ machine_type: item.machine_type, avg_queue_s: item.avg_queue_s })); - dd.expect(highQueueItems.length > 0).to.be.false; +if (highQueueItems.length > 0) { + const machineDetails = highQueueItems + .map(item => `${item.machine_type} (${item.avg_queue_s}s)`) + .join(', '); + const message = `High queue detected for machine types containing ${MACHINE_TYPE_FILTER}: ${machineDetails}`; + console.error(message); } + +dd.expect(highQueueItems.length > 0).to.be.false;