From 01245b9fb7e6556c1e4f548fc89a3d883e2b7a11 Mon Sep 17 00:00:00 2001 From: Olivier Tardieu Date: Mon, 30 Sep 2024 13:35:47 -0400 Subject: [PATCH] Add warning: total GPU quota is greater than total GPU count --- tools/cluster-checker/checker.js | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/tools/cluster-checker/checker.js b/tools/cluster-checker/checker.js index 697c60d..10326e5 100644 --- a/tools/cluster-checker/checker.js +++ b/tools/cluster-checker/checker.js @@ -4,10 +4,10 @@ const k8s = require('@kubernetes/client-node') const k8srp = require('kubernetes-resource-parser') const nodeResources = { - 'nvidia.com/gpu' : 8, - 'nvidia.com/roce_gdr' : 2, - 'cpu' : 80, - 'memory' : '800G' + 'nvidia.com/gpu': 8, + 'nvidia.com/roce_gdr': 2, + 'cpu': 80, + 'memory': '800G' } class Client { @@ -39,11 +39,11 @@ class Client { return res.body } - async readOperatorConfig() { + async readOperatorConfig () { const options = [ - {ns: 'redhat-ods-applications', cm: 'codeflare-operator-config', key: 'config.yaml', f: cm => cm.appwrapper?.Config }, - {ns: 'mlbatch-system', cm: 'appwrapper-operator-config', key: 'config.yaml', f: cm => cm.appwrapper }, - {ns: 'appwrapper-system', cm: 'appwrapper-operator-config', key: 'config.yaml', f: cm => cm.appwrapper } + { ns: 'redhat-ods-applications', cm: 'codeflare-operator-config', key: 'config.yaml', f: cm => cm.appwrapper?.Config }, + { ns: 'mlbatch-system', cm: 'appwrapper-operator-config', key: 'config.yaml', f: cm => cm.appwrapper }, + { ns: 'appwrapper-system', cm: 'appwrapper-operator-config', key: 'config.yaml', f: cm => cm.appwrapper } ] for (const opt of options) { try { @@ -149,7 +149,7 @@ function reservation (pod) { } // check container resource requests against node_resources -function checkContainerResources(namespace, workload, workloadReplicas, container) { +function checkContainerResources (namespace, workload, workloadReplicas, container) { // selectively merge limits into requests const resources = {} for (const k in container.resources?.requests ?? []) { @@ -181,13 +181,13 @@ function checkContainerResources(namespace, workload, workloadReplicas, containe } // warn if the resource:GPU ratio is not proportional to Node resources - if (gdr > 0 && ((gpus == 0) || (gpus/gdr < nodeResources['nvidia.com/gpu']/nodeResources['nvidia.com/roce_gdr']))) { + if (gdr > 0 && ((gpus == 0) || (gpus / gdr < nodeResources['nvidia.com/gpu'] / nodeResources['nvidia.com/roce_gdr']))) { console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${gdr} roce_gdr but only ${gpus} GPUs`) } - if (gpus > 0 && (cpus > 0) && (cpus/gpus > nodeResources['cpu']/nodeResources['nvidia.com/gpu'])) { + if (gpus > 0 && (cpus > 0) && (cpus / gpus > nodeResources['cpu'] / nodeResources['nvidia.com/gpu'])) { console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${cpus} cpus but only ${gpus} GPUs`) } - if (gpus > 0 && (mem > 0) && (mem/gpus > k8srp.memoryParser(nodeResources['memory'])/nodeResources['nvidia.com/gpu'])) { + if (gpus > 0 && (mem > 0) && (mem / gpus > k8srp.memoryParser(nodeResources['memory']) / nodeResources['nvidia.com/gpu'])) { console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${resources['memory']} memory but only ${gpus} GPUs`) } @@ -376,6 +376,10 @@ async function main () { console.log('WARNING: nominal GPU quota is greater than schedulable GPU count') } + if (quotaGPUs + slackGPUs > clusterGPUs) { + console.log('WARNING: total GPU quota is greater than total GPU count') + } + // check all accessible namespaces const namespaces = await client.namespaces() for (const namespace of namespaces) {