diff --git a/tools/cluster-checker/checker.js b/tools/cluster-checker/checker.js index 2a5bd65..5cd3cca 100644 --- a/tools/cluster-checker/checker.js +++ b/tools/cluster-checker/checker.js @@ -1,6 +1,14 @@ 'use strict' const k8s = require('@kubernetes/client-node') +const k8srp = require('kubernetes-resource-parser') + +const nodeResources = { + 'nvidia.com/gpu' : 8, + 'nvidia.com/roce_gdr' : 2, + 'cpu' : 80, + 'memory' : '800G' +} class Client { constructor () { @@ -140,6 +148,50 @@ function reservation (pod) { return gpus } +// check container resource requests against node_resources +function checkContainerResources(namespace, workload, container) { + // selectively merge limits into requests + const resources = {} + for (const k in container.resources?.requests ?? []) { + resources[k] = container.resources.requests[k] + } + for (const k in container.resources?.limits ?? []) { + if (!(k in resources)) { + resources[k] = container.resources.limits[k] + } + } + + const gpus = parseInt(resources['nvidia.com/gpu'] ?? '0') + const gdr = parseInt(resources['nvidia.com/roce_gdr'] ?? '0') + const cpus = k8srp.cpuParser(resources['cpu'] ?? '0') + const mem = k8srp.memoryParser(resources['memory'] ?? '0') + + // warn if the resource requests cannot be satisfied by a Node + if (gpus > nodeResources['nvidia.com/gpu']) { + console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting "${gpus} GPUs"`) + } + if (gdr > nodeResources['gdrPerNode']) { + console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${gdr} roce_gdr interfaces"`) + } + if (cpus > nodeResources['cpu']) { + console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting "${cpus} CPUs"`) + } + if (mem > k8srp.memoryParser(nodeResources['memory'])) { + console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${resources['memory']} memory`) + } + + // warn if the resource:GPU ratio is not proportional to Node resources + if (gdr > 0 && ((gpus == 0) || (gpus/gdr < nodeResources['nvidia.com/gpu']/nodeResources['nvidia.com/roce_gdr']))) { + console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${gdr} roce_gdr but only ${gpus} GPUs`) + } + if (gpus > 0 && (cpus > 0) && (cpus/gpus > nodeResources['cpu']/nodeResources['nvidia.com/gpu'])) { + console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${cpus} cpus but only ${gpus} GPUs`) + } + if (gpus > 0 && (mem > 0) && (mem/gpus > k8srp.memoryParser(nodeResources['memory'])/nodeResources['nvidia.com/gpu'])) { + console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${resources['memory']} memory but only ${gpus} GPUs`) + } +} + // check user namespace async function checkUserNamespace (client, namespace, queues) { const workloads = await client.workloads(namespace.metadata.name) @@ -171,6 +223,16 @@ async function checkUserNamespace (client, namespace, queues) { if (conditions['Evicted'] === 'True') { console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has condition Evicted=True`) } + + // report misconfigured resource requests + for (const podSet of workload.spec?.podSets) { + for (const ic of podSet.template?.spec?.initContainers ?? []) { + checkContainerResources(namespace, workload, ic) + } + for (const c of podSet.template?.spec?.containers ?? []) { + checkContainerResources(namespace, workload, c) + } + } } } diff --git a/tools/cluster-checker/package-lock.json b/tools/cluster-checker/package-lock.json index 8dc360a..62097eb 100644 --- a/tools/cluster-checker/package-lock.json +++ b/tools/cluster-checker/package-lock.json @@ -5,7 +5,8 @@ "packages": { "": { "dependencies": { - "@kubernetes/client-node": "^0.21.0" + "@kubernetes/client-node": "^0.21.0", + "kubernetes-resource-parser": "0.1.0" } }, "node_modules/@isaacs/cliui": { @@ -543,6 +544,12 @@ "node": ">=0.6.0" } }, + "node_modules/kubernetes-resource-parser": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/kubernetes-resource-parser/-/kubernetes-resource-parser-0.1.0.tgz", + "integrity": "sha512-rr2K/4akDkY3oKgJ/KL3KAKw8Fb0VwBucGgKhvgqXluVhfn2BgEuJUXIDU+zt4eWaqOOjAC6ApUgnHF/SJ/iNw==", + "license": "MIT" + }, "node_modules/lru-cache": { "version": "6.0.0", "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-6.0.0.tgz", diff --git a/tools/cluster-checker/package.json b/tools/cluster-checker/package.json index 0488c7f..36a1453 100644 --- a/tools/cluster-checker/package.json +++ b/tools/cluster-checker/package.json @@ -1,5 +1,6 @@ { "dependencies": { - "@kubernetes/client-node": "^0.21.0" + "@kubernetes/client-node": "^0.21.0", + "kubernetes-resource-parser": "0.1.0" } }