From 53a0b0cba6ec163a7d8bb6dc1b47bf5adffb8423 Mon Sep 17 00:00:00 2001 From: David Grove Date: Fri, 13 Sep 2024 18:40:48 -0400 Subject: [PATCH 1/2] basic checking of container resources --- tools/cluster-checker/checker.js | 60 +++++++++++++++++++++++++ tools/cluster-checker/package-lock.json | 9 +++- tools/cluster-checker/package.json | 3 +- 3 files changed, 70 insertions(+), 2 deletions(-) diff --git a/tools/cluster-checker/checker.js b/tools/cluster-checker/checker.js index 2a5bd65..61ac4c5 100644 --- a/tools/cluster-checker/checker.js +++ b/tools/cluster-checker/checker.js @@ -1,6 +1,14 @@ 'use strict' const k8s = require('@kubernetes/client-node') +const k8srp = require('kubernetes-resource-parser') + +const nodeResources = { + 'nvidia.com/gpu' : 8, + 'nvidia.com/roce_gdr' : 2, + 'cpu' : 80, + 'memory' : '800G' +} class Client { constructor () { @@ -140,6 +148,48 @@ function reservation (pod) { return gpus } +function checkContainerResources(namespace, workload, container) { + const resources = {} + for (const k in container.resources?.requests ?? []) { + resources[k] = container.resources.requests[k] + } + for (const k in container.resources?.limits ?? []) { + if (!(k in resources)) { + resources[k] = container.resources.limits[k] + } + } + + const gpus = parseInt(resources['nvidia.com/gpu'] ?? '0') + const gdr = parseInt(resources['nvidia.com/roce_gdr'] ?? '0') + const cpus = k8srp.cpuParser(resources['cpu'] ?? '0') + const mem = k8srp.memoryParser(resources['memory'] ?? '0') + + // Check that resources will fit on a node + if (gpus > nodeResources['nvidia.com/gpu']) { + console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting "${gpus} GPUs"`) + } + if (gdr > nodeResources['gdrPerNode']) { + console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${gdr} roce_gdr interfaces"`) + } + if (cpus > nodeResources['cpu']) { + console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting "${cpus} CPUs"`) + } + if (mem > k8srp.memoryParser(nodeResources['memory'])) { + console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${resources['memory']} memory`) + } + + // Check that resource:GPU ratio is proportional + if (gdr > 0 && ((gpus == 0) || (gpus/gdr < nodeResources['nvidia.com/gpu']/nodeResources['nvidia.com/roce_gdr']))) { + console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${gdr} roce_gdr but only ${gpus} GPUs`) + } + if (gpus > 0 && (cpus > 0) && (cpus/gpus > nodeResources['cpu']/nodeResources['nvidia.com/gpu'])) { + console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${cpus} cpus but only ${gpus} GPUs`) + } + if (gpus > 0 && (mem > 0) && (mem/gpus > k8srp.memoryParser(nodeResources['memory'])/nodeResources['nvidia.com/gpu'])) { + console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${resources['memory']} memory but only ${gpus} GPUs`) + } +} + // check user namespace async function checkUserNamespace (client, namespace, queues) { const workloads = await client.workloads(namespace.metadata.name) @@ -171,6 +221,16 @@ async function checkUserNamespace (client, namespace, queues) { if (conditions['Evicted'] === 'True') { console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has condition Evicted=True`) } + + // report misconfigured resource requests + for (const podSet of workload.spec?.podSets) { + for (const ic of podSet.template?.spec?.initContainers ?? []) { + checkContainerResources(namespace, workload, ic) + } + for (const c of podSet.template?.spec?.containers ?? []) { + checkContainerResources(namespace, workload, c) + } + } } } diff --git a/tools/cluster-checker/package-lock.json b/tools/cluster-checker/package-lock.json index 8dc360a..62097eb 100644 --- a/tools/cluster-checker/package-lock.json +++ b/tools/cluster-checker/package-lock.json @@ -5,7 +5,8 @@ "packages": { "": { "dependencies": { - "@kubernetes/client-node": "^0.21.0" + "@kubernetes/client-node": "^0.21.0", + "kubernetes-resource-parser": "0.1.0" } }, "node_modules/@isaacs/cliui": { @@ -543,6 +544,12 @@ "node": ">=0.6.0" } }, + "node_modules/kubernetes-resource-parser": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/kubernetes-resource-parser/-/kubernetes-resource-parser-0.1.0.tgz", + "integrity": "sha512-rr2K/4akDkY3oKgJ/KL3KAKw8Fb0VwBucGgKhvgqXluVhfn2BgEuJUXIDU+zt4eWaqOOjAC6ApUgnHF/SJ/iNw==", + "license": "MIT" + }, "node_modules/lru-cache": { "version": "6.0.0", "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-6.0.0.tgz", diff --git a/tools/cluster-checker/package.json b/tools/cluster-checker/package.json index 0488c7f..36a1453 100644 --- a/tools/cluster-checker/package.json +++ b/tools/cluster-checker/package.json @@ -1,5 +1,6 @@ { "dependencies": { - "@kubernetes/client-node": "^0.21.0" + "@kubernetes/client-node": "^0.21.0", + "kubernetes-resource-parser": "0.1.0" } } From 391423ad898e4bd9536fef0437d35cf2fd4b8535 Mon Sep 17 00:00:00 2001 From: David Grove Date: Mon, 16 Sep 2024 10:07:04 -0400 Subject: [PATCH 2/2] adjust comments --- tools/cluster-checker/checker.js | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/cluster-checker/checker.js b/tools/cluster-checker/checker.js index 61ac4c5..5cd3cca 100644 --- a/tools/cluster-checker/checker.js +++ b/tools/cluster-checker/checker.js @@ -148,7 +148,9 @@ function reservation (pod) { return gpus } +// check container resource requests against node_resources function checkContainerResources(namespace, workload, container) { + // selectively merge limits into requests const resources = {} for (const k in container.resources?.requests ?? []) { resources[k] = container.resources.requests[k] @@ -164,7 +166,7 @@ function checkContainerResources(namespace, workload, container) { const cpus = k8srp.cpuParser(resources['cpu'] ?? '0') const mem = k8srp.memoryParser(resources['memory'] ?? '0') - // Check that resources will fit on a node + // warn if the resource requests cannot be satisfied by a Node if (gpus > nodeResources['nvidia.com/gpu']) { console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting "${gpus} GPUs"`) } @@ -178,7 +180,7 @@ function checkContainerResources(namespace, workload, container) { console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${resources['memory']} memory`) } - // Check that resource:GPU ratio is proportional + // warn if the resource:GPU ratio is not proportional to Node resources if (gdr > 0 && ((gpus == 0) || (gpus/gdr < nodeResources['nvidia.com/gpu']/nodeResources['nvidia.com/roce_gdr']))) { console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${gdr} roce_gdr but only ${gpus} GPUs`) }