Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions tools/cluster-checker/checker.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
'use strict'

const k8s = require('@kubernetes/client-node')
const k8srp = require('kubernetes-resource-parser')

const nodeResources = {
'nvidia.com/gpu' : 8,
'nvidia.com/roce_gdr' : 2,
'cpu' : 80,
'memory' : '800G'
}

class Client {
constructor () {
Expand Down Expand Up @@ -140,6 +148,50 @@ function reservation (pod) {
return gpus
}

// check container resource requests against node_resources
function checkContainerResources(namespace, workload, container) {
// selectively merge limits into requests
const resources = {}
for (const k in container.resources?.requests ?? []) {
resources[k] = container.resources.requests[k]
}
for (const k in container.resources?.limits ?? []) {
if (!(k in resources)) {
resources[k] = container.resources.limits[k]
}
}

const gpus = parseInt(resources['nvidia.com/gpu'] ?? '0')
const gdr = parseInt(resources['nvidia.com/roce_gdr'] ?? '0')
const cpus = k8srp.cpuParser(resources['cpu'] ?? '0')
const mem = k8srp.memoryParser(resources['memory'] ?? '0')

// warn if the resource requests cannot be satisfied by a Node
if (gpus > nodeResources['nvidia.com/gpu']) {
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting "${gpus} GPUs"`)
}
if (gdr > nodeResources['gdrPerNode']) {
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${gdr} roce_gdr interfaces"`)
}
if (cpus > nodeResources['cpu']) {
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting "${cpus} CPUs"`)
}
if (mem > k8srp.memoryParser(nodeResources['memory'])) {
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${resources['memory']} memory`)
}

// warn if the resource:GPU ratio is not proportional to Node resources
if (gdr > 0 && ((gpus == 0) || (gpus/gdr < nodeResources['nvidia.com/gpu']/nodeResources['nvidia.com/roce_gdr']))) {
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${gdr} roce_gdr but only ${gpus} GPUs`)
}
if (gpus > 0 && (cpus > 0) && (cpus/gpus > nodeResources['cpu']/nodeResources['nvidia.com/gpu'])) {
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${cpus} cpus but only ${gpus} GPUs`)
}
if (gpus > 0 && (mem > 0) && (mem/gpus > k8srp.memoryParser(nodeResources['memory'])/nodeResources['nvidia.com/gpu'])) {
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${resources['memory']} memory but only ${gpus} GPUs`)
}
}

// check user namespace
async function checkUserNamespace (client, namespace, queues) {
const workloads = await client.workloads(namespace.metadata.name)
Expand Down Expand Up @@ -171,6 +223,16 @@ async function checkUserNamespace (client, namespace, queues) {
if (conditions['Evicted'] === 'True') {
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has condition Evicted=True`)
}

// report misconfigured resource requests
for (const podSet of workload.spec?.podSets) {
for (const ic of podSet.template?.spec?.initContainers ?? []) {
checkContainerResources(namespace, workload, ic)
}
for (const c of podSet.template?.spec?.containers ?? []) {
checkContainerResources(namespace, workload, c)
}
}
}
}

Expand Down
9 changes: 8 additions & 1 deletion tools/cluster-checker/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion tools/cluster-checker/package.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
{
"dependencies": {
"@kubernetes/client-node": "^0.21.0"
"@kubernetes/client-node": "^0.21.0",
"kubernetes-resource-parser": "0.1.0"
}
}