Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 42 additions & 15 deletions docs/node-mixin/dashboards/node.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -16,25 +16,30 @@ local gauge = promgrafonnet.gauge;
datasource='$datasource',
span=6,
format='percentunit',
max=100,
max=1,
min=0,
stack=true,
)
.addTarget(prometheus.target(
// TODO: Consider using `${__interval}` as range and a 1m min step.
|||
1 - rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m])
(
(1 - rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[$__interval]))
/ ignoring(cpu) group_left
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@beorn7 I just came across this while trying to figure out why my fully utilized system only shows 12.5% cpu usage for each cpu. I think this doesn't make sense, right? See also slack: https://cloud-native.slack.com/archives/C01AUBA4PFE/p1725300066105619

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have hardly any memories of this.

I can only speculate that this past version of myself either observed or merely assumed that each individual CPU would count seconds just for itself. So a fully utilized 8 core system should see 8 CPU seconds counted up in total per second.

Therefore, this query divides by the number of CPUs, so that the resulting value will be 1 for a system that fully utilizes all its CPUs.

Thinking about it, I think this should be the case. What system are you running your query on? Is it maybe some virtual machine that pretends to have 8 cores but is only backed by 1 physical core?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I could also see hyperthreading being an issue here.

count without (cpu)( node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"})
)
||| % $._config,
legendFormat='{{cpu}}',
intervalFactor=10,
intervalFactor=5,
interval='1m',
));

// TODO: Is this panel useful?
local systemLoad =
graphPanel.new(
'Load Average',
datasource='$datasource',
span=6,
format='short',
fill=0,
)
.addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='1m load average'))
.addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='5m load average'))
Expand All @@ -46,6 +51,8 @@ local gauge = promgrafonnet.gauge;
datasource='$datasource',
span=9,
format='bytes',
stack=true,
min=0,
)
.addTarget(prometheus.target(
|||
Expand Down Expand Up @@ -84,20 +91,32 @@ local gauge = promgrafonnet.gauge;
'Disk I/O',
datasource='$datasource',
span=9,
fill=0,
)
// TODO: Does it make sense to have those three in the same panel?
// TODO: Consider using `${__interval}` as range and a 1m min step.
.addTarget(prometheus.target('rate(node_disk_read_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[1m])' % $._config, legendFormat='{{device}} read'))
.addTarget(prometheus.target('rate(node_disk_written_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[1m])' % $._config, legendFormat='{{device}} written'))
.addTarget(prometheus.target('rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[1m])' % $._config, legendFormat='{{device}} io time')) +
.addTarget(prometheus.target(
'rate(node_disk_read_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[$__interval])' % $._config,
legendFormat='{{device}} read',
interval='1m',
))
.addTarget(prometheus.target(
'rate(node_disk_written_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[$__interval])' % $._config,
legendFormat='{{device}} written',
interval='1m',
))
.addTarget(prometheus.target(
'rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[$__interval])' % $._config,
legendFormat='{{device}} io time',
interval='1m',
)) +
{
seriesOverrides: [
{
alias: 'read',
alias: '/ read| written/',
yaxis: 1,
},
{
alias: 'io time',
alias: '/ io time/',
yaxis: 2,
},
],
Expand Down Expand Up @@ -129,19 +148,27 @@ local gauge = promgrafonnet.gauge;
datasource='$datasource',
span=6,
format='bytes',
fill=0,
)
// TODO: Consider using `${__interval}` as range and a 1m min step.
.addTarget(prometheus.target('rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}'));
.addTarget(prometheus.target(
'rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[$__interval])' % $._config,
legendFormat='{{device}}',
interval='1m',
));

local networkTransmitted =
graphPanel.new(
'Network Transmitted',
datasource='$datasource',
span=6,
format='bytes',
fill=0,
)
// TODO: Consider using `${__interval}` as range and a 1m min step.
.addTarget(prometheus.target('rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}'));
.addTarget(prometheus.target(
'rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[$__interval])' % $._config,
legendFormat='{{device}}',
interval='1m',
));

dashboard.new('Nodes', time_from='now-1h')
.addTemplate(
Expand Down