node-mxin: Improve nodes dashboard #1448

discordianfish · 2024-09-02T18:19:31Z

@beorn7 I just came across this while trying to figure out why my fully utilized system only shows 12.5% cpu usage for each cpu. I think this doesn't make sense, right? See also slack: https://cloud-native.slack.com/archives/C01AUBA4PFE/p1725300066105619

I have hardly any memories of this.

I can only speculate that this past version of myself either observed or merely assumed that each individual CPU would count seconds just for itself. So a fully utilized 8 core system should see 8 CPU seconds counted up in total per second.

Therefore, this query divides by the number of CPUs, so that the resulting value will be 1 for a system that fully utilizes all its CPUs.

Thinking about it, I think this should be the case. What system are you running your query on? Is it maybe some virtual machine that pretends to have 8 cores but is only backed by 1 physical core?

I could also see hyperthreading being an issue here.

-Original file line number
+Diff line change
@@ Expand Up / @@ -16,25 +16,30 @@ local gauge = promgrafonnet.gauge; @@
               datasource='$datasource',
               span=6,
               format='percentunit',
-              max=100,
+              max=1,
               min=0,
+              stack=true,
             )
             .addTarget(prometheus.target(
-              // TODO: Consider using `${__interval}` as range and a 1m min step.
               |||
-- rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m])
+                (
+                  (1 - rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[$__interval]))
+                / ignoring(cpu) group_left
+                  count without (cpu)( node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"})
+                )
               ||| % $._config,
               legendFormat='{{cpu}}',
-              intervalFactor=10,
+              intervalFactor=5,
+              interval='1m',
             ));
-          // TODO: Is this panel useful?
           local systemLoad =
             graphPanel.new(
               'Load Average',
               datasource='$datasource',
               span=6,
               format='short',
+              fill=0,
             )
             .addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='1m load average'))
             .addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='5m load average'))
@@ Expand All / @@ -46,6 +51,8 @@ local gauge = promgrafonnet.gauge; @@
               datasource='$datasource',
               span=9,
               format='bytes',
+              stack=true,
+              min=0,
             )
             .addTarget(prometheus.target(
               |||
@@ Expand Down Expand Up / @@ -84,20 +91,32 @@ local gauge = promgrafonnet.gauge; @@
               'Disk I/O',
               datasource='$datasource',
               span=9,
+              fill=0,
             )
             // TODO: Does it make sense to have those three in the same panel?
-            // TODO: Consider using `${__interval}` as range and a 1m min step.
-            .addTarget(prometheus.target('rate(node_disk_read_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[1m])' % $._config, legendFormat='{{device}} read'))
-            .addTarget(prometheus.target('rate(node_disk_written_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[1m])' % $._config, legendFormat='{{device}} written'))
-            .addTarget(prometheus.target('rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[1m])' % $._config, legendFormat='{{device}} io time')) +
+            .addTarget(prometheus.target(
+              'rate(node_disk_read_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[$__interval])' % $._config,
+              legendFormat='{{device}} read',
+              interval='1m',
+            ))
+            .addTarget(prometheus.target(
+              'rate(node_disk_written_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[$__interval])' % $._config,
+              legendFormat='{{device}} written',
+              interval='1m',
+            ))
+            .addTarget(prometheus.target(
+              'rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[$__interval])' % $._config,
+              legendFormat='{{device}} io time',
+              interval='1m',
+            )) +
             {
               seriesOverrides: [
                 {
-                  alias: 'read',
+                  alias: '/ read| written/',
                   yaxis: 1,
                 },
                 {
-                  alias: 'io time',
+                  alias: '/ io time/',
                   yaxis: 2,
                 },
               ],
@@ Expand Down Expand Up / @@ -129,19 +148,27 @@ local gauge = promgrafonnet.gauge; @@
               datasource='$datasource',
               span=6,
               format='bytes',
+              fill=0,
             )
-            // TODO: Consider using `${__interval}` as range and a 1m min step.
-            .addTarget(prometheus.target('rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}'));
+            .addTarget(prometheus.target(
+              'rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[$__interval])' % $._config,
+              legendFormat='{{device}}',
+              interval='1m',
+            ));
           local networkTransmitted =
             graphPanel.new(
               'Network Transmitted',
               datasource='$datasource',
               span=6,
               format='bytes',
+              fill=0,
             )
-            // TODO: Consider using `${__interval}` as range and a 1m min step.
-            .addTarget(prometheus.target('rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}'));
+            .addTarget(prometheus.target(
+              'rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[$__interval])' % $._config,
+              legendFormat='{{device}}',
+              interval='1m',
+            ));
           dashboard.new('Nodes', time_from='now-1h')
           .addTemplate(
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

node-mxin: Improve nodes dashboard #1448

Uh oh!

Diff view

Diff view

There are no files selected for viewing

discordianfish Sep 2, 2024

Uh oh!

beorn7 Sep 3, 2024

Uh oh!

beorn7 Sep 3, 2024

Uh oh!

node-mxin: Improve nodes dashboard #1448

Uh oh!

node-mxin: Improve nodes dashboard #1448

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

discordianfish Sep 2, 2024

Choose a reason for hiding this comment

Uh oh!

beorn7 Sep 3, 2024

Choose a reason for hiding this comment

Uh oh!

beorn7 Sep 3, 2024

Choose a reason for hiding this comment

Uh oh!