From d4b449cd30235c1bded079480c1eb1a93356a632 Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Mon, 5 Dec 2022 13:25:29 +0100 Subject: [PATCH] Replace hardcoded threshold by temp_max value The temp_max value is dynamically gathered from the device [1]. With Xeon CPUs (coretemp driver), it is often 90C, but sometimes lower. This can help reduce the frequency of alerts with busy hypervisors. [1] https://docs.kernel.org/hwmon/coretemp.html --- etc/kayobe/kolla/config/prometheus/system.rules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etc/kayobe/kolla/config/prometheus/system.rules b/etc/kayobe/kolla/config/prometheus/system.rules index fe3a2b9ac..ffc7d25a3 100644 --- a/etc/kayobe/kolla/config/prometheus/system.rules +++ b/etc/kayobe/kolla/config/prometheus/system.rules @@ -34,7 +34,7 @@ groups: description: "OOM kill detected" - alert: Overheating - expr: node_hwmon_temp_celsius >= 85 + expr: node_hwmon_temp_celsius >= node_hwmon_temp_max_celsius for: 1m labels: severity: warning