From c287121f839dc1f792d9d667f7b027ce3288d130 Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Thu, 24 Nov 2022 15:27:21 +0000 Subject: [PATCH 1/3] Switch to using absolute limit for low memory alert Nodes with alot of ram can end up triggering this low memory alert even when they have a significant amount of free memory e.g for a node with 512GiB ram, when we hit the current alert (which is con figured to be 80%) we still have 102.5GiB free. --- etc/kayobe/kolla/config/prometheus/system.rules | 4 ++-- etc/kayobe/kolla/globals.yml | 8 ++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/etc/kayobe/kolla/config/prometheus/system.rules b/etc/kayobe/kolla/config/prometheus/system.rules index 76d301fbf..32b4a02f2 100644 --- a/etc/kayobe/kolla/config/prometheus/system.rules +++ b/etc/kayobe/kolla/config/prometheus/system.rules @@ -16,13 +16,13 @@ groups: description: "{{ $labels.device }} is {{ $value }}% full." - alert: LowMemory - expr: ( ( node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes - (node_memory_Hugepagesize_bytes * node_memory_HugePages_Total)) / (node_memory_MemTotal_bytes - (node_memory_Hugepagesize_bytes * node_memory_HugePages_Total))) * 100 >= 80 + expr: (node_memory_MemAvailable_bytes / 1024^3) < {{ alertmanager_low_memory_threshold_gb }} for: 1m labels: severity: alert annotations: summary: "Prometheus exporter at {{ $labels.instance }} reports low memory" - description: "Memory is {{ $value }}% full." + description: "Available memory is {{ $value }}." - alert: HostOomKillDetected expr: increase(node_vmstat_oom_kill[5m]) > 0 diff --git a/etc/kayobe/kolla/globals.yml b/etc/kayobe/kolla/globals.yml index 514bf6aad..f68e09693 100644 --- a/etc/kayobe/kolla/globals.yml +++ b/etc/kayobe/kolla/globals.yml @@ -7,6 +7,14 @@ docker_yum_gpgkey: "https://download.docker.com/linux/centos/gpg" bifrost_tag: xena-20221102T110017 {% endif %} +############################################################################# # Monitoring and alerting related settings + es_heap_size: 8g prometheus_cmdline_extras: "--storage.tsdb.retention.time=30d" +# Threshold to trigger a LowMemory alert in Gibibytes (GiB). When the +# amount of free memory is lower that this value an alert will +# be triggered. +alertmanager_low_memory_threshold_gb: 5 + +############################################################################# \ No newline at end of file From 28ec6a8235481bfd8fdda9b3b98df8eef772fa81 Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Fri, 25 Nov 2022 09:41:00 +0000 Subject: [PATCH 2/3] Apply suggestions from code review --- etc/kayobe/kolla/config/prometheus/system.rules | 2 +- etc/kayobe/kolla/globals.yml | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/etc/kayobe/kolla/config/prometheus/system.rules b/etc/kayobe/kolla/config/prometheus/system.rules index 32b4a02f2..a8fb59aaa 100644 --- a/etc/kayobe/kolla/config/prometheus/system.rules +++ b/etc/kayobe/kolla/config/prometheus/system.rules @@ -16,7 +16,7 @@ groups: description: "{{ $labels.device }} is {{ $value }}% full." - alert: LowMemory - expr: (node_memory_MemAvailable_bytes / 1024^3) < {{ alertmanager_low_memory_threshold_gb }} + expr: (node_memory_MemAvailable_bytes / 1024^3) < {{ alertmanager_low_memory_threshold_gib }} for: 1m labels: severity: alert diff --git a/etc/kayobe/kolla/globals.yml b/etc/kayobe/kolla/globals.yml index f68e09693..e91b1ed9d 100644 --- a/etc/kayobe/kolla/globals.yml +++ b/etc/kayobe/kolla/globals.yml @@ -13,8 +13,8 @@ bifrost_tag: xena-20221102T110017 es_heap_size: 8g prometheus_cmdline_extras: "--storage.tsdb.retention.time=30d" # Threshold to trigger a LowMemory alert in Gibibytes (GiB). When the -# amount of free memory is lower that this value an alert will +# amount of free memory is lower than this value an alert will # be triggered. -alertmanager_low_memory_threshold_gb: 5 +alertmanager_low_memory_threshold_gib: 5 -############################################################################# \ No newline at end of file +############################################################################# From a6dae70db0d87f245a7cf9cc36ea7b51dc01717e Mon Sep 17 00:00:00 2001 From: Will Szumski Date: Fri, 25 Nov 2022 09:51:13 +0000 Subject: [PATCH 3/3] Rewrap comment --- etc/kayobe/kolla/globals.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/etc/kayobe/kolla/globals.yml b/etc/kayobe/kolla/globals.yml index e91b1ed9d..f52d530e0 100644 --- a/etc/kayobe/kolla/globals.yml +++ b/etc/kayobe/kolla/globals.yml @@ -12,9 +12,8 @@ bifrost_tag: xena-20221102T110017 es_heap_size: 8g prometheus_cmdline_extras: "--storage.tsdb.retention.time=30d" -# Threshold to trigger a LowMemory alert in Gibibytes (GiB). When the -# amount of free memory is lower than this value an alert will -# be triggered. +# Threshold to trigger a LowMemory alert in Gibibytes (GiB). When the amount +# of free memory is lower than this value an alert will be triggered. alertmanager_low_memory_threshold_gib: 5 #############################################################################