From 4b22c12be46314c339b7118c2b642c5ec1fbbedb Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Wed, 15 May 2024 09:14:24 +0100 Subject: [PATCH 1/4] Add Prometheus alerts test --- stackhpc_openstack_tests/test_prometheus.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/stackhpc_openstack_tests/test_prometheus.py b/stackhpc_openstack_tests/test_prometheus.py index 7478ffa..0b294b7 100644 --- a/stackhpc_openstack_tests/test_prometheus.py +++ b/stackhpc_openstack_tests/test_prometheus.py @@ -39,3 +39,21 @@ def test_prometheus_node_exporter_metrics(prom): """Check that expected node exporter metrics exist.""" metrics = prom.all_metrics() assert "node_cpu_seconds_total" in metrics + + +def test_prometheus_alerts_inactive(prom): + """Check that no Prometheus alerts are active.""" + # https://prometheus.io/docs/prometheus/latest/querying/api/#alerts + response = prom._session.get( + "{0}/api/v1/alerts".format(prom.url), + verify=prom._session.verify, + headers=prom.headers, + auth=prom.auth, + cert=prom._session.cert, + ) + assert response.ok + response = response.json() + assert "status" in response + assert response["status"] == "success" + alerts = response["data"]["alerts"] + assert not alerts From 8c89c35677e457f70f0d5f4c132906e24a6b93e4 Mon Sep 17 00:00:00 2001 From: Max Norton Date: Wed, 2 Apr 2025 14:43:08 +0100 Subject: [PATCH 2/4] Allow for Prometheus alerts that can occur in an AIO when ideally testing for no alerts. --- .../monitoring/test_prometheus.py | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/stackhpc_cloud_tests/monitoring/test_prometheus.py b/stackhpc_cloud_tests/monitoring/test_prometheus.py index 0b294b7..82de463 100644 --- a/stackhpc_cloud_tests/monitoring/test_prometheus.py +++ b/stackhpc_cloud_tests/monitoring/test_prometheus.py @@ -55,5 +55,21 @@ def test_prometheus_alerts_inactive(prom): response = response.json() assert "status" in response assert response["status"] == "success" - alerts = response["data"]["alerts"] - assert not alerts + assert "data" in response + alerts = response["data"]["alerts"] or [] + # (MaxN) Allow for, and filter out, alerts we'd expect to see in an AIO environment. + # TODO - find a way of configuring this for SCT runs in other environments. + alerts_to_ignore = [ + # We know our volumes are small. + "StorageFillingUp", + # This is probably due to storage space.. + "ElasticsearchClusterYellow", + # ..or because we're running in a single instance and it wants to be clustered across multiple nodes. + "ElasticsearchUnassignedShards", + # It's a small AIO! + "LowMemory", + # It's only one node and expects three, see https://github.com/stackhpc/stackhpc-kayobe-config/pull/1579 + "RabbitMQNodeDown" + ] + alerts = [ alert for alert in alerts if alert["labels"]["alertname"] not in alerts_to_ignore ] + assert len(alerts) == 0 From 430733c3de4ac4fd8bf1642a17e3539cda185c61 Mon Sep 17 00:00:00 2001 From: Max Norton Date: Thu, 3 Apr 2025 12:32:59 +0100 Subject: [PATCH 3/4] Ignore ContainerKilled prometheus alert - in the context of tempest --- .../monitoring/test_prometheus.py | 30 ++++++++++++++----- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/stackhpc_cloud_tests/monitoring/test_prometheus.py b/stackhpc_cloud_tests/monitoring/test_prometheus.py index 82de463..aec9da6 100644 --- a/stackhpc_cloud_tests/monitoring/test_prometheus.py +++ b/stackhpc_cloud_tests/monitoring/test_prometheus.py @@ -57,19 +57,33 @@ def test_prometheus_alerts_inactive(prom): assert response["status"] == "success" assert "data" in response alerts = response["data"]["alerts"] or [] + # (MaxN) Allow for, and filter out, alerts we'd expect to see in an AIO environment. - # TODO - find a way of configuring this for SCT runs in other environments. - alerts_to_ignore = [ + # TODO - find a way of configuring this for SCT running in other environments. + aio_alerts_to_ignore = [ # We know our volumes are small. - "StorageFillingUp", + { "alertname": "StorageFillingUp", "instance": "controller0" }, # This is probably due to storage space.. - "ElasticsearchClusterYellow", + { "alertname": "ElasticsearchClusterYellow", "instance": "controller0" }, # ..or because we're running in a single instance and it wants to be clustered across multiple nodes. - "ElasticsearchUnassignedShards", + { "alertname": "ElasticsearchUnassignedShards", "instance": "controller0" }, # It's a small AIO! - "LowMemory", + { "alertname": "LowMemory", "instance": "controller0" }, # It's only one node and expects three, see https://github.com/stackhpc/stackhpc-kayobe-config/pull/1579 - "RabbitMQNodeDown" + { "alertname": "RabbitMQNodeDown" }, + # This is probably because Tempest runs before pytest so the container has been recently stopped. + { "alertname": "ContainerKilled", "name": "tempest" } ] - alerts = [ alert for alert in alerts if alert["labels"]["alertname"] not in alerts_to_ignore ] + + def alert_is_ignored(alert, alerts_to_ignore): + alert_items = alert.items() + for alert_to_ignore in alerts_to_ignore: + alert_to_ignore_items = alert_to_ignore.items() + # alert has more items than alerts_to_ignore + # so here we can return True if alert_to_ignore is a subset of alerts + if alert_to_ignore_items <= alert_items: + return True + return False + + alerts = [ alert for alert in alerts if not alert_is_ignored(alert["labels"], aio_alerts_to_ignore) ] assert len(alerts) == 0 From 7756b4f17621a07e5a05abc0c2a112db94b8af80 Mon Sep 17 00:00:00 2001 From: Max Norton Date: Thu, 3 Apr 2025 13:49:50 +0100 Subject: [PATCH 4/4] Improve the alert filtering based on Alex's PR suggestion. --- stackhpc_cloud_tests/monitoring/test_prometheus.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/stackhpc_cloud_tests/monitoring/test_prometheus.py b/stackhpc_cloud_tests/monitoring/test_prometheus.py index aec9da6..5868de5 100644 --- a/stackhpc_cloud_tests/monitoring/test_prometheus.py +++ b/stackhpc_cloud_tests/monitoring/test_prometheus.py @@ -76,14 +76,8 @@ def test_prometheus_alerts_inactive(prom): ] def alert_is_ignored(alert, alerts_to_ignore): - alert_items = alert.items() - for alert_to_ignore in alerts_to_ignore: - alert_to_ignore_items = alert_to_ignore.items() - # alert has more items than alerts_to_ignore - # so here we can return True if alert_to_ignore is a subset of alerts - if alert_to_ignore_items <= alert_items: - return True - return False + # Check if any of the "ignore cases" match the alert + return any(alert_to_ignore.items() <= alert.items() for alert_to_ignore in alerts_to_ignore) alerts = [ alert for alert in alerts if not alert_is_ignored(alert["labels"], aio_alerts_to_ignore) ] assert len(alerts) == 0