Skip to content
42 changes: 42 additions & 0 deletions stackhpc_cloud_tests/monitoring/test_prometheus.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,45 @@ def test_prometheus_node_exporter_metrics(prom):
"""Check that expected node exporter metrics exist."""
metrics = prom.all_metrics()
assert "node_cpu_seconds_total" in metrics


def test_prometheus_alerts_inactive(prom):
"""Check that no Prometheus alerts are active."""
# https://prometheus.io/docs/prometheus/latest/querying/api/#alerts
response = prom._session.get(
"{0}/api/v1/alerts".format(prom.url),
verify=prom._session.verify,
headers=prom.headers,
auth=prom.auth,
cert=prom._session.cert,
)
assert response.ok
response = response.json()
assert "status" in response
assert response["status"] == "success"
assert "data" in response
alerts = response["data"]["alerts"] or []

# (MaxN) Allow for, and filter out, alerts we'd expect to see in an AIO environment.
# TODO - find a way of configuring this for SCT running in other environments.
aio_alerts_to_ignore = [
# We know our volumes are small.
{ "alertname": "StorageFillingUp", "instance": "controller0" },
# This is probably due to storage space..
{ "alertname": "ElasticsearchClusterYellow", "instance": "controller0" },
# ..or because we're running in a single instance and it wants to be clustered across multiple nodes.
{ "alertname": "ElasticsearchUnassignedShards", "instance": "controller0" },
# It's a small AIO!
{ "alertname": "LowMemory", "instance": "controller0" },
# It's only one node and expects three, see https://github.com/stackhpc/stackhpc-kayobe-config/pull/1579
{ "alertname": "RabbitMQNodeDown" },
# This is probably because Tempest runs before pytest so the container has been recently stopped.
{ "alertname": "ContainerKilled", "name": "tempest" }
]

def alert_is_ignored(alert, alerts_to_ignore):
# Check if any of the "ignore cases" match the alert
return any(alert_to_ignore.items() <= alert.items() for alert_to_ignore in alerts_to_ignore)

alerts = [ alert for alert in alerts if not alert_is_ignored(alert["labels"], aio_alerts_to_ignore) ]
assert len(alerts) == 0