From 20c8342a0b1a46e4a26e913c42a86cd6a8c04175 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stephan=20He=C3=9Felmann?= Date: Thu, 27 Jun 2024 04:22:18 +0200 Subject: [PATCH] feat(alerts): add weekly exhaustion alert --- resources/prometheus/prometheus-rules.yaml | 12 ++++++++++ .../unit_tests/RHACSCentralSLISLO.yaml | 22 +++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index a9f9f332..7db4d4d8 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -597,6 +597,18 @@ spec: namespace: "{{ $labels.namespace }}" rhacs_instance_id: "{{ $labels.rhacs_instance_id }}" + - alert: Central availability weekly exhaustion + annotations: + message: "Availability error budget exhaustion has increased by {{ $value | humanizePercentage }} over the last week." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-018-rhacs-central-slo-alerts.md" + expr: | + central:slo:availability:error_budget_exhaustion - central:slo:availability:error_budget_exhaustion offset 1w > 0.1 + labels: + service: central + severity: warning + namespace: "{{ $labels.namespace }}" + rhacs_instance_id: "{{ $labels.rhacs_instance_id }}" + - alert: Central high availability burn rate annotations: message: "High availability burn rate for central. Current burn rate per hour: {{ $value | humanize }}." diff --git a/resources/prometheus/unit_tests/RHACSCentralSLISLO.yaml b/resources/prometheus/unit_tests/RHACSCentralSLISLO.yaml index 8933a82d..2fa3e971 100644 --- a/resources/prometheus/unit_tests/RHACSCentralSLISLO.yaml +++ b/resources/prometheus/unit_tests/RHACSCentralSLISLO.yaml @@ -148,6 +148,28 @@ tests: message: "High availability error budget exhaustion for central. Current exhaustion: 50.84%." sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-018-rhacs-central-slo-alerts.md" + # Central weekly exhaustion + - interval: 30m + input_series: + - series: kube_deployment_status_replicas_ready{deployment="central", namespace="rhacs-ffffgggghhhhiiiijjjj"} + values: "0+0x5 1+0x265 0+0x15 1+0x100" + alert_rule_test: + - eval_time: 100m + alertname: Central availability weekly exhaustion + exp_alerts: [] + - eval_time: 11000m + alertname: Central availability weekly exhaustion + exp_alerts: + - exp_labels: + alertname: Central availability weekly exhaustion + service: central + severity: warning + namespace: rhacs-ffffgggghhhhiiiijjjj + rhacs_instance_id: ffffgggghhhhiiiijjjj + exp_annotations: + message: "Availability error budget exhaustion has increased by 21.83% over the last week." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-018-rhacs-central-slo-alerts.md" + # Central high availability burn rate - interval: 5m input_series: