From 20c8342a0b1a46e4a26e913c42a86cd6a8c04175 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stephan=20He=C3=9Felmann?= <shesselm@redhat.com>
Date: Thu, 27 Jun 2024 04:22:18 +0200
Subject: [PATCH] feat(alerts): add weekly exhaustion alert

---
 resources/prometheus/prometheus-rules.yaml    | 12 ++++++++++
 .../unit_tests/RHACSCentralSLISLO.yaml        | 22 +++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml
index a9f9f332..7db4d4d8 100644
--- a/resources/prometheus/prometheus-rules.yaml
+++ b/resources/prometheus/prometheus-rules.yaml
@@ -597,6 +597,18 @@ spec:
             namespace: "{{ $labels.namespace }}"
             rhacs_instance_id: "{{ $labels.rhacs_instance_id }}"
 
+        - alert: Central availability weekly exhaustion
+          annotations:
+            message: "Availability error budget exhaustion has increased by {{ $value | humanizePercentage }} over the last week."
+            sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-018-rhacs-central-slo-alerts.md"
+          expr: |
+            central:slo:availability:error_budget_exhaustion - central:slo:availability:error_budget_exhaustion offset 1w > 0.1
+          labels:
+            service: central
+            severity: warning
+            namespace: "{{ $labels.namespace }}"
+            rhacs_instance_id: "{{ $labels.rhacs_instance_id }}"
+
         - alert: Central high availability burn rate
           annotations:
             message: "High availability burn rate for central. Current burn rate per hour: {{ $value | humanize }}."
diff --git a/resources/prometheus/unit_tests/RHACSCentralSLISLO.yaml b/resources/prometheus/unit_tests/RHACSCentralSLISLO.yaml
index 8933a82d..2fa3e971 100644
--- a/resources/prometheus/unit_tests/RHACSCentralSLISLO.yaml
+++ b/resources/prometheus/unit_tests/RHACSCentralSLISLO.yaml
@@ -148,6 +148,28 @@ tests:
               message: "High availability error budget exhaustion for central. Current exhaustion: 50.84%."
               sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-018-rhacs-central-slo-alerts.md"
 
+  # Central weekly exhaustion
+  - interval: 30m
+    input_series:
+      - series: kube_deployment_status_replicas_ready{deployment="central", namespace="rhacs-ffffgggghhhhiiiijjjj"}
+        values: "0+0x5 1+0x265 0+0x15 1+0x100"
+    alert_rule_test:
+      - eval_time: 100m
+        alertname: Central availability weekly exhaustion
+        exp_alerts: []
+      - eval_time: 11000m
+        alertname: Central availability weekly exhaustion
+        exp_alerts:
+          - exp_labels:
+              alertname: Central availability weekly exhaustion
+              service: central
+              severity: warning
+              namespace: rhacs-ffffgggghhhhiiiijjjj
+              rhacs_instance_id: ffffgggghhhhiiiijjjj
+            exp_annotations:
+              message: "Availability error budget exhaustion has increased by 21.83% over the last week."
+              sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-018-rhacs-central-slo-alerts.md"
+
   # Central high availability burn rate
   - interval: 5m
     input_series: