ROX-16887: always extend SLO calculation over entire 28 days (#87)

* always extend SLO calculation over entire 28 days Extended average over time refers to the time series effectively being extended over the entire time interval. This is in contrast to avg_over_time, which only averages over time intervals where the time series is not nil. This is important during the initial 28 days of Central instances. For example, consider a Central instance that lived for 5 minutes and was down for 2 minutes. Using avg_over_time, the availability would be 3 min / 5 min = 60%. The extended average over 28 days would yield 1 - 2 min / 28 days ~ 99.995%. After the initial 28 days, both averages are equivalent. * only alert if Central still exists * fix dashboard
stackrox · May 5, 2023 · ae9c16b · ae9c16b
1 parent 2fb9938
commit ae9c16b
Show file tree

Hide file tree

Showing 3 changed files with 81 additions and 48 deletions.
diff --git a/resources/grafana/rhacs-central-slo-dashboard.yaml b/resources/grafana/rhacs-central-slo-dashboard.yaml
@@ -139,7 +139,7 @@ spec:
                 "uid": "PBFA97CFB590B2093"
               },
               "editorMode": "code",
-              "expr": "avg(central:sli:availability:avg_over_time28d{rhacs_instance_id=~\"$instance_id\"})",
+              "expr": "avg(central:sli:availability:extended_avg_over_time28d{rhacs_instance_id=~\"$instance_id\"})",
               "legendFormat": "__auto",
               "range": true,
               "refId": "A"

diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml
@@ -350,32 +350,63 @@ spec:
           record: central:sli:availability
 
         - expr: |
-            avg_over_time(central:sli:availability[1h])
-          record: central:sli:availability:avg_over_time1h
+            count_over_time(central:sli:availability[1h])
+          record: central:sli:availability:count_over_time1h
 
         - expr: |
-            avg_over_time(central:sli:availability[28d])
-          record: central:sli:availability:avg_over_time28d
+            count_over_time(central:sli:availability[28d])
+          record: central:sli:availability:count_over_time28d
+
+        - expr: |
+            sum_over_time(central:sli:availability[1h])
+          record: central:sli:availability:sum_over_time1h
+
+        - expr: |
+            sum_over_time(central:sli:availability[28d])
+          record: central:sli:availability:sum_over_time28d
+
+        # Extended average over time refers to the time series effectively being extended
+        # over the entire time interval. This is in contrast to `avg_over_time`, which
+        # only averages over time intervals where the time series is not nil.
+        # This is important during the initial 28 days of Central instances. For example,
+        # consider a Central instance that lived for 5 minutes and was down for 2 minutes.
+        # Using `avg_over_time`, the availability would be `3 min / 5 min = 60%`. The
+        # extended average over 28 days would yield `1 - 2 min / 28 days ~ 99.995%`.
+        # After the initial 28 days, both averages are equivalent.
+        - expr: |
+            1 - (central:sli:availability:count_over_time1h - central:sli:availability:sum_over_time1h) / scalar(central:slo:scrapes1h)
+          record: central:sli:availability:extended_avg_over_time1h
+
+        - expr: |
+            1 - (central:sli:availability:count_over_time28d - central:sli:availability:sum_over_time28d) / scalar(central:slo:scrapes28d)
+          record: central:sli:availability:extended_avg_over_time28d
 
     - name: rhacs-central.slo
       rules:
         # Availability SLO
         - expr: "0.99"
           record: central:slo:availability
 
+        # Based on 30s scrape intervals.
+        - expr: "60 * 2"
+          record: central:slo:scrapes1h
+
+        - expr: "28 * 24 * 60 * 2"
+          record: central:slo:scrapes28d
+
         # 0% exhaustion means no recorded failures.
         # 100% exhaustion means the SLO target has been reached.
         # >100% exhaustion means the SLO target has been violated.
         - expr: |
-            (1 - central:sli:availability:avg_over_time28d) / (1 - scalar(central:slo:availability))
+            (1 - central:sli:availability:extended_avg_over_time28d) / (1 - scalar(central:slo:availability))
           record: central:slo:availability:error_budget_exhaustion
 
         # A burn rate of 1 corresponds to full error budget exhaustion after the SLO window W.
         # A burn rate of n corresponds to full error budget exhaustion after W/n - in other words,
         # it measures the exhaustion velocity. To keep the SLO target, a temporary burn rate larger
         # than 1 must be compensated with a burn rate smaller than 1.
         - expr: |
-            (1 - central:sli:availability:avg_over_time1h) / (1 - scalar(central:slo:availability))
+            (1 - central:sli:availability:extended_avg_over_time1h) / (1 - scalar(central:slo:availability))
           record: central:slo:availability:burnrate1h
 
     - name: rhacs-central.alerts
@@ -385,7 +416,7 @@ spec:
             message: "High availability error budget exhaustion for central. Current exhaustion: {{ $value | humanizePercentage }}."
             sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-018-rhacs-central-slo-alerts.md"
           expr: |
-            central:slo:availability:error_budget_exhaustion >= 0.9
+            central:slo:availability:error_budget_exhaustion >= 0.9 and central:sli:availability >= 0
           labels:
             service: central
             severity: critical
@@ -401,7 +432,7 @@ spec:
             message: "High availability error budget exhaustion for central. Current exhaustion: {{ $value | humanizePercentage }}."
             sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-018-rhacs-central-slo-alerts.md"
           expr: |
-            central:slo:availability:error_budget_exhaustion >= 0.7
+            central:slo:availability:error_budget_exhaustion >= 0.7 and central:sli:availability >= 0
           labels:
             service: central
             severity: warning
@@ -417,7 +448,7 @@ spec:
             message: "High availability error budget exhaustion for central. Current exhaustion: {{ $value | humanizePercentage }}."
             sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-018-rhacs-central-slo-alerts.md"
           expr: |
-            central:slo:availability:error_budget_exhaustion >= 0.5
+            central:slo:availability:error_budget_exhaustion >= 0.5 and central:sli:availability >= 0
           labels:
             service: central
             severity: warning
@@ -435,7 +466,7 @@ spec:
           # Corresponds to less than 50% up time over 1 hour assuming a 99% SLO target.
           # See recording rules for how burn rates relate to SLI and SLO in general.
           expr: |
-            central:slo:availability:burnrate1h >= 50
+            central:slo:availability:burnrate1h >= 50 and central:sli:availability >= 0
           labels:
             service: central
             severity: critical

diff --git a/resources/prometheus/unit_tests/RHACSCentralSLISLO.yaml b/resources/prometheus/unit_tests/RHACSCentralSLISLO.yaml
@@ -1,7 +1,7 @@
 rule_files:
   - /tmp/prometheus-rules-test.yaml
 
-evaluation_interval: 1m
+evaluation_interval: 30s
 
 group_eval_order:
   - rhacs-central.sli
@@ -12,20 +12,20 @@ tests:
   # Central availability error budget exhaustion - 90%
   - interval: 5m
     input_series:
-      # 15m downtime due to pod not ready.
+      # 200m downtime due to pod not ready. Out of 28 days, this equates to ~0.5% downtime.
       - series: kube_pod_container_status_ready{container="central", pod="central-test1", namespace="rhacs-test1"}
-        values: "1+0x300 0 0 0 1+0x100"
+        values: "1+0x260 0+0x40 1+0x100"
       - series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="OK", namespace="rhacs-test1", rhacs_instance_id="test"}
         values: "1+1x400"
       - series: grpc_server_started_total{job="central", grpc_type="unary", namespace="rhacs-test1", rhacs_instance_id="test"}
-        values: "1+1x397 399+2x1"
+        values: "1+1x360 362+2x40"
       - series: http_incoming_requests_total{job="central", code="200", namespace="rhacs-test1", rhacs_instance_id="test"}
         values: "4+4x400"
-      # 5m downtime due NOK or 5xx responses.
+      # 200m downtime due NOK or 5xx responses. Out of 28 days, this equates to ~0.5% downtime.
       - series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="NOK", namespace="rhacs-test1", rhacs_instance_id="test"}
-        values: "0+0x397 0+1x1"
+        values: "0+0x360 0+1x40"
       - series: http_incoming_requests_total{job="central", code="500", namespace="rhacs-test1", rhacs_instance_id="test"}
-        values: "0+0x397 0+4x1"
+        values: "0+0x360 0+4x40"
     alert_rule_test:
       - eval_time: 100m
         alertname: Central availability error budget exhaustion - 90%
@@ -40,26 +40,26 @@ tests:
               namespace: rhacs-test1
               rhacs_instance_id: test
             exp_annotations:
-              message: "High availability error budget exhaustion for central. Current exhaustion: 100.2%."
+              message: "High availability error budget exhaustion for central. Current exhaustion: 97.97%."
               sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-018-rhacs-central-slo-alerts.md"
 
   # Central availability error budget exhaustion - 70%
   - interval: 5m
     input_series:
-      # 10m downtime due to pod not ready.
+      # 175m downtime due to pod not ready. Out of 28 days, this equates to ~0.43% downtime.
       - series: kube_pod_container_status_ready{container="central", pod="central-test2", namespace="rhacs-test2"}
-        values: "1+0x300 0 0 1+0x100"
-      - series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="OK", namespace="rhacs-test2"}
-        values: "2+2x400"
-      - series: grpc_server_started_total{job="central", grpc_type="unary", namespace="rhacs-test2"}
-        values: "2+2x397 798+4x1"
-      - series: http_incoming_requests_total{job="central", code="200", namespace="rhacs-test2"}
-        values: "3+3x400"
-      # 5m downtime due NOK or 5xx responses.
-      - series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="NOK", namespace="rhacs-test2"}
-        values: "0+0x397 0+2x1"
-      - series: http_incoming_requests_total{job="central", code="500", namespace="rhacs-test2"}
-        values: "0+0x397 0+3x1"
+        values: "1+0x265 0+0x35 1+0x100"
+      - series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="OK", namespace="rhacs-test2", rhacs_instance_id="test"}
+        values: "1+1x400"
+      - series: grpc_server_started_total{job="central", grpc_type="unary", namespace="rhacs-test2", rhacs_instance_id="test"}
+        values: "1+1x365 367+2x35"
+      - series: http_incoming_requests_total{job="central", code="200", namespace="rhacs-test2", rhacs_instance_id="test"}
+        values: "4+4x400"
+      # 175m downtime due NOK or 5xx responses. Out of 28 days, this equates to ~0.43% downtime.
+      - series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="NOK", namespace="rhacs-test2", rhacs_instance_id="test"}
+        values: "0+0x365 0+1x35"
+      - series: http_incoming_requests_total{job="central", code="500", namespace="rhacs-test2", rhacs_instance_id="test"}
+        values: "0+0x365 0+4x35"
     alert_rule_test:
       - eval_time: 100m
         alertname: Central availability error budget exhaustion - 70%
@@ -72,27 +72,28 @@ tests:
               service: central
               severity: warning
               namespace: rhacs-test2
+              rhacs_instance_id: test
             exp_annotations:
-              message: "High availability error budget exhaustion for central. Current exhaustion: 75.15%."
+              message: "High availability error budget exhaustion for central. Current exhaustion: 85.57%."
               sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-018-rhacs-central-slo-alerts.md"
 
   # Central availability error budget exhaustion - 50%
   - interval: 5m
     input_series:
-      # 5m downtime due to pod not ready.
+      # 105m downtime due to pod not ready. Out of 28 days, this equates to ~0.25% downtime.
       - series: kube_pod_container_status_ready{container="central", pod="central-test3", namespace="rhacs-test3"}
-        values: "1+0x300 0 1+0x100"
-      - series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="OK", namespace="rhacs-test3"}
-        values: "3+3x400"
-      - series: grpc_server_started_total{job="central", grpc_type="unary", namespace="rhacs-test3"}
-        values: "3+3x397 1197+6x1"
-      - series: http_incoming_requests_total{job="central", code="200", namespace="rhacs-test3"}
-        values: "2+2x400"
-      # 5m downtime due 5xx responses.
-      - series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="NOK", namespace="rhacs-test3"}
-        values: "0+0x397 0+3x1"
-      - series: http_incoming_requests_total{job="central", code="500", namespace="rhacs-test3"}
-        values: "0+0x397 0+2x1"
+        values: "1+0x279 0+0x21 1+0x100"
+      - series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="OK", namespace="rhacs-test3", rhacs_instance_id="test"}
+        values: "1+1x400"
+      - series: grpc_server_started_total{job="central", grpc_type="unary", namespace="rhacs-test3", rhacs_instance_id="test"}
+        values: "1+1x379 381+2x21"
+      - series: http_incoming_requests_total{job="central", code="200", namespace="rhacs-test3", rhacs_instance_id="test"}
+        values: "4+4x400"
+      # 105m downtime due NOK or 5xx responses. Out of 28 days, this equates to ~0.25% downtime.
+      - series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="NOK", namespace="rhacs-test3", rhacs_instance_id="test"}
+        values: "0+0x379 0+1x21"
+      - series: http_incoming_requests_total{job="central", code="500", namespace="rhacs-test3", rhacs_instance_id="test"}
+        values: "0+0x379 0+4x21"
     alert_rule_test:
       - eval_time: 100m
         alertname: Central availability error budget exhaustion - 50%
@@ -105,8 +106,9 @@ tests:
               service: central
               severity: warning
               namespace: rhacs-test3
+              rhacs_instance_id: test
             exp_annotations:
-              message: "High availability error budget exhaustion for central. Current exhaustion: 50.1%."
+              message: "High availability error budget exhaustion for central. Current exhaustion: 50.84%."
               sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-018-rhacs-central-slo-alerts.md"
 
   # Central high availability burn rate
@@ -134,5 +136,5 @@ tests:
               severity: critical
               namespace: rhacs-test4
             exp_annotations:
-              message: "High availability burn rate for central. Current burn rate per hour: 59.02."
+              message: "High availability burn rate for central. Current burn rate per hour: 59.17."
               sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-018-rhacs-central-slo-alerts.md"