Skip to content

Commit

Permalink
ROX-16887: always extend SLO calculation over entire 28 days (#87)
Browse files Browse the repository at this point in the history
* always extend SLO calculation over entire 28 days

Extended average over time refers to the time series effectively being extended
over the entire time interval. This is in contrast to avg_over_time, which
only averages over time intervals where the time series is not nil.
This is important during the initial 28 days of Central instances. For example,
consider a Central instance that lived for 5 minutes and was down for 2 minutes.
Using avg_over_time, the availability would be 3 min / 5 min = 60%. The
extended average over 28 days would yield 1 - 2 min / 28 days ~ 99.995%.
After the initial 28 days, both averages are equivalent.

* only alert if Central still exists

* fix dashboard
  • Loading branch information
stehessel committed May 5, 2023
1 parent 2fb9938 commit ae9c16b
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 48 deletions.
2 changes: 1 addition & 1 deletion resources/grafana/rhacs-central-slo-dashboard.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ spec:
"uid": "PBFA97CFB590B2093"
},
"editorMode": "code",
"expr": "avg(central:sli:availability:avg_over_time28d{rhacs_instance_id=~\"$instance_id\"})",
"expr": "avg(central:sli:availability:extended_avg_over_time28d{rhacs_instance_id=~\"$instance_id\"})",
"legendFormat": "__auto",
"range": true,
"refId": "A"
Expand Down
51 changes: 41 additions & 10 deletions resources/prometheus/prometheus-rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -350,32 +350,63 @@ spec:
record: central:sli:availability
- expr: |
avg_over_time(central:sli:availability[1h])
record: central:sli:availability:avg_over_time1h
count_over_time(central:sli:availability[1h])
record: central:sli:availability:count_over_time1h
- expr: |
avg_over_time(central:sli:availability[28d])
record: central:sli:availability:avg_over_time28d
count_over_time(central:sli:availability[28d])
record: central:sli:availability:count_over_time28d
- expr: |
sum_over_time(central:sli:availability[1h])
record: central:sli:availability:sum_over_time1h
- expr: |
sum_over_time(central:sli:availability[28d])
record: central:sli:availability:sum_over_time28d
# Extended average over time refers to the time series effectively being extended
# over the entire time interval. This is in contrast to `avg_over_time`, which
# only averages over time intervals where the time series is not nil.
# This is important during the initial 28 days of Central instances. For example,
# consider a Central instance that lived for 5 minutes and was down for 2 minutes.
# Using `avg_over_time`, the availability would be `3 min / 5 min = 60%`. The
# extended average over 28 days would yield `1 - 2 min / 28 days ~ 99.995%`.
# After the initial 28 days, both averages are equivalent.
- expr: |
1 - (central:sli:availability:count_over_time1h - central:sli:availability:sum_over_time1h) / scalar(central:slo:scrapes1h)
record: central:sli:availability:extended_avg_over_time1h
- expr: |
1 - (central:sli:availability:count_over_time28d - central:sli:availability:sum_over_time28d) / scalar(central:slo:scrapes28d)
record: central:sli:availability:extended_avg_over_time28d
- name: rhacs-central.slo
rules:
# Availability SLO
- expr: "0.99"
record: central:slo:availability

# Based on 30s scrape intervals.
- expr: "60 * 2"
record: central:slo:scrapes1h

- expr: "28 * 24 * 60 * 2"
record: central:slo:scrapes28d

# 0% exhaustion means no recorded failures.
# 100% exhaustion means the SLO target has been reached.
# >100% exhaustion means the SLO target has been violated.
- expr: |
(1 - central:sli:availability:avg_over_time28d) / (1 - scalar(central:slo:availability))
(1 - central:sli:availability:extended_avg_over_time28d) / (1 - scalar(central:slo:availability))
record: central:slo:availability:error_budget_exhaustion
# A burn rate of 1 corresponds to full error budget exhaustion after the SLO window W.
# A burn rate of n corresponds to full error budget exhaustion after W/n - in other words,
# it measures the exhaustion velocity. To keep the SLO target, a temporary burn rate larger
# than 1 must be compensated with a burn rate smaller than 1.
- expr: |
(1 - central:sli:availability:avg_over_time1h) / (1 - scalar(central:slo:availability))
(1 - central:sli:availability:extended_avg_over_time1h) / (1 - scalar(central:slo:availability))
record: central:slo:availability:burnrate1h
- name: rhacs-central.alerts
Expand All @@ -385,7 +416,7 @@ spec:
message: "High availability error budget exhaustion for central. Current exhaustion: {{ $value | humanizePercentage }}."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-018-rhacs-central-slo-alerts.md"
expr: |
central:slo:availability:error_budget_exhaustion >= 0.9
central:slo:availability:error_budget_exhaustion >= 0.9 and central:sli:availability >= 0
labels:
service: central
severity: critical
Expand All @@ -401,7 +432,7 @@ spec:
message: "High availability error budget exhaustion for central. Current exhaustion: {{ $value | humanizePercentage }}."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-018-rhacs-central-slo-alerts.md"
expr: |
central:slo:availability:error_budget_exhaustion >= 0.7
central:slo:availability:error_budget_exhaustion >= 0.7 and central:sli:availability >= 0
labels:
service: central
severity: warning
Expand All @@ -417,7 +448,7 @@ spec:
message: "High availability error budget exhaustion for central. Current exhaustion: {{ $value | humanizePercentage }}."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-018-rhacs-central-slo-alerts.md"
expr: |
central:slo:availability:error_budget_exhaustion >= 0.5
central:slo:availability:error_budget_exhaustion >= 0.5 and central:sli:availability >= 0
labels:
service: central
severity: warning
Expand All @@ -435,7 +466,7 @@ spec:
# Corresponds to less than 50% up time over 1 hour assuming a 99% SLO target.
# See recording rules for how burn rates relate to SLI and SLO in general.
expr: |
central:slo:availability:burnrate1h >= 50
central:slo:availability:burnrate1h >= 50 and central:sli:availability >= 0
labels:
service: central
severity: critical
Expand Down
76 changes: 39 additions & 37 deletions resources/prometheus/unit_tests/RHACSCentralSLISLO.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
rule_files:
- /tmp/prometheus-rules-test.yaml

evaluation_interval: 1m
evaluation_interval: 30s

group_eval_order:
- rhacs-central.sli
Expand All @@ -12,20 +12,20 @@ tests:
# Central availability error budget exhaustion - 90%
- interval: 5m
input_series:
# 15m downtime due to pod not ready.
# 200m downtime due to pod not ready. Out of 28 days, this equates to ~0.5% downtime.
- series: kube_pod_container_status_ready{container="central", pod="central-test1", namespace="rhacs-test1"}
values: "1+0x300 0 0 0 1+0x100"
values: "1+0x260 0+0x40 1+0x100"
- series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="OK", namespace="rhacs-test1", rhacs_instance_id="test"}
values: "1+1x400"
- series: grpc_server_started_total{job="central", grpc_type="unary", namespace="rhacs-test1", rhacs_instance_id="test"}
values: "1+1x397 399+2x1"
values: "1+1x360 362+2x40"
- series: http_incoming_requests_total{job="central", code="200", namespace="rhacs-test1", rhacs_instance_id="test"}
values: "4+4x400"
# 5m downtime due NOK or 5xx responses.
# 200m downtime due NOK or 5xx responses. Out of 28 days, this equates to ~0.5% downtime.
- series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="NOK", namespace="rhacs-test1", rhacs_instance_id="test"}
values: "0+0x397 0+1x1"
values: "0+0x360 0+1x40"
- series: http_incoming_requests_total{job="central", code="500", namespace="rhacs-test1", rhacs_instance_id="test"}
values: "0+0x397 0+4x1"
values: "0+0x360 0+4x40"
alert_rule_test:
- eval_time: 100m
alertname: Central availability error budget exhaustion - 90%
Expand All @@ -40,26 +40,26 @@ tests:
namespace: rhacs-test1
rhacs_instance_id: test
exp_annotations:
message: "High availability error budget exhaustion for central. Current exhaustion: 100.2%."
message: "High availability error budget exhaustion for central. Current exhaustion: 97.97%."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-018-rhacs-central-slo-alerts.md"

# Central availability error budget exhaustion - 70%
- interval: 5m
input_series:
# 10m downtime due to pod not ready.
# 175m downtime due to pod not ready. Out of 28 days, this equates to ~0.43% downtime.
- series: kube_pod_container_status_ready{container="central", pod="central-test2", namespace="rhacs-test2"}
values: "1+0x300 0 0 1+0x100"
- series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="OK", namespace="rhacs-test2"}
values: "2+2x400"
- series: grpc_server_started_total{job="central", grpc_type="unary", namespace="rhacs-test2"}
values: "2+2x397 798+4x1"
- series: http_incoming_requests_total{job="central", code="200", namespace="rhacs-test2"}
values: "3+3x400"
# 5m downtime due NOK or 5xx responses.
- series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="NOK", namespace="rhacs-test2"}
values: "0+0x397 0+2x1"
- series: http_incoming_requests_total{job="central", code="500", namespace="rhacs-test2"}
values: "0+0x397 0+3x1"
values: "1+0x265 0+0x35 1+0x100"
- series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="OK", namespace="rhacs-test2", rhacs_instance_id="test"}
values: "1+1x400"
- series: grpc_server_started_total{job="central", grpc_type="unary", namespace="rhacs-test2", rhacs_instance_id="test"}
values: "1+1x365 367+2x35"
- series: http_incoming_requests_total{job="central", code="200", namespace="rhacs-test2", rhacs_instance_id="test"}
values: "4+4x400"
# 175m downtime due NOK or 5xx responses. Out of 28 days, this equates to ~0.43% downtime.
- series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="NOK", namespace="rhacs-test2", rhacs_instance_id="test"}
values: "0+0x365 0+1x35"
- series: http_incoming_requests_total{job="central", code="500", namespace="rhacs-test2", rhacs_instance_id="test"}
values: "0+0x365 0+4x35"
alert_rule_test:
- eval_time: 100m
alertname: Central availability error budget exhaustion - 70%
Expand All @@ -72,27 +72,28 @@ tests:
service: central
severity: warning
namespace: rhacs-test2
rhacs_instance_id: test
exp_annotations:
message: "High availability error budget exhaustion for central. Current exhaustion: 75.15%."
message: "High availability error budget exhaustion for central. Current exhaustion: 85.57%."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-018-rhacs-central-slo-alerts.md"

# Central availability error budget exhaustion - 50%
- interval: 5m
input_series:
# 5m downtime due to pod not ready.
# 105m downtime due to pod not ready. Out of 28 days, this equates to ~0.25% downtime.
- series: kube_pod_container_status_ready{container="central", pod="central-test3", namespace="rhacs-test3"}
values: "1+0x300 0 1+0x100"
- series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="OK", namespace="rhacs-test3"}
values: "3+3x400"
- series: grpc_server_started_total{job="central", grpc_type="unary", namespace="rhacs-test3"}
values: "3+3x397 1197+6x1"
- series: http_incoming_requests_total{job="central", code="200", namespace="rhacs-test3"}
values: "2+2x400"
# 5m downtime due 5xx responses.
- series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="NOK", namespace="rhacs-test3"}
values: "0+0x397 0+3x1"
- series: http_incoming_requests_total{job="central", code="500", namespace="rhacs-test3"}
values: "0+0x397 0+2x1"
values: "1+0x279 0+0x21 1+0x100"
- series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="OK", namespace="rhacs-test3", rhacs_instance_id="test"}
values: "1+1x400"
- series: grpc_server_started_total{job="central", grpc_type="unary", namespace="rhacs-test3", rhacs_instance_id="test"}
values: "1+1x379 381+2x21"
- series: http_incoming_requests_total{job="central", code="200", namespace="rhacs-test3", rhacs_instance_id="test"}
values: "4+4x400"
# 105m downtime due NOK or 5xx responses. Out of 28 days, this equates to ~0.25% downtime.
- series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="NOK", namespace="rhacs-test3", rhacs_instance_id="test"}
values: "0+0x379 0+1x21"
- series: http_incoming_requests_total{job="central", code="500", namespace="rhacs-test3", rhacs_instance_id="test"}
values: "0+0x379 0+4x21"
alert_rule_test:
- eval_time: 100m
alertname: Central availability error budget exhaustion - 50%
Expand All @@ -105,8 +106,9 @@ tests:
service: central
severity: warning
namespace: rhacs-test3
rhacs_instance_id: test
exp_annotations:
message: "High availability error budget exhaustion for central. Current exhaustion: 50.1%."
message: "High availability error budget exhaustion for central. Current exhaustion: 50.84%."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-018-rhacs-central-slo-alerts.md"

# Central high availability burn rate
Expand Down Expand Up @@ -134,5 +136,5 @@ tests:
severity: critical
namespace: rhacs-test4
exp_annotations:
message: "High availability burn rate for central. Current burn rate per hour: 59.02."
message: "High availability burn rate for central. Current burn rate per hour: 59.17."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-018-rhacs-central-slo-alerts.md"

0 comments on commit ae9c16b

Please sign in to comment.