Skip to content

Commit

Permalink
feat(alerts): add weekly exhaustion alert
Browse files Browse the repository at this point in the history
  • Loading branch information
stehessel committed Jun 27, 2024
1 parent 5cc3799 commit f8b857b
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 0 deletions.
12 changes: 12 additions & 0 deletions resources/prometheus/prometheus-rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -597,6 +597,18 @@ spec:
namespace: "{{ $labels.namespace }}"
rhacs_instance_id: "{{ $labels.rhacs_instance_id }}"

- alert: Central availability weekly exhaustion
annotations:
message: "Availability error budget exhaustion has increased by {{ $value | humanizePercentage }} over the last week."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-018-rhacs-central-slo-alerts.md"
expr: |
central:slo:availability:error_budget_exhaustion - central:slo:availability:error_budget_exhaustion offset 1w > 0.1
labels:
service: central
severity: warning
namespace: "{{ $labels.namespace }}"
rhacs_instance_id: "{{ $labels.rhacs_instance_id }}"

- alert: Central high availability burn rate
annotations:
message: "High availability burn rate for central. Current burn rate per hour: {{ $value | humanize }}."
Expand Down
22 changes: 22 additions & 0 deletions resources/prometheus/unit_tests/RHACSCentralSLISLO.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,28 @@ tests:
message: "High availability error budget exhaustion for central. Current exhaustion: 50.84%."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-018-rhacs-central-slo-alerts.md"

# Central weekly exhaustion
- interval: 30m
input_series:
- series: kube_deployment_status_replicas_ready{deployment="central", namespace="rhacs-ffffgggghhhhiiiijjjj"}
values: "0+0x5 1+0x265 0+0x15 1+0x100"
alert_rule_test:
- eval_time: 100m
alertname: Central availability weekly exhaustion
exp_alerts: []
- eval_time: 11000m
alertname: Central availability weekly exhaustion
exp_alerts:
- exp_labels:
alertname: Central availability weekly exhaustion
service: central
severity: warning
namespace: rhacs-ffffgggghhhhiiiijjjj
rhacs_instance_id: ffffgggghhhhiiiijjjj
exp_annotations:
message: "Availability error budget exhaustion has increased by 21.83% over the last week."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-018-rhacs-central-slo-alerts.md"

# Central high availability burn rate
- interval: 5m
input_series:
Expand Down

0 comments on commit f8b857b

Please sign in to comment.