From 6c3f34f2dde46ccba7630da9c0c0bac5a2e5956e Mon Sep 17 00:00:00 2001 From: Stephan Hesselmann Date: Tue, 3 Oct 2023 15:11:36 +0200 Subject: [PATCH] fix(sli): allow `Unknown` status code in SLI error rate (#149) --- resources/prometheus/prometheus-rules.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index 0c9c253..86d62a4 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -351,9 +351,10 @@ spec: # The error rate over the last 10 minutes must be smaller than 35% to count as available. # GRPC + # TODO(ROX-19917): Re-add `grpc_code="Unknown"` to the list of server errors. - expr: | sum by (namespace, rhacs_instance_id, rhacs_org_id, rhacs_org_name, rhacs_cluster_name, rhacs_environment) - (rate(grpc_server_handled_total{namespace=~"rhacs-.*", job="central", grpc_type="unary", grpc_service!="v1.PingService", grpc_code!~"DeadlineExceeded|Internal|Unavailable|Unknown"}[10m])) + (rate(grpc_server_handled_total{namespace=~"rhacs-.*", job="central", grpc_type="unary", grpc_service!="v1.PingService", grpc_code!~"DeadlineExceeded|Internal|Unavailable"}[10m])) record: central:grpc_server_handled:server_available_code:rate10m - expr: |