diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index b6b1a8ad..d86ad796 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -690,7 +690,7 @@ spec: description: "During the last 15 minutes, the average memory request commitment on worker nodes was {{ $value | humanizePercentage }}. This could make pods unschedulable." sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-027-cluster-scale-up.md" - alert: WorkerNodesCPUQuotaOverCommit - expr: avg(availability_zone:acscs_worker_nodes:cpu_request_ratio) > 0.1 + expr: avg(availability_zone:acscs_worker_nodes:cpu_request_ratio) > 0.99 for: 15m labels: severity: warning diff --git a/resources/prometheus/unit_tests/WorkerNodesCPUQuotaOverCommit.yaml b/resources/prometheus/unit_tests/WorkerNodesCPUQuotaOverCommit.yaml index dfa86aad..b28b2506 100644 --- a/resources/prometheus/unit_tests/WorkerNodesCPUQuotaOverCommit.yaml +++ b/resources/prometheus/unit_tests/WorkerNodesCPUQuotaOverCommit.yaml @@ -7,18 +7,21 @@ tests: - interval: 1m input_series: - series: kube_node_role{node="worker-1", role="acscs-worker"} - values: "1+0x20" + values: "1+0x40" - series: kube_node_labels{node="worker-1", label_failure_domain_beta_kubernetes_io_zone="us-east-1a"} - values: "1+0x20" + values: "1+0x40" - series: kube_node_status_allocatable{node="worker-1", resource="cpu", job="kube-state-metrics"} - values: "200+0x20" + values: "200+0x40" - series: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{node="worker-1", resource="cpu", job="kube-state-metrics"} - values: "199+0x20" + values: "196+0x20 199+0x20" alert_rule_test: - eval_time: 1m alertname: WorkerNodesCPUQuotaOverCommit exp_alerts: [] - eval_time: 16m + alertname: WorkerNodesCPUQuotaOverCommit + exp_alerts: [] + - eval_time: 36m alertname: WorkerNodesCPUQuotaOverCommit exp_alerts: - exp_labels: diff --git a/resources/prometheus/unit_tests/WorkerNodesMemoryQuotaOverCommit.yaml b/resources/prometheus/unit_tests/WorkerNodesMemoryQuotaOverCommit.yaml index 3b74cc0d..c5f71ce9 100644 --- a/resources/prometheus/unit_tests/WorkerNodesMemoryQuotaOverCommit.yaml +++ b/resources/prometheus/unit_tests/WorkerNodesMemoryQuotaOverCommit.yaml @@ -7,18 +7,21 @@ tests: - interval: 1m input_series: - series: kube_node_role{node="worker-1", role="acscs-worker"} - values: "1+0x20" + values: "1+0x40" - series: kube_node_labels{node="worker-1", label_failure_domain_beta_kubernetes_io_zone="us-east-1a"} - values: "1+0x20" + values: "1+0x40" - series: kube_node_status_allocatable{node="worker-1", resource="memory", job="kube-state-metrics"} - values: "200+0x20" + values: "200+0x40" - series: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests{node="worker-1", resource="memory", job="kube-state-metrics"} - values: "199+0x20" + values: "196+0x20 199+0x20" alert_rule_test: - eval_time: 1m alertname: WorkerNodesMemoryQuotaOverCommit exp_alerts: [] - eval_time: 16m + alertname: WorkerNodesMemoryQuotaOverCommit + exp_alerts: [] + - eval_time: 36m alertname: WorkerNodesMemoryQuotaOverCommit exp_alerts: - exp_labels: