Merge pull request #14321 from rook/mergify/bp/release-1.14/pr-14312

monitoring: update to the latest ceph prometheus rules (backport #14312)
rook · Jun 10, 2024 · dbe2413 · dbe2413
2 parents 3f06a77 + 4253a04
commit dbe2413
Show file tree

Hide file tree

Showing 2 changed files with 551 additions and 19 deletions.
diff --git a/deploy/charts/rook-ceph-cluster/prometheus/localrules.yaml b/deploy/charts/rook-ceph-cluster/prometheus/localrules.yaml
@@ -1,4 +1,8 @@
-# copied from https://github.com/ceph/ceph/blob/master/monitoring/ceph-mixin/prometheus_alerts.yml
+# Copied from https://github.com/ceph/ceph/blob/master/monitoring/ceph-mixin/prometheus_alerts.yml
+# Attention: This is not a 1:1 copy of ceph-mixin alerts. This file contains several Rook-related adjustments.
+#   List of main adjustments:
+#     - Alerts related to cephadm are excluded
+#     - The PrometheusJobMissing alert is adjusted for the rook-ceph-mgr job, and the PrometheusJobExporterMissing alert is added
 groups:
   - name: "cluster health"
     rules:
@@ -198,7 +202,7 @@ groups:
           type: "ceph_default"
       - alert: "CephDeviceFailurePredictionTooHigh"
         annotations:
-          description: "The device health module has determined that devices predicted to fail can not be remediated automatically, since too many OSDs would be removed from the cluster to ensure performance and availabililty. Prevent data integrity issues by adding new OSDs so that data may be relocated."
+          description: "The device health module has determined that devices predicted to fail can not be remediated automatically, since too many OSDs would be removed from the cluster to ensure performance and availability. Prevent data integrity issues by adding new OSDs so that data may be relocated."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany"
           summary: "Too many devices are predicted to fail, unable to resolve"
         expr: "ceph_health_detail{name=\"DEVICE_HEALTH_TOOMANY\"} == 1"
@@ -395,7 +399,7 @@ groups:
           oid: "1.3.6.1.4.1.50495.1.2.1.7.5"
           severity: "critical"
           type: "ceph_default"
-      - alert: "CephPGUnavilableBlockingIO"
+      - alert: "CephPGUnavailableBlockingIO"
         annotations:
           description: "Data availability is reduced, impacting the cluster's ability to service I/O. One or more placement groups (PGs) are in a state that blocks I/O."
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability"
@@ -499,8 +503,8 @@ groups:
           type: "ceph_default"
       - alert: "CephNodeNetworkBondDegraded"
         annotations:
-          summary: "Degraded Bond on Node {{ $labels.instance }}"
           description: "Bond {{ $labels.master }} is degraded on Node {{ $labels.instance }}."
+          summary: "Degraded Bond on Node {{ $labels.instance }}"
         expr: |
           node_bonding_slaves - node_bonding_active != 0
         labels:
@@ -525,6 +529,15 @@ groups:
           type: "ceph_default"
   - name: "pools"
     rules:
+      - alert: "CephPoolGrowthWarning"
+        annotations:
+          description: "Pool '{{ $labels.name }}' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours."
+          summary: "Pool growth rate may soon exceed capacity"
+        expr: "(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id, instance) group_right() ceph_pool_metadata) >= 95"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.9.2"
+          severity: "warning"
+          type: "ceph_default"
       - alert: "CephPoolBackfillFull"
         annotations:
           description: "A pool is approaching the near full threshold, which will prevent recovery/backfill operations from completing. Consider adding more capacity."
@@ -566,15 +579,99 @@ groups:
           severity: "warning"
           type: "ceph_default"
       - alert: "CephDaemonSlowOps"
-        for: "30s"
-        expr: "ceph_daemon_health_metrics{type=\"SLOW_OPS\"} > 0"
-        labels:
-          severity: 'warning'
-          type: 'ceph_default'
         annotations:
-          summary: "{{ $labels.ceph_daemon }} operations are slow to complete"
           description: "{{ $labels.ceph_daemon }} operations are taking too long to process (complaint time exceeded)"
           documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops"
+          summary: "{{ $labels.ceph_daemon }} operations are slow to complete"
+        expr: "ceph_daemon_health_metrics{type=\"SLOW_OPS\"} > 0"
+        for: "30s"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+  - name: "hardware"
+    rules:
+      - alert: "HardwareStorageError"
+        annotations:
+          description: "Some storage devices are in error. Check `ceph health detail`."
+          summary: "Storage devices error(s) detected"
+        expr: "ceph_health_detail{name=\"HARDWARE_STORAGE\"} > 0"
+        for: "30s"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.13.1"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "HardwareMemoryError"
+        annotations:
+          description: "DIMM error(s) detected. Check `ceph health detail`."
+          summary: "DIMM error(s) detected"
+        expr: "ceph_health_detail{name=\"HARDWARE_MEMORY\"} > 0"
+        for: "30s"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.13.2"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "HardwareProcessorError"
+        annotations:
+          description: "Processor error(s) detected. Check `ceph health detail`."
+          summary: "Processor error(s) detected"
+        expr: "ceph_health_detail{name=\"HARDWARE_PROCESSOR\"} > 0"
+        for: "30s"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.13.3"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "HardwareNetworkError"
+        annotations:
+          description: "Network error(s) detected. Check `ceph health detail`."
+          summary: "Network error(s) detected"
+        expr: "ceph_health_detail{name=\"HARDWARE_NETWORK\"} > 0"
+        for: "30s"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.13.4"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "HardwarePowerError"
+        annotations:
+          description: "Power supply error(s) detected. Check `ceph health detail`."
+          summary: "Power supply error(s) detected"
+        expr: "ceph_health_detail{name=\"HARDWARE_POWER\"} > 0"
+        for: "30s"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.13.5"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "HardwareFanError"
+        annotations:
+          description: "Fan error(s) detected. Check `ceph health detail`."
+          summary: "Fan error(s) detected"
+        expr: "ceph_health_detail{name=\"HARDWARE_FANS\"} > 0"
+        for: "30s"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.13.6"
+          severity: "critical"
+          type: "ceph_default"
+  - name: "PrometheusServer"
+    rules:
+      - alert: "PrometheusJobMissing"
+        annotations:
+          description: "The prometheus job that scrapes from Ceph MGR is no longer defined, this will effectively mean you'll have no metrics or alerts for the cluster.  Please review the job definitions in the prometheus.yml file of the prometheus instance."
+          summary: "The scrape job for Ceph MGR is missing from Prometheus"
+        expr: "absent(up{job=\"rook-ceph-mgr\"})"
+        for: "30s"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.12.1"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "PrometheusJobExporterMissing"
+        annotations:
+          description: "The prometheus job that scrapes from Ceph Exporter is no longer defined, this will effectively mean you'll have no metrics or alerts for the cluster.  Please review the job definitions in the prometheus.yml file of the prometheus instance."
+          summary: "The scrape job for Ceph Exporter is missing from Prometheus"
+        expr: "sum(absent(up{job=\"rook-ceph-exporter\"})) and sum(ceph_osd_metadata{ceph_version=~\"^ceph version (1[89]|[2-9][0-9]).*\"}) > 0"
+        for: "30s"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.12.1"
+          severity: "critical"
+          type: "ceph_default"
   - name: "rados"
     rules:
       - alert: "CephObjectMissing"
@@ -601,3 +698,174 @@ groups:
           oid: "1.3.6.1.4.1.50495.1.2.1.1.2"
           severity: "critical"
           type: "ceph_default"
+  - name: "rbdmirror"
+    rules:
+      - alert: "CephRBDMirrorImagesPerDaemonHigh"
+        annotations:
+          description: "Number of image replications per daemon is not supposed to go beyond threshold 100"
+          summary: "Number of image replications are now above 100"
+        expr: "sum by (ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) > 100"
+        for: "1m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.10.2"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "CephRBDMirrorImagesNotInSync"
+        annotations:
+          description: "Both local and remote RBD mirror images should be in sync."
+          summary: "Some of the RBD mirror images are not in sync with the remote counter parts."
+        expr: "sum by (ceph_daemon, image, namespace, pool) (topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0"
+        for: "1m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.10.3"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "CephRBDMirrorImagesNotInSyncVeryHigh"
+        annotations:
+          description: "More than 10% of the images have synchronization problems"
+          summary: "Number of unsynchronized images are very high."
+        expr: "count by (ceph_daemon) ((topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0) > (sum by (ceph_daemon) (ceph_rbd_mirror_snapshot_snapshots)*.1)"
+        for: "1m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.10.4"
+          severity: "critical"
+          type: "ceph_default"
+      - alert: "CephRBDMirrorImageTransferBandwidthHigh"
+        annotations:
+          description: "Detected a heavy increase in bandwidth for rbd replications (over 80%) in the last 30 min. This might not be a problem, but it is good to review the number of images being replicated simultaneously"
+          summary: "The replication network usage has been increased over 80% in the last 30 minutes. Review the number of images being replicated. This alert will be cleaned automatically after 30 minutes"
+        expr: "rate(ceph_rbd_mirror_journal_replay_bytes[30m]) > 0.80"
+        for: "1m"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.10.5"
+          severity: "warning"
+          type: "ceph_default"
+  - name: "nvmeof"
+    rules:
+      - alert: "NVMeoFSubsystemNamespaceLimit"
+        annotations:
+          description: "Subsystems have a max namespace limit defined at creation time. This alert means that no more namespaces can be added to {{ $labels.nqn }}"
+          summary: "{{ $labels.nqn }} subsystem has reached its maximum number of namespaces "
+        expr: "(count by(nqn) (ceph_nvmeof_subsystem_namespace_metadata)) >= ceph_nvmeof_subsystem_namespace_limit"
+        for: "1m"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "NVMeoFTooManyGateways"
+        annotations:
+          description: "You may create many gateways, but 4 is the tested limit"
+          summary: "Max supported gateways exceeded "
+        expr: "count(ceph_nvmeof_gateway_info) > 4.00"
+        for: "1m"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "NVMeoFMaxGatewayGroupSize"
+        annotations:
+          description: "You may create many gateways in a gateway group, but 2 is the tested limit"
+          summary: "Max gateways within a gateway group ({{ $labels.group }}) exceeded "
+        expr: "count by(group) (ceph_nvmeof_gateway_info) > 2.00"
+        for: "1m"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "NVMeoFSingleGatewayGroup"
+        annotations:
+          description: "Although a single member gateway group is valid, it should only be used for test purposes"
+          summary: "The gateway group {{ $labels.group }} consists of a single gateway - HA is not possible "
+        expr: "count by(group) (ceph_nvmeof_gateway_info) == 1"
+        for: "5m"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "NVMeoFHighGatewayCPU"
+        annotations:
+          description: "Typically, high CPU may indicate degraded performance. Consider increasing the number of reactor cores"
+          summary: "CPU used by {{ $labels.instance }} NVMe-oF Gateway is high "
+        expr: "label_replace(avg by(instance) (rate(ceph_nvmeof_reactor_seconds_total{mode=\"busy\"}[1m])),\"instance\",\"$1\",\"instance\",\"(.*):.*\") > 80.00"
+        for: "10m"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "NVMeoFGatewayOpenSecurity"
+        annotations:
+          description: "It is good practice to ensure subsystems use host security to reduce the risk of unexpected data loss"
+          summary: "Subsystem {{ $labels.nqn }} has been defined without host level security "
+        expr: "ceph_nvmeof_subsystem_metadata{allow_any_host=\"yes\"}"
+        for: "5m"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "NVMeoFTooManySubsystems"
+        annotations:
+          description: "Although you may continue to create subsystems in {{ $labels.gateway_host }}, the configuration may not be supported"
+          summary: "The number of subsystems defined to the gateway exceeds supported values "
+        expr: "count by(gateway_host) (label_replace(ceph_nvmeof_subsystem_metadata,\"gateway_host\",\"$1\",\"instance\",\"(.*):.*\")) > 16.00"
+        for: "1m"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "NVMeoFVersionMismatch"
+        annotations:
+          description: "This may indicate an issue with deployment. Check cephadm logs"
+          summary: "The cluster has different NVMe-oF gateway releases active "
+        expr: "count(count by(version) (ceph_nvmeof_gateway_info)) > 1"
+        for: "1h"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "NVMeoFHighClientCount"
+        annotations:
+          description: "The supported limit for clients connecting to a subsystem is 32"
+          summary: "The number of clients connected to {{ $labels.nqn }} is too high "
+        expr: "ceph_nvmeof_subsystem_host_count > 32.00"
+        for: "1m"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "NVMeoFHighHostCPU"
+        annotations:
+          description: "High CPU on a gateway host can lead to CPU contention and performance degradation"
+          summary: "The CPU is high ({{ $value }}%) on NVMeoF Gateway host ({{ $labels.host }}) "
+        expr: "100-((100*(avg by(host) (label_replace(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]),\"host\",\"$1\",\"instance\",\"(.*):.*\")) * on(host) group_right label_replace(ceph_nvmeof_gateway_info,\"host\",\"$1\",\"instance\",\"(.*):.*\")))) >= 80.00"
+        for: "10m"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "NVMeoFInterfaceDown"
+        annotations:
+          description: "A NIC used by one or more subsystems is in a down state"
+          summary: "Network interface {{ $labels.device }} is down "
+        expr: "ceph_nvmeof_subsystem_listener_iface_info{operstate=\"down\"}"
+        for: "30s"
+        labels:
+          oid: "1.3.6.1.4.1.50495.1.2.1.14.1"
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "NVMeoFInterfaceDuplex"
+        annotations:
+          description: "Until this is resolved, performance from the gateway will be degraded"
+          summary: "Network interface {{ $labels.device }} is not running in full duplex mode "
+        expr: "ceph_nvmeof_subsystem_listener_iface_info{duplex!=\"full\"}"
+        for: "30s"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "NVMeoFHighReadLatency"
+        annotations:
+          description: "High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate"
+          summary: "The average read latency over the last 5 mins has reached 10 ms or more on {{ $labels.gateway }}"
+        expr: "label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_read_seconds_total[1m]) / rate(ceph_nvmeof_bdev_reads_completed_total[1m])))),\"gateway\",\"$1\",\"instance\",\"(.*):.*\") > 0.01"
+        for: "5m"
+        labels:
+          severity: "warning"
+          type: "ceph_default"
+      - alert: "NVMeoFHighWriteLatency"
+        annotations:
+          description: "High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate"
+          summary: "The average write latency over the last 5 mins has reached 20 ms or more on {{ $labels.gateway }}"
+        expr: "label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_write_seconds_total[5m]) / rate(ceph_nvmeof_bdev_writes_completed_total[5m])))),\"gateway\",\"$1\",\"instance\",\"(.*):.*\") > 0.02"
+        for: "5m"
+        labels:
+          severity: "warning"
+          type: "ceph_default"