diff --git a/components/ironic/kustomization.yaml b/components/ironic/kustomization.yaml index bc272f40f..cb86508cd 100644 --- a/components/ironic/kustomization.yaml +++ b/components/ironic/kustomization.yaml @@ -17,3 +17,6 @@ resources: # Graphical consoles - role-ironic-graphical-console.yaml - role-binding-ironic-graphical-console.yaml + # Alerting + - pr-clean-failed-servers.yaml + - pr-resource-availability.yaml diff --git a/components/ironic/pr-clean-failed-servers.yaml b/components/ironic/pr-clean-failed-servers.yaml new file mode 100644 index 000000000..103c201f3 --- /dev/null +++ b/components/ironic/pr-clean-failed-servers.yaml @@ -0,0 +1,18 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: ironic-clean-failed-servers + namespace: openstack +spec: + groups: + - name: ironic.rules + interval: 30s + rules: + - alert: IronicCleanFailedServer + expr: openstack_ironic_node{provision_state="clean failed"} == 1 + for: 30s + labels: + severity: warning + annotations: + summary: "Ironic node {{ $labels.name }} in clean failed state" + description: "Ironic node {{ $labels.name }} (ID: {{ $labels.id }}) is in 'clean failed' provision state" diff --git a/components/ironic/pr-resource-availability.yaml b/components/ironic/pr-resource-availability.yaml new file mode 100644 index 000000000..5bc85b85a --- /dev/null +++ b/components/ironic/pr-resource-availability.yaml @@ -0,0 +1,27 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: ironic-low-resource-availability + namespace: openstack +spec: + groups: + - name: ironic.rules + interval: 30s + rules: + - alert: ServerPoolLowResourceAvailability + expr: | + label_replace( + ( + count by (resourcetype) (openstack_placement_resource_usage == 0) + / + count by (resourcetype) (openstack_placement_resource_usage) + ) < 0.15, + "resource_name", "$1", "resourcetype", "CUSTOM_(.*)" + ) + for: 5m + labels: + severity: warning + annotations: + summary: "Server pool {{ $labels.resource_name }} nearly exhausted." + description: "{{ $labels.resource_name }} server pool has only {{ $value | humanizePercentage }} available machines (threshold: 15%)."