-
Notifications
You must be signed in to change notification settings - Fork 57
/
metal-ironic.alerts
72 lines (68 loc) · 4.13 KB
/
metal-ironic.alerts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
groups:
- name: metal-ironic.alerts
rules:
- alert: MetalIronicSensorCritical
expr: count(ipmi_sensor_state{type=~"(Memory|Processor|Critical Interrupt)", maintenance="false", provision_state=~"(deploy|active|available)"} == 2) by (instance, type, name, manufacturer, model, provision_state, server_id, project_id)
for: 15m
labels:
severity: critical
tier: metal
service: ironic
context: "{{ $labels.instance }}"
meta: "ironic node {{ $labels.instance }} hardware error. Type: {{ $labels.type }} Name: {{ $labels.name }} / provision_state: {{ $labels.provision_state }} / project_id: {{ $labels.project_id }} / server_id: {{ $labels.server_id }}"
playbook: docs/devops/alert/baremetal
annotations:
description: "ironic node {{ $labels.instance }} hardware error for 5 min. Type: {{ $labels.type }} / Name: {{ $labels.name }} / Project ID: {{ $labels.project_id }} / Server ID: {{ $labels.server_id }}"
summary: "Hardware error for instance: {{ $labels.instance }}"
- alert: MetalIronicSensorWarning
expr: count((ipmi_sensor_state{type=~"(Memory|Processor|Critical Interrupt)", maintenance="false", provision_state=~"(manageable)"} == 2) or (ipmi_sensor_state{type=~"(Drive Slot)", maintenance="false"} == 2)) by (instance, type, name, manufacturer, model, provision_state, server_id, project_id)
for: 15m
labels:
severity: warning
tier: metal
service: ironic
context: "{{ $labels.instance }}"
meta: "ironic node {{ $labels.instance }} hardware error. Type: {{ $labels.type }} Name: {{ $labels.name }} / provision_state: {{ $labels.provision_state }} / project_id: {{ $labels.project_id }} / server_id: {{ $labels.server_id }}"
playbook: docs/devops/alert/baremetal
annotations:
description: "ironic node {{ $labels.instance }} hardware error for 5 min. Type: {{ $labels.type }} / Name: {{ $labels.name }} / Project ID: {{ $labels.project_id }} / Server ID: {{ $labels.server_id }}"
summary: "Hardware error for instance: {{ $labels.instance }}"
- alert: MetalIronicSensorInfo
expr: count(ipmi_sensor_state{type=~"(Memory|Drive Slot|Processor|Power Supply|Critical Interrupt)", maintenance="true"} == 2) by (instance, type, name, manufacturer, model, provision_state, server_id, project_id)
for: 15m
labels:
severity: info
tier: metal
service: ironic
context: "{{ $labels.instance }}"
meta: "ironic node {{ $labels.instance }} hardware error. Type: {{ $labels.type }} Name: {{ $labels.name }} / provision_state: {{ $labels.provision_state }} / project_id: {{ $labels.project_id }} / server_id: {{ $labels.server_id }}"
playbook: docs/devops/alert/baremetal
annotations:
description: "ironic node {{ $labels.instance }} hardware error for 5 min. Type: {{ $labels.type }} / Name: {{ $labels.name }} / Project ID: {{ $labels.project_id }} / Server ID: {{ $labels.server_id }}"
summary: "Hardware error for instance: {{ $labels.instance }}"
- alert: MetalIronicMetricsDown
expr: count(ipmi_up == 0) by (server_name, maintenance) or count(up{job="baremetal/ironic"} == 0) by (server_name, maintenance)
for: 15m
labels:
severity: warning
tier: metal
service: ironic
context: "{{ $labels.server_name }}"
meta: "ipmi metrics cannot be fetched from node {{ $labels.server_name }}, maintenance {{ $labels.maintenance }}"
playbook: docs/devops/alert/baremetal
annotations:
description: "ipmi metrics cannot be fetched from node {{ $labels.server_name }}, maintenance {{ $labels.maintenance }}"
summary: "ipmi metrics cannot be fetched from node {{ $labels.server_name }}, maintenance {{ $labels.maintenance }}"
- alert: MetalIronicSDDown
expr: absent(ipmi_sd_up) or ipmi_sd_up == 0
for: 15m
labels:
severity: info
tier: metal
service: ironic
context: ipmi_sd
meta: "ipmi service discovery failure"
playbook: docs/devops/alert/baremetal
annotations:
description: "ipmi service discovery failed to get ironic nodes"
summary: "ipmi service discovery failed to get ironic nodes"