Skip to content

Commit

Permalink
[RFC] Disable MaaS alarms/checks with regex
Browse files Browse the repository at this point in the history
This patch allows for regular expressions to be used in the
`maas_excluded_checks` variable.

It also adds `maas_excluded_alarms` and allows deployers to use a
regular expression to exclude **only** alarms for a particular check.
The check itself (and any related graphing) is maintained.

This comes from a discussion in rcbops/u-suk-dev#948 where some of the
checks are good to have, but the alarms aren't needed.

Connects rcbops/u-suk-dev#1019
  • Loading branch information
major committed Jan 20, 2017
1 parent bb1a13f commit df26d1b
Show file tree
Hide file tree
Showing 80 changed files with 147 additions and 20 deletions.
14 changes: 13 additions & 1 deletion rpcd/playbooks/roles/rpc_maas/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -506,10 +506,22 @@ maas_source_plugin_dir: /opt/rpc-openstack/maas/plugins/
maas_plugin_dir: /usr/lib/rackspace-monitoring-agent/plugins/

maas_rpc_scripts_dir: /opt/rpc-openstack/scripts

# The following two variables control which checks and alarms are excluded from
# MaaS. Here is explanation of what each variable does:
#
# 1) When you add a check to `maas_excluded_checks`, the check (and any
# associated alarms) are not provisioned. This disables graphing,
# monitoring, alarms and alerts for that particular check.
#
# maas_excluded_checks: List of checks and alarms to exclude from this deploy
# 2) When you add an alarm to `maas_excluded_alarms`, the alarm is not
# provisioned. However, the check itself (and any associated graphing) is
# maintained.
#
# Both lists can contain simple strings that match exactly, or they can contain
# regular expressions.
maas_excluded_checks: []
maas_excluded_alarms: []

# openrc definitions from OSA
# This is necessary until LP #1537117 is implemented
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,5 @@
with_dict: "{{ cinder_backends | default({}) }}"
when:
- inventory_hostname in groups["cinder_volume"]
- "'cinder_volume_{{ item.key }}_check' not in maas_excluded_checks"
- not 'cinder_volume_{{ item.key }}_check' | match(maas_excluded_checks_regex)
delegate_to: "{{ physical_host }}"
Original file line number Diff line number Diff line change
Expand Up @@ -25,5 +25,5 @@
when:
- item.group in group_names
- inventory_hostname in groups["{{ item.group }}"]
- item.name not in maas_excluded_checks
- not item.name | match(maas_excluded_checks_regex)
delegate_to: "{{ physical_host }}"
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,5 @@
- groups["{{ item.group }}"]|length > 0
- inventory_hostname in groups["{{ item.group }}"]
- inventory_hostname != groups["{{ item.group }}"][0]
- item.name not in maas_excluded_checks
- not item.name | match(maas_excluded_checks_regex)
delegate_to: "{{ physical_host }}"
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,5 @@
- item.group in group_names
- groups["{{ item.group }}"]|length > 0
- inventory_hostname == groups["{{ item.group }}"][0]
- item.name not in maas_excluded_checks
- not item.name | match(maas_excluded_checks_regex)
delegate_to: "{{ physical_host }}"
8 changes: 8 additions & 0 deletions rpcd/playbooks/roles/rpc_maas/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# Merge regular expressions for excluded checks/alarms into a single
# regular expression string. If the lists are empty, use '^$' as the regular
# expression since it does not match anything.
- name: Gather regular expressions for excluded checks and alarms
set_fact:
maas_excluded_checks_regex: "{{ (maas_excluded_checks | length > 0) | ternary('('+maas_excluded_checks | join(')|(')+')', '^$') }}"
maas_excluded_alarms_regex: "{{ (maas_excluded_alarms | length > 0) | ternary('('+maas_excluded_alarms | join(')|(')+')', '^$') }}"

- include: host_setup.yml
when: >
inventory_hostname in groups['hosts']
Expand Down
4 changes: 2 additions & 2 deletions rpcd/playbooks/roles/rpc_maas/tasks/network.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,12 @@
- "{{ discover_nic_speed.results }}"
when:
- inventory_hostname in groups["{{ item.0.group }}"]
- "'network_throughput-{{ item.0.name }}' not in maas_excluded_checks"
- not 'network_throughput-{{ item.0.name }}' | match(maas_excluded_checks_regex)

- name: Remove checks that are excluded
file:
path: "/etc/rackspace-monitoring-agent.conf.d/network_throughput-{{ item.name }}-{{ inventory_hostname }}.yaml"
state: absent
with_items: "{{ network_checks_list }}"
when:
- "'network_throughput-{{ item.name }}' in maas_excluded_checks"
- not 'network_throughput-{{ item.name }}' | match(maas_excluded_checks_regex)
2 changes: 1 addition & 1 deletion rpcd/playbooks/roles/rpc_maas/tasks/process.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,5 +25,5 @@
when:
- item.group in groups
- inventory_hostname in groups['{{ item.group }}']
- item.name not in maas_excluded_checks
- not item.name | match(maas_excluded_checks_regex)
delegate_to: "{{ physical_host }}"
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ alarms :
ceph_health_err :
label : ceph_health_err--{{ ansible_hostname }}
notification_plan_id : "{{ maas_notification_plan }}"
disabled : {{ (('ceph_health_err--'+ansible_hostname) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (metric["cluster_health"] == 0) {
Expand All @@ -19,6 +20,7 @@ alarms :
ceph_health_warn :
label : ceph_health_warn--{{ ansible_hostname }}
notification_plan_id : "{{ maas_notification_plan }}"
disabled : {{ (('ceph_health_warn--'+ansible_hostname) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (metric["cluster_health"] == 1) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ alarms :
mon_health_err :
label : mon_health_err--{{ ansible_hostname }}
notification_plan_id : "{{ maas_notification_plan }}"
disabled : {{ (('mon_health_err--'+ansible_hostname) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (metric["mon_health"] == 0) {
Expand All @@ -19,6 +20,7 @@ alarms :
mon_health_warn :
label : mon_health_warn--{{ ansible_hostname }}
notification_plan_id : "{{ maas_notification_plan }}"
disabled : {{ (('mon_health_err--'+ansible_hostname) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (metric["mon_health"] == 1) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ alarms :
ceph_warn_osd.{{ osd_id }} :
label : ceph_warn_osd.{{ osd_id }}--{{ ansible_hostname }}
notification_plan_id : "{{ maas_notification_plan }}"
disabled : {{ (('ceph_warn_osd.'+{{ osd_id }}+'--'+{{ ansible_hostname }}) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (metric["osd.{{ osd_id }}_up"] == 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ alarms :
cinder_api_local_status :
label : cinder_api_local_status--{{ ansible_hostname }}
notification_plan_id : "{{ maas_notification_plan }}"
disabled : {{ (('cinder_api_local_status--'+{{ ansible_hostname }}) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (metric["cinder_api_local_status"] != 1) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ alarms :
cinder_backup_status :
label : cinder_backup_status--{{ ansible_hostname }}
notification_plan_id : "{{ maas_notification_plan }}"
disabled : {{ (('cinder_backup_status--'+{{ ansible_hostname }}) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (metric["cinder-backup_status"] != 1) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ alarms :
cinder_scheduler_status :
label : cinder_scheduler_status--{{ ansible_hostname }}
notification_plan_id : "{{ maas_notification_plan }}"
disabled : {{ (('cinder_scheduler_status--'+{{ ansible_hostname }}) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (metric["cinder-scheduler_status"] != 1) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ alarms :
cinder_vg_space_status :
label : cinder_vg_space_status--{{ ansible_hostname }}
notification_plan_id : "{{ maas_notification_plan }}"
disabled : {{ (('cinder_vg_space_status--'+{{ ansible_hostname }}) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (percentage(metric["{{ item.cinder_vg_name }}_vg_used_space"], metric["{{ item.cinder_vg_name }}_vg_total_space"]) > {{ cinder_volumes_vg_critical_threshold }}) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ alarms :
cinder_volume_{{ item.key }}_status :
label : cinder_volume_{{ item.key }}_status--{{ ansible_hostname }}
notification_plan_id : "{{ maas_notification_plan }}"
disabled : {{ (('cinder_volume_'+{{ item.key }}+'_status--'+{{ ansible_hostname }}) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (metric["cinder-volume-{{ item.key}}_status"] != 1) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ alarms :
conntrack_count_status :
label : conntrack_count_status--{{ ansible_hostname }}
notification_plan_id : "{{ maas_notification_plan }}"
disabled : {{ (('conntrack_count_status--'+{{ ansible_hostname }}) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (percentage(metric["nf_conntrack_count"] , metric["nf_conntrack_max"]) > {{ nf_conntrack_critical_threshold }}) {
Expand Down
1 change: 1 addition & 0 deletions rpcd/playbooks/roles/rpc_maas/templates/cpu_check.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ alarms :
idle_percent_average :
label : idle_percent_average--{{ inventory_hostname|quote }}
notification_plan_id : "{{ maas_notification_plan }}"
disabled : {{ (('idle_percent_average--'+inventory_hostname|quote) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (metric["idle_percent_average"] <= {{ cpu_idle_percent_avg_critical_threshold }}) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ alarms :
percentage_disk_utilisation_{{ device }}:
label : percentage_disk_utilisation_{{ device }}--{{ ansible_hostname }}
notification_plan_id : "{{ maas_notification_plan }}"
disabled : {{ (('percentage_disk_utilisation_'+{{ device }}+'--'+{{ ansible_hostname }}) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (metric["disk_utilisation_{{ device }}"] > {{ disk_utilisation_critical_threshold }}) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ alarms :
{{ process }}_process_status:
label : {{ process }}_process_status--{{ ansible_hostname }}
notification_plan_id : "{{ maas_notification_plan }}"
disabled : {{ (({{ process }}+'_process_status--'+{{ ansible_hostname }}) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (metric["{{ process }}_process_status"] != 1 ) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ alarms :
{{ process }}_process_status:
label : {{ process }}_process_status--{{ ansible_hostname }}
notification_plan_id : "{{ maas_notification_plan }}"
disabled : {{ (({{ process }}+'_process_status--'+{{ ansible_hostname }}) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (metric["{{ process }}_process_status"] != 1 ) {
Expand Down
1 change: 1 addition & 0 deletions rpcd/playbooks/roles/rpc_maas/templates/filesystem.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ alarms :
filesystem_{{ item.filesystem }}_check :
label : "Disk space used on {{ item.filesystem }}--{{ ansible_hostname }}"
notification_plan_id : "{{ maas_notification_plan }}"
disabled : {{ (('Disk space used on '+{{ item.filesystem }}+'--'+{{ ansible_hostname }}) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (percentage(metric['used'], metric['total']) >= {{ item.critical_threshold }}) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ alarms :
filesystem_{{ item }}_check :
label : "Disk space used on {{ item }}--{{ ansible_hostname }}"
notification_plan_id : "{{ maas_notification_plan }}"
disabled : {{ (('Disk space used on '+{{ item }}+'--'+{{ ansible_hostname }}) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (percentage(metric['used'], metric['total']) >= {{ maas_filesystem_critical_threshold }}) {
Expand Down
9 changes: 9 additions & 0 deletions rpcd/playbooks/roles/rpc_maas/templates/galera_check.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ alarms :
wsrep_cluster_size :
label : wsrep_cluster_size--{{ ansible_hostname }}
notification_plan_id : "{{ maas_notification_plan }}"
disabled : {{ (('wsrep_cluster_size--'+{{ ansible_hostname }}) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (metric["wsrep_cluster_size"] < {{ groups["galera"] | length }}) {
Expand All @@ -18,6 +19,7 @@ alarms :
wsrep_local_state :
label : wsrep_local_state--{{ ansible_hostname }}
notification_plan_id : "{{ maas_notification_plan }}"
disabled : {{ (('wsrep_local_state--'+{{ ansible_hostname }}) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (metric["wsrep_local_state_comment"] != "Synced" ) {
Expand All @@ -26,6 +28,7 @@ alarms :
percentage_used_mysql_connections :
label : percentage_used_mysql_connections--{{ ansible_hostname }}
notification_plan_id : "{{ maas_notification_plan }}"
disabled : {{ (('percentage_used_mysql_connections--'+{{ ansible_hostname }}) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (percentage(metric["mysql_current_connections"], metric["mysql_max_configured_connections"]) > {{ mysql_connection_critical_threshold }} ) {
Expand All @@ -37,6 +40,7 @@ alarms :
open_file_size_limit_reached :
label : open_file_size_limit_reached--{{ ansible_hostname }}
notification_plan_id : "{{ maas_notification_plan }}"
disabled : {{ (('open_file_size_limit_reached--'+{{ ansible_hostname }}) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (percentage(metric["num_of_open_files"], metric["open_files_limit"]) > {{ mysql_open_files_percentage_critical_threshold }}) {
Expand All @@ -48,6 +52,7 @@ alarms :
innodb_row_lock_time_avg :
label : innodb_row_lock_time_avg--{{ ansible_hostname }}
notification_plan_id : "{{ maas_notification_plan }}"
disabled : {{ (('innodb_row_lock_time_avg--'+{{ ansible_hostname }}) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (metric["innodb_row_lock_time_avg"] > {{ innodb_row_lock_time_avg_critical_threshold }}) {
Expand All @@ -59,6 +64,7 @@ alarms :
innodb_deadlocks :
label : innodb_deadlocks--{{ ansible_hostname }}
notification_plan_id : "{{ maas_notification_plan }}"
disabled : {{ (('innodb_deadlocks--'+{{ ansible_hostname }}) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (metric["innodb_deadlocks"] != 0) {
Expand All @@ -67,6 +73,7 @@ alarms :
access_denied_errors :
label : access_denied_errors--{{ ansible_hostname }}
notification_plan_id : "{{ maas_notification_plan }}"
disabled : {{ (('access_denied_errors--'+{{ ansible_hostname }}) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (rate(metric["access_denied_errors"]) > {{ mysql_access_denied_errors_rate_warning_threshold }}) {
Expand All @@ -78,6 +85,7 @@ alarms :
aborted_clients :
label : aborted_clients--{{ ansible_hostname }}
notification_plan_id : "{{ maas_notification_plan }}"
disabled : {{ (('aborted_clients--'+{{ ansible_hostname }}) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (rate(metric["aborted_clients"]) > {{ mysql_aborted_clients_rate_warning_threshold }}) {
Expand All @@ -89,6 +97,7 @@ alarms :
aborted_connects :
label : aborted_connects--{{ ansible_hostname }}
notification_plan_id : "{{ maas_notification_plan }}"
disabled : {{ (('aborted_connects--'+{{ ansible_hostname }}) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (rate(metric["aborted_connects"]) > {{ mysql_aborted_connects_rate_warning_threshold }}) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ alarms :
glance_api_local_status :
label : glance_api_local_status--{{ ansible_hostname }}
notification_plan_id : "{{ maas_notification_plan }}"
disabled : {{ (('glance_api_local_status--'+{{ ansible_hostname }}) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (metric["glance_api_local_status"] != 1) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ alarms :
glance_registry_local_status :
label : glance_registry_local_status--{{ ansible_hostname }}
notification_plan_id : "{{ maas_notification_plan }}"
disabled : {{ (('glance_registry_local_status--'+{{ ansible_hostname }}) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (metric["glance_registry_local_status"] != 1) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ alarms :
heat_api_local_status :
label : heat_api_local_status--{{ ansible_hostname }}
notification_plan_id : "{{ maas_notification_plan }}"
disabled : {{ (('heat_api_local_status--'+{{ ansible_hostname }}) | match(maas_excluded_alarms_regex)) | ternary('true', 'false') }}
criteria : |
:set consecutiveCount={{ maas_alarm_local_consecutive_count }}
if (metric["heat_api_local_status"] != 1) {
Expand Down

0 comments on commit df26d1b

Please sign in to comment.