From 6791190cd311974fa733ed7213c1d4fa7cf0ec28 Mon Sep 17 00:00:00 2001 From: Dawud M <7688823+technowhizz@users.noreply.github.com> Date: Fri, 18 Nov 2022 17:05:51 +0000 Subject: [PATCH 1/4] Add SMART Monitoring with dash and alerts Enabled Textfile collector in node exporter in kolla/globals.yml Added smartmon script as is from the prometheus-community github and then removed NVME support from this script in favour of using the nvme-cli script, which has also been added in. This is because the nvme-cli script provides better metrics than the smartmon script does. The script also adds the serial number of the disk as a label to all SMART metrics. Added a Kayobe custom playbook to easily deploy the script and associated cron job. This playbook installs smartmontool and nvmecli then copies these over to the hosts and sets up a cronjob which runs the scripts and stores the metrics in the docker volume for node exporter. The playbook changes the way the metrics are saved to a file by making use of the mv command as it is atomic. This was needed as at times prometheus would read a partially completed file. Added a prometheus alert to alert when a drive is reported as not healthy for more than 10 minutes. Added a Grafana dashboard to display the number of healthy and unhealthy drives reported in prometheus. (cherry picked from commit d83ecde1c8f38fe79da70041c637140c58f9dc9b) Add docs for SMART Monitoring (cherry picked from commit 595429ad15b77ee9777899e2593c41431ce37a82) Update doc/source/configuration/monitoring.rst Fix kayobe command Co-authored-by: Will Szumski (cherry picked from commit 9a5fc53c05de12b30dd360acfdcb023aedb84491) Update doc/source/configuration/monitoring.rst Fix Spelling Co-authored-by: Will Szumski (cherry picked from commit ef25d6f4d556809cfdc4a0be42300cd1249fdde1) Add release note (cherry picked from commit 3d4d01117dce76d668ac351a61f4843efe72ca0c) Amend docs and add release note (cherry picked from commit b6cb511d911f031de4e2b3c08fa5afb95e0f6d9a) Move SMART prometheus alert to own file (cherry picked from commit b353fd32c4799f325d1a5a0c8476690a12e30ff2) Fix typo (cherry picked from commit 611f2fb891802f2ff4d7248bcd7363acd244154c) fixup --- doc/source/configuration/monitoring.rst | 46 +- etc/kayobe/ansible/scripts/nvmemon.sh | 97 ++++ etc/kayobe/ansible/scripts/smartmon.sh | 202 +++++++ etc/kayobe/ansible/smartmon-tools.yml | 43 ++ .../openstack/hardware_overview.json | 543 ++++++++++++++++++ .../kolla/config/prometheus/smart.rules | 12 + etc/kayobe/kolla/globals.yml | 7 + .../notes/smart-mon-db8fa642c3af74b1.yaml | 4 + 8 files changed, 952 insertions(+), 2 deletions(-) create mode 100644 etc/kayobe/ansible/scripts/nvmemon.sh create mode 100644 etc/kayobe/ansible/scripts/smartmon.sh create mode 100644 etc/kayobe/ansible/smartmon-tools.yml create mode 100644 etc/kayobe/kolla/config/grafana/dashboards/openstack/hardware_overview.json create mode 100644 etc/kayobe/kolla/config/prometheus/smart.rules create mode 100644 releasenotes/notes/smart-mon-db8fa642c3af74b1.yaml diff --git a/doc/source/configuration/monitoring.rst b/doc/source/configuration/monitoring.rst index 7e53629f1..f358ea084 100644 --- a/doc/source/configuration/monitoring.rst +++ b/doc/source/configuration/monitoring.rst @@ -1,4 +1,7 @@ -======================== +=========== +Monitoring +=========== + Monitoring Configuration ======================== @@ -10,4 +13,43 @@ The configuration options can be found in ``etc/kayobe/stackhpc-monitoring.yml``: .. literalinclude:: ../../../etc/kayobe/stackhpc-monitoring.yml - :language: yaml \ No newline at end of file + :language: yaml + +SMART Drive Monitoring +======================= + +StackHPC kayobe config also includes drive monitoring for spinning disks and +NVME's. + +By default, node exporter doesn't provide SMART metrics, hence we make use +of 2 scripts (one for NVME’s and one for spinning drives), which are run by +a cronjob, to output the metrics and we use node exporter's Textfile collector +to report the metrics output by the scripts to Prometheus. These metrics can +then be visualised in Grafana with the bundled dashboard. + +After pulling in the latest changes into your local kayobe config, reconfigure +Prometheus and Grafana + +.. code-block:: console + + kayobe overcloud service reconfigure -kt grafana,prometheus + +(Note: If you run into an error when reconfiguring Grafana, it could be due to +`this `__ bug and at +present, the workaround is to go into each node running Grafana and manually +restart the process with ``docker restart grafana`` and then try the reconfigure +command again.)  + +Once the reconfigure has completed you can now run the custom playbook which +copies over the scripts and sets up the cron jobs to start SMART monitoring +on the overcloud hosts: + +.. code-block:: console + + (kayobe) [stack@node ~]$ cd etc/kayobe + (kayobe) [stack@node kayobe]$ kayobe playbook run ansible/smartmontools.yml + +SMART reporting should now be enabled along with a Prometheus alert for +unhealthy disks and a Grafana dashboard called ``Hardware Overview``.  + + diff --git a/etc/kayobe/ansible/scripts/nvmemon.sh b/etc/kayobe/ansible/scripts/nvmemon.sh new file mode 100644 index 000000000..9ab727b0b --- /dev/null +++ b/etc/kayobe/ansible/scripts/nvmemon.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash +set -eu + +# Dependencies: nvme-cli, jq (packages) +# Based on code from +# - https://github.com/prometheus/node_exporter/blob/master/text_collector_examples/smartmon.sh +# - https://github.com/prometheus/node_exporter/blob/master/text_collector_examples/mellanox_hca_temp +# - https://github.com/vorlon/check_nvme/blob/master/check_nvme.sh +# +# Author: Henk + +# Check if we are root +if [ "$EUID" -ne 0 ]; then + echo "${0##*/}: Please run as root!" >&2 + exit 1 +fi + +# Check if programs are installed +if ! command -v nvme >/dev/null 2>&1; then + echo "${0##*/}: nvme is not installed. Aborting." >&2 + exit 1 +fi + +output_format_awk="$( + cat <<'OUTPUTAWK' +BEGIN { v = "" } +v != $1 { + print "# HELP nvme_" $1 " SMART metric " $1; + if ($1 ~ /_total$/) + print "# TYPE nvme_" $1 " counter"; + else + print "# TYPE nvme_" $1 " gauge"; + v = $1 +} +{print "nvme_" $0} +OUTPUTAWK +)" + +format_output() { + sort | awk -F'{' "${output_format_awk}" +} + +# Get the nvme-cli version +nvme_version="$(nvme version | awk '$1 == "nvme" {print $3}')" +echo "nvmecli{version=\"${nvme_version}\"} 1" | format_output + +# Get devices +device_list="$(nvme list -o json | jq -r '.Devices | .[].DevicePath')" + +# Loop through the NVMe devices +for device in ${device_list}; do + json_check="$(nvme smart-log -o json "${device}")" + disk="${device##*/}" + + # The temperature value in JSON is in Kelvin, we want Celsius + value_temperature="$(echo "$json_check" | jq '.temperature - 273')" + echo "temperature_celsius{device=\"${disk}\"} ${value_temperature}" + + value_available_spare="$(echo "$json_check" | jq '.avail_spare / 100')" + echo "available_spare_ratio{device=\"${disk}\"} ${value_available_spare}" + + value_available_spare_threshold="$(echo "$json_check" | jq '.spare_thresh / 100')" + echo "available_spare_threshold_ratio{device=\"${disk}\"} ${value_available_spare_threshold}" + + value_percentage_used="$(echo "$json_check" | jq '.percent_used / 100')" + echo "percentage_used_ratio{device=\"${disk}\"} ${value_percentage_used}" + + value_critical_warning="$(echo "$json_check" | jq '.critical_warning')" + echo "critical_warning_total{device=\"${disk}\"} ${value_critical_warning}" + + value_media_errors="$(echo "$json_check" | jq '.media_errors')" + echo "media_errors_total{device=\"${disk}\"} ${value_media_errors}" + + value_num_err_log_entries="$(echo "$json_check" | jq '.num_err_log_entries')" + echo "num_err_log_entries_total{device=\"${disk}\"} ${value_num_err_log_entries}" + + value_power_cycles="$(echo "$json_check" | jq '.power_cycles')" + echo "power_cycles_total{device=\"${disk}\"} ${value_power_cycles}" + + value_power_on_hours="$(echo "$json_check" | jq '.power_on_hours')" + echo "power_on_hours_total{device=\"${disk}\"} ${value_power_on_hours}" + + value_controller_busy_time="$(echo "$json_check" | jq '.controller_busy_time')" + echo "controller_busy_time_seconds{device=\"${disk}\"} ${value_controller_busy_time}" + + value_data_units_written="$(echo "$json_check" | jq '.data_units_written')" + echo "data_units_written_total{device=\"${disk}\"} ${value_data_units_written}" + + value_data_units_read="$(echo "$json_check" | jq '.data_units_read')" + echo "data_units_read_total{device=\"${disk}\"} ${value_data_units_read}" + + value_host_read_commands="$(echo "$json_check" | jq '.host_read_commands')" + echo "host_read_commands_total{device=\"${disk}\"} ${value_host_read_commands}" + + value_host_write_commands="$(echo "$json_check" | jq '.host_write_commands')" + echo "host_write_commands_total{device=\"${disk}\"} ${value_host_write_commands}" +done | format_output diff --git a/etc/kayobe/ansible/scripts/smartmon.sh b/etc/kayobe/ansible/scripts/smartmon.sh new file mode 100644 index 000000000..bcac8b8b3 --- /dev/null +++ b/etc/kayobe/ansible/scripts/smartmon.sh @@ -0,0 +1,202 @@ +#!/bin/bash +# Script informed by the collectd monitoring script for smartmontools (using smartctl) +# by Samuel B. (c) 2012 +# source at: http://devel.dob.sk/collectd-scripts/ + +# TODO: This probably needs to be a little more complex. The raw numbers can have more +# data in them than you'd think. +# http://arstechnica.com/civis/viewtopic.php?p=22062211 + +# Formatting done via shfmt -i 2 +# https://github.com/mvdan/sh + +parse_smartctl_attributes_awk="$( + cat <<'SMARTCTLAWK' +$1 ~ /^ *[0-9]+$/ && $2 ~ /^[a-zA-Z0-9_-]+$/ { + gsub(/-/, "_"); + printf "%s_value{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $4 + printf "%s_worst{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $5 + printf "%s_threshold{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $6 + printf "%s_raw_value{%s,smart_id=\"%s\"} %e\n", $2, labels, $1, $10 +} +SMARTCTLAWK +)" + +smartmon_attrs="$( + cat <<'SMARTMONATTRS' +airflow_temperature_cel +command_timeout +current_pending_sector +end_to_end_error +erase_fail_count +g_sense_error_rate +hardware_ecc_recovered +host_reads_32mib +host_reads_mib +host_writes_32mib +host_writes_mib +load_cycle_count +media_wearout_indicator +nand_writes_1gib +offline_uncorrectable +power_cycle_count +power_on_hours +program_fail_cnt_total +program_fail_count +raw_read_error_rate +reallocated_event_count +reallocated_sector_ct +reported_uncorrect +runtime_bad_block +sata_downshift_count +seek_error_rate +spin_retry_count +spin_up_time +start_stop_count +temperature_case +temperature_celsius +temperature_internal +total_lbas_read +total_lbas_written +udma_crc_error_count +unsafe_shutdown_count +unused_rsvd_blk_cnt_tot +wear_leveling_count +workld_host_reads_perc +workld_media_wear_indic +workload_minutes +SMARTMONATTRS +)" +smartmon_attrs="$(echo "${smartmon_attrs}" | xargs | tr ' ' '|')" + +parse_smartctl_attributes() { + local disk="$1" + local disk_type="$2" + local serial="$3" + local labels="disk=\"${disk}\",type=\"${disk_type}\",serial_number=\"${serial}\"" + sed 's/^ \+//g' | + awk -v labels="${labels}" "${parse_smartctl_attributes_awk}" 2>/dev/null | + tr '[:upper:]' '[:lower:]' | + grep -E "(${smartmon_attrs})" +} + +parse_smartctl_scsi_attributes() { + local disk="$1" + local disk_type="$2" + local serial="$3" + local labels="disk=\"${disk}\",type=\"${disk_type}\",serial_number=\"${serial}\"" + while read -r line; do + attr_type="$(echo "${line}" | tr '=' ':' | cut -f1 -d: | sed 's/^ \+//g' | tr ' ' '_')" + attr_value="$(echo "${line}" | tr '=' ':' | cut -f2 -d: | sed 's/^ \+//g')" + case "${attr_type}" in + number_of_hours_powered_up_) power_on="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; + Current_Drive_Temperature) temp_cel="$(echo "${attr_value}" | cut -f1 -d' ' | awk '{ printf "%e\n", $1 }')" ;; + Blocks_sent_to_initiator_) lbas_read="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; + Blocks_received_from_initiator_) lbas_written="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; + Accumulated_start-stop_cycles) power_cycle="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; + Elements_in_grown_defect_list) grown_defects="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; + esac + done + [ -n "$power_on" ] && echo "power_on_hours_raw_value{${labels},smart_id=\"9\"} ${power_on}" + [ -n "$temp_cel" ] && echo "temperature_celsius_raw_value{${labels},smart_id=\"194\"} ${temp_cel}" + [ -n "$lbas_read" ] && echo "total_lbas_read_raw_value{${labels},smart_id=\"242\"} ${lbas_read}" + [ -n "$lbas_written" ] && echo "total_lbas_written_raw_value{${labels},smart_id=\"241\"} ${lbas_written}" + [ -n "$power_cycle" ] && echo "power_cycle_count_raw_value{${labels},smart_id=\"12\"} ${power_cycle}" + [ -n "$grown_defects" ] && echo "grown_defects_count_raw_value{${labels},smart_id=\"-1\"} ${grown_defects}" +} + +parse_smartctl_info() { + local -i smart_available=0 smart_enabled=0 smart_healthy= + local disk="$1" disk_type="$2" + local model_family='' device_model='' serial_number='' fw_version='' vendor='' product='' revision='' lun_id='' + while read -r line; do + info_type="$(echo "${line}" | cut -f1 -d: | tr ' ' '_')" + info_value="$(echo "${line}" | cut -f2- -d: | sed 's/^ \+//g' | sed 's/"/\\"/')" + case "${info_type}" in + Model_Family) model_family="${info_value}" ;; + Device_Model) device_model="${info_value}" ;; + Serial_Number) serial_number="${info_value}" ;; + Firmware_Version) fw_version="${info_value}" ;; + Vendor) vendor="${info_value}" ;; + Product) product="${info_value}" ;; + Revision) revision="${info_value}" ;; + Logical_Unit_id) lun_id="${info_value}" ;; + esac + if [[ "${info_type}" == 'SMART_support_is' ]]; then + case "${info_value:0:7}" in + Enabled) smart_available=1; smart_enabled=1 ;; + Availab) smart_available=1; smart_enabled=0 ;; + Unavail) smart_available=0; smart_enabled=0 ;; + esac + fi + if [[ "${info_type}" == 'SMART_overall-health_self-assessment_test_result' ]]; then + case "${info_value:0:6}" in + PASSED) smart_healthy=1 ;; + *) smart_healthy=0 ;; + esac + elif [[ "${info_type}" == 'SMART_Health_Status' ]]; then + case "${info_value:0:2}" in + OK) smart_healthy=1 ;; + *) smart_healthy=0 ;; + esac + fi + done + echo "device_info{disk=\"${disk}\",type=\"${disk_type}\",vendor=\"${vendor}\",product=\"${product}\",revision=\"${revision}\",lun_id=\"${lun_id}\",model_family=\"${model_family}\",device_model=\"${device_model}\",serial_number=\"${serial_number}\",firmware_version=\"${fw_version}\"} 1" + echo "device_smart_available{disk=\"${disk}\",type=\"${disk_type}\",serial_number=\"${serial_number}\"} ${smart_available}" + [[ "${smart_available}" == "1" ]] && echo "device_smart_enabled{disk=\"${disk}\",type=\"${disk_type}\",serial_number=\"${serial_number}\"} ${smart_enabled}" + [[ "${smart_available}" == "1" ]] && [[ "${smart_healthy}" != "" ]] && echo "device_smart_healthy{disk=\"${disk}\",type=\"${disk_type}\",serial_number=\"${serial_number}\"} ${smart_healthy}" +} + +output_format_awk="$( + cat <<'OUTPUTAWK' +BEGIN { v = "" } +v != $1 { + print "# HELP smartmon_" $1 " SMART metric " $1; + print "# TYPE smartmon_" $1 " gauge"; + v = $1 +} +{print "smartmon_" $0} +OUTPUTAWK +)" + +format_output() { + sort | + awk -F'{' "${output_format_awk}" +} + +smartctl_version="$(/usr/sbin/smartctl -V | head -n1 | awk '$1 == "smartctl" {print $2}')" + +echo "smartctl_version{version=\"${smartctl_version}\"} 1" | format_output + +if [[ "$(expr "${smartctl_version}" : '\([0-9]*\)\..*')" -lt 6 ]]; then + exit +fi + +device_list="$(/usr/sbin/smartctl --scan-open | awk '/^\/dev/{print $1 "|" $3}')" + +for device in ${device_list}; do + disk="$(echo "${device}" | cut -f1 -d'|')" + type="$(echo "${device}" | cut -f2 -d'|')" + # Use REGEX to extract the serial number from the parsed information and save that to a variable + serial_number="$(/usr/sbin/smartctl -i -H -d "${type}" "${disk}" | parse_smartctl_info "${disk}" "${type}"| sed -E ':a;N;$!ba;s/.*serial_number=\"([^"]+)\".*/\1/g' | sed -E 's/^device_info\{.*//g')" + active=1 + echo "smartctl_run{disk=\"${disk}\",type=\"${type}\"}" "$(TZ=UTC date '+%s')" + # Check if the device is in a low-power mode + /usr/sbin/smartctl -n standby -d "${type}" "${disk}" > /dev/null || active=0 + echo "device_active{disk=\"${disk}\",type=\"${type}\"}" "${active}" + # Skip further metrics to prevent the disk from spinning up + test ${active} -eq 0 && continue + # Get the SMART information and health + /usr/sbin/smartctl -i -H -d "${type}" "${disk}" | parse_smartctl_info "${disk}" "${type}" + # Get the SMART attributes + case ${type} in + sat) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" "${serial_number}" ;; + sat+megaraid*) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" "${serial_number}" ;; + scsi) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" "${serial_number}" ;; + megaraid*) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" "${serial_number}" ;; + *) + (>&2 echo "disk type is not sat, scsi or megaraid but ${type}") + exit + ;; + esac +done | format_output \ No newline at end of file diff --git a/etc/kayobe/ansible/smartmon-tools.yml b/etc/kayobe/ansible/smartmon-tools.yml new file mode 100644 index 000000000..6b275c264 --- /dev/null +++ b/etc/kayobe/ansible/smartmon-tools.yml @@ -0,0 +1,43 @@ +--- +- hosts: overcloud + + tasks: + - name: Ensure smartmon-tools and nvme-cli is installed + package: + name: + - smartmontools + - nvme-cli + - jq + state: present + become: true + + - name: Copy smartmon.sh and nvmemon.sh from scripts folder + copy: + src: "scripts/{{ item }}" + dest: /usr/local/bin/ + owner: 'root' + group: 'root' + mode: '0700' + loop: + - smartmon.sh + - nvmemon.sh + become: yes + + - name: Set PATH Variable for cron + cron: + name: PATH + user: root + env: yes + job: /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin + become: yes + + - name: Schedule cronjob to run both scripts every 5 minutes and save output to file + cron: + name: "SMART metrics for drive monitoring using {{ item }}" + user: root + minute: "*/5" + job: "/usr/local/bin/{{ item }}.sh > /var/lib/docker/volumes/textfile/_data/{{ item }}.prom.temp && mv /var/lib/docker/volumes/textfile/_data/{{ item }}.prom.temp /var/lib/docker/volumes/textfile/_data/{{ item }}.prom" + loop: + - smartmon + - nvmemon + become: yes diff --git a/etc/kayobe/kolla/config/grafana/dashboards/openstack/hardware_overview.json b/etc/kayobe/kolla/config/grafana/dashboards/openstack/hardware_overview.json new file mode 100644 index 000000000..e4f78aee3 --- /dev/null +++ b/etc/kayobe/kolla/config/grafana/dashboards/openstack/hardware_overview.json @@ -0,0 +1,543 @@ +{% raw %} +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Number of healthy drives", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 0 + }, + "hideTimeOverride": false, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count(smartmon_device_smart_healthy > 0)", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Healthy Drives", + "transformations": [], + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Number of healthy drives", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 0 + }, + "hideTimeOverride": false, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count(smartmon_device_smart_healthy < 1) ", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Unhealthy Drives", + "transformations": [], + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Number of healthy drives", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 0 + }, + "hideTimeOverride": false, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count(smartmon_device_smart_healthy)", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Total Drives", + "transformations": [], + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "align": "center", + "displayMode": "auto", + "filterable": false, + "inspect": false + }, + "mappings": [ + { + "options": { + "0": { + "color": "red", + "index": 1, + "text": "Failed" + }, + "1": { + "color": "dark-green", + "index": 0, + "text": "Ok" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "job" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "__name__" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "custom.displayMode", + "value": "color-background-solid" + }, + { + "id": "custom.width" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "type" + }, + "properties": [ + { + "id": "custom.width", + "value": 153 + }, + { + "id": "displayName", + "value": "Type" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "serial_number" + }, + "properties": [ + { + "id": "custom.width", + "value": 208 + }, + { + "id": "displayName", + "value": "Serial Number" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "disk" + }, + "properties": [ + { + "id": "custom.width", + "value": 146 + }, + { + "id": "displayName", + "value": "Disk" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "instance" + }, + "properties": [ + { + "id": "custom.width", + "value": 203 + }, + { + "id": "displayName", + "value": "Hostname" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Type" + }, + "properties": [ + { + "id": "custom.width" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Serial Number" + }, + "properties": [ + { + "id": "custom.width" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Hostname" + }, + "properties": [ + { + "id": "custom.width" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Disk" + }, + "properties": [ + { + "id": "custom.width" + } + ] + } + ] + }, + "gridPos": { + "h": 13, + "w": 18, + "x": 0, + "y": 7 + }, + "id": 2, + "options": { + "footer": { + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "9.1.2", + "targets": [ + { + "$$hashKey": "object:40", + "aggregation": "Last", + "alias": "Healthy", + "crit": 0, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "decimals": 0, + "displayAliasType": "Warning / Critical", + "displayType": "Regular", + "displayValueWithAlias": "Never", + "editorMode": "code", + "exemplar": false, + "expr": "smartmon_device_smart_healthy", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "range": false, + "refId": "A", + "units": "none", + "valueHandler": "Number Threshold", + "warn": 0 + } + ], + "title": "Panel Title", + "transparent": true, + "type": "table" + } + ], + "refresh": false, + "schemaVersion": 37, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "filters": [], + "hide": 0, + "name": "Filters", + "skipUrlSync": false, + "type": "adhoc" + }, + { + "current": { + "selected": true, + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Hardware Overview", + "uid": "TCN51Y25P", + "version": 1, + "weekStart": "" +} +{% endraw %} \ No newline at end of file diff --git a/etc/kayobe/kolla/config/prometheus/smart.rules b/etc/kayobe/kolla/config/prometheus/smart.rules new file mode 100644 index 000000000..0b6552598 --- /dev/null +++ b/etc/kayobe/kolla/config/prometheus/smart.rules @@ -0,0 +1,12 @@ +{% raw %} + +- alert: DiskSmartStatusUnhealthy + expr: smartmon_device_smart_healthy < 1 + for: 10m + labels: + severity: alert + annotations: + summary: "SMART monitor reports bad disk on (instance {{ $labels.instance }})" + description: "{{ $labels.instance }} is reporting unhealthy for the disk at {{ $labels.disk }}. Disk serial number is: {{ $labels.serial_number }}" + +{% endraw %} diff --git a/etc/kayobe/kolla/globals.yml b/etc/kayobe/kolla/globals.yml index b30ddd013..3081a2e5e 100644 --- a/etc/kayobe/kolla/globals.yml +++ b/etc/kayobe/kolla/globals.yml @@ -17,4 +17,11 @@ bifrost_tag: xena-20221128T101757 es_heap_size: 8g prometheus_cmdline_extras: "--storage.tsdb.retention.time=30d" +# Additional command line flags for node exporter to enable texfile collector for disk metrics and create textfile docker volume +prometheus_node_exporter_extra_volumes: + - "textfile:/var/lib/node_exporter/textfile_collector" +prometheus_node_exporter_cmdline_extras: "--collector.textfile.directory=/var/lib/node_exporter/textfile_collector" + + ############################################################################# + diff --git a/releasenotes/notes/smart-mon-db8fa642c3af74b1.yaml b/releasenotes/notes/smart-mon-db8fa642c3af74b1.yaml new file mode 100644 index 000000000..feaec4dbe --- /dev/null +++ b/releasenotes/notes/smart-mon-db8fa642c3af74b1.yaml @@ -0,0 +1,4 @@ +--- + +features: + - Enables SMART monitoring. Manual action is required, please see the monitoring documentation for the procedure. From 5a7d68d3d41d33f5b9449a1d30f2e33a82acdeb3 Mon Sep 17 00:00:00 2001 From: Matt Anson Date: Fri, 11 Nov 2022 09:17:02 +0000 Subject: [PATCH 2/4] Increase job timeout for kolla image build GHA (cherry picked from commit 807e935dd4dc622ffbaf0a1df2a86c93bacf8b4c) --- .github/workflows/stackhpc-container-image-build.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/stackhpc-container-image-build.yml b/.github/workflows/stackhpc-container-image-build.yml index 65428c039..f69626d27 100644 --- a/.github/workflows/stackhpc-container-image-build.yml +++ b/.github/workflows/stackhpc-container-image-build.yml @@ -34,6 +34,7 @@ jobs: name: Build Kolla container images if: github.repository == 'stackhpc/stackhpc-kayobe-config' runs-on: [self-hosted, stackhpc-kayobe-config-kolla-builder] + timeout-minutes: 720 steps: - uses: actions/checkout@v3 with: From a089c3ebcc1aecae7a1a508cf5cceec361da420b Mon Sep 17 00:00:00 2001 From: Dawud M <7688823+technowhizz@users.noreply.github.com> Date: Fri, 2 Dec 2022 16:04:05 +0000 Subject: [PATCH 3/4] Fix oom-killer graph Changes the oom-killer graph from a smoothed irate to a discrete delta function. Change-Id: I2e4a8576c628610409ade4aad2bd98754bec3860 (cherry picked from commit ef1a449034bb4501e333ae41910e8d77cb5b4ad8) --- .../grafana/dashboards/openstack/node_exporter_full.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/etc/kayobe/kolla/config/grafana/dashboards/openstack/node_exporter_full.json b/etc/kayobe/kolla/config/grafana/dashboards/openstack/node_exporter_full.json index 08078c31d..66d630b8d 100644 --- a/etc/kayobe/kolla/config/grafana/dashboards/openstack/node_exporter_full.json +++ b/etc/kayobe/kolla/config/grafana/dashboards/openstack/node_exporter_full.json @@ -4886,7 +4886,7 @@ "Total Swap": "#614D93", "VmallocUsed": "#EA6460" }, - "bars": false, + "bars": true, "dashLength": 10, "dashes": false, "datasource": { @@ -4921,7 +4921,7 @@ "total": false, "values": true }, - "lines": true, + "lines": false, "linewidth": 1, "links": [], "maxPerRow": 6, @@ -4940,9 +4940,9 @@ "steppedLine": false, "targets": [ { - "expr": "irate(node_vmstat_oom_kill{instance=\"$node\",job=\"$job\"}[5m])", + "expr": "max_over_time(node_vmstat_oom_kill{instance=\"$node\",job=\"$job\"}[$__rate_interval:]) - (min_over_time(node_vmstat_oom_kill{instance=\"$node\",job=\"$job\"}[$__rate_interval:]))", "format": "time_series", - "interval": "", + "interval": "30s", "intervalFactor": 2, "legendFormat": "oom killer invocations ", "refId": "A", From de1be4842e9e3291077a6bf80dbdf7369f74b39b Mon Sep 17 00:00:00 2001 From: Stig Telfer Date: Thu, 10 Nov 2022 11:05:00 +0000 Subject: [PATCH 4/4] Rephrase the match logic for interfaces monitored for package drops OVS bridge interfaces drop packets during normal operation. Change the regex to filter out interfaces that don't matter for packet drops. (cherry picked from commit 9c3f15a1f374d24d08a0bdac63886220247d2a4e) --- etc/kayobe/kolla/config/prometheus/ceph.rules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etc/kayobe/kolla/config/prometheus/ceph.rules b/etc/kayobe/kolla/config/prometheus/ceph.rules index 7df171501..52b9841a9 100644 --- a/etc/kayobe/kolla/config/prometheus/ceph.rules +++ b/etc/kayobe/kolla/config/prometheus/ceph.rules @@ -154,7 +154,7 @@ groups: # alert on nic packet errors and drops rates > 1 packet/s - alert: NetworkPacketsDropped - expr: irate(node_network_receive_drop_total{device=~"en.*|eth.*"}[5m]) + irate(node_network_transmit_drop_total{device=~"en.*|eth.*"}[5m]) > 1 + expr: irate(node_network_receive_drop_total{device!~"lo|br.*|.*-ovs|tap.*"}[5m]) + irate(node_network_transmit_drop_total{device!~"lo|br.*|.*-ovs|tap.*"}[5m]) > 1 labels: severity: warning annotations: