diff --git a/.github/workflows/stackhpc-container-image-build.yml b/.github/workflows/stackhpc-container-image-build.yml index 65428c039..f69626d27 100644 --- a/.github/workflows/stackhpc-container-image-build.yml +++ b/.github/workflows/stackhpc-container-image-build.yml @@ -34,6 +34,7 @@ jobs: name: Build Kolla container images if: github.repository == 'stackhpc/stackhpc-kayobe-config' runs-on: [self-hosted, stackhpc-kayobe-config-kolla-builder] + timeout-minutes: 720 steps: - uses: actions/checkout@v3 with: diff --git a/doc/source/configuration/monitoring.rst b/doc/source/configuration/monitoring.rst index 7e53629f1..f358ea084 100644 --- a/doc/source/configuration/monitoring.rst +++ b/doc/source/configuration/monitoring.rst @@ -1,4 +1,7 @@ -======================== +=========== +Monitoring +=========== + Monitoring Configuration ======================== @@ -10,4 +13,43 @@ The configuration options can be found in ``etc/kayobe/stackhpc-monitoring.yml``: .. literalinclude:: ../../../etc/kayobe/stackhpc-monitoring.yml - :language: yaml \ No newline at end of file + :language: yaml + +SMART Drive Monitoring +======================= + +StackHPC kayobe config also includes drive monitoring for spinning disks and +NVME's. + +By default, node exporter doesn't provide SMART metrics, hence we make use +of 2 scripts (one for NVME’s and one for spinning drives), which are run by +a cronjob, to output the metrics and we use node exporter's Textfile collector +to report the metrics output by the scripts to Prometheus. These metrics can +then be visualised in Grafana with the bundled dashboard. + +After pulling in the latest changes into your local kayobe config, reconfigure +Prometheus and Grafana + +.. code-block:: console + + kayobe overcloud service reconfigure -kt grafana,prometheus + +(Note: If you run into an error when reconfiguring Grafana, it could be due to +`this `__ bug and at +present, the workaround is to go into each node running Grafana and manually +restart the process with ``docker restart grafana`` and then try the reconfigure +command again.)  + +Once the reconfigure has completed you can now run the custom playbook which +copies over the scripts and sets up the cron jobs to start SMART monitoring +on the overcloud hosts: + +.. code-block:: console + + (kayobe) [stack@node ~]$ cd etc/kayobe + (kayobe) [stack@node kayobe]$ kayobe playbook run ansible/smartmontools.yml + +SMART reporting should now be enabled along with a Prometheus alert for +unhealthy disks and a Grafana dashboard called ``Hardware Overview``.  + + diff --git a/etc/kayobe/ansible/scripts/nvmemon.sh b/etc/kayobe/ansible/scripts/nvmemon.sh new file mode 100644 index 000000000..9ab727b0b --- /dev/null +++ b/etc/kayobe/ansible/scripts/nvmemon.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash +set -eu + +# Dependencies: nvme-cli, jq (packages) +# Based on code from +# - https://github.com/prometheus/node_exporter/blob/master/text_collector_examples/smartmon.sh +# - https://github.com/prometheus/node_exporter/blob/master/text_collector_examples/mellanox_hca_temp +# - https://github.com/vorlon/check_nvme/blob/master/check_nvme.sh +# +# Author: Henk + +# Check if we are root +if [ "$EUID" -ne 0 ]; then + echo "${0##*/}: Please run as root!" >&2 + exit 1 +fi + +# Check if programs are installed +if ! command -v nvme >/dev/null 2>&1; then + echo "${0##*/}: nvme is not installed. Aborting." >&2 + exit 1 +fi + +output_format_awk="$( + cat <<'OUTPUTAWK' +BEGIN { v = "" } +v != $1 { + print "# HELP nvme_" $1 " SMART metric " $1; + if ($1 ~ /_total$/) + print "# TYPE nvme_" $1 " counter"; + else + print "# TYPE nvme_" $1 " gauge"; + v = $1 +} +{print "nvme_" $0} +OUTPUTAWK +)" + +format_output() { + sort | awk -F'{' "${output_format_awk}" +} + +# Get the nvme-cli version +nvme_version="$(nvme version | awk '$1 == "nvme" {print $3}')" +echo "nvmecli{version=\"${nvme_version}\"} 1" | format_output + +# Get devices +device_list="$(nvme list -o json | jq -r '.Devices | .[].DevicePath')" + +# Loop through the NVMe devices +for device in ${device_list}; do + json_check="$(nvme smart-log -o json "${device}")" + disk="${device##*/}" + + # The temperature value in JSON is in Kelvin, we want Celsius + value_temperature="$(echo "$json_check" | jq '.temperature - 273')" + echo "temperature_celsius{device=\"${disk}\"} ${value_temperature}" + + value_available_spare="$(echo "$json_check" | jq '.avail_spare / 100')" + echo "available_spare_ratio{device=\"${disk}\"} ${value_available_spare}" + + value_available_spare_threshold="$(echo "$json_check" | jq '.spare_thresh / 100')" + echo "available_spare_threshold_ratio{device=\"${disk}\"} ${value_available_spare_threshold}" + + value_percentage_used="$(echo "$json_check" | jq '.percent_used / 100')" + echo "percentage_used_ratio{device=\"${disk}\"} ${value_percentage_used}" + + value_critical_warning="$(echo "$json_check" | jq '.critical_warning')" + echo "critical_warning_total{device=\"${disk}\"} ${value_critical_warning}" + + value_media_errors="$(echo "$json_check" | jq '.media_errors')" + echo "media_errors_total{device=\"${disk}\"} ${value_media_errors}" + + value_num_err_log_entries="$(echo "$json_check" | jq '.num_err_log_entries')" + echo "num_err_log_entries_total{device=\"${disk}\"} ${value_num_err_log_entries}" + + value_power_cycles="$(echo "$json_check" | jq '.power_cycles')" + echo "power_cycles_total{device=\"${disk}\"} ${value_power_cycles}" + + value_power_on_hours="$(echo "$json_check" | jq '.power_on_hours')" + echo "power_on_hours_total{device=\"${disk}\"} ${value_power_on_hours}" + + value_controller_busy_time="$(echo "$json_check" | jq '.controller_busy_time')" + echo "controller_busy_time_seconds{device=\"${disk}\"} ${value_controller_busy_time}" + + value_data_units_written="$(echo "$json_check" | jq '.data_units_written')" + echo "data_units_written_total{device=\"${disk}\"} ${value_data_units_written}" + + value_data_units_read="$(echo "$json_check" | jq '.data_units_read')" + echo "data_units_read_total{device=\"${disk}\"} ${value_data_units_read}" + + value_host_read_commands="$(echo "$json_check" | jq '.host_read_commands')" + echo "host_read_commands_total{device=\"${disk}\"} ${value_host_read_commands}" + + value_host_write_commands="$(echo "$json_check" | jq '.host_write_commands')" + echo "host_write_commands_total{device=\"${disk}\"} ${value_host_write_commands}" +done | format_output diff --git a/etc/kayobe/ansible/scripts/smartmon.sh b/etc/kayobe/ansible/scripts/smartmon.sh new file mode 100644 index 000000000..bcac8b8b3 --- /dev/null +++ b/etc/kayobe/ansible/scripts/smartmon.sh @@ -0,0 +1,202 @@ +#!/bin/bash +# Script informed by the collectd monitoring script for smartmontools (using smartctl) +# by Samuel B. (c) 2012 +# source at: http://devel.dob.sk/collectd-scripts/ + +# TODO: This probably needs to be a little more complex. The raw numbers can have more +# data in them than you'd think. +# http://arstechnica.com/civis/viewtopic.php?p=22062211 + +# Formatting done via shfmt -i 2 +# https://github.com/mvdan/sh + +parse_smartctl_attributes_awk="$( + cat <<'SMARTCTLAWK' +$1 ~ /^ *[0-9]+$/ && $2 ~ /^[a-zA-Z0-9_-]+$/ { + gsub(/-/, "_"); + printf "%s_value{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $4 + printf "%s_worst{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $5 + printf "%s_threshold{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $6 + printf "%s_raw_value{%s,smart_id=\"%s\"} %e\n", $2, labels, $1, $10 +} +SMARTCTLAWK +)" + +smartmon_attrs="$( + cat <<'SMARTMONATTRS' +airflow_temperature_cel +command_timeout +current_pending_sector +end_to_end_error +erase_fail_count +g_sense_error_rate +hardware_ecc_recovered +host_reads_32mib +host_reads_mib +host_writes_32mib +host_writes_mib +load_cycle_count +media_wearout_indicator +nand_writes_1gib +offline_uncorrectable +power_cycle_count +power_on_hours +program_fail_cnt_total +program_fail_count +raw_read_error_rate +reallocated_event_count +reallocated_sector_ct +reported_uncorrect +runtime_bad_block +sata_downshift_count +seek_error_rate +spin_retry_count +spin_up_time +start_stop_count +temperature_case +temperature_celsius +temperature_internal +total_lbas_read +total_lbas_written +udma_crc_error_count +unsafe_shutdown_count +unused_rsvd_blk_cnt_tot +wear_leveling_count +workld_host_reads_perc +workld_media_wear_indic +workload_minutes +SMARTMONATTRS +)" +smartmon_attrs="$(echo "${smartmon_attrs}" | xargs | tr ' ' '|')" + +parse_smartctl_attributes() { + local disk="$1" + local disk_type="$2" + local serial="$3" + local labels="disk=\"${disk}\",type=\"${disk_type}\",serial_number=\"${serial}\"" + sed 's/^ \+//g' | + awk -v labels="${labels}" "${parse_smartctl_attributes_awk}" 2>/dev/null | + tr '[:upper:]' '[:lower:]' | + grep -E "(${smartmon_attrs})" +} + +parse_smartctl_scsi_attributes() { + local disk="$1" + local disk_type="$2" + local serial="$3" + local labels="disk=\"${disk}\",type=\"${disk_type}\",serial_number=\"${serial}\"" + while read -r line; do + attr_type="$(echo "${line}" | tr '=' ':' | cut -f1 -d: | sed 's/^ \+//g' | tr ' ' '_')" + attr_value="$(echo "${line}" | tr '=' ':' | cut -f2 -d: | sed 's/^ \+//g')" + case "${attr_type}" in + number_of_hours_powered_up_) power_on="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; + Current_Drive_Temperature) temp_cel="$(echo "${attr_value}" | cut -f1 -d' ' | awk '{ printf "%e\n", $1 }')" ;; + Blocks_sent_to_initiator_) lbas_read="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; + Blocks_received_from_initiator_) lbas_written="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; + Accumulated_start-stop_cycles) power_cycle="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; + Elements_in_grown_defect_list) grown_defects="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; + esac + done + [ -n "$power_on" ] && echo "power_on_hours_raw_value{${labels},smart_id=\"9\"} ${power_on}" + [ -n "$temp_cel" ] && echo "temperature_celsius_raw_value{${labels},smart_id=\"194\"} ${temp_cel}" + [ -n "$lbas_read" ] && echo "total_lbas_read_raw_value{${labels},smart_id=\"242\"} ${lbas_read}" + [ -n "$lbas_written" ] && echo "total_lbas_written_raw_value{${labels},smart_id=\"241\"} ${lbas_written}" + [ -n "$power_cycle" ] && echo "power_cycle_count_raw_value{${labels},smart_id=\"12\"} ${power_cycle}" + [ -n "$grown_defects" ] && echo "grown_defects_count_raw_value{${labels},smart_id=\"-1\"} ${grown_defects}" +} + +parse_smartctl_info() { + local -i smart_available=0 smart_enabled=0 smart_healthy= + local disk="$1" disk_type="$2" + local model_family='' device_model='' serial_number='' fw_version='' vendor='' product='' revision='' lun_id='' + while read -r line; do + info_type="$(echo "${line}" | cut -f1 -d: | tr ' ' '_')" + info_value="$(echo "${line}" | cut -f2- -d: | sed 's/^ \+//g' | sed 's/"/\\"/')" + case "${info_type}" in + Model_Family) model_family="${info_value}" ;; + Device_Model) device_model="${info_value}" ;; + Serial_Number) serial_number="${info_value}" ;; + Firmware_Version) fw_version="${info_value}" ;; + Vendor) vendor="${info_value}" ;; + Product) product="${info_value}" ;; + Revision) revision="${info_value}" ;; + Logical_Unit_id) lun_id="${info_value}" ;; + esac + if [[ "${info_type}" == 'SMART_support_is' ]]; then + case "${info_value:0:7}" in + Enabled) smart_available=1; smart_enabled=1 ;; + Availab) smart_available=1; smart_enabled=0 ;; + Unavail) smart_available=0; smart_enabled=0 ;; + esac + fi + if [[ "${info_type}" == 'SMART_overall-health_self-assessment_test_result' ]]; then + case "${info_value:0:6}" in + PASSED) smart_healthy=1 ;; + *) smart_healthy=0 ;; + esac + elif [[ "${info_type}" == 'SMART_Health_Status' ]]; then + case "${info_value:0:2}" in + OK) smart_healthy=1 ;; + *) smart_healthy=0 ;; + esac + fi + done + echo "device_info{disk=\"${disk}\",type=\"${disk_type}\",vendor=\"${vendor}\",product=\"${product}\",revision=\"${revision}\",lun_id=\"${lun_id}\",model_family=\"${model_family}\",device_model=\"${device_model}\",serial_number=\"${serial_number}\",firmware_version=\"${fw_version}\"} 1" + echo "device_smart_available{disk=\"${disk}\",type=\"${disk_type}\",serial_number=\"${serial_number}\"} ${smart_available}" + [[ "${smart_available}" == "1" ]] && echo "device_smart_enabled{disk=\"${disk}\",type=\"${disk_type}\",serial_number=\"${serial_number}\"} ${smart_enabled}" + [[ "${smart_available}" == "1" ]] && [[ "${smart_healthy}" != "" ]] && echo "device_smart_healthy{disk=\"${disk}\",type=\"${disk_type}\",serial_number=\"${serial_number}\"} ${smart_healthy}" +} + +output_format_awk="$( + cat <<'OUTPUTAWK' +BEGIN { v = "" } +v != $1 { + print "# HELP smartmon_" $1 " SMART metric " $1; + print "# TYPE smartmon_" $1 " gauge"; + v = $1 +} +{print "smartmon_" $0} +OUTPUTAWK +)" + +format_output() { + sort | + awk -F'{' "${output_format_awk}" +} + +smartctl_version="$(/usr/sbin/smartctl -V | head -n1 | awk '$1 == "smartctl" {print $2}')" + +echo "smartctl_version{version=\"${smartctl_version}\"} 1" | format_output + +if [[ "$(expr "${smartctl_version}" : '\([0-9]*\)\..*')" -lt 6 ]]; then + exit +fi + +device_list="$(/usr/sbin/smartctl --scan-open | awk '/^\/dev/{print $1 "|" $3}')" + +for device in ${device_list}; do + disk="$(echo "${device}" | cut -f1 -d'|')" + type="$(echo "${device}" | cut -f2 -d'|')" + # Use REGEX to extract the serial number from the parsed information and save that to a variable + serial_number="$(/usr/sbin/smartctl -i -H -d "${type}" "${disk}" | parse_smartctl_info "${disk}" "${type}"| sed -E ':a;N;$!ba;s/.*serial_number=\"([^"]+)\".*/\1/g' | sed -E 's/^device_info\{.*//g')" + active=1 + echo "smartctl_run{disk=\"${disk}\",type=\"${type}\"}" "$(TZ=UTC date '+%s')" + # Check if the device is in a low-power mode + /usr/sbin/smartctl -n standby -d "${type}" "${disk}" > /dev/null || active=0 + echo "device_active{disk=\"${disk}\",type=\"${type}\"}" "${active}" + # Skip further metrics to prevent the disk from spinning up + test ${active} -eq 0 && continue + # Get the SMART information and health + /usr/sbin/smartctl -i -H -d "${type}" "${disk}" | parse_smartctl_info "${disk}" "${type}" + # Get the SMART attributes + case ${type} in + sat) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" "${serial_number}" ;; + sat+megaraid*) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" "${serial_number}" ;; + scsi) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" "${serial_number}" ;; + megaraid*) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" "${serial_number}" ;; + *) + (>&2 echo "disk type is not sat, scsi or megaraid but ${type}") + exit + ;; + esac +done | format_output \ No newline at end of file diff --git a/etc/kayobe/ansible/smartmon-tools.yml b/etc/kayobe/ansible/smartmon-tools.yml new file mode 100644 index 000000000..6b275c264 --- /dev/null +++ b/etc/kayobe/ansible/smartmon-tools.yml @@ -0,0 +1,43 @@ +--- +- hosts: overcloud + + tasks: + - name: Ensure smartmon-tools and nvme-cli is installed + package: + name: + - smartmontools + - nvme-cli + - jq + state: present + become: true + + - name: Copy smartmon.sh and nvmemon.sh from scripts folder + copy: + src: "scripts/{{ item }}" + dest: /usr/local/bin/ + owner: 'root' + group: 'root' + mode: '0700' + loop: + - smartmon.sh + - nvmemon.sh + become: yes + + - name: Set PATH Variable for cron + cron: + name: PATH + user: root + env: yes + job: /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin + become: yes + + - name: Schedule cronjob to run both scripts every 5 minutes and save output to file + cron: + name: "SMART metrics for drive monitoring using {{ item }}" + user: root + minute: "*/5" + job: "/usr/local/bin/{{ item }}.sh > /var/lib/docker/volumes/textfile/_data/{{ item }}.prom.temp && mv /var/lib/docker/volumes/textfile/_data/{{ item }}.prom.temp /var/lib/docker/volumes/textfile/_data/{{ item }}.prom" + loop: + - smartmon + - nvmemon + become: yes diff --git a/etc/kayobe/kolla/config/grafana/dashboards/openstack/hardware_overview.json b/etc/kayobe/kolla/config/grafana/dashboards/openstack/hardware_overview.json new file mode 100644 index 000000000..e4f78aee3 --- /dev/null +++ b/etc/kayobe/kolla/config/grafana/dashboards/openstack/hardware_overview.json @@ -0,0 +1,543 @@ +{% raw %} +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Number of healthy drives", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 0 + }, + "hideTimeOverride": false, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count(smartmon_device_smart_healthy > 0)", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Healthy Drives", + "transformations": [], + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Number of healthy drives", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 0 + }, + "hideTimeOverride": false, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count(smartmon_device_smart_healthy < 1) ", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Unhealthy Drives", + "transformations": [], + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Number of healthy drives", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 0 + }, + "hideTimeOverride": false, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count(smartmon_device_smart_healthy)", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Total Drives", + "transformations": [], + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "align": "center", + "displayMode": "auto", + "filterable": false, + "inspect": false + }, + "mappings": [ + { + "options": { + "0": { + "color": "red", + "index": 1, + "text": "Failed" + }, + "1": { + "color": "dark-green", + "index": 0, + "text": "Ok" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "job" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "__name__" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "custom.displayMode", + "value": "color-background-solid" + }, + { + "id": "custom.width" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "type" + }, + "properties": [ + { + "id": "custom.width", + "value": 153 + }, + { + "id": "displayName", + "value": "Type" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "serial_number" + }, + "properties": [ + { + "id": "custom.width", + "value": 208 + }, + { + "id": "displayName", + "value": "Serial Number" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "disk" + }, + "properties": [ + { + "id": "custom.width", + "value": 146 + }, + { + "id": "displayName", + "value": "Disk" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "instance" + }, + "properties": [ + { + "id": "custom.width", + "value": 203 + }, + { + "id": "displayName", + "value": "Hostname" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Type" + }, + "properties": [ + { + "id": "custom.width" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Serial Number" + }, + "properties": [ + { + "id": "custom.width" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Hostname" + }, + "properties": [ + { + "id": "custom.width" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Disk" + }, + "properties": [ + { + "id": "custom.width" + } + ] + } + ] + }, + "gridPos": { + "h": 13, + "w": 18, + "x": 0, + "y": 7 + }, + "id": 2, + "options": { + "footer": { + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "9.1.2", + "targets": [ + { + "$$hashKey": "object:40", + "aggregation": "Last", + "alias": "Healthy", + "crit": 0, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "decimals": 0, + "displayAliasType": "Warning / Critical", + "displayType": "Regular", + "displayValueWithAlias": "Never", + "editorMode": "code", + "exemplar": false, + "expr": "smartmon_device_smart_healthy", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "range": false, + "refId": "A", + "units": "none", + "valueHandler": "Number Threshold", + "warn": 0 + } + ], + "title": "Panel Title", + "transparent": true, + "type": "table" + } + ], + "refresh": false, + "schemaVersion": 37, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "filters": [], + "hide": 0, + "name": "Filters", + "skipUrlSync": false, + "type": "adhoc" + }, + { + "current": { + "selected": true, + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Hardware Overview", + "uid": "TCN51Y25P", + "version": 1, + "weekStart": "" +} +{% endraw %} \ No newline at end of file diff --git a/etc/kayobe/kolla/config/grafana/dashboards/openstack/node_exporter_full.json b/etc/kayobe/kolla/config/grafana/dashboards/openstack/node_exporter_full.json index 08078c31d..66d630b8d 100644 --- a/etc/kayobe/kolla/config/grafana/dashboards/openstack/node_exporter_full.json +++ b/etc/kayobe/kolla/config/grafana/dashboards/openstack/node_exporter_full.json @@ -4886,7 +4886,7 @@ "Total Swap": "#614D93", "VmallocUsed": "#EA6460" }, - "bars": false, + "bars": true, "dashLength": 10, "dashes": false, "datasource": { @@ -4921,7 +4921,7 @@ "total": false, "values": true }, - "lines": true, + "lines": false, "linewidth": 1, "links": [], "maxPerRow": 6, @@ -4940,9 +4940,9 @@ "steppedLine": false, "targets": [ { - "expr": "irate(node_vmstat_oom_kill{instance=\"$node\",job=\"$job\"}[5m])", + "expr": "max_over_time(node_vmstat_oom_kill{instance=\"$node\",job=\"$job\"}[$__rate_interval:]) - (min_over_time(node_vmstat_oom_kill{instance=\"$node\",job=\"$job\"}[$__rate_interval:]))", "format": "time_series", - "interval": "", + "interval": "30s", "intervalFactor": 2, "legendFormat": "oom killer invocations ", "refId": "A", diff --git a/etc/kayobe/kolla/config/prometheus/ceph.rules b/etc/kayobe/kolla/config/prometheus/ceph.rules index 7df171501..52b9841a9 100644 --- a/etc/kayobe/kolla/config/prometheus/ceph.rules +++ b/etc/kayobe/kolla/config/prometheus/ceph.rules @@ -154,7 +154,7 @@ groups: # alert on nic packet errors and drops rates > 1 packet/s - alert: NetworkPacketsDropped - expr: irate(node_network_receive_drop_total{device=~"en.*|eth.*"}[5m]) + irate(node_network_transmit_drop_total{device=~"en.*|eth.*"}[5m]) > 1 + expr: irate(node_network_receive_drop_total{device!~"lo|br.*|.*-ovs|tap.*"}[5m]) + irate(node_network_transmit_drop_total{device!~"lo|br.*|.*-ovs|tap.*"}[5m]) > 1 labels: severity: warning annotations: diff --git a/etc/kayobe/kolla/config/prometheus/smart.rules b/etc/kayobe/kolla/config/prometheus/smart.rules new file mode 100644 index 000000000..0b6552598 --- /dev/null +++ b/etc/kayobe/kolla/config/prometheus/smart.rules @@ -0,0 +1,12 @@ +{% raw %} + +- alert: DiskSmartStatusUnhealthy + expr: smartmon_device_smart_healthy < 1 + for: 10m + labels: + severity: alert + annotations: + summary: "SMART monitor reports bad disk on (instance {{ $labels.instance }})" + description: "{{ $labels.instance }} is reporting unhealthy for the disk at {{ $labels.disk }}. Disk serial number is: {{ $labels.serial_number }}" + +{% endraw %} diff --git a/etc/kayobe/kolla/globals.yml b/etc/kayobe/kolla/globals.yml index b30ddd013..3081a2e5e 100644 --- a/etc/kayobe/kolla/globals.yml +++ b/etc/kayobe/kolla/globals.yml @@ -17,4 +17,11 @@ bifrost_tag: xena-20221128T101757 es_heap_size: 8g prometheus_cmdline_extras: "--storage.tsdb.retention.time=30d" +# Additional command line flags for node exporter to enable texfile collector for disk metrics and create textfile docker volume +prometheus_node_exporter_extra_volumes: + - "textfile:/var/lib/node_exporter/textfile_collector" +prometheus_node_exporter_cmdline_extras: "--collector.textfile.directory=/var/lib/node_exporter/textfile_collector" + + ############################################################################# + diff --git a/releasenotes/notes/smart-mon-db8fa642c3af74b1.yaml b/releasenotes/notes/smart-mon-db8fa642c3af74b1.yaml new file mode 100644 index 000000000..feaec4dbe --- /dev/null +++ b/releasenotes/notes/smart-mon-db8fa642c3af74b1.yaml @@ -0,0 +1,4 @@ +--- + +features: + - Enables SMART monitoring. Manual action is required, please see the monitoring documentation for the procedure.