From a8c0cc122c4924fdaa9a5fb1cc4cdb925dac4eab Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Wed, 19 Oct 2022 13:24:41 +0100 Subject: [PATCH 01/28] Ubuntu: bump OVS and OVN packages The Open vSwitch and OVN packages in Ubuntu Wallaby UCA repository are quite old - 2.15 and 20.12 respectively. Pull in these packages from the Yoga UCA, which are 2.17 and 22.03, to more closely match the CentOS packages. --- etc/kayobe/kolla.yml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/etc/kayobe/kolla.yml b/etc/kayobe/kolla.yml index ff7cb0f05..24f1b2222 100644 --- a/etc/kayobe/kolla.yml +++ b/etc/kayobe/kolla.yml @@ -221,6 +221,27 @@ kolla_build_blocks: -e 's/^[# ]*\(baseurl *=.*\)/#\1/g' \ -e '/#baseurl.*/a baseurl={{ repo.url }}' /etc/yum.repos.d/{{ repo.file }}{% if not loop.last %} &&{% endif %} \ {% endfor %} + # NOTE: The Open vSwitch and OVN packages in Ubuntu Wallaby UCA repository + # are quite old - 2.15 and 20.12 respectively. Pull in these packages from + # the Yoga UCA, which are 2.17 and 22.03, to more closely match the CentOS + # packages. + base_debian_after_sources_list: | + RUN echo "\ + deb http://ubuntu-cloud.archive.canonical.com/ubuntu focal-updates/yoga main"\ + > /etc/apt/sources.list.d/uca-yoga.list + RUN echo "\ + Package: *\n\ + Pin: release focal-updates/yoga\n\ + Pin-Priority: -1\n\ + \n\ + Package: ovn*\n\ + Pin: release focal-updates/yoga\n\ + Pin-Priority: 500\n\ + \n\ + Package: openvswitch* python3-openvswitch\n\ + Pin: release focal-updates/yoga\n\ + Pin-Priority: 500"\ + > /etc/apt/preferences.d/uca-yoga # NOTE: Not currently syncing Ubuntu packages, since the on_demand mirror in # Ark does not work if the upstream mirror pulls packages (which it does # sometimes). From d83cceb2c41c18c2406032dac36cf90e57f37107 Mon Sep 17 00:00:00 2001 From: Matt Anson Date: Mon, 12 Dec 2022 22:08:51 +0000 Subject: [PATCH 02/28] Don't use interactive docker cmds in rabbitmq-reset.yml Remove --interactive and --tty args to docker exec commands in rabbitmq-reset.yml. --- etc/kayobe/ansible/rabbitmq-reset.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/etc/kayobe/ansible/rabbitmq-reset.yml b/etc/kayobe/ansible/rabbitmq-reset.yml index df6d0c4ca..23dde1971 100644 --- a/etc/kayobe/ansible/rabbitmq-reset.yml +++ b/etc/kayobe/ansible/rabbitmq-reset.yml @@ -29,20 +29,20 @@ delay: 6 - name: Wait for the rabbitmq node to automatically start on container start - command: "docker exec -it {{ container_name }} /bin/bash -c 'rabbitmqctl wait /var/lib/rabbitmq/mnesia/rabbitmq.pid --timeout 60'" + command: "docker exec {{ container_name }} /bin/bash -c 'rabbitmqctl wait /var/lib/rabbitmq/mnesia/rabbitmq.pid --timeout 60'" when: inspection.stdout == 'false' - name: Stop app - command: "docker exec -it {{ container_name }} /bin/bash -c 'rabbitmqctl stop_app'" + command: "docker exec {{ container_name }} /bin/bash -c 'rabbitmqctl stop_app'" - name: Force reset app - command: "docker exec -it {{ container_name }} /bin/bash -c 'rabbitmqctl force_reset'" + command: "docker exec {{ container_name }} /bin/bash -c 'rabbitmqctl force_reset'" - name: Start app - command: "docker exec -it {{ container_name }} /bin/bash -c 'rabbitmqctl start_app'" + command: "docker exec {{ container_name }} /bin/bash -c 'rabbitmqctl start_app'" - name: Wait for all nodes to join the cluster - command: "docker exec -it {{ container_name }} /bin/bash -c 'rabbitmqctl await_online_nodes {{ groups['controllers'] | length }}'" + command: "docker exec {{ container_name }} /bin/bash -c 'rabbitmqctl await_online_nodes {{ groups['controllers'] | length }}'" - name: Restart OpenStack services hosts: controllers:compute From fc547a3f6e88c5dc49657fd2860de4e8efd40e31 Mon Sep 17 00:00:00 2001 From: Matt Anson Date: Wed, 14 Dec 2022 07:47:58 +0000 Subject: [PATCH 03/28] Add Ubuntu image tags --- etc/kayobe/kolla.yml | 2 +- etc/kayobe/kolla/globals.yml | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/etc/kayobe/kolla.yml b/etc/kayobe/kolla.yml index 61b7253e7..59fc4d50e 100644 --- a/etc/kayobe/kolla.yml +++ b/etc/kayobe/kolla.yml @@ -96,7 +96,7 @@ kolla_docker_registry_password: "{{ stackhpc_docker_registry_password }}" # Kolla OpenStack release version. This should be a Docker image tag. # Default is {{ openstack_release }}. -kolla_openstack_release: xena-20221010T103511 +kolla_openstack_release: "{% if kolla_base_distro == 'centos' %}xena-20221010T103511{% else %}xena-20221213T204703{% endif %}" # Docker tag applied to built container images. Default is # {{ kolla_openstack_release }}. diff --git a/etc/kayobe/kolla/globals.yml b/etc/kayobe/kolla/globals.yml index 774da38ba..c8fc2413a 100644 --- a/etc/kayobe/kolla/globals.yml +++ b/etc/kayobe/kolla/globals.yml @@ -5,6 +5,8 @@ docker_yum_gpgkey: "https://download.docker.com/linux/centos/gpg" {% if kolla_base_distro == 'centos' %} bifrost_tag: xena-20221128T101757 +{% else %} +bifrost_tag: xena-20221213T224057 {% endif %} ############################################################################# From 9773d874c5dca4da275407c7ba966ffce3985511 Mon Sep 17 00:00:00 2001 From: Matt Anson Date: Wed, 14 Dec 2022 07:48:22 +0000 Subject: [PATCH 04/28] Enable Ubuntu AIO CI --- .github/workflows/stackhpc-pull-request.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/stackhpc-pull-request.yml b/.github/workflows/stackhpc-pull-request.yml index 4b027f44f..a534d413d 100644 --- a/.github/workflows/stackhpc-pull-request.yml +++ b/.github/workflows/stackhpc-pull-request.yml @@ -107,8 +107,7 @@ jobs: vm_image: Ubuntu-20.04 OS_CLOUD: sms-lab-release secrets: inherit - # NOTE: Ubuntu images not available yet. - if: false && github.repository == 'stackhpc/stackhpc-kayobe-config' + if: github.repository == 'stackhpc/stackhpc-kayobe-config' all-in-one-ubuntu-ovn: name: aio (Ubuntu OVN) @@ -122,5 +121,4 @@ jobs: vm_image: Ubuntu-20.04 OS_CLOUD: sms-lab-release secrets: inherit - # NOTE: Ubuntu images not available yet. - if: false && github.repository == 'stackhpc/stackhpc-kayobe-config' + if: github.repository == 'stackhpc/stackhpc-kayobe-config' From 6791190cd311974fa733ed7213c1d4fa7cf0ec28 Mon Sep 17 00:00:00 2001 From: Dawud M <7688823+technowhizz@users.noreply.github.com> Date: Fri, 18 Nov 2022 17:05:51 +0000 Subject: [PATCH 05/28] Add SMART Monitoring with dash and alerts Enabled Textfile collector in node exporter in kolla/globals.yml Added smartmon script as is from the prometheus-community github and then removed NVME support from this script in favour of using the nvme-cli script, which has also been added in. This is because the nvme-cli script provides better metrics than the smartmon script does. The script also adds the serial number of the disk as a label to all SMART metrics. Added a Kayobe custom playbook to easily deploy the script and associated cron job. This playbook installs smartmontool and nvmecli then copies these over to the hosts and sets up a cronjob which runs the scripts and stores the metrics in the docker volume for node exporter. The playbook changes the way the metrics are saved to a file by making use of the mv command as it is atomic. This was needed as at times prometheus would read a partially completed file. Added a prometheus alert to alert when a drive is reported as not healthy for more than 10 minutes. Added a Grafana dashboard to display the number of healthy and unhealthy drives reported in prometheus. (cherry picked from commit d83ecde1c8f38fe79da70041c637140c58f9dc9b) Add docs for SMART Monitoring (cherry picked from commit 595429ad15b77ee9777899e2593c41431ce37a82) Update doc/source/configuration/monitoring.rst Fix kayobe command Co-authored-by: Will Szumski (cherry picked from commit 9a5fc53c05de12b30dd360acfdcb023aedb84491) Update doc/source/configuration/monitoring.rst Fix Spelling Co-authored-by: Will Szumski (cherry picked from commit ef25d6f4d556809cfdc4a0be42300cd1249fdde1) Add release note (cherry picked from commit 3d4d01117dce76d668ac351a61f4843efe72ca0c) Amend docs and add release note (cherry picked from commit b6cb511d911f031de4e2b3c08fa5afb95e0f6d9a) Move SMART prometheus alert to own file (cherry picked from commit b353fd32c4799f325d1a5a0c8476690a12e30ff2) Fix typo (cherry picked from commit 611f2fb891802f2ff4d7248bcd7363acd244154c) fixup --- doc/source/configuration/monitoring.rst | 46 +- etc/kayobe/ansible/scripts/nvmemon.sh | 97 ++++ etc/kayobe/ansible/scripts/smartmon.sh | 202 +++++++ etc/kayobe/ansible/smartmon-tools.yml | 43 ++ .../openstack/hardware_overview.json | 543 ++++++++++++++++++ .../kolla/config/prometheus/smart.rules | 12 + etc/kayobe/kolla/globals.yml | 7 + .../notes/smart-mon-db8fa642c3af74b1.yaml | 4 + 8 files changed, 952 insertions(+), 2 deletions(-) create mode 100644 etc/kayobe/ansible/scripts/nvmemon.sh create mode 100644 etc/kayobe/ansible/scripts/smartmon.sh create mode 100644 etc/kayobe/ansible/smartmon-tools.yml create mode 100644 etc/kayobe/kolla/config/grafana/dashboards/openstack/hardware_overview.json create mode 100644 etc/kayobe/kolla/config/prometheus/smart.rules create mode 100644 releasenotes/notes/smart-mon-db8fa642c3af74b1.yaml diff --git a/doc/source/configuration/monitoring.rst b/doc/source/configuration/monitoring.rst index 7e53629f1..f358ea084 100644 --- a/doc/source/configuration/monitoring.rst +++ b/doc/source/configuration/monitoring.rst @@ -1,4 +1,7 @@ -======================== +=========== +Monitoring +=========== + Monitoring Configuration ======================== @@ -10,4 +13,43 @@ The configuration options can be found in ``etc/kayobe/stackhpc-monitoring.yml``: .. literalinclude:: ../../../etc/kayobe/stackhpc-monitoring.yml - :language: yaml \ No newline at end of file + :language: yaml + +SMART Drive Monitoring +======================= + +StackHPC kayobe config also includes drive monitoring for spinning disks and +NVME's. + +By default, node exporter doesn't provide SMART metrics, hence we make use +of 2 scripts (one for NVME’s and one for spinning drives), which are run by +a cronjob, to output the metrics and we use node exporter's Textfile collector +to report the metrics output by the scripts to Prometheus. These metrics can +then be visualised in Grafana with the bundled dashboard. + +After pulling in the latest changes into your local kayobe config, reconfigure +Prometheus and Grafana + +.. code-block:: console + + kayobe overcloud service reconfigure -kt grafana,prometheus + +(Note: If you run into an error when reconfiguring Grafana, it could be due to +`this `__ bug and at +present, the workaround is to go into each node running Grafana and manually +restart the process with ``docker restart grafana`` and then try the reconfigure +command again.)  + +Once the reconfigure has completed you can now run the custom playbook which +copies over the scripts and sets up the cron jobs to start SMART monitoring +on the overcloud hosts: + +.. code-block:: console + + (kayobe) [stack@node ~]$ cd etc/kayobe + (kayobe) [stack@node kayobe]$ kayobe playbook run ansible/smartmontools.yml + +SMART reporting should now be enabled along with a Prometheus alert for +unhealthy disks and a Grafana dashboard called ``Hardware Overview``.  + + diff --git a/etc/kayobe/ansible/scripts/nvmemon.sh b/etc/kayobe/ansible/scripts/nvmemon.sh new file mode 100644 index 000000000..9ab727b0b --- /dev/null +++ b/etc/kayobe/ansible/scripts/nvmemon.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash +set -eu + +# Dependencies: nvme-cli, jq (packages) +# Based on code from +# - https://github.com/prometheus/node_exporter/blob/master/text_collector_examples/smartmon.sh +# - https://github.com/prometheus/node_exporter/blob/master/text_collector_examples/mellanox_hca_temp +# - https://github.com/vorlon/check_nvme/blob/master/check_nvme.sh +# +# Author: Henk + +# Check if we are root +if [ "$EUID" -ne 0 ]; then + echo "${0##*/}: Please run as root!" >&2 + exit 1 +fi + +# Check if programs are installed +if ! command -v nvme >/dev/null 2>&1; then + echo "${0##*/}: nvme is not installed. Aborting." >&2 + exit 1 +fi + +output_format_awk="$( + cat <<'OUTPUTAWK' +BEGIN { v = "" } +v != $1 { + print "# HELP nvme_" $1 " SMART metric " $1; + if ($1 ~ /_total$/) + print "# TYPE nvme_" $1 " counter"; + else + print "# TYPE nvme_" $1 " gauge"; + v = $1 +} +{print "nvme_" $0} +OUTPUTAWK +)" + +format_output() { + sort | awk -F'{' "${output_format_awk}" +} + +# Get the nvme-cli version +nvme_version="$(nvme version | awk '$1 == "nvme" {print $3}')" +echo "nvmecli{version=\"${nvme_version}\"} 1" | format_output + +# Get devices +device_list="$(nvme list -o json | jq -r '.Devices | .[].DevicePath')" + +# Loop through the NVMe devices +for device in ${device_list}; do + json_check="$(nvme smart-log -o json "${device}")" + disk="${device##*/}" + + # The temperature value in JSON is in Kelvin, we want Celsius + value_temperature="$(echo "$json_check" | jq '.temperature - 273')" + echo "temperature_celsius{device=\"${disk}\"} ${value_temperature}" + + value_available_spare="$(echo "$json_check" | jq '.avail_spare / 100')" + echo "available_spare_ratio{device=\"${disk}\"} ${value_available_spare}" + + value_available_spare_threshold="$(echo "$json_check" | jq '.spare_thresh / 100')" + echo "available_spare_threshold_ratio{device=\"${disk}\"} ${value_available_spare_threshold}" + + value_percentage_used="$(echo "$json_check" | jq '.percent_used / 100')" + echo "percentage_used_ratio{device=\"${disk}\"} ${value_percentage_used}" + + value_critical_warning="$(echo "$json_check" | jq '.critical_warning')" + echo "critical_warning_total{device=\"${disk}\"} ${value_critical_warning}" + + value_media_errors="$(echo "$json_check" | jq '.media_errors')" + echo "media_errors_total{device=\"${disk}\"} ${value_media_errors}" + + value_num_err_log_entries="$(echo "$json_check" | jq '.num_err_log_entries')" + echo "num_err_log_entries_total{device=\"${disk}\"} ${value_num_err_log_entries}" + + value_power_cycles="$(echo "$json_check" | jq '.power_cycles')" + echo "power_cycles_total{device=\"${disk}\"} ${value_power_cycles}" + + value_power_on_hours="$(echo "$json_check" | jq '.power_on_hours')" + echo "power_on_hours_total{device=\"${disk}\"} ${value_power_on_hours}" + + value_controller_busy_time="$(echo "$json_check" | jq '.controller_busy_time')" + echo "controller_busy_time_seconds{device=\"${disk}\"} ${value_controller_busy_time}" + + value_data_units_written="$(echo "$json_check" | jq '.data_units_written')" + echo "data_units_written_total{device=\"${disk}\"} ${value_data_units_written}" + + value_data_units_read="$(echo "$json_check" | jq '.data_units_read')" + echo "data_units_read_total{device=\"${disk}\"} ${value_data_units_read}" + + value_host_read_commands="$(echo "$json_check" | jq '.host_read_commands')" + echo "host_read_commands_total{device=\"${disk}\"} ${value_host_read_commands}" + + value_host_write_commands="$(echo "$json_check" | jq '.host_write_commands')" + echo "host_write_commands_total{device=\"${disk}\"} ${value_host_write_commands}" +done | format_output diff --git a/etc/kayobe/ansible/scripts/smartmon.sh b/etc/kayobe/ansible/scripts/smartmon.sh new file mode 100644 index 000000000..bcac8b8b3 --- /dev/null +++ b/etc/kayobe/ansible/scripts/smartmon.sh @@ -0,0 +1,202 @@ +#!/bin/bash +# Script informed by the collectd monitoring script for smartmontools (using smartctl) +# by Samuel B. (c) 2012 +# source at: http://devel.dob.sk/collectd-scripts/ + +# TODO: This probably needs to be a little more complex. The raw numbers can have more +# data in them than you'd think. +# http://arstechnica.com/civis/viewtopic.php?p=22062211 + +# Formatting done via shfmt -i 2 +# https://github.com/mvdan/sh + +parse_smartctl_attributes_awk="$( + cat <<'SMARTCTLAWK' +$1 ~ /^ *[0-9]+$/ && $2 ~ /^[a-zA-Z0-9_-]+$/ { + gsub(/-/, "_"); + printf "%s_value{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $4 + printf "%s_worst{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $5 + printf "%s_threshold{%s,smart_id=\"%s\"} %d\n", $2, labels, $1, $6 + printf "%s_raw_value{%s,smart_id=\"%s\"} %e\n", $2, labels, $1, $10 +} +SMARTCTLAWK +)" + +smartmon_attrs="$( + cat <<'SMARTMONATTRS' +airflow_temperature_cel +command_timeout +current_pending_sector +end_to_end_error +erase_fail_count +g_sense_error_rate +hardware_ecc_recovered +host_reads_32mib +host_reads_mib +host_writes_32mib +host_writes_mib +load_cycle_count +media_wearout_indicator +nand_writes_1gib +offline_uncorrectable +power_cycle_count +power_on_hours +program_fail_cnt_total +program_fail_count +raw_read_error_rate +reallocated_event_count +reallocated_sector_ct +reported_uncorrect +runtime_bad_block +sata_downshift_count +seek_error_rate +spin_retry_count +spin_up_time +start_stop_count +temperature_case +temperature_celsius +temperature_internal +total_lbas_read +total_lbas_written +udma_crc_error_count +unsafe_shutdown_count +unused_rsvd_blk_cnt_tot +wear_leveling_count +workld_host_reads_perc +workld_media_wear_indic +workload_minutes +SMARTMONATTRS +)" +smartmon_attrs="$(echo "${smartmon_attrs}" | xargs | tr ' ' '|')" + +parse_smartctl_attributes() { + local disk="$1" + local disk_type="$2" + local serial="$3" + local labels="disk=\"${disk}\",type=\"${disk_type}\",serial_number=\"${serial}\"" + sed 's/^ \+//g' | + awk -v labels="${labels}" "${parse_smartctl_attributes_awk}" 2>/dev/null | + tr '[:upper:]' '[:lower:]' | + grep -E "(${smartmon_attrs})" +} + +parse_smartctl_scsi_attributes() { + local disk="$1" + local disk_type="$2" + local serial="$3" + local labels="disk=\"${disk}\",type=\"${disk_type}\",serial_number=\"${serial}\"" + while read -r line; do + attr_type="$(echo "${line}" | tr '=' ':' | cut -f1 -d: | sed 's/^ \+//g' | tr ' ' '_')" + attr_value="$(echo "${line}" | tr '=' ':' | cut -f2 -d: | sed 's/^ \+//g')" + case "${attr_type}" in + number_of_hours_powered_up_) power_on="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; + Current_Drive_Temperature) temp_cel="$(echo "${attr_value}" | cut -f1 -d' ' | awk '{ printf "%e\n", $1 }')" ;; + Blocks_sent_to_initiator_) lbas_read="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; + Blocks_received_from_initiator_) lbas_written="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; + Accumulated_start-stop_cycles) power_cycle="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; + Elements_in_grown_defect_list) grown_defects="$(echo "${attr_value}" | awk '{ printf "%e\n", $1 }')" ;; + esac + done + [ -n "$power_on" ] && echo "power_on_hours_raw_value{${labels},smart_id=\"9\"} ${power_on}" + [ -n "$temp_cel" ] && echo "temperature_celsius_raw_value{${labels},smart_id=\"194\"} ${temp_cel}" + [ -n "$lbas_read" ] && echo "total_lbas_read_raw_value{${labels},smart_id=\"242\"} ${lbas_read}" + [ -n "$lbas_written" ] && echo "total_lbas_written_raw_value{${labels},smart_id=\"241\"} ${lbas_written}" + [ -n "$power_cycle" ] && echo "power_cycle_count_raw_value{${labels},smart_id=\"12\"} ${power_cycle}" + [ -n "$grown_defects" ] && echo "grown_defects_count_raw_value{${labels},smart_id=\"-1\"} ${grown_defects}" +} + +parse_smartctl_info() { + local -i smart_available=0 smart_enabled=0 smart_healthy= + local disk="$1" disk_type="$2" + local model_family='' device_model='' serial_number='' fw_version='' vendor='' product='' revision='' lun_id='' + while read -r line; do + info_type="$(echo "${line}" | cut -f1 -d: | tr ' ' '_')" + info_value="$(echo "${line}" | cut -f2- -d: | sed 's/^ \+//g' | sed 's/"/\\"/')" + case "${info_type}" in + Model_Family) model_family="${info_value}" ;; + Device_Model) device_model="${info_value}" ;; + Serial_Number) serial_number="${info_value}" ;; + Firmware_Version) fw_version="${info_value}" ;; + Vendor) vendor="${info_value}" ;; + Product) product="${info_value}" ;; + Revision) revision="${info_value}" ;; + Logical_Unit_id) lun_id="${info_value}" ;; + esac + if [[ "${info_type}" == 'SMART_support_is' ]]; then + case "${info_value:0:7}" in + Enabled) smart_available=1; smart_enabled=1 ;; + Availab) smart_available=1; smart_enabled=0 ;; + Unavail) smart_available=0; smart_enabled=0 ;; + esac + fi + if [[ "${info_type}" == 'SMART_overall-health_self-assessment_test_result' ]]; then + case "${info_value:0:6}" in + PASSED) smart_healthy=1 ;; + *) smart_healthy=0 ;; + esac + elif [[ "${info_type}" == 'SMART_Health_Status' ]]; then + case "${info_value:0:2}" in + OK) smart_healthy=1 ;; + *) smart_healthy=0 ;; + esac + fi + done + echo "device_info{disk=\"${disk}\",type=\"${disk_type}\",vendor=\"${vendor}\",product=\"${product}\",revision=\"${revision}\",lun_id=\"${lun_id}\",model_family=\"${model_family}\",device_model=\"${device_model}\",serial_number=\"${serial_number}\",firmware_version=\"${fw_version}\"} 1" + echo "device_smart_available{disk=\"${disk}\",type=\"${disk_type}\",serial_number=\"${serial_number}\"} ${smart_available}" + [[ "${smart_available}" == "1" ]] && echo "device_smart_enabled{disk=\"${disk}\",type=\"${disk_type}\",serial_number=\"${serial_number}\"} ${smart_enabled}" + [[ "${smart_available}" == "1" ]] && [[ "${smart_healthy}" != "" ]] && echo "device_smart_healthy{disk=\"${disk}\",type=\"${disk_type}\",serial_number=\"${serial_number}\"} ${smart_healthy}" +} + +output_format_awk="$( + cat <<'OUTPUTAWK' +BEGIN { v = "" } +v != $1 { + print "# HELP smartmon_" $1 " SMART metric " $1; + print "# TYPE smartmon_" $1 " gauge"; + v = $1 +} +{print "smartmon_" $0} +OUTPUTAWK +)" + +format_output() { + sort | + awk -F'{' "${output_format_awk}" +} + +smartctl_version="$(/usr/sbin/smartctl -V | head -n1 | awk '$1 == "smartctl" {print $2}')" + +echo "smartctl_version{version=\"${smartctl_version}\"} 1" | format_output + +if [[ "$(expr "${smartctl_version}" : '\([0-9]*\)\..*')" -lt 6 ]]; then + exit +fi + +device_list="$(/usr/sbin/smartctl --scan-open | awk '/^\/dev/{print $1 "|" $3}')" + +for device in ${device_list}; do + disk="$(echo "${device}" | cut -f1 -d'|')" + type="$(echo "${device}" | cut -f2 -d'|')" + # Use REGEX to extract the serial number from the parsed information and save that to a variable + serial_number="$(/usr/sbin/smartctl -i -H -d "${type}" "${disk}" | parse_smartctl_info "${disk}" "${type}"| sed -E ':a;N;$!ba;s/.*serial_number=\"([^"]+)\".*/\1/g' | sed -E 's/^device_info\{.*//g')" + active=1 + echo "smartctl_run{disk=\"${disk}\",type=\"${type}\"}" "$(TZ=UTC date '+%s')" + # Check if the device is in a low-power mode + /usr/sbin/smartctl -n standby -d "${type}" "${disk}" > /dev/null || active=0 + echo "device_active{disk=\"${disk}\",type=\"${type}\"}" "${active}" + # Skip further metrics to prevent the disk from spinning up + test ${active} -eq 0 && continue + # Get the SMART information and health + /usr/sbin/smartctl -i -H -d "${type}" "${disk}" | parse_smartctl_info "${disk}" "${type}" + # Get the SMART attributes + case ${type} in + sat) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" "${serial_number}" ;; + sat+megaraid*) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_attributes "${disk}" "${type}" "${serial_number}" ;; + scsi) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" "${serial_number}" ;; + megaraid*) /usr/sbin/smartctl -A -d "${type}" "${disk}" | parse_smartctl_scsi_attributes "${disk}" "${type}" "${serial_number}" ;; + *) + (>&2 echo "disk type is not sat, scsi or megaraid but ${type}") + exit + ;; + esac +done | format_output \ No newline at end of file diff --git a/etc/kayobe/ansible/smartmon-tools.yml b/etc/kayobe/ansible/smartmon-tools.yml new file mode 100644 index 000000000..6b275c264 --- /dev/null +++ b/etc/kayobe/ansible/smartmon-tools.yml @@ -0,0 +1,43 @@ +--- +- hosts: overcloud + + tasks: + - name: Ensure smartmon-tools and nvme-cli is installed + package: + name: + - smartmontools + - nvme-cli + - jq + state: present + become: true + + - name: Copy smartmon.sh and nvmemon.sh from scripts folder + copy: + src: "scripts/{{ item }}" + dest: /usr/local/bin/ + owner: 'root' + group: 'root' + mode: '0700' + loop: + - smartmon.sh + - nvmemon.sh + become: yes + + - name: Set PATH Variable for cron + cron: + name: PATH + user: root + env: yes + job: /usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin + become: yes + + - name: Schedule cronjob to run both scripts every 5 minutes and save output to file + cron: + name: "SMART metrics for drive monitoring using {{ item }}" + user: root + minute: "*/5" + job: "/usr/local/bin/{{ item }}.sh > /var/lib/docker/volumes/textfile/_data/{{ item }}.prom.temp && mv /var/lib/docker/volumes/textfile/_data/{{ item }}.prom.temp /var/lib/docker/volumes/textfile/_data/{{ item }}.prom" + loop: + - smartmon + - nvmemon + become: yes diff --git a/etc/kayobe/kolla/config/grafana/dashboards/openstack/hardware_overview.json b/etc/kayobe/kolla/config/grafana/dashboards/openstack/hardware_overview.json new file mode 100644 index 000000000..e4f78aee3 --- /dev/null +++ b/etc/kayobe/kolla/config/grafana/dashboards/openstack/hardware_overview.json @@ -0,0 +1,543 @@ +{% raw %} +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Number of healthy drives", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 0 + }, + "hideTimeOverride": false, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count(smartmon_device_smart_healthy > 0)", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Healthy Drives", + "transformations": [], + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Number of healthy drives", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 0 + }, + "hideTimeOverride": false, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count(smartmon_device_smart_healthy < 1) ", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Unhealthy Drives", + "transformations": [], + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Number of healthy drives", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 0 + }, + "hideTimeOverride": false, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count(smartmon_device_smart_healthy)", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Total Drives", + "transformations": [], + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "align": "center", + "displayMode": "auto", + "filterable": false, + "inspect": false + }, + "mappings": [ + { + "options": { + "0": { + "color": "red", + "index": 1, + "text": "Failed" + }, + "1": { + "color": "dark-green", + "index": 0, + "text": "Ok" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "job" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "__name__" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "custom.displayMode", + "value": "color-background-solid" + }, + { + "id": "custom.width" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "type" + }, + "properties": [ + { + "id": "custom.width", + "value": 153 + }, + { + "id": "displayName", + "value": "Type" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "serial_number" + }, + "properties": [ + { + "id": "custom.width", + "value": 208 + }, + { + "id": "displayName", + "value": "Serial Number" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "disk" + }, + "properties": [ + { + "id": "custom.width", + "value": 146 + }, + { + "id": "displayName", + "value": "Disk" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "instance" + }, + "properties": [ + { + "id": "custom.width", + "value": 203 + }, + { + "id": "displayName", + "value": "Hostname" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Type" + }, + "properties": [ + { + "id": "custom.width" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Serial Number" + }, + "properties": [ + { + "id": "custom.width" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Hostname" + }, + "properties": [ + { + "id": "custom.width" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Disk" + }, + "properties": [ + { + "id": "custom.width" + } + ] + } + ] + }, + "gridPos": { + "h": 13, + "w": 18, + "x": 0, + "y": 7 + }, + "id": 2, + "options": { + "footer": { + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "9.1.2", + "targets": [ + { + "$$hashKey": "object:40", + "aggregation": "Last", + "alias": "Healthy", + "crit": 0, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "decimals": 0, + "displayAliasType": "Warning / Critical", + "displayType": "Regular", + "displayValueWithAlias": "Never", + "editorMode": "code", + "exemplar": false, + "expr": "smartmon_device_smart_healthy", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "range": false, + "refId": "A", + "units": "none", + "valueHandler": "Number Threshold", + "warn": 0 + } + ], + "title": "Panel Title", + "transparent": true, + "type": "table" + } + ], + "refresh": false, + "schemaVersion": 37, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "filters": [], + "hide": 0, + "name": "Filters", + "skipUrlSync": false, + "type": "adhoc" + }, + { + "current": { + "selected": true, + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Hardware Overview", + "uid": "TCN51Y25P", + "version": 1, + "weekStart": "" +} +{% endraw %} \ No newline at end of file diff --git a/etc/kayobe/kolla/config/prometheus/smart.rules b/etc/kayobe/kolla/config/prometheus/smart.rules new file mode 100644 index 000000000..0b6552598 --- /dev/null +++ b/etc/kayobe/kolla/config/prometheus/smart.rules @@ -0,0 +1,12 @@ +{% raw %} + +- alert: DiskSmartStatusUnhealthy + expr: smartmon_device_smart_healthy < 1 + for: 10m + labels: + severity: alert + annotations: + summary: "SMART monitor reports bad disk on (instance {{ $labels.instance }})" + description: "{{ $labels.instance }} is reporting unhealthy for the disk at {{ $labels.disk }}. Disk serial number is: {{ $labels.serial_number }}" + +{% endraw %} diff --git a/etc/kayobe/kolla/globals.yml b/etc/kayobe/kolla/globals.yml index b30ddd013..3081a2e5e 100644 --- a/etc/kayobe/kolla/globals.yml +++ b/etc/kayobe/kolla/globals.yml @@ -17,4 +17,11 @@ bifrost_tag: xena-20221128T101757 es_heap_size: 8g prometheus_cmdline_extras: "--storage.tsdb.retention.time=30d" +# Additional command line flags for node exporter to enable texfile collector for disk metrics and create textfile docker volume +prometheus_node_exporter_extra_volumes: + - "textfile:/var/lib/node_exporter/textfile_collector" +prometheus_node_exporter_cmdline_extras: "--collector.textfile.directory=/var/lib/node_exporter/textfile_collector" + + ############################################################################# + diff --git a/releasenotes/notes/smart-mon-db8fa642c3af74b1.yaml b/releasenotes/notes/smart-mon-db8fa642c3af74b1.yaml new file mode 100644 index 000000000..feaec4dbe --- /dev/null +++ b/releasenotes/notes/smart-mon-db8fa642c3af74b1.yaml @@ -0,0 +1,4 @@ +--- + +features: + - Enables SMART monitoring. Manual action is required, please see the monitoring documentation for the procedure. From 5a7d68d3d41d33f5b9449a1d30f2e33a82acdeb3 Mon Sep 17 00:00:00 2001 From: Matt Anson Date: Fri, 11 Nov 2022 09:17:02 +0000 Subject: [PATCH 06/28] Increase job timeout for kolla image build GHA (cherry picked from commit 807e935dd4dc622ffbaf0a1df2a86c93bacf8b4c) --- .github/workflows/stackhpc-container-image-build.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/stackhpc-container-image-build.yml b/.github/workflows/stackhpc-container-image-build.yml index 65428c039..f69626d27 100644 --- a/.github/workflows/stackhpc-container-image-build.yml +++ b/.github/workflows/stackhpc-container-image-build.yml @@ -34,6 +34,7 @@ jobs: name: Build Kolla container images if: github.repository == 'stackhpc/stackhpc-kayobe-config' runs-on: [self-hosted, stackhpc-kayobe-config-kolla-builder] + timeout-minutes: 720 steps: - uses: actions/checkout@v3 with: From a089c3ebcc1aecae7a1a508cf5cceec361da420b Mon Sep 17 00:00:00 2001 From: Dawud M <7688823+technowhizz@users.noreply.github.com> Date: Fri, 2 Dec 2022 16:04:05 +0000 Subject: [PATCH 07/28] Fix oom-killer graph Changes the oom-killer graph from a smoothed irate to a discrete delta function. Change-Id: I2e4a8576c628610409ade4aad2bd98754bec3860 (cherry picked from commit ef1a449034bb4501e333ae41910e8d77cb5b4ad8) --- .../grafana/dashboards/openstack/node_exporter_full.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/etc/kayobe/kolla/config/grafana/dashboards/openstack/node_exporter_full.json b/etc/kayobe/kolla/config/grafana/dashboards/openstack/node_exporter_full.json index 08078c31d..66d630b8d 100644 --- a/etc/kayobe/kolla/config/grafana/dashboards/openstack/node_exporter_full.json +++ b/etc/kayobe/kolla/config/grafana/dashboards/openstack/node_exporter_full.json @@ -4886,7 +4886,7 @@ "Total Swap": "#614D93", "VmallocUsed": "#EA6460" }, - "bars": false, + "bars": true, "dashLength": 10, "dashes": false, "datasource": { @@ -4921,7 +4921,7 @@ "total": false, "values": true }, - "lines": true, + "lines": false, "linewidth": 1, "links": [], "maxPerRow": 6, @@ -4940,9 +4940,9 @@ "steppedLine": false, "targets": [ { - "expr": "irate(node_vmstat_oom_kill{instance=\"$node\",job=\"$job\"}[5m])", + "expr": "max_over_time(node_vmstat_oom_kill{instance=\"$node\",job=\"$job\"}[$__rate_interval:]) - (min_over_time(node_vmstat_oom_kill{instance=\"$node\",job=\"$job\"}[$__rate_interval:]))", "format": "time_series", - "interval": "", + "interval": "30s", "intervalFactor": 2, "legendFormat": "oom killer invocations ", "refId": "A", From de1be4842e9e3291077a6bf80dbdf7369f74b39b Mon Sep 17 00:00:00 2001 From: Stig Telfer Date: Thu, 10 Nov 2022 11:05:00 +0000 Subject: [PATCH 08/28] Rephrase the match logic for interfaces monitored for package drops OVS bridge interfaces drop packets during normal operation. Change the regex to filter out interfaces that don't matter for packet drops. (cherry picked from commit 9c3f15a1f374d24d08a0bdac63886220247d2a4e) --- etc/kayobe/kolla/config/prometheus/ceph.rules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etc/kayobe/kolla/config/prometheus/ceph.rules b/etc/kayobe/kolla/config/prometheus/ceph.rules index 7df171501..52b9841a9 100644 --- a/etc/kayobe/kolla/config/prometheus/ceph.rules +++ b/etc/kayobe/kolla/config/prometheus/ceph.rules @@ -154,7 +154,7 @@ groups: # alert on nic packet errors and drops rates > 1 packet/s - alert: NetworkPacketsDropped - expr: irate(node_network_receive_drop_total{device=~"en.*|eth.*"}[5m]) + irate(node_network_transmit_drop_total{device=~"en.*|eth.*"}[5m]) > 1 + expr: irate(node_network_receive_drop_total{device!~"lo|br.*|.*-ovs|tap.*"}[5m]) + irate(node_network_transmit_drop_total{device!~"lo|br.*|.*-ovs|tap.*"}[5m]) > 1 labels: severity: warning annotations: From 73bf07968f62bcaf0b16813ffd4bd28cf19c3d65 Mon Sep 17 00:00:00 2001 From: Dawud M <7688823+technowhizz@users.noreply.github.com> Date: Tue, 29 Nov 2022 16:18:48 +0000 Subject: [PATCH 09/28] Add docs for SMART Monitoring --- doc/source/configuration/monitoring.rst | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/doc/source/configuration/monitoring.rst b/doc/source/configuration/monitoring.rst index f358ea084..2c8e253fb 100644 --- a/doc/source/configuration/monitoring.rst +++ b/doc/source/configuration/monitoring.rst @@ -1,6 +1,6 @@ -=========== +========== Monitoring -=========== +========== Monitoring Configuration ======================== @@ -16,7 +16,7 @@ The configuration options can be found in :language: yaml SMART Drive Monitoring -======================= +====================== StackHPC kayobe config also includes drive monitoring for spinning disks and NVME's. @@ -27,7 +27,7 @@ a cronjob, to output the metrics and we use node exporter's Textfile collector to report the metrics output by the scripts to Prometheus. These metrics can then be visualised in Grafana with the bundled dashboard. -After pulling in the latest changes into your local kayobe config, reconfigure +After pulling in the latest changes into your local kayobe config, reconfigure Prometheus and Grafana .. code-block:: console @@ -38,7 +38,7 @@ Prometheus and Grafana `this `__ bug and at present, the workaround is to go into each node running Grafana and manually restart the process with ``docker restart grafana`` and then try the reconfigure -command again.)  +command again.) Once the reconfigure has completed you can now run the custom playbook which copies over the scripts and sets up the cron jobs to start SMART monitoring @@ -50,6 +50,4 @@ on the overcloud hosts: (kayobe) [stack@node kayobe]$ kayobe playbook run ansible/smartmontools.yml SMART reporting should now be enabled along with a Prometheus alert for -unhealthy disks and a Grafana dashboard called ``Hardware Overview``.  - - +unhealthy disks and a Grafana dashboard called ``Hardware Overview``. From 2dcec472ce3db4cc5fd00df94df6dce194b1f9c3 Mon Sep 17 00:00:00 2001 From: Piotr Parczewski Date: Mon, 19 Dec 2022 10:54:21 +0100 Subject: [PATCH 10/28] Add note on enabling standard configuration --- doc/source/configuration/monitoring.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/doc/source/configuration/monitoring.rst b/doc/source/configuration/monitoring.rst index 2c8e253fb..43ee5f4fc 100644 --- a/doc/source/configuration/monitoring.rst +++ b/doc/source/configuration/monitoring.rst @@ -15,6 +15,17 @@ The configuration options can be found in .. literalinclude:: ../../../etc/kayobe/stackhpc-monitoring.yml :language: yaml +In order to enable stock monitoring configuration within a particular +environment, create the following symbolic links: + +.. code-block:: console + + cd $KAYOBE_CONFIG_PATH + ln -s kolla/config/grafana/ environments/$KAYOBE_ENVIRONMENT/kolla/config/ + ln -s kolla/config/prometheus/ environments/$KAYOBE_ENVIRONMENT/kolla/config/ + +and commit them to the config repository. + SMART Drive Monitoring ====================== From 74b27c515a92a3b51cfe93a91b6372003c5d6ad3 Mon Sep 17 00:00:00 2001 From: Piotr Parczewski Date: Mon, 19 Dec 2022 16:09:38 +0100 Subject: [PATCH 11/28] Docs edit --- doc/source/configuration/monitoring.rst | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/doc/source/configuration/monitoring.rst b/doc/source/configuration/monitoring.rst index 43ee5f4fc..6045f3c29 100644 --- a/doc/source/configuration/monitoring.rst +++ b/doc/source/configuration/monitoring.rst @@ -5,9 +5,25 @@ Monitoring Monitoring Configuration ======================== -StackHPC kayobe config includes a reference monitoring stack based on -Prometheus. Whilst this often works out of the box, there are some tunables -which can be customised to adapt the configuration to a particular deployment. +StackHPC kayobe config includes a reference monitoring and alerting stack based +on Prometheus, Alertmanager, Grafana, Fluentd, Elasticsearch & Kibana. These +services by default come enabled and configured. Central Elasticsearch cluster +collects OpenStack logs, with an option to receive operating system logs too. +In order to enable this, execute custom playbook after deployment: + +.. code-block:: console + + cd $KAYOBE_CONFIG_PATH + kayobe playbook run ansible/rsyslog.yml + +`Prometheus `__ comes with a comprehensive set of +metrics gathered from enabled exporters; every exporter's data is visualised +by at least one `Grafana `__ dashboard. Standard set of +alerting rules is present as well. + +While the default configuration often works out of the box, there +are some tunables which can be customised to adapt the configuration to a +particular deployment's needs. The configuration options can be found in ``etc/kayobe/stackhpc-monitoring.yml``: From 097c98565dd6bd0eb16d49b87e4da7e2f2be3a5c Mon Sep 17 00:00:00 2001 From: Matt Crees Date: Mon, 19 Dec 2022 16:32:58 +0000 Subject: [PATCH 12/28] Fail if the controller clocks are not synced --- etc/kayobe/ansible/rabbitmq-reset.yml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/etc/kayobe/ansible/rabbitmq-reset.yml b/etc/kayobe/ansible/rabbitmq-reset.yml index 23dde1971..d1a03eeb1 100644 --- a/etc/kayobe/ansible/rabbitmq-reset.yml +++ b/etc/kayobe/ansible/rabbitmq-reset.yml @@ -11,6 +11,21 @@ vars: - container_name: rabbitmq tasks: + - name: Checking timedatectl status + become: true + command: timedatectl status + register: timedatectl_status + changed_when: false + + - name: Fail if the clock is not synchronized + fail: + msg: >- + timedatectl sees the system clock as unsynchronized. + You may need to force synchronisation using `chronyc makestep`. + Otherwise, please wait for synchronization. + when: + - "'synchronized: yes' not in timedatectl_status.stdout" + - name: Inspect the {{ container_name }} container shell: cmd: "docker container inspect --format '{{ '{{' }} .State.Running {{ '}}' }}' {{ container_name }}" From 5753017ad6261ee0f74ab5248c9caed0b50da196 Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Mon, 19 Dec 2022 22:00:40 +0100 Subject: [PATCH 13/28] Bump cloudkitty tag The cloudkitty image was missing our latest backports. Only Ubuntu images have been rebuilt so far. --- etc/kayobe/kolla/globals.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/etc/kayobe/kolla/globals.yml b/etc/kayobe/kolla/globals.yml index 84cd97511..b6071fe1b 100644 --- a/etc/kayobe/kolla/globals.yml +++ b/etc/kayobe/kolla/globals.yml @@ -7,4 +7,5 @@ docker_yum_gpgkey: "https://download.docker.com/linux/centos/gpg" bifrost_tag: wallaby-20220921T100954 {% else %} bifrost_tag: wallaby-20220825T112231 +cloudkitty_tag: wallaby-20221215T220154 {% endif %} From b5a0c2f0dcacf98fa06bcbfd48813107843438d6 Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Mon, 12 Dec 2022 09:21:24 +0000 Subject: [PATCH 14/28] docs: Add current_series replacement and extlinks This allows us to reference the current release series in the documentation. --- doc/source/conf.py | 26 +++++++++++++++++++ doc/source/configuration/release-train.rst | 9 +++---- .../contributor/environments/ci-builder.rst | 2 +- doc/source/usage.rst | 5 ++-- 4 files changed, 33 insertions(+), 9 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index f7214fed0..aaeb990f5 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -26,12 +26,28 @@ # See the License for the specific language governing permissions and # limitations under the License. +# -- StackHPC Kayobe configuration -------------------------------------- +# Variables to override + +current_series = "xena" +branch = f"stackhpc/{current_series}" + +# Substitutions loader +rst_epilog = """ +.. |current_release| replace:: {current_release} +.. |current_release_git_branch_name| replace:: {current_release_git_branch_name} +""".format( # noqa: E501 + current_release_git_branch_name=branch, + current_release=current_series, +) + # -- General configuration ---------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [ #'sphinx.ext.autodoc', + 'sphinx.ext.extlinks', #'sphinx.ext.intersphinx', 'sphinxcontrib.rsvgconverter', ] @@ -91,3 +107,13 @@ # Disable usage of xindy https://bugzilla.redhat.com/show_bug.cgi?id=1643664 latex_use_xindy = False +extlinks_projects = { + "kayobe", + "kolla", + "kolla-ansible", +} + +extlinks = { + f"{project}-doc": (f"https://docs.openstack.org/{project}/{current_series}/", "%s documentation") + for project in extlinks_projects +} diff --git a/doc/source/configuration/release-train.rst b/doc/source/configuration/release-train.rst index 04b583410..6e841b8f1 100644 --- a/doc/source/configuration/release-train.rst +++ b/doc/source/configuration/release-train.rst @@ -57,15 +57,14 @@ The distribution name for the environment should be configured as either Usage ===== -The local Pulp service will be deployed as a `Seed custom container -`__ +The local Pulp service will be deployed as a :kayobe-doc:`Seed custom container +` on next ``kayobe seed service deploy`` or ``kayobe seed service upgrade``. The following custom playbooks are provided in ``etc/kayobe/ansible/``: -See the Kayobe `custom playbook documentation -`__ -for information on how to run them. +See the Kayobe :kayobe-doc:`custom playbook documentation +` for information on how to run them. * ``pulp-repo-sync.yml``: Pull packages from Ark to the local Pulp. This will create a new repository version (snapshot) for each repository in the local diff --git a/doc/source/contributor/environments/ci-builder.rst b/doc/source/contributor/environments/ci-builder.rst index f146b2107..025c6bb65 100644 --- a/doc/source/contributor/environments/ci-builder.rst +++ b/doc/source/contributor/environments/ci-builder.rst @@ -105,7 +105,7 @@ At this point you are ready to build and push some container images. kayobe seed container image build --push kayobe overcloud container image build --push -The container images are tagged as ``xena-``. This Kayobe +The container images are tagged as |current_release|-. This Kayobe configuration includes a hook that writes the tag to ``~/kolla_tag``, since it is not always simple to determine which tag was last applied to built images. diff --git a/doc/source/usage.rst b/doc/source/usage.rst index f65b98af1..867d2983c 100644 --- a/doc/source/usage.rst +++ b/doc/source/usage.rst @@ -12,9 +12,8 @@ based on the upstream kayobe-config, with some opinionated configuration changes applied. Since this repository makes changes to the base configuration, it works best -when used with Kayobe's `multiple environments -`__ -feature. +when used with Kayobe's :kayobe-doc:`multiple environments +` feature. This configuration should be consumed using the `StackHPC Kayobe fork `__, which includes From ce21921dbe9a1d0e1d8d5071355910887e61247e Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Mon, 12 Dec 2022 16:02:46 +0000 Subject: [PATCH 15/28] docs: overview --- doc/source/index.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/doc/source/index.rst b/doc/source/index.rst index a9906b35f..884629113 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -11,7 +11,16 @@ Welcome to StackHPC's Kayobe Config documentation! Overview ======== +This documentation covers the StackHPC Kayobe configuration. It is intended to +complement, rather than replace, the upstream :kayobe-doc:`Kayobe `, +:kolla-doc:`Kolla ` and :kolla-ansible-doc:`Kolla Ansible ` +documentation. +The configuration includes various things, such as: + +* Opinionated configuration +* Custom playbooks +* Continuous Integration (CI) workflows Contents ======== From 1b66e364832e596f448ff3474445b132a91f47e8 Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Mon, 12 Dec 2022 19:55:04 +0000 Subject: [PATCH 16/28] Xena: batched release notes --- .../notes/xena-batch-bc1da4e4d0f6257e.yaml | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 releasenotes/notes/xena-batch-bc1da4e4d0f6257e.yaml diff --git a/releasenotes/notes/xena-batch-bc1da4e4d0f6257e.yaml b/releasenotes/notes/xena-batch-bc1da4e4d0f6257e.yaml new file mode 100644 index 000000000..1c9446dd5 --- /dev/null +++ b/releasenotes/notes/xena-batch-bc1da4e4d0f6257e.yaml @@ -0,0 +1,21 @@ +--- +features: + - | + Adds a custom playbook to run the `Anomoly Detection Visualiser (ADVise) + `_, ``advise-run.yml``. + - | + Adds a custom playbook to reset the RabbitMQ cluster and restart OpenStack + services that use it, ``rabbitmq-reset.yml``. + - | + Adds a custom playbook to configure swap, ``swap.yml``. + - | + Adds the `Kayobe Automation + `__ Git repository as a + submodule, and provides some basic configuration for it in an + ``.automation.conf`` directory. + - | + Adds support for deploying a Squid caching proxy as a custom container on + the seed. + - | + Enables Elasticsearch, Grafana, Kibana, Prometheus by default. Provides + standard dashboards for Grafana and alerting rules for Prometheus. From 3367a28a58847b63108117142887570403671592 Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Mon, 12 Dec 2022 20:13:51 +0000 Subject: [PATCH 17/28] docs: Add release notes into main docs --- doc/requirements.txt | 2 +- doc/source/conf.py | 1 + doc/source/index.rst | 1 + doc/source/release-notes.rst | 6 ++++++ 4 files changed, 9 insertions(+), 1 deletion(-) create mode 100644 doc/source/release-notes.rst diff --git a/doc/requirements.txt b/doc/requirements.txt index 842c709a9..650e0def0 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -2,6 +2,6 @@ # of appearance. Changing the order has an impact on the overall integration # process, which may cause wedges in the gate later. -reno>=3.1.0 # Apache-2.0 +reno>=3.4.0 # Apache-2.0 sphinx>=4.2.0 # BSD sphinxcontrib-svg2pdfconverter>=0.1.0 # BSD diff --git a/doc/source/conf.py b/doc/source/conf.py index aaeb990f5..f52a21696 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -46,6 +46,7 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [ + 'reno.sphinxext', #'sphinx.ext.autodoc', 'sphinx.ext.extlinks', #'sphinx.ext.intersphinx', diff --git a/doc/source/index.rst b/doc/source/index.rst index 884629113..a7f770bc4 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -30,4 +30,5 @@ Contents usage configuration/index + release-notes contributor/index diff --git a/doc/source/release-notes.rst b/doc/source/release-notes.rst new file mode 100644 index 000000000..e5cdefbdb --- /dev/null +++ b/doc/source/release-notes.rst @@ -0,0 +1,6 @@ +========================= +Xena Series Release Notes +========================= + +.. release-notes:: + :branch: stackhpc/xena From 80b5565e0c647e0171f4762dba3b277616fb61bf Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Mon, 12 Dec 2022 20:14:36 +0000 Subject: [PATCH 18/28] reno: Match on version-specific tag This seems to help reno collect the right notes. --- releasenotes/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/releasenotes/config.yaml b/releasenotes/config.yaml index 55f94b723..5dd9a157a 100644 --- a/releasenotes/config.yaml +++ b/releasenotes/config.yaml @@ -1,4 +1,4 @@ --- # This needs to be updated to the latest release. default_branch: stackhpc/xena -release_tag_re: stackhpc/\d+\.\d+\.\d+\.\d +release_tag_re: stackhpc/11\.\d+\.\d+\.\d From 63412e378e5fcd0016c39c22dc32c3174a9726a1 Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Mon, 12 Dec 2022 16:03:05 +0000 Subject: [PATCH 19/28] docs: overcloud host image --- doc/source/configuration/host-images.rst | 66 ++++++++++++++++++++++++ doc/source/configuration/index.rst | 1 + 2 files changed, 67 insertions(+) create mode 100644 doc/source/configuration/host-images.rst diff --git a/doc/source/configuration/host-images.rst b/doc/source/configuration/host-images.rst new file mode 100644 index 000000000..cfa5fe678 --- /dev/null +++ b/doc/source/configuration/host-images.rst @@ -0,0 +1,66 @@ +.. _host-images: + +=========== +Host Images +=========== + +StackHPC Kayobe configuration provides configuration for some standard +overcloud host images, built using the :kayobe-doc:`overcloud DIB +` functionality of Kayobe. + +The overcloud DIB configuration is provided in +``etc/kayobe/stackhpc-overcloud-dib.yml``. It is not used by default, and must +be actively opted into. This can be done as follows: + +.. code-block:: yaml + :caption: ``etc/kayobe/overcloud-dib.yml`` + + overcloud_dib_build_host_images: true + + overcloud_dib_host_images: + - "{{ stackhpc_overcloud_dib_host_image }}" + +The image name is configured via ``stackhpc_overcloud_dib_name``, and is +``deployment_image`` by default. + +The list of DIB elements is configured via ``stackhpc_overcloud_dib_elements``. +The default value depends on the ``os_distribution`` variable. See the YAML +file for details. + +The DIB environment variables are configured via +``stackhpc_overcloud_dib_env_vars``. See the YAML file for details. + +A list of packages to install is configured via +``stackhpc_overcloud_dib_packages``. + +By default, a UEFI-compatible image is built that uses separate LVM volumes for +different mount points. This is done to pass Centre for Internet Security (CIS) +partition benchmarks. The block device YAML configuration is configured via +``stackhpc_overcloud_dib_block_device_config_uefi_lvm``. + +The 3 partitions are: + +* p0: EFI ESP bootloader +* p1: EFI BSP +* p2: LVM PV (``rootpv``) + +The LVM Logical Volumes are: + +============== ================== ========= +LV Mount point Size (GB) +============== ================== ========= +``lv_root`` ``/`` 5G +``lv_tmp`` ``/tmp`` 1G +``lv_var`` ``/var`` 1G +``lv_var_tmp`` ``/var/tmp`` 1G +``lv_log`` ``/var/log`` 1G +``lv_audit`` ``/var/log/audit`` 128M +``lv_home`` ``/home`` 128M +============== ================== ========= + +A compatible LVM configuration is provided, and covered in :ref:`lvm`. +The Logical Volumes in the image are defined with small sizes, with the +intention that they will be grown after provisioning. + +For RedHat family distributions, Dracut modules are configured via +``stackhpc_overcloud_dib_dracut_enabled_modules_default_config``. diff --git a/doc/source/configuration/index.rst b/doc/source/configuration/index.rst index 74d145e29..cd0a784be 100644 --- a/doc/source/configuration/index.rst +++ b/doc/source/configuration/index.rst @@ -10,5 +10,6 @@ the various features provided. walled-garden release-train + host-images cephadm monitoring From 8386215d441aff034e81b001b429aaddfe018ecf Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Mon, 19 Dec 2022 12:23:24 +0000 Subject: [PATCH 20/28] docs: LVM --- doc/source/configuration/index.rst | 1 + doc/source/configuration/lvm.rst | 98 ++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+) create mode 100644 doc/source/configuration/lvm.rst diff --git a/doc/source/configuration/index.rst b/doc/source/configuration/index.rst index cd0a784be..942591b2d 100644 --- a/doc/source/configuration/index.rst +++ b/doc/source/configuration/index.rst @@ -11,5 +11,6 @@ the various features provided. walled-garden release-train host-images + lvm cephadm monitoring diff --git a/doc/source/configuration/lvm.rst b/doc/source/configuration/lvm.rst new file mode 100644 index 000000000..65b892aa8 --- /dev/null +++ b/doc/source/configuration/lvm.rst @@ -0,0 +1,98 @@ +.. _lvm: + +=== +LVM +=== + +StackHPC Kayobe configuration provides Logical Volume Manager (LVM) +configuration that is compatible with the included :ref:`host-images` +configuration. The configuration uses the :kayobe-doc:`LVM +` host configuration functionality of +Kayobe. + +The LVM configuration is provided in +``etc/kayobe/inventory/group_vars/all/stackhpc/lvm``. This allows configuration +variables to be overridden on a per-group or per-host basis (which would not be +possible for an "extra variable" in ``etc/kayobe/*.yml``). This configuration +is not used by default, and must be actively opted into. This can be done as +follows: + +.. code-block:: yaml + + controller_lvm_groups: + - "{{ stackhpc_lvm_group_rootvg }}" + +This will configure the standard set of logical volumes for the ``rootvg`` +volume group on controller hosts. + +The disks in this volume group are configured via +``stackhpc_lvm_group_rootvg_disks``, and by default this contains a single +disk, matched by a partition label of ``root`` (as used in the standard +:ref:`host-images`). + +The size of each LV is configurable via the following variables: + +.. code-block:: yaml + + # StackHPC LVM lv_swap LV size. + stackhpc_lvm_lv_swap_size: 16g + + # StackHPC LVM lv_root LV size. + stackhpc_lvm_lv_root_size: 50g + + # StackHPC LVM lv_tmp LV size. + stackhpc_lvm_lv_tmp_size: 10g + + # StackHPC LVM lv_var LV size. + stackhpc_lvm_lv_var_size: 20g + + # StackHPC LVM lv_var_tmp LV size. + stackhpc_lvm_lv_var_tmp_size: 2g + + # StackHPC LVM lv_log LV size. + stackhpc_lvm_lv_log_size: 20g + + # StackHPC LVM lv_audit LV size. + stackhpc_lvm_lv_audit_size: 10g + + # StackHPC LVM lv_home LV size. + stackhpc_lvm_lv_home_size: 10g + +Additional LVs may be configured via ``stackhpc_lvm_group_rootvg_lvs_extra``. A +common requirement is to have ``/var/lib/docker/`` mounted on a separate LV, +so this has been made convenient to achieve: + +.. code-block:: yaml + + stackhpc_lvm_group_rootvg_lvs_extra: + - "{{ stackhpc_lvm_lv_docker }}" + + # StackHPC LVM lv_docker LV size. + stackhpc_lvm_lv_docker_size: 100%FREE + +It may be desirable to use a lower percentage of the free space, in case +another LV needs to be grown at a later date. + +Growroot playbook +================= + +A ``growroot.yml`` custom playbook is provied that can be used to grow the +partition and LVM Physical Volume (PV) of the root Volume Group (VG). This +allows for expansion of Logical Volumes (LVs) in that VG. + +The following variables may be used to configure the playbook: + +``growroot_group`` + Host pattern against which to target the playbook. Default is ``overcloud``. +``growroot_vg`` + Name of the VG containing the PV to grow. Default is ``rootvg`` to match the + standard :ref:`host image configuration `. + +This playbook may be used as a host configure pre hook, e.g. for overcloud +hosts: + +.. code-block:: console + + mkdir -p ${KAYOBE_CONFIG_PATH}/hooks/overcloud-host-configure/pre.d + cd ${KAYOBE_CONFIG_PATH}/hooks/overcloud-host-configure/pre.d + ln -s ../../../ansible/growroot.yml 30-growroot.yml From 31262bd3617f472211aeb102e66a402f01d556bc Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Tue, 20 Dec 2022 16:44:15 +0000 Subject: [PATCH 21/28] docs: swap --- doc/source/configuration/index.rst | 1 + doc/source/configuration/swap.rst | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 doc/source/configuration/swap.rst diff --git a/doc/source/configuration/index.rst b/doc/source/configuration/index.rst index 942591b2d..ebaab804e 100644 --- a/doc/source/configuration/index.rst +++ b/doc/source/configuration/index.rst @@ -12,5 +12,6 @@ the various features provided. release-train host-images lvm + swap cephadm monitoring diff --git a/doc/source/configuration/swap.rst b/doc/source/configuration/swap.rst new file mode 100644 index 000000000..20bae501b --- /dev/null +++ b/doc/source/configuration/swap.rst @@ -0,0 +1,23 @@ +==== +Swap +==== + +StackHPC Kayobe configuration provides a ``swap.yml`` custom playbook that may +be used to configure a swap device. + +The following variables may be used to configure the playbook: + +``swap_group`` + Host pattern against which to target the playbook. Default is ``overcloud``. +``swap_device`` + Name of the swap device to configure. Default is ``/dev/rootvg/lv_swap`` to + match the standard :ref:`host image configuration `. + +This playbook may be used as a host configure post hook, e.g. for overcloud +hosts: + +.. code-block:: console + + mkdir -p ${KAYOBE_CONFIG_PATH}/hooks/overcloud-host-configure/post.d + cd ${KAYOBE_CONFIG_PATH}/hooks/overcloud-host-configure/post.d + ln -s ../../../ansible/swap.yml 10-swap.yml From 65aabdec34a93176ef93a204c8d08849369b1e53 Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Mon, 12 Dec 2022 16:02:28 +0000 Subject: [PATCH 22/28] docs: Current branch variable --- .../contributor/environments/ci-aio.rst | 22 +++++++++---------- .../contributor/environments/ci-builder.rst | 20 ++++++++--------- doc/source/usage.rst | 19 ++++++++-------- 3 files changed, 31 insertions(+), 30 deletions(-) diff --git a/doc/source/contributor/environments/ci-aio.rst b/doc/source/contributor/environments/ci-aio.rst index b34bad4d6..7107da6f4 100644 --- a/doc/source/contributor/environments/ci-aio.rst +++ b/doc/source/contributor/environments/ci-aio.rst @@ -20,31 +20,31 @@ Install package dependencies. On CentOS: -.. code-block:: console +.. parsed-literal:: sudo dnf install -y python3-virtualenv On Ubuntu: -.. code-block:: console +.. parsed-literal:: sudo apt update sudo apt install -y python3-virtualenv Clone the Kayobe and Kayobe configuration repositories (this one): -.. code-block:: console +.. parsed-literal:: cd mkdir -p src pushd src - git clone https://github.com/stackhpc/kayobe.git -b stackhpc/xena - git clone https://github.com/stackhpc/stackhpc-kayobe-config -b stackhpc/xena kayobe-config + git clone https://github.com/stackhpc/kayobe.git -b |current_release_git_branch_name| + git clone https://github.com/stackhpc/stackhpc-kayobe-config -b |current_release_git_branch_name| kayobe-config popd Create a virtual environment and install Kayobe: -.. code-block:: console +.. parsed-literal:: cd mkdir -p venvs @@ -57,7 +57,7 @@ Create a virtual environment and install Kayobe: Add initial network configuration: -.. code-block:: console +.. parsed-literal:: sudo ip l add breth1 type bridge sudo ip l set breth1 up @@ -75,7 +75,7 @@ Acquire the Ansible Vault password for this repository, and store a copy at The following commands install Kayobe and its dependencies, and prepare the Ansible control host. -.. code-block:: console +.. parsed-literal:: export KAYOBE_VAULT_PASSWORD=$(cat ~/vault-pw) pushd ~/venvs/kayobe @@ -90,13 +90,13 @@ Deployment Next, configure the host OS & services. -.. code-block:: console +.. parsed-literal:: kayobe overcloud host configure Finally, deploy the overcloud services. -.. code-block:: console +.. parsed-literal:: kayobe overcloud service deploy @@ -107,7 +107,7 @@ Testing Run a smoke test: -.. code-block:: console +.. parsed-literal:: cd ~/kayobe ./dev/overcloud-test-vm.sh diff --git a/doc/source/contributor/environments/ci-builder.rst b/doc/source/contributor/environments/ci-builder.rst index 025c6bb65..6cd4878c3 100644 --- a/doc/source/contributor/environments/ci-builder.rst +++ b/doc/source/contributor/environments/ci-builder.rst @@ -21,31 +21,31 @@ Install package dependencies. On CentOS: -.. code-block:: console +.. parsed-literal:: sudo dnf install -y python3-virtualenv On Ubuntu: -.. code-block:: console +.. parsed-literal:: sudo apt update sudo apt install -y python3-virtualenv Clone the Kayobe and Kayobe configuration repositories (this one): -.. code-block:: console +.. parsed-literal:: cd mkdir -p src pushd src - git clone https://github.com/stackhpc/kayobe.git -b stackhpc/xena - git clone https://github.com/stackhpc/stackhpc-kayobe-config -b stackhpc/xena kayobe-config + git clone https://github.com/stackhpc/kayobe.git -b |current_release_git_branch_name| + git clone https://github.com/stackhpc/stackhpc-kayobe-config -b |current_release_git_branch_name| kayobe-config popd Create a virtual environment and install Kayobe: -.. code-block:: console +.. parsed-literal:: cd mkdir -p venvs @@ -58,7 +58,7 @@ Create a virtual environment and install Kayobe: Add initial network configuration: -.. code-block:: console +.. parsed-literal:: sudo ip l add breth1 type bridge sudo ip l set breth1 up @@ -76,7 +76,7 @@ Acquire the Ansible Vault password for this repository, and store a copy at The following commands install Kayobe and its dependencies, and prepare the Ansible control host. -.. code-block:: console +.. parsed-literal:: export KAYOBE_VAULT_PASSWORD=$(cat ~/vault-pw) pushd ~/venvs/kayobe @@ -91,7 +91,7 @@ Deployment Next, configure the host OS & services. -.. code-block:: console +.. parsed-literal:: kayobe seed host configure @@ -100,7 +100,7 @@ Building images At this point you are ready to build and push some container images. -.. code-block:: console +.. parsed-literal:: kayobe seed container image build --push kayobe overcloud container image build --push diff --git a/doc/source/usage.rst b/doc/source/usage.rst index 867d2983c..8be28f9eb 100644 --- a/doc/source/usage.rst +++ b/doc/source/usage.rst @@ -23,35 +23,36 @@ New deployments --------------- If starting a new deployment, clone this repository as the starting point for -your configuration. +your configuration: -.. code-block:: console +.. parsed-literal:: - git clone https://github.com/stackhpc/stackhpc-kayobe-config -b stackhpc/xena + git clone https://github.com/stackhpc/stackhpc-kayobe-config -b |current_release_git_branch_name| Existing deployments -------------------- If migrating an existing deployment to StackHPC Kayobe configuration, you will -need to merge the changes in this repository into your repository. +need to merge the changes in this repository into your repository: -.. code-block:: console +.. parsed-literal:: git remote add stackhpc https://github.com/stackhpc/stackhpc-kayobe-config git fetch stackhpc - git merge stackhpc/stackhpc/xena + git merge stackhpc/|current_release_git_branch_name| Updating -------- This base configuration will be updated over time, to update repository versions, container image tags, and other configuration. Deployments may -consume these updates by merging in the changes with their local configuration. +consume these updates by merging in the changes with their local +configuration: -.. code-block:: console +.. parsed-literal:: git fetch stackhpc - git merge stackhpc/stackhpc/xena + git merge stackhpc/|current_release_git_branch_name| The intention is to avoid merge conflicts where possible, but there may be cases where this is difficult. We are open to discussion on how best to From 19b2e1aef6c6ac35bf55062cfda4917a1f481ff1 Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Wed, 21 Dec 2022 10:49:26 +0000 Subject: [PATCH 23/28] docs: update ci-aio & ci-builder prerequisites --- doc/source/contributor/environments/ci-aio.rst | 2 +- doc/source/contributor/environments/ci-builder.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/contributor/environments/ci-aio.rst b/doc/source/contributor/environments/ci-aio.rst index 7107da6f4..de1d2d8fb 100644 --- a/doc/source/contributor/environments/ci-aio.rst +++ b/doc/source/contributor/environments/ci-aio.rst @@ -9,7 +9,7 @@ Prerequisites ============= * a CentOS Stream 8 or Ubuntu Focal 20.04 host -* access to the local Pulp server +* access to the Test Pulp server on SMS lab Setup ===== diff --git a/doc/source/contributor/environments/ci-builder.rst b/doc/source/contributor/environments/ci-builder.rst index 6cd4878c3..17b73ad1c 100644 --- a/doc/source/contributor/environments/ci-builder.rst +++ b/doc/source/contributor/environments/ci-builder.rst @@ -10,7 +10,7 @@ Prerequisites ============= * a CentOS Stream 8 or Ubuntu Focal 20.04 host -* access to the local Pulp server +* access to the Test Pulp server on SMS lab Setup ===== From ddc4bb0a2e71fbd3177a0ece319513b254261227 Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Wed, 21 Dec 2022 13:48:01 +0000 Subject: [PATCH 24/28] README: link to rtd.io --- README.rst | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/README.rst b/README.rst index c441e9eda..b64ef97c1 100644 --- a/README.rst +++ b/README.rst @@ -5,14 +5,9 @@ StackHPC Kayobe Configuration This repository provides a base Kayobe configuration for the Xena release of StackHPC OpenStack. -Documentation for this project is provided in the ``doc`` directory. To build -it, run: - -.. code-block:: console - - tox -e docs - -The HTML will be written to ``doc/build``. +Documentation is hosted on `readthedocs.io +`__, +and includes release notes. Resources ========= From 546c95abf6804c76de9df438bbcbbc8fc4a51b8a Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Wed, 21 Dec 2022 13:48:41 +0000 Subject: [PATCH 25/28] docs: improve release train docs --- doc/source/_static/images/release-train.svg | 1 + doc/source/configuration/release-train.rst | 117 ++++++++++++++++++-- 2 files changed, 106 insertions(+), 12 deletions(-) create mode 100644 doc/source/_static/images/release-train.svg diff --git a/doc/source/_static/images/release-train.svg b/doc/source/_static/images/release-train.svg new file mode 100644 index 000000000..aaf2f2303 --- /dev/null +++ b/doc/source/_static/images/release-train.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/doc/source/configuration/release-train.rst b/doc/source/configuration/release-train.rst index 6e841b8f1..af1e7b3f9 100644 --- a/doc/source/configuration/release-train.rst +++ b/doc/source/configuration/release-train.rst @@ -3,33 +3,52 @@ StackHPC Release Train ====================== StackHPC provides packages and container images for OpenStack via `Ark -`__. +`__. These artifacts are built and released using a +process known as the `Release Train +`__. Deployments should use a local `Pulp `__ repository -server to synchronise content from Ark and serve it locally. Access to the -repositories on Ark is controlled via user accounts issued by StackHPC. +server to synchronise content from Ark and serve it locally. This reduces +Internet bandwidth requirements for package and container downloads. Content is +synced on demand from Ark to the local Pulp, meaning that the local Pulp acts +like a pull-through cache. -This configuration is a base, and should be merged with any existing Kayobe -configuration. It currently provides the following: +Access to the repositories on Ark is controlled via user accounts issued by +StackHPC. -* Configuration to deploy a local Pulp service as a container on the seed -* Pulp repository definitions for CentOS Stream 8 -* Playbooks to synchronise a local Pulp service with Ark -* Configuration to use the local Pulp repository mirrors on control plane hosts -* Configuration to use the local Pulp container registry on control plane hosts +.. image:: /_static/images/release-train.svg + :width: 75% + +All content on Ark is versioned, meaning that a deployment may continue to use +older package repository snapshots and container images when newer content is +released. This allows for improved reliability & repeatability of deployments. This configuration defines two `Pulp distributions `__ for packages, ``development`` and ``production``. This allows packages to be updated and tested in a development or staging environment before rolling them -out to production. +out to production. Typically a given environment will always use the same +distribution, meaning that package repository configuration files do not need +to be updated on control plane hosts in order to consume a package update. Configuration ============= +This configuration provides the following: + +* Configuration to deploy a local Pulp service as a container on the seed +* Pulp repository definitions for CentOS Stream 8 and Rocky Linux 8 +* Playbooks to synchronise a local Pulp service with Ark +* Configuration to use the local Pulp repository mirrors on control plane hosts +* Configuration to use the local Pulp container registry on control plane hosts + Local Pulp server ----------------- +The Pulp container is deployed on the seed by default, but may be disabled by +setting ``seed_pulp_container_enabled`` to ``false`` in +``etc/kayobe/seed.yml``. + The URL and credentials of the local Pulp server are configured in ``etc/kayobe/pulp.yml`` via ``pulp_url``, ``pulp_username`` and ``pulp_password``. In most cases, the default values should be sufficient. @@ -38,6 +57,9 @@ An admin password must be generated and set as the value of a ``etc/kayobe/secrets.yml`` file. This password will be automatically set on Pulp startup. +If a proxy is required to access the Internet from the seed, ``pulp_proxy_url`` +may be used. + StackHPC Ark ------------ @@ -49,11 +71,49 @@ The Ark pulp credentials issued by StackHPC should be configured in stackhpc_release_pulp_username: stackhpc_release_pulp_password: +Package repositories +-------------------- + +Currently, Ark does not provide package repositories for Ubuntu - only +container images. For this reason, ``stackhpc_pulp_sync_ubuntu_focal`` in +``etc/kayobe/pulp.yml`` is set to ``false`` by default. + +CentOS Stream 8 and Rocky Linux 8 package repositories are synced based on the +value of ``os_distribution``. If you need to sync multiple distributions, +``stackhpc_pulp_sync_centos_stream8`` and ``stackhpc_pulp_sync_rocky_8`` in +``etc/kayobe/pulp.yml`` may be set to ``true``. + +On Ark, each package repository provides versioned snapshots using a datetime +stamp (e.g. ``20220817T082321``). The current set of tested versions is defined +in ``etc/kayobe/pulp-repo-versions.yml``. This file is managed by the StackHPC +Release Train and should generally not be modified by consumers of this +repository. + +Package managers +---------------- + +No configuration is provided for APT, since Ark does not currently provide +package repositories for Ubuntu - only container images. + +For CentOS and Rocky Linux based systems, package manager configuration is +provided by ``stackhpc_dnf_repos`` in ``etc/kayobe/dnf.yml``, which points to +package repositories on the local Pulp server. To use this configuration, the +``dnf_custom_repos`` variable must be set, and this is done for hosts in the +``overcloud`` group via the group_vars file +``etc/kayobe/inventory/group_vars/overcloud/stackhpc-dnf-repos``. Similar +configuration may be added for other groups, however there may be ordering +issues during initial deployment when Pulp has not yet been deployed. The distribution name for the environment should be configured as either ``development`` or ``production`` via ``stackhpc_repo_distribution`` in ``etc/kayobe/stackhpc.yml``. +Ceph container images +--------------------- + +By default, Ceph images are not synced from quay.io to the local Pulp. To sync +these images, set ``stackhpc_sync_ceph_images`` to ``true``. + Usage ===== @@ -91,6 +151,39 @@ See the Kayobe :kayobe-doc:`custom playbook documentation local Pulp. This will make synchonised container images available to cloud nodes. +Syncing content +--------------- + +A typical workflow to sync all packages and containers is as follows: + +.. code-block:: console + + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/pulp-repo-sync.yml + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/pulp-repo-publish.yml + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/pulp-container-sync.yml + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/pulp-container-publish.yml + +Once the content has been tested in a test/staging environment, it may be +promoted to production: + +.. code-block:: console + + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/pulp-repo-promote-production.yml + +Initial seed deployment +----------------------- + +During the initial seed deployment, there is an ordering issue where the +Bifrost container will not yet have been synced, but the local Pulp container +has not yet been deployed. This can be avoided with the following workflow: + +.. code-block:: console + + kayobe seed service deploy --tags seed-deploy-containers --kolla-tags none + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/pulp-container-sync.yml + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/pulp-container-publish.yml + kayobe seed service deploy + Working with pulp ================= @@ -149,7 +242,7 @@ with the push repository using the pulp CLI: .Done. HTTP Error 404: Not Found -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~ If your login credentials are incorrect, or lack the required permissions, you will see a 404 error during ``pulp-repo-sync.yml``: From d73b5630534fe4cf6e64f6d36323aa6ba300ffd2 Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Wed, 21 Dec 2022 13:52:30 +0000 Subject: [PATCH 26/28] docs: add info on how to build docs --- doc/source/contributor/documentation.rst | 18 ++++++++++++++++++ doc/source/contributor/index.rst | 1 + 2 files changed, 19 insertions(+) create mode 100644 doc/source/contributor/documentation.rst diff --git a/doc/source/contributor/documentation.rst b/doc/source/contributor/documentation.rst new file mode 100644 index 000000000..99accac42 --- /dev/null +++ b/doc/source/contributor/documentation.rst @@ -0,0 +1,18 @@ +============= +Documentation +============= + +This documentation is provided in the ``doc`` directory. To build it, run: + +.. code-block:: console + + tox -e docs + +The HTML will be written to ``doc/build``, and may be viewed in a web browser. + +Documentation is hosted on readthedocs.io. Multiple maintainers can be added +for the project. GitHub integration ensures that each version is built when the +corresponding branch is pushed to. + +Each release is hosted as a separate version, and new versions will need to be +added manually. diff --git a/doc/source/contributor/index.rst b/doc/source/contributor/index.rst index d80f0c4ce..9d05da2e2 100644 --- a/doc/source/contributor/index.rst +++ b/doc/source/contributor/index.rst @@ -7,4 +7,5 @@ This guide is for contributors of the StackHPC Kayobe configuration project. .. toctree:: :maxdepth: 1 + documentation environments From 0e6c6d6d9300374180c751e0e2740373922c9247 Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Wed, 21 Dec 2022 13:58:58 +0000 Subject: [PATCH 27/28] docs: move environments index --- .../{environments.rst => environments/index.rst} | 6 +++--- doc/source/contributor/index.rst | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) rename doc/source/contributor/{environments.rst => environments/index.rst} (62%) diff --git a/doc/source/contributor/environments.rst b/doc/source/contributor/environments/index.rst similarity index 62% rename from doc/source/contributor/environments.rst rename to doc/source/contributor/environments/index.rst index 9d99d7a79..7192cae6a 100644 --- a/doc/source/contributor/environments.rst +++ b/doc/source/contributor/environments/index.rst @@ -6,6 +6,6 @@ The following Kayobe environments are provided with this configuration: .. toctree:: :maxdepth: 1 - environments/ci-aio - environments/ci-builder - environments/ci-multinode + ci-aio + ci-builder + ci-multinode diff --git a/doc/source/contributor/index.rst b/doc/source/contributor/index.rst index 9d05da2e2..b6fcb982a 100644 --- a/doc/source/contributor/index.rst +++ b/doc/source/contributor/index.rst @@ -8,4 +8,4 @@ This guide is for contributors of the StackHPC Kayobe configuration project. :maxdepth: 1 documentation - environments + environments/index From 0ef2a51f0320fff36aa58671fc32f09b9c12d81a Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Wed, 21 Dec 2022 14:07:15 +0000 Subject: [PATCH 28/28] docs: add info on generating release notes --- doc/source/contributor/documentation.rst | 2 ++ doc/source/contributor/index.rst | 1 + doc/source/contributor/release-notes.rst | 38 ++++++++++++++++++++++++ 3 files changed, 41 insertions(+) create mode 100644 doc/source/contributor/release-notes.rst diff --git a/doc/source/contributor/documentation.rst b/doc/source/contributor/documentation.rst index 99accac42..b8a263f6c 100644 --- a/doc/source/contributor/documentation.rst +++ b/doc/source/contributor/documentation.rst @@ -1,3 +1,5 @@ +.. _documentation: + ============= Documentation ============= diff --git a/doc/source/contributor/index.rst b/doc/source/contributor/index.rst index b6fcb982a..1e1bdd2d6 100644 --- a/doc/source/contributor/index.rst +++ b/doc/source/contributor/index.rst @@ -8,4 +8,5 @@ This guide is for contributors of the StackHPC Kayobe configuration project. :maxdepth: 1 documentation + release-notes environments/index diff --git a/doc/source/contributor/release-notes.rst b/doc/source/contributor/release-notes.rst new file mode 100644 index 000000000..8deb85730 --- /dev/null +++ b/doc/source/contributor/release-notes.rst @@ -0,0 +1,38 @@ +============= +Release notes +============= + +StackHPC Kayobe configuration uses the following release notes sections: + +- ``features`` --- for new features or functionality; these should ideally + refer to the blueprint being implemented; +- ``fixes`` --- for fixes closing bugs; these must refer to the bug being + closed; +- ``upgrade`` --- for notes relevant when upgrading from previous version; + these should ideally be added only between major versions; required when + the proposed change affects behaviour in a non-backwards compatible way or + generally changes something impactful; +- ``deprecations`` --- to track deprecated features; relevant changes may + consist of only the commit message and the release note; +- ``prelude`` --- filled in by the PTL before each release or RC. + +Other release note types may be applied per common sense. +Each change should include a release note unless being a ``TrivialFix`` +change or affecting only docs or CI. Such changes should `not` include +a release note to avoid confusion. +Remember release notes are mostly for end users which, in case of Kolla, +are OpenStack administrators/operators. + +To add a release note, install the ``reno`` package in a Python virtual +environment, then run the following command: + +.. code-block:: console + + reno new + +Release notes for the current release are included in the :ref:`documentation`. +Note that a note won't be included in the generated documentation until it is +tracked by ``git``. + +All release notes can be inspected by browsing ``releasenotes/notes`` +directory.