From 39fa6433b4eee5a75beabd6b993913a53d1394d6 Mon Sep 17 00:00:00 2001 From: Alex-Welsh Date: Mon, 15 Apr 2024 10:57:31 +0100 Subject: [PATCH 1/5] Fix tox whitespace warning --- etc/kayobe/kolla.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etc/kayobe/kolla.yml b/etc/kayobe/kolla.yml index 110ee750f..b4dc649a5 100644 --- a/etc/kayobe/kolla.yml +++ b/etc/kayobe/kolla.yml @@ -369,7 +369,7 @@ kolla_build_customizations: "{{ kolla_build_customizations_common | combine(koll # Dict mapping Kolla Dockerfile ARG names to their values. kolla_build_args: - node_exporter_version: "1.5.0" # kolla has 1.4.0 + node_exporter_version: "1.5.0" # kolla has 1.4.0 node_exporter_sha256sum: "af999fd31ab54ed3a34b9f0b10c28e9acee9ef5ac5a5d5edfdde85437db7acbb" ############################################################################### From c2bd71ba51c4f73ce83e1f0bd12dba7c6060cca6 Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Thu, 16 Nov 2023 11:42:17 +0000 Subject: [PATCH 2/5] Add retries to overcloud host image pulp tasks Retries have been added to the stackhpc.pulp collection to improve reliability. Adding the same here. --- .../ansible/pulp-host-image-promote.yml | 7 +++++ etc/kayobe/ansible/pulp-host-image-upload.yml | 26 +++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/etc/kayobe/ansible/pulp-host-image-promote.yml b/etc/kayobe/ansible/pulp-host-image-promote.yml index d93d71d51..42f98b423 100644 --- a/etc/kayobe/ansible/pulp-host-image-promote.yml +++ b/etc/kayobe/ansible/pulp-host-image-promote.yml @@ -19,6 +19,9 @@ name: "{{ repository_name }}_{{ promotion_tag }}" base_path: "{{ base_path }}/{{ promotion_tag }}" register: distribution_details + until: distribution_details is success + retries: 3 + delay: 5 - name: Fail if the image does not exist fail: @@ -34,6 +37,10 @@ base_path: "{{ base_path }}/{{ promotion_tag }}" content_guard: release state: present + register: content_guard_result + until: content_guard_result is success + retries: 3 + delay: 5 - name: Print version tag and os debug: diff --git a/etc/kayobe/ansible/pulp-host-image-upload.yml b/etc/kayobe/ansible/pulp-host-image-upload.yml index d3a44f133..cc4876080 100644 --- a/etc/kayobe/ansible/pulp-host-image-upload.yml +++ b/etc/kayobe/ansible/pulp-host-image-upload.yml @@ -25,6 +25,10 @@ password: "{{ remote_pulp_password }}" file: "{{ found_files.files[0].path }}" state: present + register: upload_result + until: upload_result is success + retries: 3 + delay: 60 - name: Get sha256 hash ansible.builtin.stat: @@ -40,6 +44,10 @@ sha256: "{{ file_stats.stat.checksum }}" relative_path: "{{ found_files.files[0].path | basename }}" state: present + register: file_content_result + until: file_content_result is success + retries: 3 + delay: 5 - name: Ensure file repo exists pulp.squeezer.file_repository: @@ -48,6 +56,10 @@ password: "{{ remote_pulp_password }}" name: "{{ repository_name }}" state: present + register: file_repo_result + until: file_repo_result is success + retries: 3 + delay: 5 - name: Add content to file repo pulp.squeezer.file_repository_content: @@ -58,6 +70,10 @@ present_content: - relative_path: "{{ found_files.files[0].path | basename }}" sha256: "{{ file_stats.stat.checksum }}" + register: file_repo_content_result + until: file_repo_content_result is success + retries: 3 + delay: 5 - name: Create a new publication to point to this version pulp.squeezer.file_publication: @@ -67,6 +83,9 @@ repository: "{{ repository_name }}" state: present register: publication_details + until: publication_details is success + retries: 3 + delay: 5 - name: Update distribution for latest version pulp.squeezer.file_distribution: @@ -79,6 +98,9 @@ content_guard: development state: present register: latest_distribution_details + until: latest_distribution_details is success + retries: 3 + delay: 5 - name: Create distribution for given version pulp.squeezer.file_distribution: @@ -91,6 +113,10 @@ content_guard: development state: present when: latest_distribution_details.changed + register: distribution_result + until: distribution_result is success + retries: 3 + delay: 5 - name: Update new images file with versioned path lineinfile: From c2edb3825a467cbad6e9713c17ec67f185b286bd Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Tue, 16 Apr 2024 13:37:55 +0200 Subject: [PATCH 3/5] Raise alert on degraded network bonds This will raise a alert when at least one of the bond members is down. Adapted from awesome-prometheus-alerts [1]. [1] https://samber.github.io/awesome-prometheus-alerts/rules.html#rule-host-and-hardware-1-34 --- etc/kayobe/kolla/config/prometheus/system.rules | 9 +++++++++ .../network-bond-degraded-alert-d2a0b05002609ac1.yaml | 5 +++++ 2 files changed, 14 insertions(+) create mode 100644 releasenotes/notes/network-bond-degraded-alert-d2a0b05002609ac1.yaml diff --git a/etc/kayobe/kolla/config/prometheus/system.rules b/etc/kayobe/kolla/config/prometheus/system.rules index c82bed16e..6ee3eed3c 100644 --- a/etc/kayobe/kolla/config/prometheus/system.rules +++ b/etc/kayobe/kolla/config/prometheus/system.rules @@ -96,6 +96,15 @@ groups: summary: Host clock not synchronising (instance {{ $labels.instance }}) description: "Clock not synchronising. Ensure NTP is configured on this host." + - alert: HostNetworkBondDegraded + expr: (node_bonding_active - node_bonding_slaves) != 0 + for: 2m + labels: + severity: warning + annotations: + summary: Host network bond degraded (instance {{ $labels.instance }}) + description: "Bond {{ $labels.master }} degraded on {{ $labels.instance }}" + - alert: HostConntrackLimit expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8 for: 5m diff --git a/releasenotes/notes/network-bond-degraded-alert-d2a0b05002609ac1.yaml b/releasenotes/notes/network-bond-degraded-alert-d2a0b05002609ac1.yaml new file mode 100644 index 000000000..c987c7959 --- /dev/null +++ b/releasenotes/notes/network-bond-degraded-alert-d2a0b05002609ac1.yaml @@ -0,0 +1,5 @@ +--- +features: + - | + Adds a new Prometheus alert ``HostNetworkBondDegraded`` which will be + raised when at least one bond member is down. From 6cf594d304cd1936d99a5c0ada70d4cd6861a4b1 Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Tue, 23 Apr 2024 11:07:18 +0200 Subject: [PATCH 4/5] Add alert to detect bonds with a single link This change adds a new Prometheus alert HostNetworkBondSingleLink which will be raised when a bond is configured with only one member. This can happen when NetworkManager detects that a bond member is down at boot time. This would fail to be detected by the HostNetworkBondDegraded alert. --- etc/kayobe/kolla/config/prometheus/system.rules | 15 +++++++++++++++ etc/kayobe/stackhpc-monitoring.yml | 4 ++++ ...network-bond-single-link-766adf41a3c2fd4e.yaml | 8 ++++++++ 3 files changed, 27 insertions(+) create mode 100644 releasenotes/notes/network-bond-single-link-766adf41a3c2fd4e.yaml diff --git a/etc/kayobe/kolla/config/prometheus/system.rules b/etc/kayobe/kolla/config/prometheus/system.rules index 6ee3eed3c..613368be6 100644 --- a/etc/kayobe/kolla/config/prometheus/system.rules +++ b/etc/kayobe/kolla/config/prometheus/system.rules @@ -104,7 +104,22 @@ groups: annotations: summary: Host network bond degraded (instance {{ $labels.instance }}) description: "Bond {{ $labels.master }} degraded on {{ $labels.instance }}" +{% endraw %} +{% if alertmanager_warn_network_bond_single_link | bool %} +{% raw %} + - alert: HostNetworkBondSingleLink + expr: node_bonding_slaves == 1 + for: 2m + labels: + severity: warning + annotations: + summary: Host network bond with a single link (instance {{ $labels.instance }}) + description: "Bond {{ $labels.master }} configured with a single link on {{ $labels.instance }}" +{% endraw %} +{% endif %} + +{% raw %} - alert: HostConntrackLimit expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8 for: 5m diff --git a/etc/kayobe/stackhpc-monitoring.yml b/etc/kayobe/stackhpc-monitoring.yml index f08e552c3..e8e0bb91f 100644 --- a/etc/kayobe/stackhpc-monitoring.yml +++ b/etc/kayobe/stackhpc-monitoring.yml @@ -8,6 +8,10 @@ # of free memory is lower than this value an alert will be triggered. alertmanager_low_memory_threshold_gib: 5 +# Whether to raise an alert if any network bond is configured with a single +# link. Change to false to disable this alert. +alertmanager_warn_network_bond_single_link: true + ############################################################################### # Exporter configuration diff --git a/releasenotes/notes/network-bond-single-link-766adf41a3c2fd4e.yaml b/releasenotes/notes/network-bond-single-link-766adf41a3c2fd4e.yaml new file mode 100644 index 000000000..66d66f40b --- /dev/null +++ b/releasenotes/notes/network-bond-single-link-766adf41a3c2fd4e.yaml @@ -0,0 +1,8 @@ +--- +features: + - | + Adds a new Prometheus alert ``HostNetworkBondSingleLink`` which will be + raised when a bond is configured with only one member. This can happen when + NetworkManager detects that a bond member is down at boot time. This alert + can be disabled by setting ``alertmanager_warn_network_bond_single_link`` + to ``false``. From 2d8c15b78f723c4874c304acbdca5b2d809c409e Mon Sep 17 00:00:00 2001 From: Matt Crees Date: Thu, 25 Apr 2024 10:01:22 +0100 Subject: [PATCH 5/5] Correct backup for seed images in RL9 migration Current instructions have a recursive copy: ``cp: cannot copy a directory, '/var/lib/libvirt/images', into itself, '/var/lib/libvirt/images/backup/images'`` --- doc/source/operations/rocky-linux-9.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/operations/rocky-linux-9.rst b/doc/source/operations/rocky-linux-9.rst index 22b9323ae..99c827a87 100644 --- a/doc/source/operations/rocky-linux-9.rst +++ b/doc/source/operations/rocky-linux-9.rst @@ -745,8 +745,8 @@ Full procedure .. code:: console - sudo mkdir /var/lib/libvirt/images/backup - sudo cp -r /var/lib/libvirt/images /var/lib/libvirt/images/backup + sudo mkdir /var/lib/libvirt/images-backup + sudo cp -r /var/lib/libvirt/images /var/lib/libvirt/images-backup 9. Delete the seed root volume (check the structure & naming conventions first)