diff --git a/etc/kayobe/ansible/pulp-host-image-promote.yml b/etc/kayobe/ansible/pulp-host-image-promote.yml index d93d71d51..42f98b423 100644 --- a/etc/kayobe/ansible/pulp-host-image-promote.yml +++ b/etc/kayobe/ansible/pulp-host-image-promote.yml @@ -19,6 +19,9 @@ name: "{{ repository_name }}_{{ promotion_tag }}" base_path: "{{ base_path }}/{{ promotion_tag }}" register: distribution_details + until: distribution_details is success + retries: 3 + delay: 5 - name: Fail if the image does not exist fail: @@ -34,6 +37,10 @@ base_path: "{{ base_path }}/{{ promotion_tag }}" content_guard: release state: present + register: content_guard_result + until: content_guard_result is success + retries: 3 + delay: 5 - name: Print version tag and os debug: diff --git a/etc/kayobe/ansible/pulp-host-image-upload.yml b/etc/kayobe/ansible/pulp-host-image-upload.yml index d3a44f133..cc4876080 100644 --- a/etc/kayobe/ansible/pulp-host-image-upload.yml +++ b/etc/kayobe/ansible/pulp-host-image-upload.yml @@ -25,6 +25,10 @@ password: "{{ remote_pulp_password }}" file: "{{ found_files.files[0].path }}" state: present + register: upload_result + until: upload_result is success + retries: 3 + delay: 60 - name: Get sha256 hash ansible.builtin.stat: @@ -40,6 +44,10 @@ sha256: "{{ file_stats.stat.checksum }}" relative_path: "{{ found_files.files[0].path | basename }}" state: present + register: file_content_result + until: file_content_result is success + retries: 3 + delay: 5 - name: Ensure file repo exists pulp.squeezer.file_repository: @@ -48,6 +56,10 @@ password: "{{ remote_pulp_password }}" name: "{{ repository_name }}" state: present + register: file_repo_result + until: file_repo_result is success + retries: 3 + delay: 5 - name: Add content to file repo pulp.squeezer.file_repository_content: @@ -58,6 +70,10 @@ present_content: - relative_path: "{{ found_files.files[0].path | basename }}" sha256: "{{ file_stats.stat.checksum }}" + register: file_repo_content_result + until: file_repo_content_result is success + retries: 3 + delay: 5 - name: Create a new publication to point to this version pulp.squeezer.file_publication: @@ -67,6 +83,9 @@ repository: "{{ repository_name }}" state: present register: publication_details + until: publication_details is success + retries: 3 + delay: 5 - name: Update distribution for latest version pulp.squeezer.file_distribution: @@ -79,6 +98,9 @@ content_guard: development state: present register: latest_distribution_details + until: latest_distribution_details is success + retries: 3 + delay: 5 - name: Create distribution for given version pulp.squeezer.file_distribution: @@ -91,6 +113,10 @@ content_guard: development state: present when: latest_distribution_details.changed + register: distribution_result + until: distribution_result is success + retries: 3 + delay: 5 - name: Update new images file with versioned path lineinfile: diff --git a/etc/kayobe/kolla.yml b/etc/kayobe/kolla.yml index adf3081cf..9479ba6c1 100644 --- a/etc/kayobe/kolla.yml +++ b/etc/kayobe/kolla.yml @@ -367,7 +367,7 @@ kolla_build_customizations: "{{ kolla_build_customizations_common | combine(koll # Dict mapping Kolla Dockerfile ARG names to their values. kolla_build_args: - node_exporter_version: "1.5.0" # kolla has 1.4.0 + node_exporter_version: "1.5.0" # kolla has 1.4.0 node_exporter_sha256sum: "af999fd31ab54ed3a34b9f0b10c28e9acee9ef5ac5a5d5edfdde85437db7acbb" ############################################################################### diff --git a/etc/kayobe/kolla/config/prometheus/system.rules b/etc/kayobe/kolla/config/prometheus/system.rules index c2caa9898..b7c757a56 100644 --- a/etc/kayobe/kolla/config/prometheus/system.rules +++ b/etc/kayobe/kolla/config/prometheus/system.rules @@ -96,6 +96,30 @@ groups: summary: Host clock not synchronising (instance {{ $labels.instance }}) description: "Clock not synchronising. Ensure NTP is configured on this host." + - alert: HostNetworkBondDegraded + expr: (node_bonding_active - node_bonding_slaves) != 0 + for: 2m + labels: + severity: warning + annotations: + summary: Host network bond degraded (instance {{ $labels.instance }}) + description: "Bond {{ $labels.master }} degraded on {{ $labels.instance }}" +{% endraw %} + +{% if alertmanager_warn_network_bond_single_link | bool %} +{% raw %} + - alert: HostNetworkBondSingleLink + expr: node_bonding_slaves == 1 + for: 2m + labels: + severity: warning + annotations: + summary: Host network bond with a single link (instance {{ $labels.instance }}) + description: "Bond {{ $labels.master }} configured with a single link on {{ $labels.instance }}" +{% endraw %} +{% endif %} + +{% raw %} - alert: HostConntrackLimit expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8 for: 5m diff --git a/etc/kayobe/stackhpc-monitoring.yml b/etc/kayobe/stackhpc-monitoring.yml index f08e552c3..e8e0bb91f 100644 --- a/etc/kayobe/stackhpc-monitoring.yml +++ b/etc/kayobe/stackhpc-monitoring.yml @@ -8,6 +8,10 @@ # of free memory is lower than this value an alert will be triggered. alertmanager_low_memory_threshold_gib: 5 +# Whether to raise an alert if any network bond is configured with a single +# link. Change to false to disable this alert. +alertmanager_warn_network_bond_single_link: true + ############################################################################### # Exporter configuration diff --git a/releasenotes/notes/network-bond-degraded-alert-d2a0b05002609ac1.yaml b/releasenotes/notes/network-bond-degraded-alert-d2a0b05002609ac1.yaml new file mode 100644 index 000000000..c987c7959 --- /dev/null +++ b/releasenotes/notes/network-bond-degraded-alert-d2a0b05002609ac1.yaml @@ -0,0 +1,5 @@ +--- +features: + - | + Adds a new Prometheus alert ``HostNetworkBondDegraded`` which will be + raised when at least one bond member is down. diff --git a/releasenotes/notes/network-bond-single-link-766adf41a3c2fd4e.yaml b/releasenotes/notes/network-bond-single-link-766adf41a3c2fd4e.yaml new file mode 100644 index 000000000..66d66f40b --- /dev/null +++ b/releasenotes/notes/network-bond-single-link-766adf41a3c2fd4e.yaml @@ -0,0 +1,8 @@ +--- +features: + - | + Adds a new Prometheus alert ``HostNetworkBondSingleLink`` which will be + raised when a bond is configured with only one member. This can happen when + NetworkManager detects that a bond member is down at boot time. This alert + can be disabled by setting ``alertmanager_warn_network_bond_single_link`` + to ``false``.