diff --git a/etc/kayobe/ansible/advise-run.yml b/etc/kayobe/ansible/advise-run.yml new file mode 100644 index 000000000..d0ad2eee1 --- /dev/null +++ b/etc/kayobe/ansible/advise-run.yml @@ -0,0 +1,46 @@ +--- +- name: ADVise run + hosts: localhost + gather_facts: no + tags: + - advise + vars: + venv: "~/venvs/advise-review" + input_dir: "{{ lookup('env', 'PWD') }}/overcloud-introspection-data" + output_dir: "{{ lookup('env', 'PWD') }}/review" + advise_pattern: ".*.eval" # Uses regex + tasks: + - name: Install dependencies + pip: + virtualenv: "{{ venv }}" + name: + - git+https://github.com/stackhpc/ADVise + state: latest + + - name: Create data directory + file: + path: '{{ output_dir }}/data' + state: directory + + - name: Extract data + shell: + cmd: > + {{ venv }}/bin/m2-extract {{ input_dir }}/*.json --output_dir {{ output_dir }}/data + + - name: Create review directory + file: + path: '{{ output_dir }}/results' + state: directory + + - name: Process data + shell: + cmd: > + {{ venv }}/bin/advise-process + -I ipmi + -p '{{ output_dir }}/data/extra-hardware/{{ advise_pattern }}' + -o '{{ output_dir }}' + + - name: Visualise data + command: > + {{ venv }}/bin/advise-visualise + --output_dir '{{ output_dir }}' diff --git a/etc/kayobe/ansible/rabbitmq-reset.yml b/etc/kayobe/ansible/rabbitmq-reset.yml new file mode 100644 index 000000000..df6d0c4ca --- /dev/null +++ b/etc/kayobe/ansible/rabbitmq-reset.yml @@ -0,0 +1,57 @@ +--- +# Reset a broken RabbitMQ cluster. +# Also restarts OpenStack services which may be broken. + +- name: Reset RabbitMQ + hosts: controllers + become: True + gather_facts: no + tags: + - rabbitmq-reset + vars: + - container_name: rabbitmq + tasks: + - name: Inspect the {{ container_name }} container + shell: + cmd: "docker container inspect --format '{{ '{{' }} .State.Running {{ '}}' }}' {{ container_name }}" + register: inspection + + - name: Ensure the {{ container_name }} container is running + command: "docker start {{ container_name }}" + when: inspection.stdout == 'false' + + - name: Wait for the {{ container_name }} container to reach state 'Running' + shell: + cmd: "docker container inspect --format '{{ '{{' }} .State.Running {{ '}}' }}' {{ container_name }}" + register: result + until: result.stdout == 'true' + retries: 10 + delay: 6 + + - name: Wait for the rabbitmq node to automatically start on container start + command: "docker exec -it {{ container_name }} /bin/bash -c 'rabbitmqctl wait /var/lib/rabbitmq/mnesia/rabbitmq.pid --timeout 60'" + when: inspection.stdout == 'false' + + - name: Stop app + command: "docker exec -it {{ container_name }} /bin/bash -c 'rabbitmqctl stop_app'" + + - name: Force reset app + command: "docker exec -it {{ container_name }} /bin/bash -c 'rabbitmqctl force_reset'" + + - name: Start app + command: "docker exec -it {{ container_name }} /bin/bash -c 'rabbitmqctl start_app'" + + - name: Wait for all nodes to join the cluster + command: "docker exec -it {{ container_name }} /bin/bash -c 'rabbitmqctl await_online_nodes {{ groups['controllers'] | length }}'" + +- name: Restart OpenStack services + hosts: controllers:compute + become: true + gather_facts: no + tags: + - restart-openstack + tasks: + # The following services can have problems if the cluster gets broken. + - name: Restart OpenStack services + shell: >- + docker ps -a | egrep '(cinder|heat|ironic|keystone|magnum|neutron|nova)' | awk '{ print $NF }' | xargs docker restart diff --git a/etc/kayobe/environments/ci-aio/kolla/config/grafana b/etc/kayobe/environments/ci-aio/kolla/config/grafana new file mode 120000 index 000000000..0e711c2ae --- /dev/null +++ b/etc/kayobe/environments/ci-aio/kolla/config/grafana @@ -0,0 +1 @@ +../../../../kolla/config/grafana/ \ No newline at end of file diff --git a/etc/kayobe/environments/ci-aio/kolla/config/prometheus b/etc/kayobe/environments/ci-aio/kolla/config/prometheus new file mode 120000 index 000000000..9a40a2c64 --- /dev/null +++ b/etc/kayobe/environments/ci-aio/kolla/config/prometheus @@ -0,0 +1 @@ +../../../../kolla/config/prometheus/ \ No newline at end of file diff --git a/etc/kayobe/environments/ci-aio/stackhpc-ci.yml b/etc/kayobe/environments/ci-aio/stackhpc-ci.yml index 9238a290c..77e01909e 100644 --- a/etc/kayobe/environments/ci-aio/stackhpc-ci.yml +++ b/etc/kayobe/environments/ci-aio/stackhpc-ci.yml @@ -48,9 +48,16 @@ stackhpc_repo_ubuntu_cloud_archive_version: "{{ stackhpc_pulp_repo_ubuntu_cloud_ stackhpc_repo_ubuntu_focal_version: "{{ stackhpc_pulp_repo_ubuntu_focal_version }}" stackhpc_repo_ubuntu_focal_security_version: "{{ stackhpc_pulp_repo_ubuntu_focal_security_version }}" stackhpc_repo_docker_ce_ubuntu_version: "{{ stackhpc_pulp_repo_docker_ce_ubuntu_version }}" -stackhpc_repo_rocky_baseos_version: "{{ stackhpc_pulp_repo_rocky_8_6_baseos_version }}" -stackhpc_repo_rocky_appstream_version: "{{ stackhpc_pulp_repo_rocky_8_6_appstream_version }}" -stackhpc_repo_rocky_extras_version: "{{ stackhpc_pulp_repo_rocky_8_6_extras_version }}" +## Use derived vars from etc/kayobe/pulp.yml to switch between +## minor Rocky versions using stackhpc_pulp_repo_rocky_8_minor_version +stackhpc_repo_rocky_baseos_version: "{{ stackhpc_pulp_repo_rocky_8_baseos_version }}" +stackhpc_repo_rocky_appstream_version: "{{ stackhpc_pulp_repo_rocky_8_appstream_version }}" +stackhpc_repo_rocky_extras_version: "{{ stackhpc_pulp_repo_rocky_8_extras_version }}" + +# Rocky-and-CI-specific Pulp urls +stackhpc_repo_rocky_baseos_url: "{{ stackhpc_repo_mirror_url }}/pulp/content/rocky/8.{{ stackhpc_pulp_repo_rocky_8_minor_version }}/BaseOS/x86_64/os/{{ stackhpc_repo_rocky_baseos_version }}/" +stackhpc_repo_rocky_appstream_url: "{{ stackhpc_repo_mirror_url }}/pulp/content/rocky/8.{{ stackhpc_pulp_repo_rocky_8_minor_version }}/AppStream/x86_64/os/{{ stackhpc_repo_rocky_appstream_version }}/" +stackhpc_repo_rocky_extras_url: "{{ stackhpc_repo_mirror_url }}/pulp/content/rocky/8.{{ stackhpc_pulp_repo_rocky_8_minor_version }}/extras/x86_64/os/{{ stackhpc_repo_rocky_extras_version }}/" # Host and port of container registry. # Push built images to the development Pulp service registry. diff --git a/etc/kayobe/environments/ci-multinode/kolla/config/grafana b/etc/kayobe/environments/ci-multinode/kolla/config/grafana new file mode 120000 index 000000000..0e711c2ae --- /dev/null +++ b/etc/kayobe/environments/ci-multinode/kolla/config/grafana @@ -0,0 +1 @@ +../../../../kolla/config/grafana/ \ No newline at end of file diff --git a/etc/kayobe/environments/ci-multinode/kolla/config/prometheus b/etc/kayobe/environments/ci-multinode/kolla/config/prometheus new file mode 120000 index 000000000..9a40a2c64 --- /dev/null +++ b/etc/kayobe/environments/ci-multinode/kolla/config/prometheus @@ -0,0 +1 @@ +../../../../kolla/config/prometheus/ \ No newline at end of file diff --git a/etc/kayobe/kolla/config/prometheus/elasticsearch.rules b/etc/kayobe/kolla/config/prometheus/elasticsearch.rules index 974bf4e99..42a196b9a 100644 --- a/etc/kayobe/kolla/config/prometheus/elasticsearch.rules +++ b/etc/kayobe/kolla/config/prometheus/elasticsearch.rules @@ -44,7 +44,7 @@ groups: - alert: ElasticsearchClusterRed expr: elasticsearch_cluster_health_status{color="red"} == 1 - for: 0m + for: 5m labels: severity: critical annotations: @@ -53,7 +53,7 @@ groups: - alert: ElasticsearchClusterYellow expr: elasticsearch_cluster_health_status{color="yellow"} == 1 - for: 5m + for: 15m labels: severity: warning annotations: @@ -80,7 +80,7 @@ groups: - alert: ElasticsearchUnassignedShards expr: elasticsearch_cluster_health_unassigned_shards > 0 - for: 0m + for: 5m labels: severity: critical annotations: diff --git a/etc/kayobe/kolla/config/prometheus/system.rules b/etc/kayobe/kolla/config/prometheus/system.rules index fe3a2b9ac..ffc7d25a3 100644 --- a/etc/kayobe/kolla/config/prometheus/system.rules +++ b/etc/kayobe/kolla/config/prometheus/system.rules @@ -34,7 +34,7 @@ groups: description: "OOM kill detected" - alert: Overheating - expr: node_hwmon_temp_celsius >= 85 + expr: node_hwmon_temp_celsius >= node_hwmon_temp_max_celsius for: 1m labels: severity: warning diff --git a/etc/kayobe/pulp.yml b/etc/kayobe/pulp.yml index 55d0e0384..ad475a56d 100644 --- a/etc/kayobe/pulp.yml +++ b/etc/kayobe/pulp.yml @@ -19,7 +19,7 @@ pulp_proxy_url: "{{ omit }}" # Base URL of the StackHPC Pulp service. stackhpc_release_pulp_url: "https://ark.stackhpc.com" -# Credentials used to access the StackHPC Ark container image registry. +# Credentials used to access the StackHPC Ark pulp server. stackhpc_release_pulp_username: stackhpc_release_pulp_password: @@ -248,7 +248,7 @@ stackhpc_pulp_repository_rpm_repos: state: present required: "{{ stackhpc_pulp_sync_rocky_8 | bool }}" - name: Rocky Linux 8 - PowerTools - url: "{{ stackhpc_release_pulp_content_url }}/rocky/8.{{ stackhpc_pulp_repo_rocky_8_minor_version }}/PowerTools/x86_64/os/{{ stackhpc_pulp_repo_rocky_8_6_powertools_version }}" + url: "{{ stackhpc_release_pulp_content_url }}/rocky/8.{{ stackhpc_pulp_repo_rocky_8_minor_version }}/PowerTools/x86_64/os/{{ stackhpc_pulp_repo_rocky_8_powertools_version }}" remote_username: "{{ stackhpc_release_pulp_username }}" remote_password: "{{ stackhpc_release_pulp_password }}" policy: on_demand diff --git a/etc/kayobe/seed.yml b/etc/kayobe/seed.yml index ce16140fa..33e2637ac 100644 --- a/etc/kayobe/seed.yml +++ b/etc/kayobe/seed.yml @@ -106,8 +106,11 @@ seed_pulp_container: image: pulp/pulp pre: "{{ kayobe_config_path }}/containers/pulp/pre.yml" post: "{{ kayobe_config_path }}/containers/pulp/post.yml" - tag: "3.16" + tag: "3.21" network_mode: host + # Override deploy_containers_defaults.init == true to ensure + # s6-overlay-suexec starts as pid 1 + init: false volumes: - /opt/kayobe/containers/pulp:/etc/pulp - pulp_storage:/var/lib/pulp diff --git a/etc/kayobe/stackhpc.yml b/etc/kayobe/stackhpc.yml index 8cc9ccf8a..613810928 100644 --- a/etc/kayobe/stackhpc.yml +++ b/etc/kayobe/stackhpc.yml @@ -104,15 +104,15 @@ stackhpc_repo_treasuredata_4_url: "{{ stackhpc_repo_mirror_url }}/pulp/content/t stackhpc_repo_treasuredata_4_version: "{{ stackhpc_repo_distribution }}" # Rocky 8 BaseOS -stackhpc_repo_rocky_baseos_url: "{{ stackhpc_repo_mirror_url }}/pulp/content/rocky/8.6/BaseOS/x86_64/os/{{ stackhpc_repo_rocky_baseos_version }}" +stackhpc_repo_rocky_baseos_url: "{{ stackhpc_repo_mirror_url }}/pulp/content/rocky/8/BaseOS/x86_64/os/{{ stackhpc_repo_rocky_baseos_version }}/" stackhpc_repo_rocky_baseos_version: "{{ stackhpc_repo_distribution }}" # Rocky 8 AppStream -stackhpc_repo_rocky_appstream_url: "{{ stackhpc_repo_mirror_url }}/pulp/content/rocky/8.6/AppStream/x86_64/os/{{ stackhpc_repo_rocky_appstream_version }}" +stackhpc_repo_rocky_appstream_url: "{{ stackhpc_repo_mirror_url }}/pulp/content/rocky/8/AppStream/x86_64/os/{{ stackhpc_repo_rocky_appstream_version }}/" stackhpc_repo_rocky_appstream_version: "{{ stackhpc_repo_distribution }}" # Rocky 8 extras -stackhpc_repo_rocky_extras_url: "{{ stackhpc_repo_mirror_url }}/pulp/content/rocky/8.6/extras/x86_64/os/{{ stackhpc_repo_rocky_extras_version }}" +stackhpc_repo_rocky_extras_url: "{{ stackhpc_repo_mirror_url }}/pulp/content/rocky/8/extras/x86_64/os/{{ stackhpc_repo_rocky_extras_version }}/" stackhpc_repo_rocky_extras_version: "{{ stackhpc_repo_distribution }}" ###############################################################################