From dd18412655368c5f6e0e575f54122599af7fa0dd Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Wed, 7 Feb 2024 17:11:14 +0000 Subject: [PATCH 01/11] Deprecate network configuration in environments --- etc/kayobe/environments/ci-aio/stackhpc-ci.yml | 7 ------- etc/kayobe/environments/ci-builder/stackhpc-ci.yml | 7 ------- etc/kayobe/environments/ci-multinode/stackhpc-ci.yml | 7 ------- 3 files changed, 21 deletions(-) diff --git a/etc/kayobe/environments/ci-aio/stackhpc-ci.yml b/etc/kayobe/environments/ci-aio/stackhpc-ci.yml index 9740da775..00ea5bc10 100644 --- a/etc/kayobe/environments/ci-aio/stackhpc-ci.yml +++ b/etc/kayobe/environments/ci-aio/stackhpc-ci.yml @@ -5,13 +5,6 @@ # Docker namespace to use for Kolla images. Default is 'kolla'. kolla_docker_namespace: stackhpc-dev -############################################################################### -# Network configuration. - -# Don't touch resolv.conf: use Neutron DNS for accessing Pulp server via -# hostname. -resolv_is_managed: false - ############################################################################### # StackHPC configuration. diff --git a/etc/kayobe/environments/ci-builder/stackhpc-ci.yml b/etc/kayobe/environments/ci-builder/stackhpc-ci.yml index efe4236e9..e0421f97d 100644 --- a/etc/kayobe/environments/ci-builder/stackhpc-ci.yml +++ b/etc/kayobe/environments/ci-builder/stackhpc-ci.yml @@ -30,13 +30,6 @@ kolla_enable_prometheus: true kolla_enable_redis: true kolla_enable_skydive: true -############################################################################### -# Network configuration. - -# Don't touch resolv.conf: use Neutron DNS for accessing Pulp server via -# hostname. -resolv_is_managed: false - ############################################################################### # StackHPC configuration. diff --git a/etc/kayobe/environments/ci-multinode/stackhpc-ci.yml b/etc/kayobe/environments/ci-multinode/stackhpc-ci.yml index b37db2ee7..cdb6eb810 100644 --- a/etc/kayobe/environments/ci-multinode/stackhpc-ci.yml +++ b/etc/kayobe/environments/ci-multinode/stackhpc-ci.yml @@ -5,13 +5,6 @@ # Docker namespace to use for Kolla images. Default is 'kolla'. kolla_docker_namespace: stackhpc-dev -############################################################################### -# Network configuration. - -# Don't touch resolv.conf: use Neutron DNS for accessing Pulp server via -# hostname. -resolv_is_managed: false - ############################################################################### # StackHPC configuration. From f2ba8ec2a10cf8c2cb99a8a0a4c73b4c3d5ded09 Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Sat, 17 Feb 2024 08:54:29 +0100 Subject: [PATCH 02/11] Fix growroot playbook for NVMe devices The disk_tmp variable uses a device path rather than a device name. --- etc/kayobe/ansible/growroot.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etc/kayobe/ansible/growroot.yml b/etc/kayobe/ansible/growroot.yml index cdd7293d9..aa3dcc05a 100644 --- a/etc/kayobe/ansible/growroot.yml +++ b/etc/kayobe/ansible/growroot.yml @@ -77,7 +77,7 @@ vars: pv: "{{ pvs.stdout | from_json }}" disk_tmp: "{{ pv.report[0].pv[0].pv_name[:-1] }}" - disk: "{{ disk_tmp[:-1] if disk_tmp[-1] == 'p' and disk_tmp[:4] == 'nvme' else disk_tmp }}" + disk: "{{ disk_tmp[:-1] if disk_tmp[-1] == 'p' and disk_tmp[:9] == '/dev/nvme' else disk_tmp }}" part_num: "{{ pv.report[0].pv[0].pv_name[-1] }}" become: true failed_when: "growpart.rc != 0 and 'NOCHANGE' not in growpart.stdout" From 880d0b2a5ec7acd543c7bade60dae7be47f89a87 Mon Sep 17 00:00:00 2001 From: Bartosz Bezak Date: Mon, 19 Feb 2024 15:35:50 +0100 Subject: [PATCH 03/11] docs: guide for migrating to containerized libvirt in R8/R9 migration --- doc/source/operations/rocky-linux-9.rst | 30 ++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/doc/source/operations/rocky-linux-9.rst b/doc/source/operations/rocky-linux-9.rst index 5b13c2807..b174d11e7 100644 --- a/doc/source/operations/rocky-linux-9.rst +++ b/doc/source/operations/rocky-linux-9.rst @@ -429,13 +429,33 @@ Full procedure for one batch of hosts kayobe overcloud provision -l -5. Host configure: +5. If the compute node is using Libvirt on the Host, and one wants to transition to containerized Libvirt. + + 1. Update kolla.yml + + .. code-block:: yaml + + kolla_enable_nova_libvirt_container: "{{ inventory_hostname != 'localhost' and ansible_facts.distribution_major_version == '9' }}" + + 2. Update kolla/globals.yml + + .. code-block:: yaml + + enable_nova_libvirt_container: "{% raw %}{{ ansible_facts.distribution_major_version == '9' }}{% endraw %}" + + .. note:: + + Those settings are needed only for the timeframe of migration to Rocky Linux 9, + when CentOS Stream 8 or Rocky Linux 8 hosts with Libvirt on the hosts exists + in the environment. + +6. Host configure: .. code:: console kayobe overcloud host configure -l -kl -6. If the compute node is running Ceph OSD services: +7. If the compute node is running Ceph OSD services: 1. Make sure the cephadm public key is in ``authorized_keys`` for stack or root user - depends on your setup. For example, your SSH key may @@ -460,13 +480,13 @@ Full procedure for one batch of hosts ceph -s ceph -w -7. Service deploy: +8. Service deploy: .. code:: console kayobe overcloud service deploy -kl -8. If you are using Wazuh, you will need to deploy the agent again. +9. If you are using Wazuh, you will need to deploy the agent again. Note that CIS benchmarks do not run on RL9 out-the-box. See `our Wazuh docs `__ for details. @@ -475,7 +495,7 @@ Full procedure for one batch of hosts kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/wazuh-agent.yml -l -9. Restore the system to full health. +10. Restore the system to full health. 1. If any VMs were powered off, they may now be powered back on. From 5931ff07b7cf585c87e2d6987ad43137bb60c2d5 Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Mon, 9 Oct 2023 10:23:13 +0100 Subject: [PATCH 04/11] Various os_capacity fixes --- .yamllint | 1 + doc/source/configuration/monitoring.rst | 43 ++- .../ansible/deploy-os-capacity-exporter.yml | 7 + .../templates/os_capacity-clouds.yml.j2 | 11 +- .../openstack/grafana_cloud_dashboard.json | 37 +- .../openstack/grafana_project_dashboard.json | 343 +++++++++++++++--- .../{os_exporter.cfg => os_capacity.cfg} | 6 +- .../prometheus.yml.d/70-oscapacity.yml | 5 +- etc/kayobe/stackhpc-monitoring.yml | 10 + .../notes/os-capacity-94006f03f16583e4.yaml | 19 +- 10 files changed, 408 insertions(+), 74 deletions(-) rename etc/kayobe/kolla/config/haproxy/services.d/{os_exporter.cfg => os_capacity.cfg} (74%) diff --git a/.yamllint b/.yamllint index 96b2b10dd..1c115e29b 100644 --- a/.yamllint +++ b/.yamllint @@ -20,3 +20,4 @@ ignore: | .github/ .gitlab/ .gitlab-ci.yml + etc/kayobe/kolla/config/prometheus/prometheus.yml.d/70-oscapacity.yml diff --git a/doc/source/configuration/monitoring.rst b/doc/source/configuration/monitoring.rst index 880ca0032..819da9769 100644 --- a/doc/source/configuration/monitoring.rst +++ b/doc/source/configuration/monitoring.rst @@ -140,33 +140,58 @@ enable the ceph mgr exporter. OpenStack Capacity ================== -OpenStack Capacity allows you to see how much space you have avaliable -in your cloud. StackHPC Kayobe Config includes this exporter by default -and it's necessary that some variables are set to allow deployment. +OpenStack Capacity allows you to see how much space you have available +in your cloud. StackHPC Kayobe Config includes a playbook for manual +deployment, and it's necessary that some variables are set before +running this playbook. To successfully deploy OpenStack Capacity, you are required to specify the OpenStack application credentials in ``kayobe/secrets.yml`` as: .. code-block:: yaml - secrets_os_exporter_auth_url: - secrets_os_exporter_credential_id: - secrets_os_exporter_credential_secret: + secrets_os_capacity_credential_id: + secrets_os_capacity_credential_secret: -After defining your credentials, You may deploy OpenStack Capacity +The Keystone authentication URL and OpenStack region can be changed +from their defaults in ``stackhpc-monitoring.yml`` should you need to +set a different OpenStack region for your cloud. The authentication +URL is set to use ``kolla_internal_fqdn`` by default: + +.. code-block:: yaml + + stackhpc_os_capacity_auth_url: + stackhpc_os_capacity_openstack_region_name: + +Additionally, you are required to enable a conditional flag to allow +HAProxy and Prometheus configuration to be templated during deployment. + +.. code-block:: yaml + + stackhpc_enable_os_capacity: true + +If you are deploying in a cloud with internal TLS, you may be required +to disable certificate verification for the OpenStack Capacity exporter +if your certificate is not signed by a trusted CA. + +.. code-block:: yaml + + stackhpc_os_capacity_openstack_verify: false + +After defining your credentials, you may deploy OpenStack Capacity using the ``ansible/deploy-os-capacity-exporter.yml`` Ansible playbook via Kayobe. .. code-block:: console - kayobe playbook run ansible/deploy-os-capacity-exporter.yml + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/deploy-os-capacity-exporter.yml It is required that you re-configure the Prometheus, Grafana and HAProxy services following deployment, to do this run the following Kayobe command. .. code-block:: console - kayobe overcloud service reconfigure -kt grafana,prometheus,haproxy + kayobe overcloud service reconfigure -kt grafana,prometheus,loadbalancer If you notice ``HaproxyServerDown`` or ``HaproxyBackendDown`` prometheus alerts after deployment it's likely the os_exporter secrets have not been diff --git a/etc/kayobe/ansible/deploy-os-capacity-exporter.yml b/etc/kayobe/ansible/deploy-os-capacity-exporter.yml index 4eeb69431..e5cde5676 100644 --- a/etc/kayobe/ansible/deploy-os-capacity-exporter.yml +++ b/etc/kayobe/ansible/deploy-os-capacity-exporter.yml @@ -3,6 +3,13 @@ gather_facts: false tasks: + - name: Ensure legacy os_exporter.cfg config file is deleted + ansible.builtin.file: + path: /etc/kolla/haproxy/services.d/os_exporter.cfg + state: absent + delegate_to: network + become: true + - name: Create os-capacity directory ansible.builtin.file: path: /opt/kayobe/os-capacity/ diff --git a/etc/kayobe/ansible/templates/os_capacity-clouds.yml.j2 b/etc/kayobe/ansible/templates/os_capacity-clouds.yml.j2 index 89d66c0bc..a821d6dcb 100644 --- a/etc/kayobe/ansible/templates/os_capacity-clouds.yml.j2 +++ b/etc/kayobe/ansible/templates/os_capacity-clouds.yml.j2 @@ -1,10 +1,13 @@ clouds: openstack: auth: - auth_url: "{{ secrets_os_exporter_auth_url }}" - application_credential_id: "{{ secrets_os_exporter_credential_id }}" - application_credential_secret: "{{ secrets_os_exporter_credential_secret }}" - region_name: "RegionOne" + auth_url: "{{ stackhpc_os_capacity_auth_url }}" + application_credential_id: "{{ secrets_os_capacity_credential_id }}" + application_credential_secret: "{{ secrets_os_capacity_credential_secret }}" + region_name: "{{ stackhpc_os_capacity_openstack_region_name }}" interface: "internal" identity_api_version: 3 auth_type: "v3applicationcredential" +{% if not stackhpc_os_capacity_openstack_verify | bool %} + verify: False +{% endif %} diff --git a/etc/kayobe/kolla/config/grafana/dashboards/openstack/grafana_cloud_dashboard.json b/etc/kayobe/kolla/config/grafana/dashboards/openstack/grafana_cloud_dashboard.json index a777c332e..7bdbdee9f 100644 --- a/etc/kayobe/kolla/config/grafana/dashboards/openstack/grafana_cloud_dashboard.json +++ b/etc/kayobe/kolla/config/grafana/dashboards/openstack/grafana_cloud_dashboard.json @@ -25,7 +25,6 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 2084495, "links": [], "liveNow": false, "panels": [ @@ -66,7 +65,7 @@ }, "gridPos": { "h": 4, - "w": 2.4, + "w": 4.8, "x": 0, "y": 1 }, @@ -86,7 +85,7 @@ }, "textMode": "auto" }, - "pluginVersion": "9.4.7", + "pluginVersion": "10.1.5", "repeat": "flavors", "repeatDirection": "h", "targets": [ @@ -96,7 +95,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "openstack_free_capacity_by_flavor_total{flavor_name=~\"$flavors\"}", + "expr": "round(avg_over_time(openstack_free_capacity_by_flavor_total{flavor_name=~\"$flavors\"}[30m]), 1)", "legendFormat": "__auto", "range": true, "refId": "A" @@ -424,6 +423,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -465,6 +465,7 @@ "y": 17 }, "id": 5, + "interval": "10m", "options": { "legend": { "calcs": [ @@ -489,7 +490,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "openstack_project_usage{placement_resource=\"MEMORY_MB\"}", + "expr": "avg_over_time(openstack_project_usage{placement_resource=\"MEMORY_MB\"}[30m])", "legendFormat": "{{project_name}}", "range": true, "refId": "A" @@ -522,6 +523,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -552,7 +554,7 @@ } ] }, - "unit": "decmbytes" + "unit": "none" }, "overrides": [] }, @@ -563,6 +565,7 @@ "y": 17 }, "id": 16, + "interval": "10m", "options": { "legend": { "calcs": [ @@ -587,7 +590,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "openstack_project_usage{placement_resource=\"VCPU\"}", + "expr": "avg_over_time(openstack_project_usage{placement_resource=\"VCPU\"}[30m])", "legendFormat": "VCPU {{project_name}}", "range": true, "refId": "A" @@ -598,7 +601,7 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "openstack_project_usage{placement_resource=\"PCPU\"}", + "expr": "avg_over_time(openstack_project_usage{placement_resource=\"PCPU\"}[30m])", "hide": false, "legendFormat": "PCPU {{project_name}}", "range": true, @@ -646,6 +649,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "smooth", "lineStyle": { "fill": "solid" @@ -689,6 +693,7 @@ "y": 26 }, "id": 6, + "interval": "10m", "options": { "legend": { "calcs": [ @@ -715,15 +720,15 @@ }, "editorMode": "code", "exemplar": false, - "expr": "openstack_free_capacity_hypervisor_by_flavor{flavor_name=~\"$flavors\"}", + "expr": "avg_over_time(openstack_free_capacity_hypervisor_by_flavor{flavor_name=~\"$flavors\"}[30m])", "format": "time_series", "instant": false, "legendFormat": "{{flavor_name}} on {{hypervisor}}", "range": true, - "refId": "Avaliable Capacity on Hypervisors" + "refId": "Available Capacity on Hypervisors" } ], - "title": "Avaliable Capacity for $flavors", + "title": "Available Capacity for $flavors", "type": "timeseries" }, { @@ -750,6 +755,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -791,6 +797,7 @@ "y": 26 }, "id": 4, + "interval": "10m", "options": { "legend": { "calcs": [ @@ -814,8 +821,8 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "editorMode": "builder", - "expr": "openstack_hypervisor_placement_allocatable_capacity{resource=\"MEMORY_MB\"} - on(hypervisor) openstack_hypervisor_placement_allocated{resource=\"MEMORY_MB\"}", + "editorMode": "code", + "expr": "avg_over_time(openstack_hypervisor_placement_allocatable_capacity{resource=\"MEMORY_MB\"}[30m]) - on(hypervisor) avg_over_time(openstack_hypervisor_placement_allocated{resource=\"MEMORY_MB\"}[30m])", "legendFormat": "{{hypervisor}}", "range": true, "refId": "A" @@ -885,7 +892,7 @@ ] }, "time": { - "from": "now-24h", + "from": "now-2d", "to": "now" }, "timepicker": {}, @@ -895,4 +902,4 @@ "version": 1, "weekStart": "" } -{% endraw %} +{% endraw %} \ No newline at end of file diff --git a/etc/kayobe/kolla/config/grafana/dashboards/openstack/grafana_project_dashboard.json b/etc/kayobe/kolla/config/grafana/dashboards/openstack/grafana_project_dashboard.json index c3a483cf9..acb37f195 100644 --- a/etc/kayobe/kolla/config/grafana/dashboards/openstack/grafana_project_dashboard.json +++ b/etc/kayobe/kolla/config/grafana/dashboards/openstack/grafana_project_dashboard.json @@ -25,7 +25,6 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 2084480, "links": [], "liveNow": false, "panels": [ @@ -89,9 +88,10 @@ "fields": "", "values": false }, - "showUnfilled": true + "showUnfilled": true, + "valueMode": "color" }, - "pluginVersion": "9.4.7", + "pluginVersion": "10.1.5", "targets": [ { "datasource": { @@ -134,6 +134,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -175,6 +176,7 @@ "y": 11 }, "id": 5, + "interval": "10m", "options": { "legend": { "calcs": [ @@ -199,7 +201,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum(openstack_project_usage{project_id=~\"${project_id}\"}) by (placement_resource)", + "expr": "sum(avg_over_time(openstack_project_usage{project_id=~\"${project_id}\"}[30m:])) by (placement_resource)", "hide": false, "legendFormat": "{{placement_resource}}", "range": true, @@ -234,6 +236,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -275,6 +278,7 @@ "y": 11 }, "id": 19, + "interval": "10m", "options": { "legend": { "calcs": [ @@ -299,7 +303,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "openstack_project_quota{project_id=~\"${project_id}\"}", + "expr": "avg_over_time(openstack_project_quota{project_id=~\"${project_id}\"}[30m:])", "hide": false, "legendFormat": "{{project_name}}:{{quota_resource}}", "range": true, @@ -333,6 +337,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -433,6 +438,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -474,6 +480,7 @@ "y": 20 }, "id": 20, + "interval": "30m", "options": { "legend": { "calcs": [ @@ -498,7 +505,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum(irate(libvirt_domain_vcpu_time_seconds_total{}[5m]) / ignoring(instance,vcpu) group_left(domain) libvirt_domain_info_virtual_cpus{}) by (domain) * on(domain) group_left(instance_name,project_name,project_uuid) libvirt_domain_info_meta{project_uuid=~\"${project_id}\"}", + "expr": "avg(sum(irate(libvirt_domain_vcpu_time_seconds_total{}[5m]) / ignoring(instance,vcpu) group_left(domain) libvirt_domain_info_virtual_cpus{}) by (domain) * on(domain) group_left(instance_name,project_name,project_uuid) libvirt_domain_info_meta{project_uuid=~\"${project_id}\"}) by (project_name)", "hide": false, "legendFormat": "{{instance_name}}", "range": true, @@ -532,6 +539,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -598,7 +606,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "libvirt_domain_memory_stats_used_percent * on(domain) group_left(instance_name,project_name,project_uuid) libvirt_domain_info_meta{project_uuid=~\"${project_id}\"}", + "expr": "avg(libvirt_domain_memory_stats_used_percent * on(domain) group_left(instance_name,project_name,project_uuid) libvirt_domain_info_meta{project_uuid=~\"${project_id}\"}) by (project_name)", "hide": false, "legendFormat": "{{instance_name}}", "range": true, @@ -633,6 +641,7 @@ "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -700,9 +709,9 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "rate(libvirt_domain_block_stats_read_time_seconds_total[5m]) * on(domain) group_left(instance_name,project_name,project_uuid) libvirt_domain_info_meta{project_uuid=~\"${project_id}\"}", + "expr": "avg(rate(libvirt_domain_block_stats_read_time_seconds_total[5m]) * on(domain) group_left(instance_name,project_name,project_uuid) libvirt_domain_info_meta{project_uuid=~\"${project_id}\"}) by (project_name)", "hide": false, - "legendFormat": "{{instance_name}} : read {{target_device}}", + "legendFormat": "read: {{project_name}}", "range": true, "refId": "B" }, @@ -712,9 +721,9 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "rate(libvirt_domain_block_stats_write_time_seconds_total[5m]) * on(domain) group_left(instance_name,project_name,project_uuid) libvirt_domain_info_meta{project_uuid=~\"${project_id}\"} * -1", + "expr": "avg(rate(libvirt_domain_block_stats_write_time_seconds_total[5m]) * on(domain) group_left(instance_name,project_name,project_uuid) libvirt_domain_info_meta{project_uuid=~\"${project_id}\"} * -1) by (project_name)", "hide": false, - "legendFormat": "{{instance_name}} : write {{target_device}}", + "legendFormat": " write: {{project_name}}", "range": true, "refId": "C" } @@ -732,7 +741,7 @@ }, "id": 15, "panels": [], - "title": "Per Hypervisor Free Capacity", + "title": "Per Instance Utilization", "type": "row" }, { @@ -740,67 +749,319 @@ "type": "prometheus", "uid": "${datasource}" }, - "description": "", "fieldConfig": { "defaults": { "color": { - "mode": "thresholds" + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 1, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 23, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { - "color": "semi-dark-yellow", + "color": "green", "value": null }, { - "color": "green", - "value": 4 + "color": "red", + "value": 80 } ] - } + }, + "unit": "percentunit" }, "overrides": [] }, "gridPos": { - "h": 10, - "w": 24, + "h": 9, + "w": 8, "x": 0, "y": 30 }, - "id": 2, + "id": 23, + "interval": "30m", "options": { - "displayMode": "basic", - "minVizHeight": 10, - "minVizWidth": 0, - "orientation": "horizontal", - "reduceOptions": { + "legend": { "calcs": [ - "lastNotNull" + "min", + "max" ], - "fields": "", - "values": false + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true }, - "showUnfilled": true + "tooltip": { + "mode": "single", + "sort": "none" + } }, - "pluginVersion": "9.4.7", "targets": [ { "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "editorMode": "builder", - "expr": "openstack_free_capacity_by_flavor_total", - "format": "time_series", - "legendFormat": "{{flavor_name}}", + "editorMode": "code", + "expr": "sum(irate(libvirt_domain_vcpu_time_seconds_total{}[5m]) / ignoring(instance,vcpu) group_left(domain) libvirt_domain_info_virtual_cpus{}) by (domain) * on(domain) group_left(instance_name,project_name,project_uuid) libvirt_domain_info_meta{project_uuid=~\"${project_id}\"}", + "hide": false, + "legendFormat": "{{instance_name}}", + "range": true, + "refId": "B" + } + ], + "title": "CPU utilization per instance", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 23, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 100, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 8, + "y": 30 + }, + "id": 24, + "interval": "30m", + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "libvirt_domain_memory_stats_used_percent * on(domain) group_left(instance_name,project_name,project_uuid) libvirt_domain_info_meta{project_uuid=~\"${project_id}\"}", + "hide": false, + "legendFormat": "{{instance_name}}", "range": true, "refId": "A" } ], - "title": "Free Capacity by Flavor", - "type": "bargauge" + "title": "Memory utilization per instance", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": true, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 23, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 1, + "min": -1, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 16, + "y": 30 + }, + "id": 25, + "options": { + "legend": { + "calcs": [ + "min", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "rate(libvirt_domain_block_stats_read_time_seconds_total[5m]) * on(domain) group_left(instance_name,project_name,project_uuid) libvirt_domain_info_meta{project_uuid=~\"${project_id}\"}", + "hide": false, + "legendFormat": "{{instance_name}} : read {{target_device}}", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "avg(rate(libvirt_domain_block_stats_write_time_seconds_total[5m]) * on(domain) group_left(instance_name,project_name,project_uuid) libvirt_domain_info_meta{project_uuid=~\"${project_id}\"} * -1) by (project_name)", + "hide": false, + "legendFormat": "{{instance_name}} : write {{target_device}}", + "range": true, + "refId": "C" + } + ], + "title": "Disk utilization per instance", + "type": "timeseries" } ], "refresh": "", @@ -817,7 +1078,7 @@ "current": { "selected": false, "text": "Prometheus", - "value": "Prometheus" + "value": "PBFA97CFB590B2093" }, "description": "The prometheus datasource used for queries.", "hide": 0, @@ -867,14 +1128,14 @@ ] }, "time": { - "from": "now-3h", + "from": "now-2d", "to": "now" }, "timepicker": {}, "timezone": "", "title": "OpenStack Project Metrics", "uid": "mXiuBDe7z", - "version": 2, + "version": 1, "weekStart": "" } -{% endraw %} +{% endraw %} \ No newline at end of file diff --git a/etc/kayobe/kolla/config/haproxy/services.d/os_exporter.cfg b/etc/kayobe/kolla/config/haproxy/services.d/os_capacity.cfg similarity index 74% rename from etc/kayobe/kolla/config/haproxy/services.d/os_exporter.cfg rename to etc/kayobe/kolla/config/haproxy/services.d/os_capacity.cfg index e40c27a38..4326265ca 100644 --- a/etc/kayobe/kolla/config/haproxy/services.d/os_exporter.cfg +++ b/etc/kayobe/kolla/config/haproxy/services.d/os_capacity.cfg @@ -6,7 +6,11 @@ frontend os_capacity_frontend option httplog option forwardfor http-request set-header X-Forwarded-Proto https if { ssl_fc } - bind {{ kolla_internal_vip_address }}:9000 +{% if kolla_enable_tls_internal | bool %} + bind {{ kolla_internal_vip_address }}:9090 ssl crt /etc/haproxy/haproxy-internal.pem +{% else %} + bind {{ kolla_internal_vip_address }}:9090 +{% endif %} default_backend os_capacity_backend backend os_capacity_backend diff --git a/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/70-oscapacity.yml b/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/70-oscapacity.yml index 659c26047..afed8d915 100644 --- a/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/70-oscapacity.yml +++ b/etc/kayobe/kolla/config/prometheus/prometheus.yml.d/70-oscapacity.yml @@ -6,8 +6,11 @@ scrape_configs: - job_name: os-capacity static_configs: - targets: - - '{{ kolla_internal_vip_address | put_address_in_context('url') }}:9000' + - '{{ kolla_internal_fqdn | put_address_in_context('url') }}:9090' scrape_interval: 15m scrape_timeout: 10m +{% if kolla_enable_tls_internal | bool %} + scheme: https +{% endif %} {% endraw %} {% endif %} diff --git a/etc/kayobe/stackhpc-monitoring.yml b/etc/kayobe/stackhpc-monitoring.yml index b48646e79..8d0771e13 100644 --- a/etc/kayobe/stackhpc-monitoring.yml +++ b/etc/kayobe/stackhpc-monitoring.yml @@ -15,3 +15,13 @@ alertmanager_low_memory_threshold_gib: 5 # Enabling this flag will result in HAProxy configuration and Prometheus scrape # targets being templated during deployment. stackhpc_enable_os_capacity: false + +# Keystone authentication URL for OpenStack Capacity +stackhpc_os_capacity_auth_url: "http{% if kolla_enable_tls_internal | bool %}s{% endif %}://{{ kolla_internal_fqdn }}:5000" + +# OpenStack region for OpenStack Capacity +stackhpc_os_capacity_openstack_region_name: "{{ openstack_region_name | default(RegionOne) }}" + +# Whether TLS certificate verification is enabled for the OpenStack Capacity +# exporter during Keystone authentication. +stackhpc_os_capacity_openstack_verify: true diff --git a/releasenotes/notes/os-capacity-94006f03f16583e4.yaml b/releasenotes/notes/os-capacity-94006f03f16583e4.yaml index f9d76b7f4..ca317682b 100644 --- a/releasenotes/notes/os-capacity-94006f03f16583e4.yaml +++ b/releasenotes/notes/os-capacity-94006f03f16583e4.yaml @@ -9,7 +9,20 @@ upgrade: - | To deploy the OpenStack Capacity Grafana dashboard, you must define OpenStack application credential variables: - ``secrets_os_exporter_auth_url``, - ``secrets_os_exporter_credential_id`` and - ``secrets_os_exporter_credential_secret`` as laid out in the + ``secrets_os_capacity_credential_id`` and + ``secrets_os_capacity_credential_secret`` as laid out in the 'Monitoring' documentation. + + You must also enable the ``stackhpc_enable_os_capacity`` + flag for OpenStack Capacity HAProxy and Prometheus configuration + to be templated. + + You may also change the default authentication URL from the + kolla_internal_fqdn and change the default OpenStack region + from RegionOne with the variables: + ``stackhpc_os_capacity_auth_url`` and + ``stackhpc_os_capacity_openstack_region_name``. + + To disable certificate verification for the OpenStack Capacity + exporter, you can set ``stackhpc_os_capacity_openstack_verify`` + to false. From 75f48577750030ba6720bcf2060734a67331d181 Mon Sep 17 00:00:00 2001 From: Seunghun Lee <45145778+seunghun1ee@users.noreply.github.com> Date: Wed, 21 Feb 2024 10:24:29 +0000 Subject: [PATCH 05/11] Update Kolla container images for Ubuntu Jammy Zed (#904) * Update ubuntu jammy kolla container tags --- etc/kayobe/kolla-image-tags.yml | 12 ++---------- ...-jammy-zed-kolla-containers-0774af3c590b89d0.yaml | 4 ++++ 2 files changed, 6 insertions(+), 10 deletions(-) create mode 100644 releasenotes/notes/update-ubuntu-jammy-zed-kolla-containers-0774af3c590b89d0.yaml diff --git a/etc/kayobe/kolla-image-tags.yml b/etc/kayobe/kolla-image-tags.yml index 0144c8179..5cd94a2c3 100644 --- a/etc/kayobe/kolla-image-tags.yml +++ b/etc/kayobe/kolla-image-tags.yml @@ -5,15 +5,7 @@ kolla_image_tags: openstack: rocky-9: zed-rocky-9-20240202T105829 - ubuntu-jammy: zed-ubuntu-jammy-20230921T153510 - bifrost: - ubuntu-jammy: zed-ubuntu-jammy-20231101T132522 - ovn: - ubuntu-jammy: zed-ubuntu-jammy-20230821T155947 - cloudkitty: - ubuntu-jammy: zed-ubuntu-jammy-20231114T124701 + ubuntu-jammy: zed-ubuntu-jammy-20240129T151534 neutron: rocky-9: zed-rocky-9-20240202T141530 - ubuntu-jammy: zed-ubuntu-jammy-20231115T094053 - opensearch: - ubuntu-jammy: zed-ubuntu-jammy-20231214T095452 + ubuntu-jammy: zed-ubuntu-jammy-20240202T143208 diff --git a/releasenotes/notes/update-ubuntu-jammy-zed-kolla-containers-0774af3c590b89d0.yaml b/releasenotes/notes/update-ubuntu-jammy-zed-kolla-containers-0774af3c590b89d0.yaml new file mode 100644 index 000000000..81562019d --- /dev/null +++ b/releasenotes/notes/update-ubuntu-jammy-zed-kolla-containers-0774af3c590b89d0.yaml @@ -0,0 +1,4 @@ +--- +upgrade: + - | + Update Ubuntu Jammy Zed Kolla container tags. From 0e7ea643f1f4a82fb3382b81cc3172bb7e85f802 Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Thu, 28 Dec 2023 09:48:15 +0000 Subject: [PATCH 06/11] Ensure cron service is started for smartmon --- etc/kayobe/ansible/smartmon-tools.yml | 7 +++++++ releasenotes/notes/smartmontools-bc8176f45d58a75d.yaml | 6 ++++++ 2 files changed, 13 insertions(+) create mode 100644 releasenotes/notes/smartmontools-bc8176f45d58a75d.yaml diff --git a/etc/kayobe/ansible/smartmon-tools.yml b/etc/kayobe/ansible/smartmon-tools.yml index bb5cf5dca..b4a064b63 100644 --- a/etc/kayobe/ansible/smartmon-tools.yml +++ b/etc/kayobe/ansible/smartmon-tools.yml @@ -12,6 +12,13 @@ state: present become: true + - name: Ensure the cron/crond service is running + service: + name: "{{ 'cron' if ansible_facts['distribution'] == 'Ubuntu' else 'crond' }}" + state: started + enabled: true + become: true + - name: Copy smartmon.sh and nvmemon.sh from scripts folder copy: src: "scripts/{{ item }}" diff --git a/releasenotes/notes/smartmontools-bc8176f45d58a75d.yaml b/releasenotes/notes/smartmontools-bc8176f45d58a75d.yaml new file mode 100644 index 000000000..ac3451347 --- /dev/null +++ b/releasenotes/notes/smartmontools-bc8176f45d58a75d.yaml @@ -0,0 +1,6 @@ +--- +features: + - | + The smartmon-tools playbook now ensures that the cron service is running as + in some cases it may not be running by default. + From 3fca4743f02f877ded36d9bcee763019733e1f73 Mon Sep 17 00:00:00 2001 From: Matt Crees Date: Wed, 21 Feb 2024 11:22:21 +0000 Subject: [PATCH 07/11] Bump RL9 host image to RL9.3 (#897) Co-authored-by: Mark Goddard --- etc/kayobe/pulp-host-image-versions.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etc/kayobe/pulp-host-image-versions.yml b/etc/kayobe/pulp-host-image-versions.yml index 9c55926e5..40819e32b 100644 --- a/etc/kayobe/pulp-host-image-versions.yml +++ b/etc/kayobe/pulp-host-image-versions.yml @@ -1,5 +1,5 @@ --- # Overcloud host image versioning tags # These images must be in SMS, since they are used by our AIO CI runners -stackhpc_rocky_9_overcloud_host_image_version: "zed-20231106T151621" +stackhpc_rocky_9_overcloud_host_image_version: "zed-20240126T093155" stackhpc_ubuntu_jammy_overcloud_host_image_version: "zed-20231013T123933" From ffe3ed3183d046c8ec98690765190bda1cb4c243 Mon Sep 17 00:00:00 2001 From: Alex-Welsh Date: Thu, 15 Feb 2024 17:27:39 +0000 Subject: [PATCH 08/11] Add Ubuntu Jammy upgrade doc --- doc/source/operations/index.rst | 1 + doc/source/operations/ubuntu-jammy.rst | 531 +++++++++++++++++++++++++ 2 files changed, 532 insertions(+) create mode 100644 doc/source/operations/ubuntu-jammy.rst diff --git a/doc/source/operations/index.rst b/doc/source/operations/index.rst index 284795d6a..f547f13bb 100644 --- a/doc/source/operations/index.rst +++ b/doc/source/operations/index.rst @@ -11,4 +11,5 @@ This guide is for operators of the StackHPC Kayobe configuration project. octavia hotfix-playbook rocky-linux-9 + ubuntu-jammy secret-rotation diff --git a/doc/source/operations/ubuntu-jammy.rst b/doc/source/operations/ubuntu-jammy.rst new file mode 100644 index 000000000..4f684aa07 --- /dev/null +++ b/doc/source/operations/ubuntu-jammy.rst @@ -0,0 +1,531 @@ +========================= +Upgrading to Ubuntu Jammy +========================= + +Overview +======== + +This document describes how to upgrade systems from Ubuntu Focal 20.04 to +Ubuntu Jammy 22.04. This procedure must be performed on Ubuntu Focal 20.04 +OpenStack Yoga systems before it is possible to upgrade to OpenStack Zed. It is +possible to perform a rolling upgrade to ensure service is not disrupted. + +Upgrades are performed in-place with a script using the ``do-release-upgrade`` +tool provided by Canonical, rather than reprovisioning. The scripts are found +at ``tools/ubuntu-upgrade-*.sh``. For overcloud and infrastructure VM upgrades, +the script takes one argument - the host(s) to upgrade. The scripts execute a +playbook to upgrade the host, then run the appropriate ``kayobe * host +configure`` command. + +The guide assumes a local pulp instance is deployed and all hosts use it +to pull ``apt`` packages. To upgrade a host using upstream packages, see the +manual upgrade process at the bottom of this page. + +While it is technically possible to upgrade hosts in any order, it is +recommended that upgrades for one type of node be completed before moving on +to the next i.e. all compute node upgrades are performed before all storage +node upgrades. + +The order of node groups is less important however it is arguably safest to +perform controller node upgrades first, given that they are the most complex +and it is easiest to revert their state in the event of a failure. +This guide covers the following types of hosts: + +- Controllers +- Compute hosts +- Storage hosts +- Seed +- Other hosts not managed by Kayobe + +The following types of hosts will be covered in the future: + +- Ansible control host +- Seed hypervisor (an upgrade script exists but has not been tested) +- Infrastructure VMs (an upgrade script exists but has not been tested) + +.. warning:: + + Ceph node upgrades have not yet been performed outside of a virtualised test + environment. Proceed with caution. + +Prerequisites +============= + +Before starting the upgrade, ensure any appropriate prerequisites are +satisfied. These will be specific to each deployment, but here are some +suggestions: + +* Merge in the latest ``stackhpc-kayobe-config`` ``stackhpc/yoga`` branch. +* Ensure that there is sufficient hypervisor capacity to drain + at least one node. +* If using Ironic for bare metal compute, ensure that at least one node is + available for testing provisioning. +* Ensure that expected test suites are passing, e.g. Tempest. +* Resolve any Prometheus alerts. +* Check for unexpected ``ERROR`` or ``CRITICAL`` messages in Kibana/OpenSearch + Dashboard. +* Check Grafana dashboards. + +Sync Release Train artifacts +---------------------------- + +New `StackHPC Release Train <../configuration/release-train.html>`__ content +should be synced to the local Pulp server. This includes host packages +(Deb/RPM) and container images. + +To sync host packages: + +.. code-block:: console + + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/pulp-repo-sync.yml -e stackhpc_pulp_sync_ubuntu_focal=true -e stackhpc_pulp_sync_ubuntu_jammy=true + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/pulp-repo-publish.yml + +Once the host package content has been tested in a test/staging environment, it +may be promoted to production: + +.. code-block:: console + + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/pulp-repo-promote-production.yml + +To sync container images: + +.. code-block:: console + + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/pulp-container-sync.yml + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/pulp-container-publish.yml + +Build locally customised container images +----------------------------------------- + +.. note:: + + The container images provided by StackHPC Release Train are suitable for + most deployments. In this case, this step can be skipped. + +In some cases, it is necessary to build some or all images locally to apply +customisations. To do this, set +``stackhpc_pulp_sync_for_local_container_build`` to ``true`` before syncing +container images. + +To build the overcloud images locally and push them to the local Pulp server: + +.. code-block:: console + + kayobe overcloud container image build --push + +It is possible to build a specific set of images by supplying one or more +image name regular expressions: + +.. code-block:: console + + kayobe overcloud container image build --push ironic- nova-api + +Deploy the latest container images +---------------------------------- + +Make sure you deploy the latest containers before this upgrade: + +.. code-block:: console + + kayobe seed service deploy + kayobe overcloud service deploy + +Common issues for all host types +================================ + +- Interface names regularly change during upgrades, usually gaining the + ``np0`` suffix. This cannot easily be resolved. The upgrade script + configures networking both before and after rebooting to apply the upgrade. + Setting the interface statically in a kayobe-config fails during one of + these. This can be worked around by adding a ``sed`` command to the upgrade + script between the upgrade playbook step and the host configure step e.g. + + .. code-block:: bash + + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/ubuntu-upgrade.yml -e os_release=jammy --limit $1 + sed -i -e 's/"ens1"/"ens1np0"/g' -e 's/"ens2"/"ens2np0"/g' $KAYOBE_CONFIG_PATH/environments/production/inventory/group_vars/compute/network-interfaces + kayobe overcloud host configure --limit $1 --kolla-limit $1 -e os_release=jammy + + Remember to reset the change before upgrading another host (or add a + second ``sed`` command to automate the process) +- Disk names can change during upgrades. This can be resolved in kayobe-config + once the new name is known (i.e. after the first upgrade) and applied by + re-running ``host configure`` for the affected host. +- Timeouts can become an issue with some hardware. The host will reboot once + or twice depending on whether it needs to apply package updates. Edit the + timeouts in the upgrade playbook (``ubuntu-upgrade.yml``) where required. + +Controllers +=========== + +Upgrade controllers *one by one*, ideally upgrading the host with the Kolla +Virtual IP (VIP) last. Before upgrading a host with the VIP, stop the +``keepalived`` container for a few seconds to fail it over to another +controller (restarting the container does not always stop the container for +long enough). + +.. code-block:: bash + + sudo docker stop keepalived + sudo docker start keepalived + +Always back up the overcloud DB before starting: + +.. code-block:: bash + + kayobe overcloud database backup + +Potential issues +---------------- + +- In both testing and production, RabbitMQ has fallen into an error state + during controller upgrades. Keep an eye on the RabbitMQ Grafana dashboard and + if errors begin to increase, use the ``rabbitmq-reset`` playbook: + + .. code-block:: bash + + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/rabbitmq-reset.yml + +- If you are using hyper-converged Ceph, please also note the potential issues + in the Storage section below. + +Full procedure for one controller +--------------------------------- + +1. Export the ``KAYOBE_PATH`` environment variable e.g. + + .. code-block:: console + + export KAYOBE_PATH=~/src/kayobe + +2. If the controller is running Ceph services: + + 1. Set host in maintenance mode: + + .. code-block:: console + + ceph orch host maintenance enter + + 2. Check nothing remains on the host: + + .. code-block:: console + + ceph orch ps + +3. Run the upgrade script: + + .. code-block:: console + + $KAYOBE_CONFIG_PATH/../../tools/ubuntu-upgrade-overcloud.sh + +4. If the controller is running Ceph OSD services: + + 1. Make sure the cephadm public key is in ``authorized_keys`` for stack or + root user - depends on your setup. For example, your SSH key may + already be defined in ``users.yml``. If in doubt, run the cephadm + deploy playbook to copy the SSH key and install the cephadm binary. + + .. code-block:: console + + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/cephadm-deploy.yml + + 2. Take the host out of maintenance mode: + + .. code-block:: console + + ceph orch host maintenance exit + + 3. Make sure that everything is back in working condition before moving + on to the next host: + + .. code-block:: console + + ceph -s + ceph -w + +5. Some RabbitMQ instability has been observed. Check the RabbitMQ dashboard + in Grafana if the cluster is unhealthy run the ``rabbitmq-reset`` playbook. + + .. code:: console + + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/rabbitmq-reset.yml + +After each controller has been upgraded you may wish to perform some smoke +testing, run Tempest, check for alerts and errors etc. + +Compute +======= + +Compute nodes can be upgraded in batches. +The possible batches depend on: + +* willingness for instance reboots and downtime +* available spare hypervisor capacity +* sizes of groups of compatible hypervisors + +Potential issues +---------------- + +- VMs cannot be live migrated between Focal and Jammy hypervisors using AMD + CPUs. Any affected VMs must be cold-migrated. It may be possible to disable + ``xsave``, reboot the VM, then live-migrate, however this process has not + been tested. + +Full procedure for one batch of hosts +------------------------------------- + +1. Export the ``KAYOBE_PATH`` environment variable e.g. + + .. code-block:: console + + export KAYOBE_PATH=~/src/kayobe + +2. Disable the Nova compute service and drain it of VMs using live migration. + If any VMs fail to migrate, they may be cold migrated or powered off: + + .. code-block:: console + + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/nova-compute-{disable,drain}.yml --limit + +3. If the compute node is running Ceph OSD services: + + 1. Set host in maintenance mode: + + .. code-block:: console + + ceph orch host maintenance enter + + 2. Check there's nothing remaining on the host: + + .. code-block:: console + + ceph orch ps + +4. Run the upgrade script: + + .. code-block:: console + + $KAYOBE_CONFIG_PATH/../../tools/ubuntu-upgrade-overcloud.sh + +5. If the compute node is running Ceph OSD services: + + 1. Make sure the cephadm public key is in ``authorized_keys`` for stack or + root user - depends on your setup. For example, your SSH key may + already be defined in ``users.yml`` . If in doubt, run the cephadm + deploy playbook to copy the SSH key and install the cephadm binary. + + .. code-block:: console + + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/cephadm-deploy.yml + + 2. Take the host out of maintenance mode: + + .. code-block:: console + + ceph orch host maintenance exit + + 3. Make sure that everything is back in working condition before moving + on to the next host: + + .. code-block:: console + + ceph -s + ceph -w + +6. Restore the system to full health. + + 1. If any VMs were powered off, they may now be powered back on. + + 2. Wait for Prometheus alerts and errors in Kibana/OpenSearch Dashboard to + resolve, or address them. + + 3. Once happy that the system has been restored to full health, enable the + hypervisor in Nova if it is still disabled and then move onto the next + host or batch or hosts. + + .. code-block:: console + + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/nova-compute-enable.yml --limit + +Storage +======= + +Potential issues +---------------- + +- It is recommended that you upgrade the bootstrap host last. +- Before upgrading the bootstrap host, it can be beneficial to backup + ``/etc/ceph`` and ``/var/lib/ceph``, as sometimes the keys, config, etc. + stored here will not be moved/recreated correctly. +- When a host is taken out of maintenance, you may see errors relating to + permissions of /tmp/etc and /tmp/var. These issues should be resolved in + Ceph version 17.2.7. See issue: https://github.com/ceph/ceph/pull/50736. In + the meantime, you can work around this by running the command below. You may + need to omit one or the other of ``/tmp/etc`` and ``/tmp/var``. You will + likely need to run this multiple times. Run ``ceph -W cephadm`` to monitor + the logs and see when permissions issues are hit. + + .. code-block:: console + + kayobe overcloud host command run --command "chown -R stack:stack /tmp/etc /tmp/var" -b -l storage + +- It has been seen that sometimes the Ceph containers do not come up after + upgrading. This seems to be related to having ``/var/lib/ceph`` persisted + through the reprovision (e.g. seen at a customer in a volume with software + RAID). Further investigation is needed for the root cause. When this + occurs, you will need to redeploy the daemons: + + List the daemons on the host: + + .. code-block:: console + + ceph orch ps + + Redeploy the daemons, one at a time. It is recommended that you start with + the crash daemon, as this will have the least impact if unexpected issues + occur. + + .. code-block:: console + + ceph orch daemon redeploy to redeploy a daemon. + +- Commands starting with ``ceph`` are all run on the cephadm bootstrap + host in a cephadm shell unless stated otherwise. + +Full procedure for a storage host +--------------------------------- + +1. Export the ``KAYOBE_PATH`` environment variable e.g. + + .. code-block:: console + + export KAYOBE_PATH=~/src/kayobe + +2. Set host in maintenance mode: + + .. code-block:: console + + ceph orch host maintenance enter + +3. Check there's nothing remaining on the host: + + .. code-block:: console + + ceph orch ps + +4. Run the upgrade script: + + .. code-block:: console + + $KAYOBE_CONFIG_PATH/../../tools/ubuntu-upgrade-overcloud.sh + +5. Make sure the cephadm public key is in ``authorized_keys`` for stack or + root user - depends on your setup. For example, your SSH key may + already be defined in ``users.yml``. If in doubt, run the cephadm + deploy playbook to copy the SSH key and install the cephadm binary. + + .. code-block:: console + + kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/cephadm-deploy.yml + +6. Take the host out of maintenance mode: + + .. code-block:: console + + ceph orch host maintenance exit + +7. Make sure that everything is back in working condition before moving + on to the next host: + + .. code-block:: console + + ceph -s + ceph -w + +Seed +==== + +Potential issues +---------------- + +- The process has not been tested as well as for other hosts. Proceed with + caution. +- The Seed can take significantly longer to upgrade than other hosts. + ``do-release-upgrade`` has been observed taking more than 45 minutes to + complete. + +Full procedure +-------------- + +1. Export the ``KAYOBE_PATH`` environment variable e.g. + + .. code-block:: console + + export KAYOBE_PATH=~/src/kayobe + +2. Run the upgrade script: + + .. code-block:: console + + $KAYOBE_CONFIG_PATH/../../tools/ubuntu-upgrade-seed.sh + +Wazuh manager +============= + +TODO + +Seed hypervisor +=============== + +TODO + +Ansible control host +==================== + +TODO + +Manual Process +============== + +Sometimes it is necessary to upgrade a system that is not managed by Kayobe +(and therefore does not use packages from pulp). Below is a set of instructions +to manually execute the upgrade process. + +Full procedure +-------------- + +1. Update all packages to the latest available versions + + .. code-block:: console + + sudo apt update -y && sudo apt upgrade -y + +2. Install the upgrade tool + + .. code-block:: console + + sudo apt install ubuntu-release-upgrader-core + +3. Check whether a reboot is required + + .. code-block:: console + + cat /var/run/reboot-required + +4. Where required, reboot to apply updates + + .. code-block:: console + + sudo reboot + +5. Run ``do-release-upgrade`` + + .. code-block:: console + + do-release-upgrade -f DistUpgradeViewNonInteractive + +6. Reboot to apply the upgrade + + .. code-block:: console + + sudo reboot From 27ebaa5fd65d6e940b9fa3aa9cd3ebb4543aef6b Mon Sep 17 00:00:00 2001 From: Seunghun Lee Date: Thu, 15 Feb 2024 10:20:22 +0000 Subject: [PATCH 09/11] Set kolla_build_neutron_ovs to true if regex empty --- .github/workflows/stackhpc-container-image-build.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/stackhpc-container-image-build.yml b/.github/workflows/stackhpc-container-image-build.yml index b8afea93e..af9b3cc79 100644 --- a/.github/workflows/stackhpc-container-image-build.yml +++ b/.github/workflows/stackhpc-container-image-build.yml @@ -171,6 +171,9 @@ jobs: if ${{ inputs.push }} == 'true'; then args="$args --push" fi + if [[ ${{ github.event.inputs.regexes }} == "" ]]; then + args="$args -e kolla_build_neutron_ovs=true" + fi source venvs/kayobe/bin/activate && source src/kayobe-config/kayobe-env --environment ci-builder && kayobe overcloud container image build $args From a9a39806b1fa10d2321d34452fbe00bbe50b0136 Mon Sep 17 00:00:00 2001 From: Seunghun Lee Date: Wed, 21 Feb 2024 13:30:00 +0000 Subject: [PATCH 10/11] Add missing quotes --- .github/workflows/stackhpc-container-image-build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/stackhpc-container-image-build.yml b/.github/workflows/stackhpc-container-image-build.yml index af9b3cc79..be09f8b4f 100644 --- a/.github/workflows/stackhpc-container-image-build.yml +++ b/.github/workflows/stackhpc-container-image-build.yml @@ -171,7 +171,7 @@ jobs: if ${{ inputs.push }} == 'true'; then args="$args --push" fi - if [[ ${{ github.event.inputs.regexes }} == "" ]]; then + if [[ "${{ github.event.inputs.regexes }}" == "" ]]; then args="$args -e kolla_build_neutron_ovs=true" fi source venvs/kayobe/bin/activate && From 2249baf6d3c76bfa9940b927fc7e7571c5902fdd Mon Sep 17 00:00:00 2001 From: Seunghun Lee Date: Wed, 21 Feb 2024 15:14:43 +0000 Subject: [PATCH 11/11] Change the variable definition location --- .github/workflows/stackhpc-container-image-build.yml | 3 --- etc/kayobe/environments/ci-builder/stackhpc-ci.yml | 1 + 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/stackhpc-container-image-build.yml b/.github/workflows/stackhpc-container-image-build.yml index be09f8b4f..b8afea93e 100644 --- a/.github/workflows/stackhpc-container-image-build.yml +++ b/.github/workflows/stackhpc-container-image-build.yml @@ -171,9 +171,6 @@ jobs: if ${{ inputs.push }} == 'true'; then args="$args --push" fi - if [[ "${{ github.event.inputs.regexes }}" == "" ]]; then - args="$args -e kolla_build_neutron_ovs=true" - fi source venvs/kayobe/bin/activate && source src/kayobe-config/kayobe-env --environment ci-builder && kayobe overcloud container image build $args diff --git a/etc/kayobe/environments/ci-builder/stackhpc-ci.yml b/etc/kayobe/environments/ci-builder/stackhpc-ci.yml index 868668e33..27ac8d858 100644 --- a/etc/kayobe/environments/ci-builder/stackhpc-ci.yml +++ b/etc/kayobe/environments/ci-builder/stackhpc-ci.yml @@ -29,6 +29,7 @@ kolla_enable_ovn: true kolla_enable_prometheus: true kolla_enable_redis: true kolla_enable_skydive: true +kolla_build_neutron_ovs: true ############################################################################### # StackHPC configuration.