From 3cfce2fb14e528861f53b79aed7b4785340bf452 Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Fri, 8 Jul 2022 15:52:07 +0100 Subject: [PATCH 1/5] Move included files to an excluded directory This avoids weird issues with duplicate labels and headings --- source/ceph_storage.rst | 4 ++-- source/conf.py | 2 +- source/{ => include}/ceph_ansible.rst | 0 source/{ => include}/ceph_troubleshooting.rst | 0 4 files changed, 3 insertions(+), 3 deletions(-) rename source/{ => include}/ceph_ansible.rst (100%) rename source/{ => include}/ceph_troubleshooting.rst (100%) diff --git a/source/ceph_storage.rst b/source/ceph_storage.rst index 0f468e3..241bb11 100644 --- a/source/ceph_storage.rst +++ b/source/ceph_storage.rst @@ -21,9 +21,9 @@ Ceph Storage Ceph Ansible ============ - .. include:: ceph_ansible.rst + .. include:: include/ceph_ansible.rst Ceph Troubleshooting ==================== - .. include:: ceph_troubleshooting.rst + .. include:: include/ceph_troubleshooting.rst diff --git a/source/conf.py b/source/conf.py index 31f4f03..96f97bf 100644 --- a/source/conf.py +++ b/source/conf.py @@ -39,7 +39,7 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = [] +exclude_patterns = ['include/*'] # -- Options for HTML output ------------------------------------------------- diff --git a/source/ceph_ansible.rst b/source/include/ceph_ansible.rst similarity index 100% rename from source/ceph_ansible.rst rename to source/include/ceph_ansible.rst diff --git a/source/ceph_troubleshooting.rst b/source/include/ceph_troubleshooting.rst similarity index 100% rename from source/ceph_troubleshooting.rst rename to source/include/ceph_troubleshooting.rst From c69002e8e1bda7a2c46c40f4f8e646db713ed1db Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Fri, 8 Jul 2022 15:50:37 +0100 Subject: [PATCH 2/5] Add baremetal management section Uses new deployment flags to make inclusion optional. Co-Authored-By: Will Szumski --- source/baremetal_management.rst | 18 ++ source/data/deployment.yml | 9 + source/include/baremetal_management.rst | 268 ++++++++++++++++++++++++ source/index.rst | 1 + source/vars.rst | 1 + 5 files changed, 297 insertions(+) create mode 100644 source/baremetal_management.rst create mode 100644 source/include/baremetal_management.rst diff --git a/source/baremetal_management.rst b/source/baremetal_management.rst new file mode 100644 index 0000000..3df9f1b --- /dev/null +++ b/source/baremetal_management.rst @@ -0,0 +1,18 @@ +.. include:: vars.rst + +====================================== +Bare Metal Compute Hardware Management +====================================== + +.. ifconfig:: deployment['ironic'] + + The |project_name| cloud includes bare metal compute nodes managed by the + Ironic services. This section describes elements of the configuration of + this service. + + .. include:: include/baremetal_management.rst + +.. ifconfig:: not deployment['ironic'] + + The |project_name| cloud does not include bare metal compute nodes managed + by the Ironic services. diff --git a/source/data/deployment.yml b/source/data/deployment.yml index e4bb386..209d2fd 100644 --- a/source/data/deployment.yml +++ b/source/data/deployment.yml @@ -7,3 +7,12 @@ ceph_ansible: false # Whether the Ceph deployment is managed by StackHPC ceph_managed: false + +# Whether the OpenStack deployment includes Ironic for bare metal compute. +ironic: false + +# Whether Ironic automated cleaning is enabled. +ironic_automated_cleaning: true + +# Whether Kayobe manages physical network devices. +kayobe_manages_physical_network: true diff --git a/source/include/baremetal_management.rst b/source/include/baremetal_management.rst new file mode 100644 index 0000000..87d6f93 --- /dev/null +++ b/source/include/baremetal_management.rst @@ -0,0 +1,268 @@ +.. _ironic-node-lifecycle: + +Ironic node life cycle +---------------------- + +The deployment process is documented in the `Ironic User Guide `__. +The |project_name| OpenStack deployment uses the +`direct deploy method `__. + +The Ironic state machine can be found `here `__. The rest of +this documentation refers to these states and assumes that you have familiarity. + +High level overview of state transitions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The following section attempts to describe the state transitions for various Ironic operations at a high level. +It focuses on trying to describe the steps where dynamic switch reconfiguration is triggered. +For a more detailed overview, refer to the :ref:`ironic-node-lifecycle` section. + +Provisioning +~~~~~~~~~~~~ + +Provisioning starts when an instance is created in Nova using a bare metal flavor. + +- Node starts in the available state (available) +- User provisions an instance (deploying) +- Ironic will switch the node onto the provisioning network (deploying) +- Ironic will power on the node and will await a callback (wait-callback) +- Ironic will image the node with an operating system using the image provided at creation (deploying) +- Ironic switches the node onto the tenant network(s) via neutron (deploying) +- Transition node to active state (active) + +.. _baremetal-management-deprovisioning: + +Deprovisioning +~~~~~~~~~~~~~~ + +Deprovisioning starts when an instance created in Nova using a bare metal flavor is destroyed. + +.. ifconfig:: deployment['ironic_automated_cleaning'] + + Automated cleaning is enabled, and occurs when nodes are deprovisioned. + + - Node starts in active state (active) + - User deletes instance (deleting) + - Ironic will remove the node from any tenant network(s) (deleting) + - Ironic will switch the node onto the cleaning network (deleting) + - Ironic will power on the node and will await a callback (clean-wait) + - Node boots into Ironic Python Agent and issues callback, Ironic starts cleaning (cleaning) + - Ironic removes node from cleaning network (cleaning) + - Node transitions to available (available) + +.. ifconfig:: not deployment['ironic_automated_cleaning'] + + Automated cleaning is currently disabled. + + - Node starts in active state (active) + - User deletes instance (deleting) + - Ironic will remove the node from any tenant network(s) (deleting) + - Node transitions to available (available) + +Cleaning +~~~~~~~~ + +Manual cleaning is not part of the regular state transitions when using Nova, however nodes can be manually cleaned by administrators. + +- Node starts in the manageable state (manageable) +- User triggers cleaning with API (cleaning) +- Ironic will switch the node onto the cleaning network (cleaning) +- Ironic will power on the node and will await a callback (clean-wait) +- Node boots into Ironic Python Agent and issues callback, Ironic starts cleaning (cleaning) +- Ironic removes node from cleaning network (cleaning) +- Node transitions back to the manageable state (manageable) + +.. ifconfig:: deployment['ironic_automated_cleaning'] + + See :ref:`baremetal-management-deprovisioning` for information about + automated cleaning. + +Rescuing +~~~~~~~~ + +Feature not used. The required rescue network is not currently configured. + +Baremetal networking +-------------------- + +Baremetal networking with the Neutron Networking Generic Switch ML2 driver requires a combination of static and dynamic switch configuration. + +.. _static-switch-config: + +Static switch configuration +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. ifconfig:: deployment['kayobe_manages_physical_network'] + + Static physical network configuration is managed via Kayobe. + + .. TODO: Fill in the switch configuration + + - Some initial switch configuration is required before networking generic switch can take over the management of an interface. + First, LACP must be configured on the switch ports attached to the baremetal node, e.g: + + .. code-block:: shell + + The interface is then partially configured: + + .. code-block:: shell + + For :ref:`ironic-node-discovery` to work, you need to manually switch the port to the provisioning network: + + .. code-block:: shell + + **NOTE**: You only need to do this if Ironic isn't aware of the node. + + Configuration with kayobe + ^^^^^^^^^^^^^^^^^^^^^^^^^ + + Kayobe can be used to apply the :ref:`static-switch-config`. + + - Upstream documentation can be found `here `__. + - Kayobe does all the switch configuration that isn't :ref:`dynamically updated using Ironic `. + - Optionally switches the node onto the provisioning network (when using ``--enable-discovery``) + + + NOTE: This is a dangerous operation as it can wipe out the dynamic VLAN configuration applied by neutron/ironic. + You should only run this when initially enrolling a node. It is possible to use the ``interface-description-limit``. For example: + + .. code-block:: + + kayobe physical network configure --interface-description-limit --group switches --display --enable-discovery + + In this example, ``--display`` is used to preview the switch configuration without applying it. + + .. TODO: Fill in information about how switches are configured in kayobe-config, with links + + - Configuration is done using a combination of ``group_vars`` and ``host_vars`` + +.. ifconfig:: not deployment['kayobe_manages_physical_network'] + + .. TODO: Fill in details about how physical network configuration is managed. + + Static physical network configuration is not managed via Kayobe. + +.. _dynamic-switch-configuration: + +Dynamic switch configuration +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Ironic dynamically configures the switches using the Neutron `Networking Generic Switch `_ ML2 driver. + +- Used to toggle the baremetal nodes onto different networks + + + Can use any VLAN network defined in OpenStack, providing that the VLAN has been trunked to the controllers + as this is required for DHCP to function. + + See :ref:`ironic-node-lifecycle`. This attempts to illustrate when any switch reconfigurations happen. + +- Only configures VLAN membership of the switch interfaces or port groups. To prevent conflicts with the static switch configuration, + the convention used is: after the node is in service in Ironic, VLAN membership should not be manually adjusted and + should be left to be controlled by ironic i.e *don't* use ``--enable-discovery`` without a limit when configuring the + switches with kayobe. +- Ironic is configured to use the neutron networking driver. + +.. _ngs-commands: + +Commands that NGS will execute +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Networking Generic Switch is mainly concerned with toggling the ports onto different VLANs. It +cannot fully configure the switch. + +.. TODO: Fill in the switch configuration + +- Switching the port onto the provisioning network + + .. code-block:: shell + +- Switching the port onto the tenant network. + + .. code-block:: shell + +- When deleting the instance, the VLANs are removed from the port. Using: + + .. code-block:: shell + +NGS will save the configuration after each reconfiguration (by default). + +Ports managed by NGS +^^^^^^^^^^^^^^^^^^^^ + +The command below extracts a list of port UUID, node UUID and switch port information. + +.. code-block:: bash + + admin# openstack baremetal port list --field uuid --field node_uuid --field local_link_connection --format value + +NGS will manage VLAN membership for ports when the ``local_link_connection`` fields match one of the switches in ``ml2_conf.ini``. +The rest of the switch configuration is static. +The switch configuration that NGS will apply to these ports is detailed in :ref:`dynamic-switch-configuration`. + +.. _ironic-node-discovery: + +Ironic node discovery +--------------------- + +Discovery is the process of PXE booting the nodes into the Ironic Python Agent (IPA) ramdisk. This ramdisk will collect hardware and networking configuration from the node in a process known as introspection. This data is used to populate the baremetal node object in Ironic. The series of steps you need to take to enrol a new node is as follows: + +- Configure credentials on the |bmc|. These are needed for Ironic to be able to perform power control actions. + +- Controllers should have network connectivity with the target |bmc|. + +.. ifconfig:: deployment['kayobe_manages_physical_network'] + + - Add any additional switch configuration to kayobe config. + The minimal switch configuration that kayobe needs to know about is described in :ref:`tor-switch-configuration`. + +- Apply any :ref:`static switch configration `. This performs the initial + setup of the switchports that is needed before Ironic can take over. The static configuration + will not be modified by Ironic, so it should be safe to reapply at any point. See :ref:`ngs-commands` + for details about the switch configuation that Networking Generic Switch will apply. + +.. ifconfig:: deployment['kayobe_manages_physical_network'] + + - Put the node onto the provisioning network by using the ``--enable discovery`` flag. See :ref:`static-switch-config`. + + * This is only necessary to initially discover the node. Once the node is in registered in Ironic, + it will take over control of the the VLAN membership. See :ref:`dynamic-switch-configuration`. + + * This provides ethernet connectivity with the controllers over the `workload provisioning` network + +.. ifconfig:: not deployment['kayobe_manages_physical_network'] + + - Put the node onto the provisioning network. + +.. TODO: link to the relevant file in kayobe config + +- Add node to the kayobe inventory. + +.. TODO: Fill in details about necessary BIOS & RAID config + +- Apply any necesary BIOS & RAID configuration. + +.. TODO: Fill in details about how to trigger a PXE boot + +- PXE boot the node. + +.. _tor-switch-configuration: + +Top of Rack (ToR) switch configuration +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Networking Generic Switch must be aware of the Top-of-Rack switch connected to the new node. +Switches managed by NGS are configured in ``ml2_conf.ini``. + +.. TODO: Fill in details about how switches are added to NGS config in kayobe-config + +After adding switches to the NGS configuration, Neutron must be redeployed. + +Considerations when booting baremetal compared to VMs +------------------------------------------------------ + +- You can only use networks of type: vlan +- Without using trunk ports, it is only possible to directly attach one network to each port or port group of an instance. + + * To access other networks you can use routers + * You can still attach floating IPs + +- Instances take much longer to provision (expect at least 15 mins) +- When booting an instances use one of the flavors that maps to a baremetal node via the RESOURCE_CLASS configured on the flavor. diff --git a/source/index.rst b/source/index.rst index bce742a..05cdd86 100644 --- a/source/index.rst +++ b/source/index.rst @@ -26,6 +26,7 @@ Contents operations_and_monitoring customising_deployment gpus_in_openstack + baremetal_management rally_and_tempest Indices and search diff --git a/source/vars.rst b/source/vars.rst index 41bdeb5..b95f4ce 100644 --- a/source/vars.rst +++ b/source/vars.rst @@ -1,5 +1,6 @@ .. |alertmanager_url| replace:: https://openstack.acme.example:9093 .. |base_path| replace:: ~/kayobe-env +.. |bmc| replace:: BMC .. |chat_system| replace:: Slack .. |control_host_access| replace:: |control_host| is used as the Ansible control host. Each operator uses their own account on this host, but with a shared SSH key stored as ``~/.ssh/id_rsa``. .. |control_host| replace:: acme-seed-hypervisor From d713993ffc72e610bc936b597b0235aff6c9d1ed Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Fri, 8 Jul 2022 17:14:37 +0100 Subject: [PATCH 3/5] CI: Add workflow to build the docs --- .github/workflows/pull_request.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 .github/workflows/pull_request.yml diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml new file mode 100644 index 0000000..42116ec --- /dev/null +++ b/.github/workflows/pull_request.yml @@ -0,0 +1,20 @@ +--- +name: Build OpenStack admin guide +on: + - pull_request +jobs: + build: + name: Build OpenStack admin guide + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v4 + with: + python-version: '3.x' + + - name: Install Python dependencies + run: pip3 install -r requirements.txt + + - name: Build HTML + run: make html From 2a140c9add3dbe1aa064d57e591fbc8a63a0d475 Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Fri, 8 Jul 2022 17:24:31 +0100 Subject: [PATCH 4/5] CI: Add matrix for several deployment configs --- .github/workflows/deployment_yaml/disable-ceph.yml | 7 +++++++ .../enable-ironic-no-cleaning-or-physnet.yml | 7 +++++++ .github/workflows/deployment_yaml/enable-ironic.yml | 7 +++++++ .github/workflows/pull_request.yml | 11 +++++++++++ 4 files changed, 32 insertions(+) create mode 100644 .github/workflows/deployment_yaml/disable-ceph.yml create mode 100644 .github/workflows/deployment_yaml/enable-ironic-no-cleaning-or-physnet.yml create mode 100644 .github/workflows/deployment_yaml/enable-ironic.yml diff --git a/.github/workflows/deployment_yaml/disable-ceph.yml b/.github/workflows/deployment_yaml/disable-ceph.yml new file mode 100644 index 0000000..d80388f --- /dev/null +++ b/.github/workflows/deployment_yaml/disable-ceph.yml @@ -0,0 +1,7 @@ +--- +ceph: false +ceph_ansible: false +ceph_managed: false +ironic: false +ironic_automated_cleaning: true +kayobe_manages_physical_network: true diff --git a/.github/workflows/deployment_yaml/enable-ironic-no-cleaning-or-physnet.yml b/.github/workflows/deployment_yaml/enable-ironic-no-cleaning-or-physnet.yml new file mode 100644 index 0000000..3f4b5da --- /dev/null +++ b/.github/workflows/deployment_yaml/enable-ironic-no-cleaning-or-physnet.yml @@ -0,0 +1,7 @@ +--- +ceph: true +ceph_ansible: false +ceph_managed: false +ironic: true +ironic_automated_cleaning: false +kayobe_manages_physical_network: false diff --git a/.github/workflows/deployment_yaml/enable-ironic.yml b/.github/workflows/deployment_yaml/enable-ironic.yml new file mode 100644 index 0000000..29038b8 --- /dev/null +++ b/.github/workflows/deployment_yaml/enable-ironic.yml @@ -0,0 +1,7 @@ +--- +ceph: true +ceph_ansible: false +ceph_managed: false +ironic: true +ironic_automated_cleaning: true +kayobe_manages_physical_network: true diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 42116ec..f6dc71c 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -6,6 +6,13 @@ jobs: build: name: Build OpenStack admin guide runs-on: ubuntu-latest + strategy: + matrix: + deployment_yaml: + - __default__ + - disable-ceph + - enable-ironic + - enable-ironic-no-cleaning-or-physnet steps: - uses: actions/checkout@v3 @@ -16,5 +23,9 @@ jobs: - name: Install Python dependencies run: pip3 install -r requirements.txt + - name: Copy deployment.yml into place + run: cp .github/workflows/deployment_yaml/${{ matrix.deployment_yaml }}.yml source/data/deployment.yml + if: matrix.deployment_yaml != '__default__' + - name: Build HTML run: make html From 3bd135f0a2df869efa2f33579332048d11c2f92f Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Fri, 8 Jul 2022 17:48:16 +0100 Subject: [PATCH 5/5] Baremetal: fix up grammar --- source/include/baremetal_management.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/include/baremetal_management.rst b/source/include/baremetal_management.rst index 87d6f93..8fe4967 100644 --- a/source/include/baremetal_management.rst +++ b/source/include/baremetal_management.rst @@ -265,4 +265,4 @@ Considerations when booting baremetal compared to VMs * You can still attach floating IPs - Instances take much longer to provision (expect at least 15 mins) -- When booting an instances use one of the flavors that maps to a baremetal node via the RESOURCE_CLASS configured on the flavor. +- When booting an instance use one of the flavors that maps to a baremetal node via the RESOURCE_CLASS configured on the flavor.