From 506030cf2c7403af52132cb80c1141a91fda873a Mon Sep 17 00:00:00 2001 From: haseeb Date: Thu, 7 Aug 2025 20:16:56 +0530 Subject: [PATCH] feat: patch https://review.opendev.org/c/openstack/ironic/+/954311 Skip initial reboot to IPA when updating firmware out-of-band --- .../0002_skip_reboot_firmware_update.patch | 331 ++++++++++++++++++ containers/ironic/patches/series | 1 + 2 files changed, 332 insertions(+) create mode 100644 containers/ironic/patches/0002_skip_reboot_firmware_update.patch diff --git a/containers/ironic/patches/0002_skip_reboot_firmware_update.patch b/containers/ironic/patches/0002_skip_reboot_firmware_update.patch new file mode 100644 index 000000000..1c9bcbe30 --- /dev/null +++ b/containers/ironic/patches/0002_skip_reboot_firmware_update.patch @@ -0,0 +1,331 @@ +From 1b2c01185c7907a1d17c4d90cf3bcd73fe345658 Mon Sep 17 00:00:00 2001 +From: Jacob Anders +Date: Tue, 08 Jul 2025 16:25:40 +1000 +Subject: [PATCH] Skip initial reboot to IPA when updating firmware out-of-band + +This change enables Ironic to skip initial reboot to IPA when +performing out-of-band firmware. + +Change-Id: Id055a4ddbde3dbe336717e5f06ca6eb024b90c9f +Signed-off-by: Jacob Anders +--- + +diff --git a/ironic/conductor/servicing.py b/ironic/conductor/servicing.py +index 8b385d7..6aaa4a5 100644 +--- a/ironic/conductor/servicing.py ++++ b/ironic/conductor/servicing.py +@@ -38,6 +38,7 @@ + :param disable_ramdisk: Whether to skip booting ramdisk for servicing. + """ + node = task.node ++ ramdisk_needed = None + try: + # NOTE(ghe): Valid power and network values are needed to perform + # a service operation. +@@ -54,42 +55,53 @@ + node.set_driver_internal_info('service_disable_ramdisk', + disable_ramdisk) + task.node.save() +- + utils.node_update_cache(task) + +- # Allow the deploy driver to set up the ramdisk again (necessary for IPA) +- try: +- if not disable_ramdisk: +- prepare_result = task.driver.deploy.prepare_service(task) +- else: +- LOG.info('Skipping preparing for service in-band service since ' +- 'out-of-band only service has been requested for node ' +- '%s', node.uuid) +- prepare_result = None +- except Exception as e: +- msg = (_('Failed to prepare node %(node)s for service: %(e)s') +- % {'node': node.uuid, 'e': e}) +- return utils.servicing_error_handler(task, msg, traceback=True) +- +- if prepare_result == states.SERVICEWAIT: +- # Prepare is asynchronous, the deploy driver will need to +- # set node.driver_internal_info['service_steps'] and +- # node.service_step and then make an RPC call to +- # continue_node_service to start service operations. +- task.process_event('wait') +- return + try: + conductor_steps.set_node_service_steps( + task, disable_ramdisk=disable_ramdisk) ++ except exception.InvalidParameterValue: ++ if disable_ramdisk: ++ # NOTE(janders) raising log severity since this will result ++ # in servicing failure ++ LOG.error('Failed to compose list of service steps for node ' ++ '%s', node.uuid) ++ raise ++ LOG.debug('Unable to compose list of service steps for node %(node)s ' ++ 'will retry once ramdisk is booted.', {'node': node.uuid}) ++ ramdisk_needed = True + except Exception as e: +- # Catch all exceptions and follow the error handling +- # path so things are cleaned up properly. + msg = (_('Cannot service node %(node)s: %(msg)s') + % {'node': node.uuid, 'msg': e}) + return utils.servicing_error_handler(task, msg) + + steps = node.driver_internal_info.get('service_steps', []) + step_index = 0 if steps else None ++ for step in steps: ++ step_requires_ramdisk = step.get('requires_ramdisk') ++ if step_requires_ramdisk: ++ LOG.debug('Found service step %s requiring ramdisk ' ++ 'for node %s', step, task.node.uuid) ++ ramdisk_needed = True ++ ++ if ramdisk_needed and not disable_ramdisk: ++ try: ++ prepare_result = task.driver.deploy.prepare_service(task) ++ except Exception as e: ++ msg = (_('Failed to prepare node %(node)s for service: %(e)s') ++ % {'node': node.uuid, 'e': e}) ++ return utils.servicing_error_handler(task, msg, traceback=True) ++ if prepare_result == states.SERVICEWAIT: ++ # Prepare is asynchronous, the deploy driver will need to ++ # set node.driver_internal_info['service_steps'] and ++ # node.service_step and then make an RPC call to ++ # continue_node_service to start service operations. ++ task.process_event('wait') ++ return ++ else: ++ LOG.debug('Will proceed with servicing node %(node)s ' ++ 'without booting the ramdisk.', {'node': node.uuid}) ++ + do_next_service_step(task, step_index, disable_ramdisk=disable_ramdisk) + + +diff --git a/ironic/conductor/steps.py b/ironic/conductor/steps.py +index 39f75fbe..9112e6d 100644 +--- a/ironic/conductor/steps.py ++++ b/ironic/conductor/steps.py +@@ -496,9 +496,11 @@ + are accepted. + :raises: InvalidParameterValue if there is a problem with the user's + clean steps. +- :raises: NodeCleaningFailure if there was a problem getting the ++ :raises: NodeServicingFailure if there was a problem getting the + service steps. + """ ++ # FIXME(janders) it seems NodeServicingFailure is never actually raised, ++ # perhaps we should remove it? + node = task.node + steps = _validate_user_service_steps( + task, node.driver_internal_info.get('service_steps', []), +diff --git a/ironic/drivers/modules/deploy_utils.py b/ironic/drivers/modules/deploy_utils.py +index 8afb909..f501b33 100644 +--- a/ironic/drivers/modules/deploy_utils.py ++++ b/ironic/drivers/modules/deploy_utils.py +@@ -1750,7 +1750,7 @@ + task.driver.boot.prepare_ramdisk(task, deploy_opts) + + +-def reboot_to_finish_step(task, timeout=None): ++def reboot_to_finish_step(task, timeout=None, disable_ramdisk=None): + """Reboot the node into IPA to finish a deploy/clean step. + + :param task: a TaskManager instance. +@@ -1759,8 +1759,14 @@ + :returns: states.CLEANWAIT if cleaning operation in progress + or states.DEPLOYWAIT if deploy operation in progress. + """ +- disable_ramdisk = task.node.driver_internal_info.get( +- 'cleaning_disable_ramdisk') ++ if disable_ramdisk is None: ++ if task.node.provision_state in [states.CLEANING, states.CLEANWAIT]: ++ disable_ramdisk = task.node.driver_internal_info.get( ++ 'cleaning_disable_ramdisk') ++ elif task.node.provision_state in [states.SERVICING, ++ states.SERVICEWAIT]: ++ disable_ramdisk = task.node.driver_internal_info.get( ++ 'service_disable_ramdisk') + if not disable_ramdisk: + if (manager_utils.is_fast_track(task) + and not task.node.disable_power_off): +diff --git a/ironic/drivers/modules/redfish/firmware.py b/ironic/drivers/modules/redfish/firmware.py +index 2500cd5..555a921 100644 +--- a/ironic/drivers/modules/redfish/firmware.py ++++ b/ironic/drivers/modules/redfish/firmware.py +@@ -146,7 +146,7 @@ + requires_ramdisk=True) + @base.service_step(priority=0, abortable=False, + argsinfo=_FW_SETTINGS_ARGSINFO, +- requires_ramdisk=True) ++ requires_ramdisk=False) + @base.cache_firmware_components + def update(self, task, settings): + """Update the Firmware on the node using the settings for components. +@@ -181,7 +181,8 @@ + polling=True + ) + +- return deploy_utils.reboot_to_finish_step(task, timeout=wait_interval) ++ return deploy_utils.reboot_to_finish_step(task, timeout=wait_interval, ++ disable_ramdisk=True) + + def _execute_firmware_update(self, node, update_service, settings): + """Executes the next firmware update to the node +diff --git a/ironic/tests/unit/conductor/test_servicing.py b/ironic/tests/unit/conductor/test_servicing.py +index 22c2612..b46394d 100644 +--- a/ironic/tests/unit/conductor/test_servicing.py ++++ b/ironic/tests/unit/conductor/test_servicing.py +@@ -109,14 +109,21 @@ + mock_validate.side_effect = exception.NetworkError() + self.__do_node_service_validate_fail(mock_validate) + ++ @mock.patch.object(conductor_steps, 'set_node_service_steps', ++ autospec=True) + @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate', + autospec=True) + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_service', + autospec=True) + def test__do_node_service_prepare_service_fail(self, mock_prep, + mock_validate, +- service_steps=None): +- # Exception from task.driver.deploy.prepare_cleaning should cause node ++ mock_steps, ++ service_steps=[]): ++ # NOTE(janders) after removing unconditional initial reboot into ++ # ramdisk, set_node_service_steps needs to InvalidParameterValue ++ # to force boot into ramdisk ++ mock_steps.side_effect = exception.InvalidParameterValue('error') ++ # Exception from task.driver.deploy.prepare_service should cause node + # to go to SERVICEFAIL + mock_prep.side_effect = exception.InvalidParameterValue('error') + tgt_prov_state = states.ACTIVE +@@ -135,17 +142,76 @@ + self.assertFalse(node.maintenance) + self.assertIsNone(node.fault) + ++ @mock.patch.object(conductor_steps, 'set_node_service_steps', ++ autospec=True) + @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate', + autospec=True) + @mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_service', + autospec=True) + def test__do_node_service_prepare_service_wait(self, mock_prep, +- mock_validate): ++ mock_validate, ++ mock_steps): + service_steps = [ + {'step': 'trigger_servicewait', 'priority': 10, + 'interface': 'vendor'} + ] ++ mock_steps.side_effect = exception.InvalidParameterValue('error') ++ mock_prep.return_value = states.SERVICEWAIT ++ tgt_prov_state = states.ACTIVE ++ node = obj_utils.create_test_node( ++ self.context, driver='fake-hardware', ++ provision_state=states.SERVICING, ++ target_provision_state=tgt_prov_state, ++ vendor_interface='fake') ++ with task_manager.acquire( ++ self.context, node.uuid, shared=False) as task: ++ servicing.do_node_service(task, service_steps=service_steps) ++ node.refresh() ++ self.assertEqual(states.SERVICEWAIT, node.provision_state) ++ self.assertEqual(tgt_prov_state, node.target_provision_state) ++ mock_prep.assert_called_once_with(mock.ANY, mock.ANY) ++ mock_validate.assert_called_once_with(mock.ANY, mock.ANY) + ++ @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate', ++ autospec=True) ++ @mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_service', ++ autospec=True) ++ def test__do_node_service_out_of_band(self, mock_prep, ++ mock_validate): ++ # NOTE(janders) this test ensures ramdisk isn't prepared if ++ # steps do not require it ++ service_steps = [ ++ {'step': 'trigger_servicewait', 'priority': 10, ++ 'interface': 'vendor'} ++ ] ++ mock_prep.return_value = states.SERVICEWAIT ++ tgt_prov_state = states.ACTIVE ++ node = obj_utils.create_test_node( ++ self.context, driver='fake-hardware', ++ provision_state=states.SERVICING, ++ target_provision_state=tgt_prov_state, ++ vendor_interface='fake') ++ with task_manager.acquire( ++ self.context, node.uuid, shared=False) as task: ++ servicing.do_node_service(task, service_steps=service_steps) ++ node.refresh() ++ self.assertEqual(states.SERVICEWAIT, node.provision_state) ++ self.assertEqual(tgt_prov_state, node.target_provision_state) ++ mock_prep.assert_not_called() ++ mock_validate.assert_called_once_with(mock.ANY, mock.ANY) ++ ++ @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate', ++ autospec=True) ++ @mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_service', ++ autospec=True) ++ def test__do_node_service_requires_ramdisk_fallback(self, mock_prep, ++ mock_validate): ++ # NOTE(janders): here we set 'requires_ramdisk': 'True' ++ # to force ramdisk_needed to be set and trigger prepare ramdisk ++ service_steps = [ ++ {'step': 'trigger_servicewait', 'priority': 10, ++ 'interface': 'vendor', 'requires_ramdisk': 'True'} ++ ] + mock_prep.return_value = states.SERVICEWAIT + tgt_prov_state = states.ACTIVE + node = obj_utils.create_test_node( +@@ -194,11 +260,8 @@ + @mock.patch.object(conductor_steps, 'set_node_service_steps', + autospec=True) + def __do_node_service_steps_fail(self, mock_steps, mock_validate, +- service_steps=None, invalid_exc=True): +- if invalid_exc: +- mock_steps.side_effect = exception.InvalidParameterValue('invalid') +- else: +- mock_steps.side_effect = exception.NodeCleaningFailure('failure') ++ service_steps=None): ++ mock_steps.side_effect = exception.NodeCleaningFailure('failure') + tgt_prov_state = states.ACTIVE + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', +@@ -224,12 +287,8 @@ + def test_do_node_service_steps_fail_poweroff(self, mock_steps, + mock_validate, + mock_power, +- service_steps=None, +- invalid_exc=True): +- if invalid_exc: +- mock_steps.side_effect = exception.InvalidParameterValue('invalid') +- else: +- mock_steps.side_effect = exception.NodeCleaningFailure('failure') ++ service_steps=None): ++ mock_steps.side_effect = exception.NodeCleaningFailure('failure') + tgt_prov_state = states.ACTIVE + self.config(poweroff_in_cleanfail=True, group='conductor') + node = obj_utils.create_test_node( +@@ -249,9 +308,7 @@ + self.assertFalse(mock_power.called) + + def test__do_node_service_steps_fail(self): +- for invalid in (True, False): +- self.__do_node_service_steps_fail(service_steps=[self.deploy_raid], +- invalid_exc=invalid) ++ self.__do_node_service_steps_fail(service_steps=[self.deploy_raid]) + + @mock.patch.object(conductor_steps, 'set_node_service_steps', + autospec=True) +diff --git a/releasenotes/notes/servicing-remove-initial-reboot-into-ramdisk-c1840524832435c2.yaml b/releasenotes/notes/servicing-remove-initial-reboot-into-ramdisk-c1840524832435c2.yaml +new file mode 100644 +index 0000000..296779a +--- /dev/null ++++ b/releasenotes/notes/servicing-remove-initial-reboot-into-ramdisk-c1840524832435c2.yaml +@@ -0,0 +1,7 @@ ++--- ++fixes: ++ - | ++ Removes initial, unconditional reboot into ramdisk during servicing when ++ not required by specified service steps. This reduces the total number of ++ reboots needed when performing servicing, speeding up the process. ++ diff --git a/containers/ironic/patches/series b/containers/ironic/patches/series index 2b61b766e..492d82d09 100644 --- a/containers/ironic/patches/series +++ b/containers/ironic/patches/series @@ -1 +1,2 @@ 0001_grub_config.patch +0002_skip_reboot_firmware_update.patch