Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
331 changes: 331 additions & 0 deletions containers/ironic/patches/0002_skip_reboot_firmware_update.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,331 @@
From 1b2c01185c7907a1d17c4d90cf3bcd73fe345658 Mon Sep 17 00:00:00 2001
From: Jacob Anders <janders@redhat.com>
Date: Tue, 08 Jul 2025 16:25:40 +1000
Subject: [PATCH] Skip initial reboot to IPA when updating firmware out-of-band

This change enables Ironic to skip initial reboot to IPA when
performing out-of-band firmware.

Change-Id: Id055a4ddbde3dbe336717e5f06ca6eb024b90c9f
Signed-off-by: Jacob Anders <janders@redhat.com>
---

diff --git a/ironic/conductor/servicing.py b/ironic/conductor/servicing.py
index 8b385d7..6aaa4a5 100644
--- a/ironic/conductor/servicing.py
+++ b/ironic/conductor/servicing.py
@@ -38,6 +38,7 @@
:param disable_ramdisk: Whether to skip booting ramdisk for servicing.
"""
node = task.node
+ ramdisk_needed = None
try:
# NOTE(ghe): Valid power and network values are needed to perform
# a service operation.
@@ -54,42 +55,53 @@
node.set_driver_internal_info('service_disable_ramdisk',
disable_ramdisk)
task.node.save()
-
utils.node_update_cache(task)

- # Allow the deploy driver to set up the ramdisk again (necessary for IPA)
- try:
- if not disable_ramdisk:
- prepare_result = task.driver.deploy.prepare_service(task)
- else:
- LOG.info('Skipping preparing for service in-band service since '
- 'out-of-band only service has been requested for node '
- '%s', node.uuid)
- prepare_result = None
- except Exception as e:
- msg = (_('Failed to prepare node %(node)s for service: %(e)s')
- % {'node': node.uuid, 'e': e})
- return utils.servicing_error_handler(task, msg, traceback=True)
-
- if prepare_result == states.SERVICEWAIT:
- # Prepare is asynchronous, the deploy driver will need to
- # set node.driver_internal_info['service_steps'] and
- # node.service_step and then make an RPC call to
- # continue_node_service to start service operations.
- task.process_event('wait')
- return
try:
conductor_steps.set_node_service_steps(
task, disable_ramdisk=disable_ramdisk)
+ except exception.InvalidParameterValue:
+ if disable_ramdisk:
+ # NOTE(janders) raising log severity since this will result
+ # in servicing failure
+ LOG.error('Failed to compose list of service steps for node '
+ '%s', node.uuid)
+ raise
+ LOG.debug('Unable to compose list of service steps for node %(node)s '
+ 'will retry once ramdisk is booted.', {'node': node.uuid})
+ ramdisk_needed = True
except Exception as e:
- # Catch all exceptions and follow the error handling
- # path so things are cleaned up properly.
msg = (_('Cannot service node %(node)s: %(msg)s')
% {'node': node.uuid, 'msg': e})
return utils.servicing_error_handler(task, msg)

steps = node.driver_internal_info.get('service_steps', [])
step_index = 0 if steps else None
+ for step in steps:
+ step_requires_ramdisk = step.get('requires_ramdisk')
+ if step_requires_ramdisk:
+ LOG.debug('Found service step %s requiring ramdisk '
+ 'for node %s', step, task.node.uuid)
+ ramdisk_needed = True
+
+ if ramdisk_needed and not disable_ramdisk:
+ try:
+ prepare_result = task.driver.deploy.prepare_service(task)
+ except Exception as e:
+ msg = (_('Failed to prepare node %(node)s for service: %(e)s')
+ % {'node': node.uuid, 'e': e})
+ return utils.servicing_error_handler(task, msg, traceback=True)
+ if prepare_result == states.SERVICEWAIT:
+ # Prepare is asynchronous, the deploy driver will need to
+ # set node.driver_internal_info['service_steps'] and
+ # node.service_step and then make an RPC call to
+ # continue_node_service to start service operations.
+ task.process_event('wait')
+ return
+ else:
+ LOG.debug('Will proceed with servicing node %(node)s '
+ 'without booting the ramdisk.', {'node': node.uuid})
+
do_next_service_step(task, step_index, disable_ramdisk=disable_ramdisk)


diff --git a/ironic/conductor/steps.py b/ironic/conductor/steps.py
index 39f75fbe..9112e6d 100644
--- a/ironic/conductor/steps.py
+++ b/ironic/conductor/steps.py
@@ -496,9 +496,11 @@
are accepted.
:raises: InvalidParameterValue if there is a problem with the user's
clean steps.
- :raises: NodeCleaningFailure if there was a problem getting the
+ :raises: NodeServicingFailure if there was a problem getting the
service steps.
"""
+ # FIXME(janders) it seems NodeServicingFailure is never actually raised,
+ # perhaps we should remove it?
node = task.node
steps = _validate_user_service_steps(
task, node.driver_internal_info.get('service_steps', []),
diff --git a/ironic/drivers/modules/deploy_utils.py b/ironic/drivers/modules/deploy_utils.py
index 8afb909..f501b33 100644
--- a/ironic/drivers/modules/deploy_utils.py
+++ b/ironic/drivers/modules/deploy_utils.py
@@ -1750,7 +1750,7 @@
task.driver.boot.prepare_ramdisk(task, deploy_opts)


-def reboot_to_finish_step(task, timeout=None):
+def reboot_to_finish_step(task, timeout=None, disable_ramdisk=None):
"""Reboot the node into IPA to finish a deploy/clean step.

:param task: a TaskManager instance.
@@ -1759,8 +1759,14 @@
:returns: states.CLEANWAIT if cleaning operation in progress
or states.DEPLOYWAIT if deploy operation in progress.
"""
- disable_ramdisk = task.node.driver_internal_info.get(
- 'cleaning_disable_ramdisk')
+ if disable_ramdisk is None:
+ if task.node.provision_state in [states.CLEANING, states.CLEANWAIT]:
+ disable_ramdisk = task.node.driver_internal_info.get(
+ 'cleaning_disable_ramdisk')
+ elif task.node.provision_state in [states.SERVICING,
+ states.SERVICEWAIT]:
+ disable_ramdisk = task.node.driver_internal_info.get(
+ 'service_disable_ramdisk')
if not disable_ramdisk:
if (manager_utils.is_fast_track(task)
and not task.node.disable_power_off):
diff --git a/ironic/drivers/modules/redfish/firmware.py b/ironic/drivers/modules/redfish/firmware.py
index 2500cd5..555a921 100644
--- a/ironic/drivers/modules/redfish/firmware.py
+++ b/ironic/drivers/modules/redfish/firmware.py
@@ -146,7 +146,7 @@
requires_ramdisk=True)
@base.service_step(priority=0, abortable=False,
argsinfo=_FW_SETTINGS_ARGSINFO,
- requires_ramdisk=True)
+ requires_ramdisk=False)
@base.cache_firmware_components
def update(self, task, settings):
"""Update the Firmware on the node using the settings for components.
@@ -181,7 +181,8 @@
polling=True
)

- return deploy_utils.reboot_to_finish_step(task, timeout=wait_interval)
+ return deploy_utils.reboot_to_finish_step(task, timeout=wait_interval,
+ disable_ramdisk=True)

def _execute_firmware_update(self, node, update_service, settings):
"""Executes the next firmware update to the node
diff --git a/ironic/tests/unit/conductor/test_servicing.py b/ironic/tests/unit/conductor/test_servicing.py
index 22c2612..b46394d 100644
--- a/ironic/tests/unit/conductor/test_servicing.py
+++ b/ironic/tests/unit/conductor/test_servicing.py
@@ -109,14 +109,21 @@
mock_validate.side_effect = exception.NetworkError()
self.__do_node_service_validate_fail(mock_validate)

+ @mock.patch.object(conductor_steps, 'set_node_service_steps',
+ autospec=True)
@mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate',
autospec=True)
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_service',
autospec=True)
def test__do_node_service_prepare_service_fail(self, mock_prep,
mock_validate,
- service_steps=None):
- # Exception from task.driver.deploy.prepare_cleaning should cause node
+ mock_steps,
+ service_steps=[]):
+ # NOTE(janders) after removing unconditional initial reboot into
+ # ramdisk, set_node_service_steps needs to InvalidParameterValue
+ # to force boot into ramdisk
+ mock_steps.side_effect = exception.InvalidParameterValue('error')
+ # Exception from task.driver.deploy.prepare_service should cause node
# to go to SERVICEFAIL
mock_prep.side_effect = exception.InvalidParameterValue('error')
tgt_prov_state = states.ACTIVE
@@ -135,17 +142,76 @@
self.assertFalse(node.maintenance)
self.assertIsNone(node.fault)

+ @mock.patch.object(conductor_steps, 'set_node_service_steps',
+ autospec=True)
@mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate',
autospec=True)
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_service',
autospec=True)
def test__do_node_service_prepare_service_wait(self, mock_prep,
- mock_validate):
+ mock_validate,
+ mock_steps):
service_steps = [
{'step': 'trigger_servicewait', 'priority': 10,
'interface': 'vendor'}
]
+ mock_steps.side_effect = exception.InvalidParameterValue('error')
+ mock_prep.return_value = states.SERVICEWAIT
+ tgt_prov_state = states.ACTIVE
+ node = obj_utils.create_test_node(
+ self.context, driver='fake-hardware',
+ provision_state=states.SERVICING,
+ target_provision_state=tgt_prov_state,
+ vendor_interface='fake')
+ with task_manager.acquire(
+ self.context, node.uuid, shared=False) as task:
+ servicing.do_node_service(task, service_steps=service_steps)
+ node.refresh()
+ self.assertEqual(states.SERVICEWAIT, node.provision_state)
+ self.assertEqual(tgt_prov_state, node.target_provision_state)
+ mock_prep.assert_called_once_with(mock.ANY, mock.ANY)
+ mock_validate.assert_called_once_with(mock.ANY, mock.ANY)

+ @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate',
+ autospec=True)
+ @mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_service',
+ autospec=True)
+ def test__do_node_service_out_of_band(self, mock_prep,
+ mock_validate):
+ # NOTE(janders) this test ensures ramdisk isn't prepared if
+ # steps do not require it
+ service_steps = [
+ {'step': 'trigger_servicewait', 'priority': 10,
+ 'interface': 'vendor'}
+ ]
+ mock_prep.return_value = states.SERVICEWAIT
+ tgt_prov_state = states.ACTIVE
+ node = obj_utils.create_test_node(
+ self.context, driver='fake-hardware',
+ provision_state=states.SERVICING,
+ target_provision_state=tgt_prov_state,
+ vendor_interface='fake')
+ with task_manager.acquire(
+ self.context, node.uuid, shared=False) as task:
+ servicing.do_node_service(task, service_steps=service_steps)
+ node.refresh()
+ self.assertEqual(states.SERVICEWAIT, node.provision_state)
+ self.assertEqual(tgt_prov_state, node.target_provision_state)
+ mock_prep.assert_not_called()
+ mock_validate.assert_called_once_with(mock.ANY, mock.ANY)
+
+ @mock.patch('ironic.drivers.modules.network.flat.FlatNetwork.validate',
+ autospec=True)
+ @mock.patch('ironic.drivers.modules.fake.FakeDeploy.prepare_service',
+ autospec=True)
+ def test__do_node_service_requires_ramdisk_fallback(self, mock_prep,
+ mock_validate):
+ # NOTE(janders): here we set 'requires_ramdisk': 'True'
+ # to force ramdisk_needed to be set and trigger prepare ramdisk
+ service_steps = [
+ {'step': 'trigger_servicewait', 'priority': 10,
+ 'interface': 'vendor', 'requires_ramdisk': 'True'}
+ ]
mock_prep.return_value = states.SERVICEWAIT
tgt_prov_state = states.ACTIVE
node = obj_utils.create_test_node(
@@ -194,11 +260,8 @@
@mock.patch.object(conductor_steps, 'set_node_service_steps',
autospec=True)
def __do_node_service_steps_fail(self, mock_steps, mock_validate,
- service_steps=None, invalid_exc=True):
- if invalid_exc:
- mock_steps.side_effect = exception.InvalidParameterValue('invalid')
- else:
- mock_steps.side_effect = exception.NodeCleaningFailure('failure')
+ service_steps=None):
+ mock_steps.side_effect = exception.NodeCleaningFailure('failure')
tgt_prov_state = states.ACTIVE
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
@@ -224,12 +287,8 @@
def test_do_node_service_steps_fail_poweroff(self, mock_steps,
mock_validate,
mock_power,
- service_steps=None,
- invalid_exc=True):
- if invalid_exc:
- mock_steps.side_effect = exception.InvalidParameterValue('invalid')
- else:
- mock_steps.side_effect = exception.NodeCleaningFailure('failure')
+ service_steps=None):
+ mock_steps.side_effect = exception.NodeCleaningFailure('failure')
tgt_prov_state = states.ACTIVE
self.config(poweroff_in_cleanfail=True, group='conductor')
node = obj_utils.create_test_node(
@@ -249,9 +308,7 @@
self.assertFalse(mock_power.called)

def test__do_node_service_steps_fail(self):
- for invalid in (True, False):
- self.__do_node_service_steps_fail(service_steps=[self.deploy_raid],
- invalid_exc=invalid)
+ self.__do_node_service_steps_fail(service_steps=[self.deploy_raid])

@mock.patch.object(conductor_steps, 'set_node_service_steps',
autospec=True)
diff --git a/releasenotes/notes/servicing-remove-initial-reboot-into-ramdisk-c1840524832435c2.yaml b/releasenotes/notes/servicing-remove-initial-reboot-into-ramdisk-c1840524832435c2.yaml
new file mode 100644
index 0000000..296779a
--- /dev/null
+++ b/releasenotes/notes/servicing-remove-initial-reboot-into-ramdisk-c1840524832435c2.yaml
@@ -0,0 +1,7 @@
+---
+fixes:
+ - |
+ Removes initial, unconditional reboot into ramdisk during servicing when
+ not required by specified service steps. This reduces the total number of
+ reboots needed when performing servicing, speeding up the process.
+
1 change: 1 addition & 0 deletions containers/ironic/patches/series
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
0001_grub_config.patch
0002_skip_reboot_firmware_update.patch
Loading