From 010acce606f5c013b9753701fa1fba8fd4e54092 Mon Sep 17 00:00:00 2001 From: Artom Lifshitz Date: Tue, 13 Aug 2024 11:29:10 -0400 Subject: [PATCH 1/2] libvirt: call get_capabilities() with all CPUs online While we do cache the hosts's capabilities in self._caps in the libvirt Host object, if we happen to fist call get_capabilities() with some of our dedicated CPUs offline, libvirt erroneously reports them as being on socket 0 regardless of their real socket. We would then cache that topology, thus breaking pretty much all of our NUMA accounting. To fix this, this patch makes sure to call get_capabilities() immediately upon host init, and to power up all our dedicated CPUs before doing so. That way, we cache their real socket ID. For testing, because we don't really want to implement a libvirt bug in our Python libvirt fixture, we make due with a simple unit tests that asserts that init_host() has powered on the correct CPUs. Closes-bug: 2077228 Change-Id: I9a2a7614313297f11a55d99fb94916d3583a9504 (cherry picked from commit 79d1f06094599249e6e30ebba2488b8b7a10834e) (cherry picked from commit 294444b360f547ab7a7e0c53f076b976a43f248d) (cherry picked from commit 6448e382a7b0cba6bafce1652878c55d86a305ee) --- nova/tests/unit/virt/libvirt/test_driver.py | 10 +++++++++ nova/virt/libvirt/cpu/api.py | 6 +++--- nova/virt/libvirt/driver.py | 23 +++++++++++++++++++-- 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py index 9815cde0888..0c12e57a3d7 100644 --- a/nova/tests/unit/virt/libvirt/test_driver.py +++ b/nova/tests/unit/virt/libvirt/test_driver.py @@ -894,6 +894,16 @@ def test_driver_capabilities_secure_boot(self, mock_supports): ) mock_supports.assert_called_once_with() + @mock.patch.object(hardware, 'get_cpu_dedicated_set', + return_value=set([0, 42, 1337])) + @mock.patch.object(libvirt_driver.LibvirtDriver, + '_register_all_undefined_instance_details') + def test_init_host_topology(self, mock_get_cpu_dedicated_set, _): + driver = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + with mock.patch.object(driver.cpu_api, 'power_up') as mock_power_up: + driver.init_host('goat') + mock_power_up.assert_called_with(set([0, 42, 1337])) + @mock.patch.object( libvirt_driver.LibvirtDriver, '_register_all_undefined_instance_details', diff --git a/nova/virt/libvirt/cpu/api.py b/nova/virt/libvirt/cpu/api.py index 104a3b4d277..5d7dccd47ed 100644 --- a/nova/virt/libvirt/cpu/api.py +++ b/nova/virt/libvirt/cpu/api.py @@ -91,7 +91,7 @@ def core(self, i): """ return Core(i) - def _power_up(self, cpus: ty.Set[int]) -> None: + def power_up(self, cpus: ty.Set[int]) -> None: if not CONF.libvirt.cpu_power_management: return cpu_dedicated_set = hardware.get_cpu_dedicated_set_nozero() or set() @@ -111,7 +111,7 @@ def power_up_for_instance(self, instance: objects.Instance) -> None: return pcpus = instance.numa_topology.cpu_pinning.union( instance.numa_topology.cpuset_reserved) - self._power_up(pcpus) + self.power_up(pcpus) def power_up_for_migration( self, dst_numa_info: objects.LibvirtLiveMigrateNUMAInfo @@ -121,7 +121,7 @@ def power_up_for_migration( pcpus = dst_numa_info.emulator_pins for pins in dst_numa_info.cpu_pins.values(): pcpus = pcpus.union(pins) - self._power_up(pcpus) + self.power_up(pcpus) def _power_down(self, cpus: ty.Set[int]) -> None: if not CONF.libvirt.cpu_power_management: diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index 39add9f24e4..96e8ec9b276 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -735,12 +735,31 @@ def _handle_conn_event(self, enabled, reason): {'enabled': enabled, 'reason': reason}) self._set_host_enabled(enabled, reason) + def _init_host_topology(self): + """To work around a bug in libvirt that reports offline CPUs as always + being on socket 0 regardless of their real socket, power up all + dedicated CPUs (the only ones whose socket we actually care about), + then call get_capabilities() to initialize the topology with the + correct socket values. get_capabilities()'s implementation will reuse + these initial socket value, and avoid clobbering them with 0 for + offline CPUs. + """ + cpus = hardware.get_cpu_dedicated_set() + if cpus: + self.cpu_api.power_up(cpus) + self._host.get_capabilities() + def init_host(self, host): self._host.initialize() - self._update_host_specific_capabilities() - + # NOTE(artom) Do this first to make sure our first call to + # get_capabilities() happens with all dedicated CPUs online and caches + # their correct socket ID. Unused dedicated CPUs will be powered down + # further down in this method. self._check_cpu_set_configuration() + self._init_host_topology() + + self._update_host_specific_capabilities() self._do_quality_warnings() From 9367b93e92f4bf2f5661328f9cb260a9217d662a Mon Sep 17 00:00:00 2001 From: Julien Le Jeune Date: Tue, 30 Jul 2024 15:45:48 +0200 Subject: [PATCH 2/2] Skip snapshot test when missing qemu-img Since the commit the remove AMI snapshot format special casing has merged, we're now running the libvirt snapshot tests as expected. However, for those tests qemu-img binary needs to be installed. Because these tests have been silently and incorrectly skipped for so long, they didn't receive the same maintenance as other tests as the failures went unnoticed. Change-Id: Ia90eedbe35f4ab2b200bdc90e0e35e5a86cc2110 Closes-bug: #2075178 Signed-off-by: Julien Le Jeune (cherry picked from commit 0809f75d7921fe01a6832211081e756a11b3ad4e) (cherry picked from commit cd4e58173a1533878eccc6efabbda0560dfde613) (cherry picked from commit f58e8df67c98403a65f47fcf9add788dacaea7c1) --- nova/tests/unit/virt/test_virt_drivers.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/nova/tests/unit/virt/test_virt_drivers.py b/nova/tests/unit/virt/test_virt_drivers.py index 802ea4f027f..8057b68b9e9 100644 --- a/nova/tests/unit/virt/test_virt_drivers.py +++ b/nova/tests/unit/virt/test_virt_drivers.py @@ -21,6 +21,7 @@ import netaddr import os_resource_classes as orc import os_vif +from oslo_concurrency import processutils from oslo_log import log as logging from oslo_serialization import jsonutils from oslo_utils import importutils @@ -239,8 +240,16 @@ def test_snapshot_not_running(self): def test_snapshot_running(self): img_ref = self.image_service.create(self.ctxt, {'name': 'snap-1'}) instance_ref, network_info = self._get_running_instance() - self.connection.snapshot(self.ctxt, instance_ref, img_ref['id'], - lambda *args, **kwargs: None) + # this test depends on qemu-img + # being installed and in the path, + # if it is not installed, skip + try: + self.connection.snapshot(self.ctxt, instance_ref, img_ref['id'], + lambda *args, **kwargs: None) + except processutils.ProcessExecutionError as e: + if 'qemu-img' in e.stderr and 'No such file' in e.stderr: + self.skipTest("qemu-img not installed") + raise e @catch_notimplementederror def test_reboot(self):