From 23c190a35839b396418d3e98af1e67587f9e9296 Mon Sep 17 00:00:00 2001 From: Bence Romsics Date: Mon, 14 Aug 2023 13:03:13 +0200 Subject: [PATCH 1/2] Reproduce bug #2025480 in a functional test Written by gibi, I just cleaned it up. Change-Id: I8386a846b3685b8d03c59334ccfb2efbd4afe427 Co-Authored-By: Balazs Gibizer Related-Bug: #2025480 (cherry picked from commit 62300d4885549368f874b3e07b756017ff96c659) (cherry picked from commit 477ff2667d7ecd218fa5163d86d2719979dcdcd3) --- .../regressions/test_bug_2025480.py | 87 +++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 nova/tests/functional/regressions/test_bug_2025480.py diff --git a/nova/tests/functional/regressions/test_bug_2025480.py b/nova/tests/functional/regressions/test_bug_2025480.py new file mode 100644 index 00000000000..f6c87109f79 --- /dev/null +++ b/nova/tests/functional/regressions/test_bug_2025480.py @@ -0,0 +1,87 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +from unittest import mock + +from nova import context +from nova.objects import compute_node +from nova import test +from nova.tests import fixtures as nova_fixtures +from nova.tests.functional import fixtures as func_fixtures +from nova.tests.functional import integrated_helpers + + +class UnshelveUpdateAvailableResourcesPeriodicRace( + test.TestCase, integrated_helpers.InstanceHelperMixin): + def setUp(self): + super(UnshelveUpdateAvailableResourcesPeriodicRace, self).setUp() + + placement = func_fixtures.PlacementFixture() + self.useFixture(placement) + self.placement = placement.api + self.neutron = nova_fixtures.NeutronFixture(self) + self.useFixture(self.neutron) + self.useFixture(nova_fixtures.GlanceFixture(self)) + # Start nova services. + self.api = self.useFixture(nova_fixtures.OSAPIFixture( + api_version='v2.1')).admin_api + self.api.microversion = 'latest' + self.notifier = self.useFixture( + nova_fixtures.NotificationFixture(self)) + + self.start_service('conductor') + self.start_service('scheduler') + + def test_unshelve_spawning_update_available_resources(self): + compute = self._start_compute('compute1') + + server = self._create_server( + networks=[{'port': self.neutron.port_1['id']}]) + + node = compute_node.ComputeNode.get_by_nodename( + context.get_admin_context(), 'compute1') + self.assertEqual(1, node.vcpus_used) + + # with default config shelve means immediate offload as well + req = { + 'shelve': {} + } + self.api.post_server_action(server['id'], req) + self._wait_for_server_parameter( + server, {'status': 'SHELVED_OFFLOADED', + 'OS-EXT-SRV-ATTR:host': None}) + + node = compute_node.ComputeNode.get_by_nodename( + context.get_admin_context(), 'compute1') + self.assertEqual(0, node.vcpus_used) + + def fake_spawn(*args, **kwargs): + self._run_periodics() + + with mock.patch.object( + compute.driver, 'spawn', side_effect=fake_spawn): + req = {'unshelve': None} + self.api.post_server_action(server['id'], req) + self.notifier.wait_for_versioned_notifications( + 'instance.unshelve.start') + self._wait_for_server_parameter( + server, + { + 'status': 'ACTIVE', + 'OS-EXT-STS:task_state': None, + 'OS-EXT-SRV-ATTR:host': 'compute1', + }) + + node = compute_node.ComputeNode.get_by_nodename( + context.get_admin_context(), 'compute1') + # This is the bug, the instance should have resources claimed + # self.assertEqual(1, node.vcpus_used) + self.assertEqual(0, node.vcpus_used) From 683ecc060e3bca818b9fb514d297e323bc8cb220 Mon Sep 17 00:00:00 2001 From: Bence Romsics Date: Wed, 2 Aug 2023 16:22:55 +0200 Subject: [PATCH 2/2] Do not untrack resources of a server being unshelved This patch concerns the time when a VM is being unshelved and the compute manager set the task_state to spawning, claimed resources of the VM and then called driver.spawn(). So the instance is in vm_state SHELVED_OFFLOADED, task_state spawning. If at this point a new update_available_resource periodic job is started that collects all the instances assigned to the node to calculate resource usage. However the calculation assumed that a VM in SHELVED_OFFLOADED state does not need resource allocation on the node (probably being removed from the node as it is offloaded) and deleted the resource claim. Given all this we ended up with the VM spawned successfully but having lost the resource claim on the node. This patch changes what we do in vm_state SHELVED_OFFLOADED, task_state spawning. We no longer delete the resource claim in this state and keep tracking the resource in stats. Change-Id: I8c9944810c09d501a6d3f60f095d9817b756872d Closes-Bug: #2025480 (cherry picked from commit f1dc4ec39bcfda1bd4b97e233a9da498b6378c4f) (cherry picked from commit 4239d1fec2814c074482b740a2fd38a5d5ce6942) --- nova/compute/manager.py | 6 +++--- nova/compute/resource_tracker.py | 7 +++++-- nova/compute/stats.py | 3 ++- nova/compute/vm_states.py | 13 ++++++++++--- .../functional/regressions/test_bug_2025480.py | 5 ++--- nova/tests/unit/compute/test_stats.py | 16 ++++++++++++++++ 6 files changed, 38 insertions(+), 12 deletions(-) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index c887b3ae47c..e4df90b981c 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -6918,9 +6918,9 @@ def _shelve_offload_instance(self, context, instance, clean_shutdown, instance.power_state = current_power_state # NOTE(mriedem): The vm_state has to be set before updating the - # resource tracker, see vm_states.ALLOW_RESOURCE_REMOVAL. The host/node - # values cannot be nulled out until after updating the resource tracker - # though. + # resource tracker, see vm_states.allow_resource_removal(). The + # host/node values cannot be nulled out until after updating the + # resource tracker though. instance.vm_state = vm_states.SHELVED_OFFLOADED instance.task_state = None instance.save(expected_task_state=[task_states.SHELVING, diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py index 3f911f37084..a4e7dbacc64 100644 --- a/nova/compute/resource_tracker.py +++ b/nova/compute/resource_tracker.py @@ -1539,7 +1539,8 @@ def _update_usage_from_instance(self, context, instance, nodename, # NOTE(sfinucan): Both brand new instances as well as instances that # are being unshelved will have is_new_instance == True is_removed_instance = not is_new_instance and (is_removed or - instance['vm_state'] in vm_states.ALLOW_RESOURCE_REMOVAL) + vm_states.allow_resource_removal( + vm_state=instance['vm_state'], task_state=instance.task_state)) if is_new_instance: self.tracked_instances.add(uuid) @@ -1598,7 +1599,9 @@ def _update_usage_from_instances(self, context, instances, nodename): instance_by_uuid = {} for instance in instances: - if instance.vm_state not in vm_states.ALLOW_RESOURCE_REMOVAL: + if not vm_states.allow_resource_removal( + vm_state=instance['vm_state'], + task_state=instance.task_state): self._update_usage_from_instance(context, instance, nodename) instance_by_uuid[instance.uuid] = instance return instance_by_uuid diff --git a/nova/compute/stats.py b/nova/compute/stats.py index cfbee2e6bc1..e9180ec6d6d 100644 --- a/nova/compute/stats.py +++ b/nova/compute/stats.py @@ -105,7 +105,8 @@ def update_stats_for_instance(self, instance, is_removed=False): (vm_state, task_state, os_type, project_id) = \ self._extract_state_from_instance(instance) - if is_removed or vm_state in vm_states.ALLOW_RESOURCE_REMOVAL: + if is_removed or vm_states.allow_resource_removal( + vm_state=vm_state, task_state=task_state): self._decrement("num_instances") self.states.pop(uuid) else: diff --git a/nova/compute/vm_states.py b/nova/compute/vm_states.py index 1a916ea59a6..393a3ebbf2b 100644 --- a/nova/compute/vm_states.py +++ b/nova/compute/vm_states.py @@ -27,6 +27,7 @@ See http://wiki.openstack.org/VMState """ +from nova.compute import task_states from nova.objects import fields @@ -74,8 +75,14 @@ # states we allow to trigger crash dump ALLOW_TRIGGER_CRASH_DUMP = [ACTIVE, PAUSED, RESCUED, RESIZED, ERROR] -# states we allow resources to be freed in -ALLOW_RESOURCE_REMOVAL = [DELETED, SHELVED_OFFLOADED] - # states we allow for evacuate instance ALLOW_TARGET_STATES = [STOPPED] + + +def allow_resource_removal(vm_state, task_state=None): + """(vm_state, task_state) combinations we allow resources to be freed in""" + + return ( + vm_state == DELETED or + vm_state == SHELVED_OFFLOADED and task_state != task_states.SPAWNING + ) diff --git a/nova/tests/functional/regressions/test_bug_2025480.py b/nova/tests/functional/regressions/test_bug_2025480.py index f6c87109f79..c707a40a846 100644 --- a/nova/tests/functional/regressions/test_bug_2025480.py +++ b/nova/tests/functional/regressions/test_bug_2025480.py @@ -82,6 +82,5 @@ def fake_spawn(*args, **kwargs): node = compute_node.ComputeNode.get_by_nodename( context.get_admin_context(), 'compute1') - # This is the bug, the instance should have resources claimed - # self.assertEqual(1, node.vcpus_used) - self.assertEqual(0, node.vcpus_used) + # After the fix, the instance should have resources claimed + self.assertEqual(1, node.vcpus_used) diff --git a/nova/tests/unit/compute/test_stats.py b/nova/tests/unit/compute/test_stats.py index e713794a19a..b95475f09db 100644 --- a/nova/tests/unit/compute/test_stats.py +++ b/nova/tests/unit/compute/test_stats.py @@ -208,6 +208,22 @@ def test_update_stats_for_instance_offloaded(self): self.assertEqual(0, self.stats.num_os_type("Linux")) self.assertEqual(0, self.stats["num_vm_" + vm_states.BUILDING]) + def test_update_stats_for_instance_being_unshelved(self): + instance = self._create_instance() + self.stats.update_stats_for_instance(instance) + self.assertEqual(1, self.stats.num_instances_for_project("1234")) + + instance["vm_state"] = vm_states.SHELVED_OFFLOADED + instance["task_state"] = task_states.SPAWNING + self.stats.update_stats_for_instance(instance) + + self.assertEqual(1, self.stats.num_instances) + self.assertEqual(1, self.stats.num_instances_for_project(1234)) + self.assertEqual(1, self.stats["num_os_type_Linux"]) + self.assertEqual(1, self.stats["num_vm_%s" % + vm_states.SHELVED_OFFLOADED]) + self.assertEqual(1, self.stats["num_task_%s" % task_states.SPAWNING]) + def test_io_workload(self): vms = [vm_states.ACTIVE, vm_states.BUILDING, vm_states.PAUSED] tasks = [task_states.RESIZE_MIGRATING, task_states.REBUILDING,