Merge pull request #72 from stackhpc/upstream/wallaby-2024-02-19

markgoddard · web-flow · commit 0eeac3978b22 · 2024-02-19T12:16:53.000Z
Synchronise wallaby with upstream
diff --git a/nova/scheduler/client/report.py b/nova/scheduler/client/report.py
@@ -36,6 +36,7 @@
 from nova import exception
 from nova.i18n import _
 from nova import objects
+from nova.objects import fields
 from nova import utils
 
 
@@ -2218,6 +2219,22 @@ def get_allocations_for_provider_tree(self, context, nodename):
         return {consumer: self.get_allocs_for_consumer(context, consumer)
                 for consumer in consumers}
 
+    def _remove_allocations_for_evacuated_instances(self, context,
+            compute_node):
+        filters = {
+            'source_compute': compute_node.host,
+            'status': ['done'],
+            'migration_type': fields.MigrationType.EVACUATION,
+        }
+        evacuations = objects.MigrationList.get_by_filters(context, filters)
+
+        for evacuation in evacuations:
+            if not self.remove_provider_tree_from_instance_allocation(
+                    context, evacuation.instance_uuid, compute_node.uuid):
+                LOG.error("Failed to clean allocation of evacuated "
+                          "instance on the source node %s",
+                          compute_node.uuid, instance=evacuation.instance)
+
     def delete_resource_provider(self, context, compute_node, cascade=False):
         """Deletes the ResourceProvider record for the compute_node.
 
@@ -2236,16 +2253,19 @@ def delete_resource_provider(self, context, compute_node, cascade=False):
             # Delete any allocations for this resource provider.
             # Since allocations are by consumer, we get the consumers on this
             # host, which are its instances.
-            # NOTE(mriedem): This assumes the only allocations on this node
-            # are instances, but there could be migration consumers if the
-            # node is deleted during a migration or allocations from an
-            # evacuated host (bug 1829479). Obviously an admin shouldn't
-            # do that but...you know. I guess the provider deletion should fail
-            # in that case which is what we'd want to happen.
             instance_uuids = objects.InstanceList.get_uuids_by_host_and_node(
                 context, host, nodename)
             for instance_uuid in instance_uuids:
                 self.delete_allocation_for_instance(context, instance_uuid)
+
+            # When an instance is evacuated, its allocation remains in
+            # the source compute node until the node recovers again.
+            # If the broken compute never recovered but instead it is
+            # decommissioned, then we should delete the allocations of
+            # successfully evacuated instances during service delete.
+            self._remove_allocations_for_evacuated_instances(context,
+                                                             compute_node)
+
         # Ensure to delete resource provider in tree by top-down
         # traversable order.
         rps_to_refresh = self.get_providers_in_tree(context, rp_uuid)
diff --git a/nova/tests/functional/test_nova_manage.py b/nova/tests/functional/test_nova_manage.py
@@ -1573,71 +1573,6 @@ def test_audit_orphaned_allocations_from_confirmed_resize(self):
         self.assertIn('Processed 1 allocation.', output)
         self.assertEqual(4, ret)
 
-    # TODO(sbauza): Remove this test once bug #1829479 is fixed
-    def test_audit_orphaned_allocations_from_deleted_compute_evacuate(self):
-        """Evacuate a server and the delete the source node so that it will
-           leave a source allocation that the audit command will find.
-        """
-
-        source_hostname = self.compute1.host
-        dest_hostname = self.compute2.host
-
-        source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
-        dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname)
-
-        server = self._boot_and_check_allocations(self.flavor, source_hostname)
-
-        # Stop the service and fake it down
-        self.compute1.stop()
-        source_service_id = self.admin_api.get_services(
-            host=source_hostname, binary='nova-compute')[0]['id']
-        self.admin_api.put_service(source_service_id, {'forced_down': 'true'})
-
-        # evacuate the instance to the target
-        self._evacuate_server(
-            server, {'host': dest_hostname}, expected_host=dest_hostname)
-
-        # Now the instance is gone, we can delete the compute service
-        self.admin_api.api_delete('/os-services/%s' % source_service_id)
-
-        # Since the compute is deleted, we should have in theory a single
-        # allocation against the destination resource provider, but evacuated
-        # instances are not having their allocations deleted. See bug #1829479.
-        # We have two allocations for the same consumer, source and destination
-        self._check_allocation_during_evacuate(
-            self.flavor, server['id'], source_rp_uuid, dest_rp_uuid)
-
-        # Now, run the audit command that will find this orphaned allocation
-        ret = self.cli.audit(verbose=True)
-        output = self.output.getvalue()
-        self.assertIn(
-            'Allocations for consumer UUID %(consumer_uuid)s on '
-            'Resource Provider %(rp_uuid)s can be deleted' %
-            {'consumer_uuid': server['id'],
-             'rp_uuid': source_rp_uuid},
-            output)
-        self.assertIn('Processed 1 allocation.', output)
-        self.assertEqual(3, ret)
-
-        # Now we want to delete the orphaned allocation that is duplicate
-        ret = self.cli.audit(delete=True, verbose=True)
-
-        # We finally should only have the target allocations
-        self.assertFlavorMatchesUsage(dest_rp_uuid, self.flavor)
-        self.assertRequestMatchesUsage({'VCPU': 0,
-                                        'MEMORY_MB': 0,
-                                        'DISK_GB': 0}, source_rp_uuid)
-
-        output = self.output.getvalue()
-        self.assertIn(
-            'Deleted allocations for consumer UUID %(consumer_uuid)s on '
-            'Resource Provider %(rp_uuid)s' %
-            {'consumer_uuid': server['id'],
-             'rp_uuid': source_rp_uuid},
-            output)
-        self.assertIn('Processed 1 allocation.', output)
-        self.assertEqual(4, ret)
-
 
 class TestDBArchiveDeletedRows(integrated_helpers._IntegratedTestBase):
     """Functional tests for the "nova-manage db archive_deleted_rows" CLI."""
diff --git a/nova/tests/functional/wsgi/test_services.py b/nova/tests/functional/wsgi/test_services.py
@@ -120,8 +120,7 @@ def test_evacuate_then_delete_compute_service(self):
         """Tests a scenario where a server is created on a host, the host
         goes down, the server is evacuated to another host, and then the
         source host compute service is deleted. After that the deleted
-        compute service is restarted. Related placement resources are checked
-        throughout.
+        compute service is restarted and starts successfully.
         """
         # Create our source host that we will evacuate *from* later.
         host1 = self._start_compute('host1')
@@ -152,23 +151,15 @@ def test_evacuate_then_delete_compute_service(self):
         services = self.admin_api.get_services(
             binary='nova-compute', host='host1')
         self.assertEqual(0, len(services), services)
-        # FIXME(mriedem): This is bug 1829479 where the compute service is
-        # deleted but the resource provider is not because there are still
-        # allocations against the provider from the evacuated server.
+        # Then the resource provider is also deleted.
         resp = self.placement.get('/resource_providers/%s' % rp_uuid)
-        self.assertEqual(200, resp.status)
-        self.assertFlavorMatchesUsage(rp_uuid, flavor)
-        # Try to restart the host1 compute service to create a new resource
-        # provider.
+        self.assertEqual(404, resp.status)
+        # Try to restart the host1 compute service to create a new service
+        # and a new resource provider.
         self.restart_compute_service(host1)
-        # FIXME(mriedem): This is bug 1817833 where restarting the now-deleted
-        # compute service attempts to create a new resource provider with a
-        # new uuid but the same name which results in a conflict. The service
-        # does not die, however, because _update_available_resource_for_node
-        # catches and logs but does not re-raise the error.
-        log_output = self.stdlog.logger.output
-        self.assertIn('Error updating resources for node host1.', log_output)
-        self.assertIn('Failed to create resource provider host1', log_output)
+        # Make sure the compute service record for host1 is recreated.
+        service = self.admin_api.get_services(
+            binary='nova-compute', host='host1')[0]
 
     def test_migrate_confirm_after_deleted_source_compute(self):
         """Tests a scenario where a server is cold migrated and while in
diff --git a/nova/tests/unit/scheduler/client/test_report.py b/nova/tests/unit/scheduler/client/test_report.py
@@ -3184,15 +3184,41 @@ def test_refresh_associations_disabled(self):
 
 class TestAllocations(SchedulerReportClientTestCase):
 
+    @mock.patch("nova.scheduler.client.report.SchedulerReportClient."
+                "remove_provider_tree_from_instance_allocation")
+    @mock.patch('nova.objects.MigrationList.get_by_filters')
+    def test_remove_allocations_for_evacuated_instances(self,
+            mock_get_migrations, mock_rm_pr_tree):
+        cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host",
+                hypervisor_hostname="fake_hostname", )
+        migrations = [mock.Mock(instance_uuid=uuids.inst1, status='done'),
+                      mock.Mock(instance_uuid=uuids.inst2, status='done')]
+        mock_get_migrations.return_value = migrations
+        mock_rm_pr_tree.return_value = True
+        self.client._remove_allocations_for_evacuated_instances(self.context,
+                                                                cn)
+        mock_get_migrations.assert_called_once_with(
+            self.context,
+            {'source_compute': cn.host, 'status': ['done'],
+             'migration_type': 'evacuation'})
+        mock_rm_pr_tree.assert_has_calls(
+            [mock.call(self.context, uuids.inst1, cn.uuid),
+             mock.call(self.context, uuids.inst2, cn.uuid)])
+        # status of migrations should be kept
+        for migration in migrations:
+            self.assertEqual('done', migration.status)
+
     @mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
                 'get_providers_in_tree')
     @mock.patch("nova.scheduler.client.report.SchedulerReportClient."
                 "delete")
+    @mock.patch("nova.scheduler.client.report.SchedulerReportClient."
+                "_remove_allocations_for_evacuated_instances")
     @mock.patch("nova.scheduler.client.report.SchedulerReportClient."
                 "delete_allocation_for_instance")
     @mock.patch("nova.objects.InstanceList.get_uuids_by_host_and_node")
     def test_delete_resource_provider_cascade(self, mock_by_host,
-            mock_del_alloc, mock_delete, mock_get_rpt):
+            mock_del_alloc, mock_evacuate, mock_delete, mock_get_rpt):
         cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host",
                 hypervisor_hostname="fake_hostname", )
         mock_by_host.return_value = [uuids.inst1, uuids.inst2]
@@ -3208,6 +3234,7 @@ def test_delete_resource_provider_cascade(self, mock_by_host,
         mock_by_host.assert_called_once_with(
             self.context, cn.host, cn.hypervisor_hostname)
         self.assertEqual(2, mock_del_alloc.call_count)
+        mock_evacuate.assert_called_once_with(self.context, cn)
         exp_url = "/resource_providers/%s" % uuids.cn
         mock_delete.assert_called_once_with(
             exp_url, global_request_id=self.context.global_id)
@@ -3217,11 +3244,13 @@ def test_delete_resource_provider_cascade(self, mock_by_host,
                 'get_providers_in_tree')
     @mock.patch("nova.scheduler.client.report.SchedulerReportClient."
                 "delete")
+    @mock.patch("nova.scheduler.client.report.SchedulerReportClient."
+                "_remove_allocations_for_evacuated_instances")
     @mock.patch("nova.scheduler.client.report.SchedulerReportClient."
                 "delete_allocation_for_instance")
     @mock.patch("nova.objects.InstanceList.get_uuids_by_host_and_node")
     def test_delete_resource_provider_no_cascade(self, mock_by_host,
-            mock_del_alloc, mock_delete, mock_get_rpt):
+            mock_del_alloc, mock_evacuate, mock_delete, mock_get_rpt):
         self.client._association_refresh_time[uuids.cn] = mock.Mock()
         cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host",
                 hypervisor_hostname="fake_hostname", )
@@ -3236,6 +3265,7 @@ def test_delete_resource_provider_no_cascade(self, mock_by_host,
         mock_delete.return_value = resp_mock
         self.client.delete_resource_provider(self.context, cn)
         mock_del_alloc.assert_not_called()
+        mock_evacuate.assert_not_called()
         exp_url = "/resource_providers/%s" % uuids.cn
         mock_delete.assert_called_once_with(
             exp_url, global_request_id=self.context.global_id)
diff --git a/releasenotes/notes/bug-1829479-cd2db21526965e6d.yaml b/releasenotes/notes/bug-1829479-cd2db21526965e6d.yaml
@@ -0,0 +1,8 @@
+---
+fixes:
+  - |
+    `Bug #1829479 <https://bugs.launchpad.net/nova/+bug/1829479>`_: Now
+    deleting a nova-compute service removes allocations of successfully
+    evacuated instances. This allows the associated resource provider to be
+    deleted automatically even if the nova-compute service cannot recover after
+    all instances on the node have been successfully evacuated.