Skip to content

Commit 0eeac39

Browse files
authored
Merge pull request #72 from stackhpc/upstream/wallaby-2024-02-19
Synchronise wallaby with upstream
2 parents 428772e + 05848fb commit 0eeac39

File tree

5 files changed

+74
-90
lines changed

5 files changed

+74
-90
lines changed

nova/scheduler/client/report.py

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
from nova import exception
3737
from nova.i18n import _
3838
from nova import objects
39+
from nova.objects import fields
3940
from nova import utils
4041

4142

@@ -2218,6 +2219,22 @@ def get_allocations_for_provider_tree(self, context, nodename):
22182219
return {consumer: self.get_allocs_for_consumer(context, consumer)
22192220
for consumer in consumers}
22202221

2222+
def _remove_allocations_for_evacuated_instances(self, context,
2223+
compute_node):
2224+
filters = {
2225+
'source_compute': compute_node.host,
2226+
'status': ['done'],
2227+
'migration_type': fields.MigrationType.EVACUATION,
2228+
}
2229+
evacuations = objects.MigrationList.get_by_filters(context, filters)
2230+
2231+
for evacuation in evacuations:
2232+
if not self.remove_provider_tree_from_instance_allocation(
2233+
context, evacuation.instance_uuid, compute_node.uuid):
2234+
LOG.error("Failed to clean allocation of evacuated "
2235+
"instance on the source node %s",
2236+
compute_node.uuid, instance=evacuation.instance)
2237+
22212238
def delete_resource_provider(self, context, compute_node, cascade=False):
22222239
"""Deletes the ResourceProvider record for the compute_node.
22232240
@@ -2236,16 +2253,19 @@ def delete_resource_provider(self, context, compute_node, cascade=False):
22362253
# Delete any allocations for this resource provider.
22372254
# Since allocations are by consumer, we get the consumers on this
22382255
# host, which are its instances.
2239-
# NOTE(mriedem): This assumes the only allocations on this node
2240-
# are instances, but there could be migration consumers if the
2241-
# node is deleted during a migration or allocations from an
2242-
# evacuated host (bug 1829479). Obviously an admin shouldn't
2243-
# do that but...you know. I guess the provider deletion should fail
2244-
# in that case which is what we'd want to happen.
22452256
instance_uuids = objects.InstanceList.get_uuids_by_host_and_node(
22462257
context, host, nodename)
22472258
for instance_uuid in instance_uuids:
22482259
self.delete_allocation_for_instance(context, instance_uuid)
2260+
2261+
# When an instance is evacuated, its allocation remains in
2262+
# the source compute node until the node recovers again.
2263+
# If the broken compute never recovered but instead it is
2264+
# decommissioned, then we should delete the allocations of
2265+
# successfully evacuated instances during service delete.
2266+
self._remove_allocations_for_evacuated_instances(context,
2267+
compute_node)
2268+
22492269
# Ensure to delete resource provider in tree by top-down
22502270
# traversable order.
22512271
rps_to_refresh = self.get_providers_in_tree(context, rp_uuid)

nova/tests/functional/test_nova_manage.py

Lines changed: 0 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1573,71 +1573,6 @@ def test_audit_orphaned_allocations_from_confirmed_resize(self):
15731573
self.assertIn('Processed 1 allocation.', output)
15741574
self.assertEqual(4, ret)
15751575

1576-
# TODO(sbauza): Remove this test once bug #1829479 is fixed
1577-
def test_audit_orphaned_allocations_from_deleted_compute_evacuate(self):
1578-
"""Evacuate a server and the delete the source node so that it will
1579-
leave a source allocation that the audit command will find.
1580-
"""
1581-
1582-
source_hostname = self.compute1.host
1583-
dest_hostname = self.compute2.host
1584-
1585-
source_rp_uuid = self._get_provider_uuid_by_host(source_hostname)
1586-
dest_rp_uuid = self._get_provider_uuid_by_host(dest_hostname)
1587-
1588-
server = self._boot_and_check_allocations(self.flavor, source_hostname)
1589-
1590-
# Stop the service and fake it down
1591-
self.compute1.stop()
1592-
source_service_id = self.admin_api.get_services(
1593-
host=source_hostname, binary='nova-compute')[0]['id']
1594-
self.admin_api.put_service(source_service_id, {'forced_down': 'true'})
1595-
1596-
# evacuate the instance to the target
1597-
self._evacuate_server(
1598-
server, {'host': dest_hostname}, expected_host=dest_hostname)
1599-
1600-
# Now the instance is gone, we can delete the compute service
1601-
self.admin_api.api_delete('/os-services/%s' % source_service_id)
1602-
1603-
# Since the compute is deleted, we should have in theory a single
1604-
# allocation against the destination resource provider, but evacuated
1605-
# instances are not having their allocations deleted. See bug #1829479.
1606-
# We have two allocations for the same consumer, source and destination
1607-
self._check_allocation_during_evacuate(
1608-
self.flavor, server['id'], source_rp_uuid, dest_rp_uuid)
1609-
1610-
# Now, run the audit command that will find this orphaned allocation
1611-
ret = self.cli.audit(verbose=True)
1612-
output = self.output.getvalue()
1613-
self.assertIn(
1614-
'Allocations for consumer UUID %(consumer_uuid)s on '
1615-
'Resource Provider %(rp_uuid)s can be deleted' %
1616-
{'consumer_uuid': server['id'],
1617-
'rp_uuid': source_rp_uuid},
1618-
output)
1619-
self.assertIn('Processed 1 allocation.', output)
1620-
self.assertEqual(3, ret)
1621-
1622-
# Now we want to delete the orphaned allocation that is duplicate
1623-
ret = self.cli.audit(delete=True, verbose=True)
1624-
1625-
# We finally should only have the target allocations
1626-
self.assertFlavorMatchesUsage(dest_rp_uuid, self.flavor)
1627-
self.assertRequestMatchesUsage({'VCPU': 0,
1628-
'MEMORY_MB': 0,
1629-
'DISK_GB': 0}, source_rp_uuid)
1630-
1631-
output = self.output.getvalue()
1632-
self.assertIn(
1633-
'Deleted allocations for consumer UUID %(consumer_uuid)s on '
1634-
'Resource Provider %(rp_uuid)s' %
1635-
{'consumer_uuid': server['id'],
1636-
'rp_uuid': source_rp_uuid},
1637-
output)
1638-
self.assertIn('Processed 1 allocation.', output)
1639-
self.assertEqual(4, ret)
1640-
16411576

16421577
class TestDBArchiveDeletedRows(integrated_helpers._IntegratedTestBase):
16431578
"""Functional tests for the "nova-manage db archive_deleted_rows" CLI."""

nova/tests/functional/wsgi/test_services.py

Lines changed: 8 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -120,8 +120,7 @@ def test_evacuate_then_delete_compute_service(self):
120120
"""Tests a scenario where a server is created on a host, the host
121121
goes down, the server is evacuated to another host, and then the
122122
source host compute service is deleted. After that the deleted
123-
compute service is restarted. Related placement resources are checked
124-
throughout.
123+
compute service is restarted and starts successfully.
125124
"""
126125
# Create our source host that we will evacuate *from* later.
127126
host1 = self._start_compute('host1')
@@ -152,23 +151,15 @@ def test_evacuate_then_delete_compute_service(self):
152151
services = self.admin_api.get_services(
153152
binary='nova-compute', host='host1')
154153
self.assertEqual(0, len(services), services)
155-
# FIXME(mriedem): This is bug 1829479 where the compute service is
156-
# deleted but the resource provider is not because there are still
157-
# allocations against the provider from the evacuated server.
154+
# Then the resource provider is also deleted.
158155
resp = self.placement.get('/resource_providers/%s' % rp_uuid)
159-
self.assertEqual(200, resp.status)
160-
self.assertFlavorMatchesUsage(rp_uuid, flavor)
161-
# Try to restart the host1 compute service to create a new resource
162-
# provider.
156+
self.assertEqual(404, resp.status)
157+
# Try to restart the host1 compute service to create a new service
158+
# and a new resource provider.
163159
self.restart_compute_service(host1)
164-
# FIXME(mriedem): This is bug 1817833 where restarting the now-deleted
165-
# compute service attempts to create a new resource provider with a
166-
# new uuid but the same name which results in a conflict. The service
167-
# does not die, however, because _update_available_resource_for_node
168-
# catches and logs but does not re-raise the error.
169-
log_output = self.stdlog.logger.output
170-
self.assertIn('Error updating resources for node host1.', log_output)
171-
self.assertIn('Failed to create resource provider host1', log_output)
160+
# Make sure the compute service record for host1 is recreated.
161+
service = self.admin_api.get_services(
162+
binary='nova-compute', host='host1')[0]
172163

173164
def test_migrate_confirm_after_deleted_source_compute(self):
174165
"""Tests a scenario where a server is cold migrated and while in

nova/tests/unit/scheduler/client/test_report.py

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3184,15 +3184,41 @@ def test_refresh_associations_disabled(self):
31843184

31853185
class TestAllocations(SchedulerReportClientTestCase):
31863186

3187+
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
3188+
"remove_provider_tree_from_instance_allocation")
3189+
@mock.patch('nova.objects.MigrationList.get_by_filters')
3190+
def test_remove_allocations_for_evacuated_instances(self,
3191+
mock_get_migrations, mock_rm_pr_tree):
3192+
cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host",
3193+
hypervisor_hostname="fake_hostname", )
3194+
migrations = [mock.Mock(instance_uuid=uuids.inst1, status='done'),
3195+
mock.Mock(instance_uuid=uuids.inst2, status='done')]
3196+
mock_get_migrations.return_value = migrations
3197+
mock_rm_pr_tree.return_value = True
3198+
self.client._remove_allocations_for_evacuated_instances(self.context,
3199+
cn)
3200+
mock_get_migrations.assert_called_once_with(
3201+
self.context,
3202+
{'source_compute': cn.host, 'status': ['done'],
3203+
'migration_type': 'evacuation'})
3204+
mock_rm_pr_tree.assert_has_calls(
3205+
[mock.call(self.context, uuids.inst1, cn.uuid),
3206+
mock.call(self.context, uuids.inst2, cn.uuid)])
3207+
# status of migrations should be kept
3208+
for migration in migrations:
3209+
self.assertEqual('done', migration.status)
3210+
31873211
@mock.patch('nova.scheduler.client.report.SchedulerReportClient.'
31883212
'get_providers_in_tree')
31893213
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
31903214
"delete")
3215+
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
3216+
"_remove_allocations_for_evacuated_instances")
31913217
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
31923218
"delete_allocation_for_instance")
31933219
@mock.patch("nova.objects.InstanceList.get_uuids_by_host_and_node")
31943220
def test_delete_resource_provider_cascade(self, mock_by_host,
3195-
mock_del_alloc, mock_delete, mock_get_rpt):
3221+
mock_del_alloc, mock_evacuate, mock_delete, mock_get_rpt):
31963222
cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host",
31973223
hypervisor_hostname="fake_hostname", )
31983224
mock_by_host.return_value = [uuids.inst1, uuids.inst2]
@@ -3208,6 +3234,7 @@ def test_delete_resource_provider_cascade(self, mock_by_host,
32083234
mock_by_host.assert_called_once_with(
32093235
self.context, cn.host, cn.hypervisor_hostname)
32103236
self.assertEqual(2, mock_del_alloc.call_count)
3237+
mock_evacuate.assert_called_once_with(self.context, cn)
32113238
exp_url = "/resource_providers/%s" % uuids.cn
32123239
mock_delete.assert_called_once_with(
32133240
exp_url, global_request_id=self.context.global_id)
@@ -3217,11 +3244,13 @@ def test_delete_resource_provider_cascade(self, mock_by_host,
32173244
'get_providers_in_tree')
32183245
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
32193246
"delete")
3247+
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
3248+
"_remove_allocations_for_evacuated_instances")
32203249
@mock.patch("nova.scheduler.client.report.SchedulerReportClient."
32213250
"delete_allocation_for_instance")
32223251
@mock.patch("nova.objects.InstanceList.get_uuids_by_host_and_node")
32233252
def test_delete_resource_provider_no_cascade(self, mock_by_host,
3224-
mock_del_alloc, mock_delete, mock_get_rpt):
3253+
mock_del_alloc, mock_evacuate, mock_delete, mock_get_rpt):
32253254
self.client._association_refresh_time[uuids.cn] = mock.Mock()
32263255
cn = objects.ComputeNode(uuid=uuids.cn, host="fake_host",
32273256
hypervisor_hostname="fake_hostname", )
@@ -3236,6 +3265,7 @@ def test_delete_resource_provider_no_cascade(self, mock_by_host,
32363265
mock_delete.return_value = resp_mock
32373266
self.client.delete_resource_provider(self.context, cn)
32383267
mock_del_alloc.assert_not_called()
3268+
mock_evacuate.assert_not_called()
32393269
exp_url = "/resource_providers/%s" % uuids.cn
32403270
mock_delete.assert_called_once_with(
32413271
exp_url, global_request_id=self.context.global_id)
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
---
2+
fixes:
3+
- |
4+
`Bug #1829479 <https://bugs.launchpad.net/nova/+bug/1829479>`_: Now
5+
deleting a nova-compute service removes allocations of successfully
6+
evacuated instances. This allows the associated resource provider to be
7+
deleted automatically even if the nova-compute service cannot recover after
8+
all instances on the node have been successfully evacuated.

0 commit comments

Comments
 (0)