diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py index 988de9a0350..da7efe0c0ed 100644 --- a/nova/compute/resource_tracker.py +++ b/nova/compute/resource_tracker.py @@ -580,8 +580,12 @@ def _init_compute_node(self, context, resources): cn = objects.ComputeNode(context) cn.host = self.host self._copy_resources(cn, resources) - self.compute_nodes[nodename] = cn cn.create() + # Only map the ComputeNode into compute_nodes if create() was OK + # because if create() fails, on the next run through here nodename + # would be in compute_nodes and we won't try to create again (because + # of the logic above). + self.compute_nodes[nodename] = cn LOG.info('Compute node record created for ' '%(host)s:%(node)s with uuid: %(uuid)s', {'host': self.host, 'node': nodename, 'uuid': cn.uuid}) diff --git a/nova/db/sqlalchemy/api.py b/nova/db/sqlalchemy/api.py index 37d1beed29a..9cbc2c974fe 100644 --- a/nova/db/sqlalchemy/api.py +++ b/nova/db/sqlalchemy/api.py @@ -30,6 +30,7 @@ from oslo_db.sqlalchemy import update_match from oslo_db.sqlalchemy import utils as sqlalchemyutils from oslo_log import log as logging +from oslo_utils import excutils from oslo_utils import importutils from oslo_utils import timeutils from oslo_utils import uuidutils @@ -693,11 +694,54 @@ def compute_node_create(context, values): compute_node_ref = models.ComputeNode() compute_node_ref.update(values) - compute_node_ref.save(context.session) + try: + compute_node_ref.save(context.session) + except db_exc.DBDuplicateEntry: + with excutils.save_and_reraise_exception(logger=LOG) as err_ctx: + # Check to see if we have a (soft) deleted ComputeNode with the + # same UUID and if so just update it and mark as no longer (soft) + # deleted. See bug 1839560 for details. + if 'uuid' in values: + # Get a fresh context for a new DB session and allow it to + # get a deleted record. + ctxt = nova.context.get_admin_context(read_deleted='yes') + compute_node_ref = _compute_node_get_and_update_deleted( + ctxt, values) + # If we didn't get anything back we failed to find the node + # by uuid and update it so re-raise the DBDuplicateEntry. + if compute_node_ref: + err_ctx.reraise = False return compute_node_ref +@pick_context_manager_writer +def _compute_node_get_and_update_deleted(context, values): + """Find a ComputeNode by uuid, update and un-delete it. + + This is a special case from the ``compute_node_create`` method which + needs to be separate to get a new Session. + + This method will update the ComputeNode, if found, to have deleted=0 and + deleted_at=None values. + + :param context: request auth context which should be able to read deleted + records + :param values: values used to update the ComputeNode record - must include + uuid + :return: updated ComputeNode sqlalchemy model object if successfully found + and updated, None otherwise + """ + cn = model_query( + context, models.ComputeNode).filter_by(uuid=values['uuid']).first() + if cn: + # Update with the provided values but un-soft-delete. + update_values = copy.deepcopy(values) + update_values['deleted'] = 0 + update_values['deleted_at'] = None + return compute_node_update(context, cn.id, update_values) + + @oslo_db_api.wrap_db_retry(max_retries=5, retry_on_deadlock=True) @pick_context_manager_writer def compute_node_update(context, compute_id, values): diff --git a/nova/tests/functional/regressions/test_bug_1839560.py b/nova/tests/functional/regressions/test_bug_1839560.py new file mode 100644 index 00000000000..02786c11f83 --- /dev/null +++ b/nova/tests/functional/regressions/test_bug_1839560.py @@ -0,0 +1,113 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from oslo_log import log as logging + +from nova import context +from nova.db import api as db_api +from nova import objects +from nova import test +from nova.tests import fixtures as nova_fixtures +from nova.tests.functional import integrated_helpers +from nova import utils +from nova.virt import fake as fake_virt + +LOG = logging.getLogger(__name__) + + +class PeriodicNodeRecreateTestCase(test.TestCase, + integrated_helpers.InstanceHelperMixin): + """Regression test for bug 1839560 introduced in Rocky. + + When an ironic node is undergoing maintenance the driver will not report + it as an available node to the ComputeManager.update_available_resource + periodic task. The ComputeManager will then (soft) delete a ComputeNode + record for that no-longer-available node. If/when the ironic node is + available again and the driver reports it, the ResourceTracker will attempt + to create a ComputeNode record for the ironic node. + + The regression with change Ia69fabce8e7fd7de101e291fe133c6f5f5f7056a is + that the ironic node uuid is used as the ComputeNode.uuid and there is + a unique constraint on the ComputeNode.uuid value in the database. So + trying to create a ComputeNode with the same uuid (after the ironic node + comes back from being unavailable) fails with a DuplicateEntry error since + there is a (soft) deleted version of the ComputeNode with the same uuid + in the database. + """ + def setUp(self): + super(PeriodicNodeRecreateTestCase, self).setUp() + # We need the PlacementFixture for the compute nodes to report in but + # otherwise don't care about placement for this test. + self.useFixture(nova_fixtures.PlacementFixture()) + # Start up the API so we can query the os-hypervisors API. + self.api = self.useFixture(nova_fixtures.OSAPIFixture( + api_version='v2.1')).admin_api + # Make sure we're using the fake driver that has predictable uuids + # for each node. + self.flags(compute_driver='fake.PredictableNodeUUIDDriver') + + def test_update_available_resource_node_recreate(self): + # First we create a compute service to manage a couple of fake nodes. + # When start_service runs, it will create the node1 and node2 + # ComputeNodes. + fake_virt.set_nodes(['node1', 'node2']) + self.addCleanup(fake_virt.restore_nodes) + compute = self.start_service('compute', 'node1') + # Now we should have two compute nodes, make sure the hypervisors API + # shows them. + hypervisors = self.api.api_get('/os-hypervisors').body['hypervisors'] + self.assertEqual(2, len(hypervisors), hypervisors) + self.assertEqual({'node1', 'node2'}, + set([hyp['hypervisor_hostname'] + for hyp in hypervisors])) + # Now stub the driver to only report node1. This is making it look like + # node2 is no longer available when update_available_resource runs. + compute.manager.driver._nodes = ['node1'] + ctxt = context.get_admin_context() + compute.manager.update_available_resource(ctxt) + # node2 should have been deleted, check the logs and API. + log = self.stdlog.logger.output + self.assertIn('Deleting orphan compute node', log) + self.assertIn('hypervisor host is node2', log) + hypervisors = self.api.api_get('/os-hypervisors').body['hypervisors'] + self.assertEqual(1, len(hypervisors), hypervisors) + self.assertEqual('node1', hypervisors[0]['hypervisor_hostname']) + # But the node2 ComputeNode is still in the database with deleted!=0. + with utils.temporary_mutation(ctxt, read_deleted='yes'): + cn = objects.ComputeNode.get_by_host_and_nodename( + ctxt, 'node1', 'node2') + self.assertTrue(cn.deleted) + # Now stub the driver again to report node2 as being back and run + # the periodic task. + compute.manager.driver._nodes = ['node1', 'node2'] + LOG.info('Running update_available_resource which should bring back ' + 'node2.') + compute.manager.update_available_resource(ctxt) + # The DBDuplicateEntry error should have been handled and resulted in + # updating the (soft) deleted record to no longer be deleted. + log = self.stdlog.logger.output + self.assertNotIn('DBDuplicateEntry', log) + # Should have two reported hypervisors again. + hypervisors = self.api.api_get('/os-hypervisors').body['hypervisors'] + self.assertEqual(2, len(hypervisors), hypervisors) + # Now that the node2 record was un-soft-deleted, archiving should not + # remove any compute_nodes. + LOG.info('Archiving the database.') + archived = db_api.archive_deleted_rows(1000)[0] + self.assertNotIn('compute_nodes', archived) + cn2 = objects.ComputeNode.get_by_host_and_nodename( + ctxt, 'node1', 'node2') + self.assertFalse(cn2.deleted) + self.assertIsNone(cn2.deleted_at) + # The node2 id and uuid should not have changed in the DB. + self.assertEqual(cn.id, cn2.id) + self.assertEqual(cn.uuid, cn2.uuid) diff --git a/nova/tests/unit/compute/test_resource_tracker.py b/nova/tests/unit/compute/test_resource_tracker.py index 1d31fec2e9a..2ab2aac7a39 100644 --- a/nova/tests/unit/compute/test_resource_tracker.py +++ b/nova/tests/unit/compute/test_resource_tracker.py @@ -1074,23 +1074,18 @@ def fake_get_all(_ctx, nodename): self.assertEqual(_HOSTNAME, self.rt.compute_nodes[_NODENAME].host) @mock.patch('nova.objects.ComputeNodeList.get_by_hypervisor') - @mock.patch('nova.objects.PciDeviceList.get_by_compute_node', - return_value=objects.PciDeviceList(objects=[])) @mock.patch('nova.objects.ComputeNode.create') @mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename') @mock.patch('nova.compute.resource_tracker.ResourceTracker.' '_update') def test_compute_node_created_on_empty(self, update_mock, get_mock, - create_mock, pci_tracker_mock, + create_mock, get_by_hypervisor_mock): get_by_hypervisor_mock.return_value = [] self._test_compute_node_created(update_mock, get_mock, create_mock, - pci_tracker_mock, get_by_hypervisor_mock) @mock.patch('nova.objects.ComputeNodeList.get_by_hypervisor') - @mock.patch('nova.objects.PciDeviceList.get_by_compute_node', - return_value=objects.PciDeviceList(objects=[])) @mock.patch('nova.objects.ComputeNode.create') @mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename') @mock.patch('nova.compute.resource_tracker.ResourceTracker.' @@ -1098,32 +1093,26 @@ def test_compute_node_created_on_empty(self, update_mock, get_mock, def test_compute_node_created_on_empty_rebalance(self, update_mock, get_mock, create_mock, - pci_tracker_mock, get_by_hypervisor_mock): get_by_hypervisor_mock.return_value = [] self._test_compute_node_created(update_mock, get_mock, create_mock, - pci_tracker_mock, get_by_hypervisor_mock, rebalances_nodes=True) @mock.patch('nova.objects.ComputeNodeList.get_by_hypervisor') - @mock.patch('nova.objects.PciDeviceList.get_by_compute_node', - return_value=objects.PciDeviceList(objects=[])) @mock.patch('nova.objects.ComputeNode.create') @mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename') @mock.patch('nova.compute.resource_tracker.ResourceTracker.' '_update') def test_compute_node_created_too_many(self, update_mock, get_mock, - create_mock, pci_tracker_mock, + create_mock, get_by_hypervisor_mock): get_by_hypervisor_mock.return_value = ["fake_node_1", "fake_node_2"] self._test_compute_node_created(update_mock, get_mock, create_mock, - pci_tracker_mock, get_by_hypervisor_mock, rebalances_nodes=True) - def _test_compute_node_created(self, update_mock, get_mock, - create_mock, pci_tracker_mock, + def _test_compute_node_created(self, update_mock, get_mock, create_mock, get_by_hypervisor_mock, rebalances_nodes=False): self.flags(cpu_allocation_ratio=1.0, ram_allocation_ratio=1.0, @@ -1152,12 +1141,12 @@ def _test_compute_node_created(self, update_mock, get_mock, 'current_workload': 0, 'vcpus': 4, 'running_vms': 0, - 'pci_passthrough_devices': '[]' + 'pci_passthrough_devices': '[]', + 'uuid': uuids.compute_node_uuid } # The expected compute represents the initial values used # when creating a compute node. expected_compute = objects.ComputeNode( - id=42, host_ip=resources['host_ip'], vcpus=resources['vcpus'], memory_mb=resources['memory_mb'], @@ -1177,20 +1166,11 @@ def _test_compute_node_created(self, update_mock, get_mock, cpu_allocation_ratio=1.0, disk_allocation_ratio=1.0, stats={'failed_builds': 0}, - pci_device_pools=objects.PciDevicePoolList(objects=[]), uuid=uuids.compute_node_uuid ) - def set_cn_id(): - # The PCI tracker needs the compute node's ID when starting up, so - # make sure that we set the ID value so we don't get a Cannot load - # 'id' in base class error - cn = self.rt.compute_nodes[_NODENAME] - cn.id = 42 # Has to be a number, not a mock - cn.uuid = uuids.compute_node_uuid - - create_mock.side_effect = set_cn_id - self.rt._init_compute_node(mock.sentinel.ctx, resources) + with mock.patch.object(self.rt, '_setup_pci_tracker') as setup_pci: + self.rt._init_compute_node(mock.sentinel.ctx, resources) cn = self.rt.compute_nodes[_NODENAME] get_mock.assert_called_once_with(mock.sentinel.ctx, _HOSTNAME, @@ -1202,22 +1182,47 @@ def set_cn_id(): get_by_hypervisor_mock.assert_not_called() create_mock.assert_called_once_with() self.assertTrue(obj_base.obj_equal_prims(expected_compute, cn)) - pci_tracker_mock.assert_called_once_with(mock.sentinel.ctx, - 42) + setup_pci.assert_called_once_with(mock.sentinel.ctx, cn, resources) self.assertFalse(update_mock.called) + @mock.patch('nova.compute.resource_tracker.ResourceTracker.' + '_setup_pci_tracker') + @mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename', + side_effect=exc.ComputeHostNotFound(host=_HOSTNAME)) + @mock.patch('nova.objects.ComputeNode.create', + side_effect=(test.TestingException, None)) + def test_compute_node_create_fail_retry_works(self, mock_create, mock_get, + mock_setup_pci): + """Tests that _init_compute_node will not save the ComputeNode object + in the compute_nodes dict if create() fails. + """ + self._setup_rt() + self.assertEqual({}, self.rt.compute_nodes) + ctxt = context.get_context() + # The first ComputeNode.create fails so rt.compute_nodes should + # remain empty. + resources = copy.deepcopy(_VIRT_DRIVER_AVAIL_RESOURCES) + resources['uuid'] = uuids.cn_uuid # for the LOG.info message + self.assertRaises(test.TestingException, + self.rt._init_compute_node, ctxt, resources) + self.assertEqual({}, self.rt.compute_nodes) + # Second create works so compute_nodes should have a mapping. + self.rt._init_compute_node(ctxt, resources) + self.assertIn(_NODENAME, self.rt.compute_nodes) + mock_get.assert_has_calls([mock.call( + ctxt, _HOSTNAME, _NODENAME)] * 2) + self.assertEqual(2, mock_create.call_count) + mock_setup_pci.assert_called_once_with( + ctxt, test.MatchType(objects.ComputeNode), resources) + @mock.patch('nova.objects.ComputeNodeList.get_by_hypervisor') - @mock.patch('nova.objects.PciDeviceList.get_by_compute_node', - return_value=objects.PciDeviceList(objects=[])) @mock.patch('nova.objects.ComputeNode.create') @mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename') @mock.patch('nova.compute.resource_tracker.ResourceTracker.' '_update') def test_node_removed(self, update_mock, get_mock, - create_mock, pci_tracker_mock, - get_by_hypervisor_mock): + create_mock, get_by_hypervisor_mock): self._test_compute_node_created(update_mock, get_mock, create_mock, - pci_tracker_mock, get_by_hypervisor_mock) self.rt.old_resources[_NODENAME] = mock.sentinel.foo self.assertIn(_NODENAME, self.rt.compute_nodes) diff --git a/nova/tests/unit/db/test_db_api.py b/nova/tests/unit/db/test_db_api.py index 0433f126b1b..5b269140429 100644 --- a/nova/tests/unit/db/test_db_api.py +++ b/nova/tests/unit/db/test_db_api.py @@ -6882,6 +6882,18 @@ def test_compute_node_create(self): new_stats = jsonutils.loads(self.item['stats']) self.assertEqual(self.stats, new_stats) + def test_compute_node_create_duplicate_host_hypervisor_hostname(self): + """Tests to make sure that DBDuplicateEntry is raised when trying to + create a duplicate ComputeNode with the same host and + hypervisor_hostname values but different uuid values. This makes + sure that when _compute_node_get_and_update_deleted returns None + the DBDuplicateEntry is re-raised. + """ + other_node = dict(self.compute_node_dict) + other_node['uuid'] = uuidutils.generate_uuid() + self.assertRaises(db_exc.DBDuplicateEntry, + db.compute_node_create, self.ctxt, other_node) + def test_compute_node_get_all(self): nodes = db.compute_node_get_all(self.ctxt) self.assertEqual(1, len(nodes)) diff --git a/nova/virt/fake.py b/nova/virt/fake.py index 300c2510e97..1271a7463e8 100644 --- a/nova/virt/fake.py +++ b/nova/virt/fake.py @@ -27,6 +27,7 @@ import contextlib import copy import time +import uuid from oslo_log import log as logging from oslo_serialization import jsonutils @@ -674,6 +675,19 @@ class MediumFakeDriver(FakeDriver): local_gb = 1028 +class PredictableNodeUUIDDriver(SmallFakeDriver): + """SmallFakeDriver variant that reports a predictable node uuid in + get_available_resource, like IronicDriver. + """ + def get_available_resource(self, nodename): + resources = super( + PredictableNodeUUIDDriver, self).get_available_resource(nodename) + # This is used in ComputeNode.update_from_virt_driver which is called + # from the ResourceTracker when creating a ComputeNode. + resources['uuid'] = uuid.uuid5(uuid.NAMESPACE_DNS, nodename) + return resources + + class FakeRescheduleDriver(FakeDriver): """FakeDriver derivative that triggers a reschedule on the first spawn attempt. This is expected to only be used in tests that have more than