Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion nova/compute/resource_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,8 +580,12 @@ def _init_compute_node(self, context, resources):
cn = objects.ComputeNode(context)
cn.host = self.host
self._copy_resources(cn, resources)
self.compute_nodes[nodename] = cn
cn.create()
# Only map the ComputeNode into compute_nodes if create() was OK
# because if create() fails, on the next run through here nodename
# would be in compute_nodes and we won't try to create again (because
# of the logic above).
self.compute_nodes[nodename] = cn
LOG.info('Compute node record created for '
'%(host)s:%(node)s with uuid: %(uuid)s',
{'host': self.host, 'node': nodename, 'uuid': cn.uuid})
Expand Down
46 changes: 45 additions & 1 deletion nova/db/sqlalchemy/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from oslo_db.sqlalchemy import update_match
from oslo_db.sqlalchemy import utils as sqlalchemyutils
from oslo_log import log as logging
from oslo_utils import excutils
from oslo_utils import importutils
from oslo_utils import timeutils
from oslo_utils import uuidutils
Expand Down Expand Up @@ -693,11 +694,54 @@ def compute_node_create(context, values):

compute_node_ref = models.ComputeNode()
compute_node_ref.update(values)
compute_node_ref.save(context.session)
try:
compute_node_ref.save(context.session)
except db_exc.DBDuplicateEntry:
with excutils.save_and_reraise_exception(logger=LOG) as err_ctx:
# Check to see if we have a (soft) deleted ComputeNode with the
# same UUID and if so just update it and mark as no longer (soft)
# deleted. See bug 1839560 for details.
if 'uuid' in values:
# Get a fresh context for a new DB session and allow it to
# get a deleted record.
ctxt = nova.context.get_admin_context(read_deleted='yes')
compute_node_ref = _compute_node_get_and_update_deleted(
ctxt, values)
# If we didn't get anything back we failed to find the node
# by uuid and update it so re-raise the DBDuplicateEntry.
if compute_node_ref:
err_ctx.reraise = False

return compute_node_ref


@pick_context_manager_writer
def _compute_node_get_and_update_deleted(context, values):
"""Find a ComputeNode by uuid, update and un-delete it.

This is a special case from the ``compute_node_create`` method which
needs to be separate to get a new Session.

This method will update the ComputeNode, if found, to have deleted=0 and
deleted_at=None values.

:param context: request auth context which should be able to read deleted
records
:param values: values used to update the ComputeNode record - must include
uuid
:return: updated ComputeNode sqlalchemy model object if successfully found
and updated, None otherwise
"""
cn = model_query(
context, models.ComputeNode).filter_by(uuid=values['uuid']).first()
if cn:
# Update with the provided values but un-soft-delete.
update_values = copy.deepcopy(values)
update_values['deleted'] = 0
update_values['deleted_at'] = None
return compute_node_update(context, cn.id, update_values)


@oslo_db_api.wrap_db_retry(max_retries=5, retry_on_deadlock=True)
@pick_context_manager_writer
def compute_node_update(context, compute_id, values):
Expand Down
113 changes: 113 additions & 0 deletions nova/tests/functional/regressions/test_bug_1839560.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

from oslo_log import log as logging

from nova import context
from nova.db import api as db_api
from nova import objects
from nova import test
from nova.tests import fixtures as nova_fixtures
from nova.tests.functional import integrated_helpers
from nova import utils
from nova.virt import fake as fake_virt

LOG = logging.getLogger(__name__)


class PeriodicNodeRecreateTestCase(test.TestCase,
integrated_helpers.InstanceHelperMixin):
"""Regression test for bug 1839560 introduced in Rocky.

When an ironic node is undergoing maintenance the driver will not report
it as an available node to the ComputeManager.update_available_resource
periodic task. The ComputeManager will then (soft) delete a ComputeNode
record for that no-longer-available node. If/when the ironic node is
available again and the driver reports it, the ResourceTracker will attempt
to create a ComputeNode record for the ironic node.

The regression with change Ia69fabce8e7fd7de101e291fe133c6f5f5f7056a is
that the ironic node uuid is used as the ComputeNode.uuid and there is
a unique constraint on the ComputeNode.uuid value in the database. So
trying to create a ComputeNode with the same uuid (after the ironic node
comes back from being unavailable) fails with a DuplicateEntry error since
there is a (soft) deleted version of the ComputeNode with the same uuid
in the database.
"""
def setUp(self):
super(PeriodicNodeRecreateTestCase, self).setUp()
# We need the PlacementFixture for the compute nodes to report in but
# otherwise don't care about placement for this test.
self.useFixture(nova_fixtures.PlacementFixture())
# Start up the API so we can query the os-hypervisors API.
self.api = self.useFixture(nova_fixtures.OSAPIFixture(
api_version='v2.1')).admin_api
# Make sure we're using the fake driver that has predictable uuids
# for each node.
self.flags(compute_driver='fake.PredictableNodeUUIDDriver')

def test_update_available_resource_node_recreate(self):
# First we create a compute service to manage a couple of fake nodes.
# When start_service runs, it will create the node1 and node2
# ComputeNodes.
fake_virt.set_nodes(['node1', 'node2'])
self.addCleanup(fake_virt.restore_nodes)
compute = self.start_service('compute', 'node1')
# Now we should have two compute nodes, make sure the hypervisors API
# shows them.
hypervisors = self.api.api_get('/os-hypervisors').body['hypervisors']
self.assertEqual(2, len(hypervisors), hypervisors)
self.assertEqual({'node1', 'node2'},
set([hyp['hypervisor_hostname']
for hyp in hypervisors]))
# Now stub the driver to only report node1. This is making it look like
# node2 is no longer available when update_available_resource runs.
compute.manager.driver._nodes = ['node1']
ctxt = context.get_admin_context()
compute.manager.update_available_resource(ctxt)
# node2 should have been deleted, check the logs and API.
log = self.stdlog.logger.output
self.assertIn('Deleting orphan compute node', log)
self.assertIn('hypervisor host is node2', log)
hypervisors = self.api.api_get('/os-hypervisors').body['hypervisors']
self.assertEqual(1, len(hypervisors), hypervisors)
self.assertEqual('node1', hypervisors[0]['hypervisor_hostname'])
# But the node2 ComputeNode is still in the database with deleted!=0.
with utils.temporary_mutation(ctxt, read_deleted='yes'):
cn = objects.ComputeNode.get_by_host_and_nodename(
ctxt, 'node1', 'node2')
self.assertTrue(cn.deleted)
# Now stub the driver again to report node2 as being back and run
# the periodic task.
compute.manager.driver._nodes = ['node1', 'node2']
LOG.info('Running update_available_resource which should bring back '
'node2.')
compute.manager.update_available_resource(ctxt)
# The DBDuplicateEntry error should have been handled and resulted in
# updating the (soft) deleted record to no longer be deleted.
log = self.stdlog.logger.output
self.assertNotIn('DBDuplicateEntry', log)
# Should have two reported hypervisors again.
hypervisors = self.api.api_get('/os-hypervisors').body['hypervisors']
self.assertEqual(2, len(hypervisors), hypervisors)
# Now that the node2 record was un-soft-deleted, archiving should not
# remove any compute_nodes.
LOG.info('Archiving the database.')
archived = db_api.archive_deleted_rows(1000)[0]
self.assertNotIn('compute_nodes', archived)
cn2 = objects.ComputeNode.get_by_host_and_nodename(
ctxt, 'node1', 'node2')
self.assertFalse(cn2.deleted)
self.assertIsNone(cn2.deleted_at)
# The node2 id and uuid should not have changed in the DB.
self.assertEqual(cn.id, cn2.id)
self.assertEqual(cn.uuid, cn2.uuid)
73 changes: 39 additions & 34 deletions nova/tests/unit/compute/test_resource_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -1074,56 +1074,45 @@ def fake_get_all(_ctx, nodename):
self.assertEqual(_HOSTNAME, self.rt.compute_nodes[_NODENAME].host)

@mock.patch('nova.objects.ComputeNodeList.get_by_hypervisor')
@mock.patch('nova.objects.PciDeviceList.get_by_compute_node',
return_value=objects.PciDeviceList(objects=[]))
@mock.patch('nova.objects.ComputeNode.create')
@mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename')
@mock.patch('nova.compute.resource_tracker.ResourceTracker.'
'_update')
def test_compute_node_created_on_empty(self, update_mock, get_mock,
create_mock, pci_tracker_mock,
create_mock,
get_by_hypervisor_mock):
get_by_hypervisor_mock.return_value = []
self._test_compute_node_created(update_mock, get_mock, create_mock,
pci_tracker_mock,
get_by_hypervisor_mock)

@mock.patch('nova.objects.ComputeNodeList.get_by_hypervisor')
@mock.patch('nova.objects.PciDeviceList.get_by_compute_node',
return_value=objects.PciDeviceList(objects=[]))
@mock.patch('nova.objects.ComputeNode.create')
@mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename')
@mock.patch('nova.compute.resource_tracker.ResourceTracker.'
'_update')
def test_compute_node_created_on_empty_rebalance(self, update_mock,
get_mock,
create_mock,
pci_tracker_mock,
get_by_hypervisor_mock):
get_by_hypervisor_mock.return_value = []
self._test_compute_node_created(update_mock, get_mock, create_mock,
pci_tracker_mock,
get_by_hypervisor_mock,
rebalances_nodes=True)

@mock.patch('nova.objects.ComputeNodeList.get_by_hypervisor')
@mock.patch('nova.objects.PciDeviceList.get_by_compute_node',
return_value=objects.PciDeviceList(objects=[]))
@mock.patch('nova.objects.ComputeNode.create')
@mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename')
@mock.patch('nova.compute.resource_tracker.ResourceTracker.'
'_update')
def test_compute_node_created_too_many(self, update_mock, get_mock,
create_mock, pci_tracker_mock,
create_mock,
get_by_hypervisor_mock):
get_by_hypervisor_mock.return_value = ["fake_node_1", "fake_node_2"]
self._test_compute_node_created(update_mock, get_mock, create_mock,
pci_tracker_mock,
get_by_hypervisor_mock,
rebalances_nodes=True)

def _test_compute_node_created(self, update_mock, get_mock,
create_mock, pci_tracker_mock,
def _test_compute_node_created(self, update_mock, get_mock, create_mock,
get_by_hypervisor_mock,
rebalances_nodes=False):
self.flags(cpu_allocation_ratio=1.0, ram_allocation_ratio=1.0,
Expand Down Expand Up @@ -1152,12 +1141,12 @@ def _test_compute_node_created(self, update_mock, get_mock,
'current_workload': 0,
'vcpus': 4,
'running_vms': 0,
'pci_passthrough_devices': '[]'
'pci_passthrough_devices': '[]',
'uuid': uuids.compute_node_uuid
}
# The expected compute represents the initial values used
# when creating a compute node.
expected_compute = objects.ComputeNode(
id=42,
host_ip=resources['host_ip'],
vcpus=resources['vcpus'],
memory_mb=resources['memory_mb'],
Expand All @@ -1177,20 +1166,11 @@ def _test_compute_node_created(self, update_mock, get_mock,
cpu_allocation_ratio=1.0,
disk_allocation_ratio=1.0,
stats={'failed_builds': 0},
pci_device_pools=objects.PciDevicePoolList(objects=[]),
uuid=uuids.compute_node_uuid
)

def set_cn_id():
# The PCI tracker needs the compute node's ID when starting up, so
# make sure that we set the ID value so we don't get a Cannot load
# 'id' in base class error
cn = self.rt.compute_nodes[_NODENAME]
cn.id = 42 # Has to be a number, not a mock
cn.uuid = uuids.compute_node_uuid

create_mock.side_effect = set_cn_id
self.rt._init_compute_node(mock.sentinel.ctx, resources)
with mock.patch.object(self.rt, '_setup_pci_tracker') as setup_pci:
self.rt._init_compute_node(mock.sentinel.ctx, resources)

cn = self.rt.compute_nodes[_NODENAME]
get_mock.assert_called_once_with(mock.sentinel.ctx, _HOSTNAME,
Expand All @@ -1202,22 +1182,47 @@ def set_cn_id():
get_by_hypervisor_mock.assert_not_called()
create_mock.assert_called_once_with()
self.assertTrue(obj_base.obj_equal_prims(expected_compute, cn))
pci_tracker_mock.assert_called_once_with(mock.sentinel.ctx,
42)
setup_pci.assert_called_once_with(mock.sentinel.ctx, cn, resources)
self.assertFalse(update_mock.called)

@mock.patch('nova.compute.resource_tracker.ResourceTracker.'
'_setup_pci_tracker')
@mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename',
side_effect=exc.ComputeHostNotFound(host=_HOSTNAME))
@mock.patch('nova.objects.ComputeNode.create',
side_effect=(test.TestingException, None))
def test_compute_node_create_fail_retry_works(self, mock_create, mock_get,
mock_setup_pci):
"""Tests that _init_compute_node will not save the ComputeNode object
in the compute_nodes dict if create() fails.
"""
self._setup_rt()
self.assertEqual({}, self.rt.compute_nodes)
ctxt = context.get_context()
# The first ComputeNode.create fails so rt.compute_nodes should
# remain empty.
resources = copy.deepcopy(_VIRT_DRIVER_AVAIL_RESOURCES)
resources['uuid'] = uuids.cn_uuid # for the LOG.info message
self.assertRaises(test.TestingException,
self.rt._init_compute_node, ctxt, resources)
self.assertEqual({}, self.rt.compute_nodes)
# Second create works so compute_nodes should have a mapping.
self.rt._init_compute_node(ctxt, resources)
self.assertIn(_NODENAME, self.rt.compute_nodes)
mock_get.assert_has_calls([mock.call(
ctxt, _HOSTNAME, _NODENAME)] * 2)
self.assertEqual(2, mock_create.call_count)
mock_setup_pci.assert_called_once_with(
ctxt, test.MatchType(objects.ComputeNode), resources)

@mock.patch('nova.objects.ComputeNodeList.get_by_hypervisor')
@mock.patch('nova.objects.PciDeviceList.get_by_compute_node',
return_value=objects.PciDeviceList(objects=[]))
@mock.patch('nova.objects.ComputeNode.create')
@mock.patch('nova.objects.ComputeNode.get_by_host_and_nodename')
@mock.patch('nova.compute.resource_tracker.ResourceTracker.'
'_update')
def test_node_removed(self, update_mock, get_mock,
create_mock, pci_tracker_mock,
get_by_hypervisor_mock):
create_mock, get_by_hypervisor_mock):
self._test_compute_node_created(update_mock, get_mock, create_mock,
pci_tracker_mock,
get_by_hypervisor_mock)
self.rt.old_resources[_NODENAME] = mock.sentinel.foo
self.assertIn(_NODENAME, self.rt.compute_nodes)
Expand Down
12 changes: 12 additions & 0 deletions nova/tests/unit/db/test_db_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -6882,6 +6882,18 @@ def test_compute_node_create(self):
new_stats = jsonutils.loads(self.item['stats'])
self.assertEqual(self.stats, new_stats)

def test_compute_node_create_duplicate_host_hypervisor_hostname(self):
"""Tests to make sure that DBDuplicateEntry is raised when trying to
create a duplicate ComputeNode with the same host and
hypervisor_hostname values but different uuid values. This makes
sure that when _compute_node_get_and_update_deleted returns None
the DBDuplicateEntry is re-raised.
"""
other_node = dict(self.compute_node_dict)
other_node['uuid'] = uuidutils.generate_uuid()
self.assertRaises(db_exc.DBDuplicateEntry,
db.compute_node_create, self.ctxt, other_node)

def test_compute_node_get_all(self):
nodes = db.compute_node_get_all(self.ctxt)
self.assertEqual(1, len(nodes))
Expand Down
14 changes: 14 additions & 0 deletions nova/virt/fake.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import contextlib
import copy
import time
import uuid

from oslo_log import log as logging
from oslo_serialization import jsonutils
Expand Down Expand Up @@ -674,6 +675,19 @@ class MediumFakeDriver(FakeDriver):
local_gb = 1028


class PredictableNodeUUIDDriver(SmallFakeDriver):
"""SmallFakeDriver variant that reports a predictable node uuid in
get_available_resource, like IronicDriver.
"""
def get_available_resource(self, nodename):
resources = super(
PredictableNodeUUIDDriver, self).get_available_resource(nodename)
# This is used in ComputeNode.update_from_virt_driver which is called
# from the ResourceTracker when creating a ComputeNode.
resources['uuid'] = uuid.uuid5(uuid.NAMESPACE_DNS, nodename)
return resources


class FakeRescheduleDriver(FakeDriver):
"""FakeDriver derivative that triggers a reschedule on the first spawn
attempt. This is expected to only be used in tests that have more than
Expand Down