Skip to content

Commit cd0403d

Browse files
yusuke-okadagibizer
authored andcommitted
Fix failed count for anti-affinity check
The late anti-affinity check runs in the compute manager to avoid parallel scheduling requests to invalidate the anti-affinity server group policy. When the check fails the instance is re-scheduled. However this failure counted as a real instance boot failure of the compute host and can lead to de-prioritization of the compute host in the scheduler via BuildFailureWeigher. As the late anti-affinity check does not indicate any fault of the compute host itself it should not be counted towards the build failure counter. This patch adds new build results to handle this case. Closes-Bug: #1996732 Change-Id: I2ba035c09ace20e9835d9d12a5c5bee17d616718 Signed-off-by: Yusuke Okada <okada.yusuke@fujitsu.com> (cherry picked from commit 56d320a) (cherry picked from commit 1b56714) (cherry picked from commit 2f1d657)
1 parent 8653767 commit cd0403d

File tree

5 files changed

+265
-14
lines changed

5 files changed

+265
-14
lines changed

nova/compute/build_results.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,11 @@
2424
ACTIVE = 'active' # Instance is running
2525
FAILED = 'failed' # Instance failed to build and was not rescheduled
2626
RESCHEDULED = 'rescheduled' # Instance failed to build, but was rescheduled
27+
# Instance failed by policy violation (such as affinity or anti-affinity)
28+
# and was not rescheduled. In this case, the node's failed count won't be
29+
# increased.
30+
FAILED_BY_POLICY = 'failed_by_policy'
31+
# Instance failed by policy violation (such as affinity or anti-affinity)
32+
# but was rescheduled. In this case, the node's failed count won't be
33+
# increased.
34+
RESCHEDULED_BY_POLICY = 'rescheduled_by_policy'

nova/compute/manager.py

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1804,11 +1804,8 @@ def _do_validation(context, instance, group):
18041804
else:
18051805
max_server = 1
18061806
if len(members_on_host) >= max_server:
1807-
msg = _("Anti-affinity instance group policy "
1808-
"was violated.")
1809-
raise exception.RescheduledException(
1810-
instance_uuid=instance.uuid,
1811-
reason=msg)
1807+
raise exception.GroupAffinityViolation(
1808+
instance_uuid=instance.uuid, policy='Anti-affinity')
18121809

18131810
# NOTE(ganso): The check for affinity below does not work and it
18141811
# can easily be violated because the lock happens in different
@@ -1818,10 +1815,8 @@ def _do_validation(context, instance, group):
18181815
elif group.policy and 'affinity' == group.policy:
18191816
group_hosts = group.get_hosts(exclude=[instance.uuid])
18201817
if group_hosts and self.host not in group_hosts:
1821-
msg = _("Affinity instance group policy was violated.")
1822-
raise exception.RescheduledException(
1823-
instance_uuid=instance.uuid,
1824-
reason=msg)
1818+
raise exception.GroupAffinityViolation(
1819+
instance_uuid=instance.uuid, policy='Affinity')
18251820

18261821
_do_validation(context, instance, group)
18271822

@@ -2256,6 +2251,9 @@ def _locked_do_build_and_run_instance(*args, **kwargs):
22562251
self.reportclient.delete_allocation_for_instance(
22572252
context, instance.uuid, force=True)
22582253

2254+
if result in (build_results.FAILED_BY_POLICY,
2255+
build_results.RESCHEDULED_BY_POLICY):
2256+
return
22592257
if result in (build_results.FAILED,
22602258
build_results.RESCHEDULED):
22612259
self._build_failed(node)
@@ -2354,6 +2352,8 @@ def _do_build_and_run_instance(self, context, instance, image,
23542352
self._nil_out_instance_obj_host_and_node(instance)
23552353
self._set_instance_obj_error_state(instance,
23562354
clean_task_state=True)
2355+
if isinstance(e, exception.RescheduledByPolicyException):
2356+
return build_results.FAILED_BY_POLICY
23572357
return build_results.FAILED
23582358
LOG.debug(e.format_message(), instance=instance)
23592359
# This will be used for logging the exception
@@ -2380,6 +2380,10 @@ def _do_build_and_run_instance(self, context, instance, image,
23802380
injected_files, requested_networks, security_groups,
23812381
block_device_mapping, request_spec=request_spec,
23822382
host_lists=[host_list])
2383+
2384+
if isinstance(e, exception.RescheduledByPolicyException):
2385+
return build_results.RESCHEDULED_BY_POLICY
2386+
23832387
return build_results.RESCHEDULED
23842388
except (exception.InstanceNotFound,
23852389
exception.UnexpectedDeletingTaskStateError):
@@ -2597,6 +2601,17 @@ def _build_and_run_instance(self, context, instance, image, injected_files,
25972601
bdms=block_device_mapping)
25982602
raise exception.BuildAbortException(instance_uuid=instance.uuid,
25992603
reason=e.format_message())
2604+
except exception.GroupAffinityViolation as e:
2605+
LOG.exception('Failed to build and run instance',
2606+
instance=instance)
2607+
self._notify_about_instance_usage(context, instance,
2608+
'create.error', fault=e)
2609+
compute_utils.notify_about_instance_create(
2610+
context, instance, self.host,
2611+
phase=fields.NotificationPhase.ERROR, exception=e,
2612+
bdms=block_device_mapping)
2613+
raise exception.RescheduledByPolicyException(
2614+
instance_uuid=instance.uuid, reason=str(e))
26002615
except Exception as e:
26012616
LOG.exception('Failed to build and run instance',
26022617
instance=instance)

nova/exception.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1477,6 +1477,15 @@ class RescheduledException(NovaException):
14771477
"%(reason)s")
14781478

14791479

1480+
class RescheduledByPolicyException(RescheduledException):
1481+
msg_fmt = _("Build of instance %(instance_uuid)s was re-scheduled: "
1482+
"%(reason)s")
1483+
1484+
1485+
class GroupAffinityViolation(NovaException):
1486+
msg_fmt = _("%(policy)s instance group policy was violated")
1487+
1488+
14801489
class InstanceFaultRollback(NovaException):
14811490
def __init__(self, inner_exception=None):
14821491
message = _("Instance rollback performed due to: %s")

nova/tests/functional/test_server_group.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from nova.compute import instance_actions
2020
from nova import context
2121
from nova.db.main import api as db
22+
from nova import objects
2223
from nova import test
2324
from nova.tests import fixtures as nova_fixtures
2425
from nova.tests.functional.api import client
@@ -494,6 +495,85 @@ def test_soft_affinity_not_supported(self):
494495
self.assertIn('Invalid input', ex.response.text)
495496
self.assertIn('soft-affinity', ex.response.text)
496497

498+
@mock.patch('nova.scheduler.filters.affinity_filter.'
499+
'ServerGroupAffinityFilter.host_passes', return_value=True)
500+
def test_failed_count_with_affinity_violation(self, mock_host_passes):
501+
"""Check failed count not incremented after violation of the late
502+
affinity check. https://bugs.launchpad.net/nova/+bug/1996732
503+
"""
504+
505+
created_group = self.api.post_server_groups(self.affinity)
506+
flavor = self.api.get_flavors()[2]
507+
508+
# Ensure the first instance is on compute1
509+
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
510+
compute2_service_id = self.admin_api.get_services(
511+
host=self.compute2.host, binary='nova-compute')[0]['id']
512+
self.admin_api.put_service(compute2_service_id,
513+
{'status': 'disabled'})
514+
515+
self._boot_a_server_to_group(created_group, flavor=flavor)
516+
517+
# Ensure the second instance is on compute2
518+
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
519+
self.admin_api.put_service(compute2_service_id,
520+
{'status': 'enabled'})
521+
compute1_service_id = self.admin_api.get_services(
522+
host=self.compute.host, binary='nova-compute')[0]['id']
523+
self.admin_api.put_service(compute1_service_id,
524+
{'status': 'disabled'})
525+
526+
# Expects GroupAffinityViolation exception
527+
failed_server = self._boot_a_server_to_group(created_group,
528+
flavor=flavor,
529+
expected_status='ERROR')
530+
531+
self.assertEqual('Exceeded maximum number of retries. Exhausted all '
532+
'hosts available for retrying build failures for '
533+
'instance %s.' % failed_server['id'],
534+
failed_server['fault']['message'])
535+
536+
ctxt = context.get_admin_context()
537+
computes = objects.ComputeNodeList.get_all(ctxt)
538+
539+
for node in computes:
540+
self.assertEqual(node.stats.get('failed_builds'), '0')
541+
542+
@mock.patch('nova.scheduler.filters.affinity_filter.'
543+
'ServerGroupAntiAffinityFilter.host_passes', return_value=True)
544+
def test_failed_count_with_anti_affinity_violation(self, mock_host_passes):
545+
"""Check failed count after violation of the late affinity check.
546+
https://bugs.launchpad.net/nova/+bug/1996732
547+
"""
548+
549+
created_group = self.api.post_server_groups(self.anti_affinity)
550+
flavor = self.api.get_flavors()[2]
551+
552+
# Ensure two instances are scheduled on the same host
553+
with utils.temporary_mutation(self.admin_api, microversion='2.53'):
554+
compute2_service_id = self.admin_api.get_services(
555+
host=self.compute2.host, binary='nova-compute')[0]['id']
556+
self.admin_api.put_service(compute2_service_id,
557+
{'status': 'disabled'})
558+
559+
self._boot_a_server_to_group(created_group, flavor=flavor)
560+
561+
# Expects GroupAffinityViolation exception
562+
failed_server = self._boot_a_server_to_group(created_group,
563+
flavor=flavor,
564+
expected_status='ERROR')
565+
566+
self.assertEqual('Exceeded maximum number of retries. Exhausted all '
567+
'hosts available for retrying build failures for '
568+
'instance %s.' % failed_server['id'],
569+
failed_server['fault']['message'])
570+
571+
ctxt = context.get_admin_context()
572+
computes = objects.ComputeNodeList.get_all(ctxt)
573+
574+
for node in computes:
575+
self.assertEqual(node.stats.get('failed_builds'), '0')
576+
497577

498578
class ServerGroupAffinityConfTest(ServerGroupTestBase):
499579
api_major_version = 'v2.1'

0 commit comments

Comments
 (0)