From 2691056d6fa3e7db0cf9966b082af51ae6b5dda9 Mon Sep 17 00:00:00 2001 From: Johannes Kulik Date: Wed, 31 Aug 2022 11:03:50 +0200 Subject: [PATCH] Allow switching resources on overused providers When a resource-provider gets overused, e.g. because a host in the cluster it represents goes down or because one memory DIMM of multiple fails, we need to be able to (offline) migrate/resize a VM to reduce the pressure on the host. For that, we check in the call setting the allocations if for every allocation with a value for a resource, there's also an allocation removing such a resource - namely switching the allocations from one consumer to another as is done for migrations. --- placement/objects/allocation.py | 28 +++- .../tests/functional/db/test_allocation.py | 127 ++++++++++++++++++ 2 files changed, 153 insertions(+), 2 deletions(-) diff --git a/placement/objects/allocation.py b/placement/objects/allocation.py index 4c32954d8..71d347981 100644 --- a/placement/objects/allocation.py +++ b/placement/objects/allocation.py @@ -11,6 +11,7 @@ # under the License. import collections +import itertools from oslo_db import api as oslo_db_api from oslo_log import log as logging @@ -176,6 +177,28 @@ def _check_capacity_exceeded(ctx, allocs): raise exception.InvalidInventory( resource_class=class_str, resource_provider=provider_str) + # check if this a complete replacement of one consumer with another one + # TODO(jkulik): Make this only work if it's replacing to same or less. + consumer_resources = collections.defaultdict(list) + for alloc in allocs: + d = (alloc.resource_class, alloc.used) + consumer_resources[alloc.consumer.id].append(d) + is_replacement = len(consumer_resources) == 2 + if is_replacement: + # sort the resources so we get matching resource pairs (if possible) in + # the next step + for k in consumer_resources: + consumer_resources[k].sort() + # check that every resource change has a matching second change + # removing it from another consumer so the whole process is a + # replacement + for items in itertools.zip_longest(*consumer_resources.values(), + fillvalue=(None, None)): + (cls1, used1), (cls2, used2) = items + if cls1 != cls2 or 0 not in (used1, used2): + is_replacement = False + break + res_providers = {} rp_resource_class_sum = collections.defaultdict( lambda: collections.defaultdict(int)) @@ -223,8 +246,9 @@ def _check_capacity_exceeded(ctx, allocs): # usage["used"] can be returned as None used = usage['used'] or 0 capacity = (usage['total'] - usage['reserved']) * allocation_ratio - if (capacity < (used + amount_needed) or - capacity < (used + rp_resource_class_sum[rp_uuid][rc_id])): + if ((capacity < (used + amount_needed) or + capacity < (used + rp_resource_class_sum[rp_uuid][rc_id])) and + not is_replacement): LOG.warning( "Over capacity for %(rc)s on resource provider %(rp)s. " "Needed: %(needed)s, Used: %(used)s, Capacity: %(cap)s", diff --git a/placement/tests/functional/db/test_allocation.py b/placement/tests/functional/db/test_allocation.py index 2e839e6ef..a2434a991 100644 --- a/placement/tests/functional/db/test_allocation.py +++ b/placement/tests/functional/db/test_allocation.py @@ -678,3 +678,130 @@ def side_effect(*args, **kwargs): new_rp = alloc_list[0].resource_provider self.assertEqual(original_generation, rp1.generation) self.assertEqual(original_generation + 1, new_rp.generation) + + def test_switch_allocations_overbooked(self): + """A resource-provider which is already overbooked gets an allocation + switched from one consumer to another one. This is necessary to migrate + away resources to make the resource-provider less full. + """ + rp = self._create_provider('full_rp') + + cpu_inv = tb.add_inventory(rp, orc.VCPU, 24, + allocation_ratio=16.0) + mem_inv = tb.add_inventory(rp, orc.MEMORY_MB, 1024, + min_unit=64, + max_unit=1024, + step_size=64) + + # Create a consumer already filling half the provider + filling_consumer = consumer_obj.Consumer( + self.ctx, uuid=uuidsentinel.filler, user=self.user_obj, + project=self.project_obj) + filling_consumer.create() + + alloc_list = [ + alloc_obj.Allocation( + consumer=filling_consumer, + resource_provider=rp, + resource_class=orc.VCPU, + used=12), + alloc_obj.Allocation( + consumer=filling_consumer, + resource_provider=rp, + resource_class=orc.MEMORY_MB, + used=512) + ] + alloc_obj.replace_all(self.ctx, alloc_list) + + # Create a consumer representing the instance, filling the other half + inst_consumer = consumer_obj.Consumer( + self.ctx, uuid=uuidsentinel.instance, user=self.user_obj, + project=self.project_obj) + inst_consumer.create() + + alloc_list = [ + alloc_obj.Allocation( + consumer=inst_consumer, + resource_provider=rp, + resource_class=orc.VCPU, + used=12), + alloc_obj.Allocation( + consumer=inst_consumer, + resource_provider=rp, + resource_class=orc.MEMORY_MB, + used=512) + ] + alloc_obj.replace_all(self.ctx, alloc_list) + + # inventory of the provider changes (because a host in the cluster it + # represents goes down or a memory-module fails + mem_inv.total = 768 + rp.set_inventory([cpu_inv, mem_inv]) + + # Create a consumer representing the migration + mig_consumer = consumer_obj.Consumer( + self.ctx, uuid=uuidsentinel.migration, user=self.user_obj, + project=self.project_obj) + mig_consumer.create() + + # requesting additional resources fails + alloc_list = [ + alloc_obj.Allocation( + consumer=mig_consumer, + resource_provider=rp, + resource_class=orc.VCPU, + used=12), + alloc_obj.Allocation( + consumer=mig_consumer, + resource_provider=rp, + resource_class=orc.MEMORY_MB, + used=512) + ] + self.assertRaises(exception.InvalidAllocationCapacityExceeded, + alloc_obj.replace_all, self.ctx, alloc_list) + + # switching over some resources from one consumer to another fails + alloc_list.append( + alloc_obj.Allocation( + consumer=inst_consumer, + resource_provider=rp, + resource_class=orc.MEMORY_MB, + used=0) + ) + self.assertRaises(exception.InvalidAllocationCapacityExceeded, + alloc_obj.replace_all, self.ctx, alloc_list) + + # switch over all resources works + alloc_list.append( + alloc_obj.Allocation( + consumer=inst_consumer, + resource_provider=rp, + resource_class=orc.VCPU, + used=0) + ) + alloc_obj.replace_all(self.ctx, alloc_list) + + # check that we still have the same number of resources allocations + allocations = alloc_obj.get_all_by_resource_provider(self.ctx, rp) + self.assertEqual(4, len(allocations)) + + # instance now has no allocations + allocations = alloc_obj.get_all_by_consumer_id(self.ctx, + inst_consumer.uuid) + self.assertEqual(0, len(allocations)) + + # migration has the resources of the instance + allocations = alloc_obj.get_all_by_consumer_id(self.ctx, + mig_consumer.uuid) + self.assertEqual(2, len(allocations)) + expected = sorted([(orc.VCPU, 12), (orc.MEMORY_MB, 512)]) + actual = sorted([(a.resource_class, a.used) for a in allocations]) + self.assertEqual(expected, actual) + + # filling consumer stayed the same + allocations = alloc_obj.get_all_by_consumer_id(self.ctx, + filling_consumer.uuid) + self.assertEqual(2, len(allocations)) + expected = sorted([(orc.VCPU, 12), (orc.MEMORY_MB, 512)]) + actual = sorted([(a.resource_class, a.used) for a in allocations]) + self.assertEqual(expected, actual)