Skip to content

Commit

Permalink
Allow switching resources on overused providers
Browse files Browse the repository at this point in the history
When a resource-provider gets overused, e.g. because a host in the cluster it
represents goes down or because one memory DIMM of multiple fails, we need to
be able to (offline) migrate/resize a VM to reduce the pressure on the host.

For that, we check in the call setting the allocations if for every
allocation with a value for a resource, there's also an allocation
removing such a resource - namely switching the allocations from one
consumer to another as is done for migrations.
  • Loading branch information
joker-at-work committed Sep 8, 2022
1 parent 4d3df47 commit 2691056
Show file tree
Hide file tree
Showing 2 changed files with 153 additions and 2 deletions.
28 changes: 26 additions & 2 deletions placement/objects/allocation.py
Expand Up @@ -11,6 +11,7 @@
# under the License.

import collections
import itertools

from oslo_db import api as oslo_db_api
from oslo_log import log as logging
Expand Down Expand Up @@ -176,6 +177,28 @@ def _check_capacity_exceeded(ctx, allocs):
raise exception.InvalidInventory(
resource_class=class_str, resource_provider=provider_str)

# check if this a complete replacement of one consumer with another one
# TODO(jkulik): Make this only work if it's replacing to same or less.
consumer_resources = collections.defaultdict(list)
for alloc in allocs:
d = (alloc.resource_class, alloc.used)
consumer_resources[alloc.consumer.id].append(d)
is_replacement = len(consumer_resources) == 2
if is_replacement:
# sort the resources so we get matching resource pairs (if possible) in
# the next step
for k in consumer_resources:
consumer_resources[k].sort()
# check that every resource change has a matching second change
# removing it from another consumer so the whole process is a
# replacement
for items in itertools.zip_longest(*consumer_resources.values(),
fillvalue=(None, None)):
(cls1, used1), (cls2, used2) = items
if cls1 != cls2 or 0 not in (used1, used2):
is_replacement = False
break

res_providers = {}
rp_resource_class_sum = collections.defaultdict(
lambda: collections.defaultdict(int))
Expand Down Expand Up @@ -223,8 +246,9 @@ def _check_capacity_exceeded(ctx, allocs):
# usage["used"] can be returned as None
used = usage['used'] or 0
capacity = (usage['total'] - usage['reserved']) * allocation_ratio
if (capacity < (used + amount_needed) or
capacity < (used + rp_resource_class_sum[rp_uuid][rc_id])):
if ((capacity < (used + amount_needed) or
capacity < (used + rp_resource_class_sum[rp_uuid][rc_id])) and
not is_replacement):
LOG.warning(
"Over capacity for %(rc)s on resource provider %(rp)s. "
"Needed: %(needed)s, Used: %(used)s, Capacity: %(cap)s",
Expand Down
127 changes: 127 additions & 0 deletions placement/tests/functional/db/test_allocation.py
Expand Up @@ -678,3 +678,130 @@ def side_effect(*args, **kwargs):
new_rp = alloc_list[0].resource_provider
self.assertEqual(original_generation, rp1.generation)
self.assertEqual(original_generation + 1, new_rp.generation)

def test_switch_allocations_overbooked(self):
"""A resource-provider which is already overbooked gets an allocation
switched from one consumer to another one. This is necessary to migrate
away resources to make the resource-provider less full.
"""
rp = self._create_provider('full_rp')

cpu_inv = tb.add_inventory(rp, orc.VCPU, 24,
allocation_ratio=16.0)
mem_inv = tb.add_inventory(rp, orc.MEMORY_MB, 1024,
min_unit=64,
max_unit=1024,
step_size=64)

# Create a consumer already filling half the provider
filling_consumer = consumer_obj.Consumer(
self.ctx, uuid=uuidsentinel.filler, user=self.user_obj,
project=self.project_obj)
filling_consumer.create()

alloc_list = [
alloc_obj.Allocation(
consumer=filling_consumer,
resource_provider=rp,
resource_class=orc.VCPU,
used=12),
alloc_obj.Allocation(
consumer=filling_consumer,
resource_provider=rp,
resource_class=orc.MEMORY_MB,
used=512)
]
alloc_obj.replace_all(self.ctx, alloc_list)

# Create a consumer representing the instance, filling the other half
inst_consumer = consumer_obj.Consumer(
self.ctx, uuid=uuidsentinel.instance, user=self.user_obj,
project=self.project_obj)
inst_consumer.create()

alloc_list = [
alloc_obj.Allocation(
consumer=inst_consumer,
resource_provider=rp,
resource_class=orc.VCPU,
used=12),
alloc_obj.Allocation(
consumer=inst_consumer,
resource_provider=rp,
resource_class=orc.MEMORY_MB,
used=512)
]
alloc_obj.replace_all(self.ctx, alloc_list)

# inventory of the provider changes (because a host in the cluster it
# represents goes down or a memory-module fails
mem_inv.total = 768
rp.set_inventory([cpu_inv, mem_inv])

# Create a consumer representing the migration
mig_consumer = consumer_obj.Consumer(
self.ctx, uuid=uuidsentinel.migration, user=self.user_obj,
project=self.project_obj)
mig_consumer.create()

# requesting additional resources fails
alloc_list = [
alloc_obj.Allocation(
consumer=mig_consumer,
resource_provider=rp,
resource_class=orc.VCPU,
used=12),
alloc_obj.Allocation(
consumer=mig_consumer,
resource_provider=rp,
resource_class=orc.MEMORY_MB,
used=512)
]
self.assertRaises(exception.InvalidAllocationCapacityExceeded,
alloc_obj.replace_all, self.ctx, alloc_list)

# switching over some resources from one consumer to another fails
alloc_list.append(
alloc_obj.Allocation(
consumer=inst_consumer,
resource_provider=rp,
resource_class=orc.MEMORY_MB,
used=0)
)
self.assertRaises(exception.InvalidAllocationCapacityExceeded,
alloc_obj.replace_all, self.ctx, alloc_list)

# switch over all resources works
alloc_list.append(
alloc_obj.Allocation(
consumer=inst_consumer,
resource_provider=rp,
resource_class=orc.VCPU,
used=0)
)
alloc_obj.replace_all(self.ctx, alloc_list)

# check that we still have the same number of resources allocations
allocations = alloc_obj.get_all_by_resource_provider(self.ctx, rp)
self.assertEqual(4, len(allocations))

# instance now has no allocations
allocations = alloc_obj.get_all_by_consumer_id(self.ctx,
inst_consumer.uuid)
self.assertEqual(0, len(allocations))

# migration has the resources of the instance
allocations = alloc_obj.get_all_by_consumer_id(self.ctx,
mig_consumer.uuid)
self.assertEqual(2, len(allocations))
expected = sorted([(orc.VCPU, 12), (orc.MEMORY_MB, 512)])
actual = sorted([(a.resource_class, a.used) for a in allocations])
self.assertEqual(expected, actual)

# filling consumer stayed the same
allocations = alloc_obj.get_all_by_consumer_id(self.ctx,
filling_consumer.uuid)
self.assertEqual(2, len(allocations))
expected = sorted([(orc.VCPU, 12), (orc.MEMORY_MB, 512)])
actual = sorted([(a.resource_class, a.used) for a in allocations])
self.assertEqual(expected, actual)

0 comments on commit 2691056

Please sign in to comment.