Skip to content

Commit c6e1b44

Browse files
committed
pwr mgmt: handle live migrations correctly
Previously, live migrations completely ignored CPU power management. This patch makes sure that we correctly: * Power up the cores on the destination during pre_live_migration, as we need them powered up before the instance starts on the destination. * If the live migration is successful, power down the vacated cores on the source. * In case of a rollback, power down the cores previously powered up on pre_live_migration. Closes-bug: 2056613 Change-Id: I787bd7807950370cd865f29b95989d489d4826d0 (cherry picked from commit c1ccc1a) (cherry picked from commit c5a73e6)
1 parent 6d48c12 commit c6e1b44

File tree

5 files changed

+137
-39
lines changed

5 files changed

+137
-39
lines changed

nova/compute/manager.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9095,12 +9095,16 @@ def _live_migration_cleanup_flags(self, migrate_data, migr_ctxt=None):
90959095
objects.LibvirtVPMEMDevice)):
90969096
has_vpmem = True
90979097
break
9098+
power_management_possible = (
9099+
'dst_numa_info' in migrate_data and
9100+
migrate_data.dst_numa_info is not None)
90989101
# No instance booting at source host, but instance dir
90999102
# must be deleted for preparing next block migration
91009103
# must be deleted for preparing next live migration w/o shared
91019104
# storage
91029105
# vpmem must be cleaned
9103-
do_cleanup = not migrate_data.is_shared_instance_path or has_vpmem
9106+
do_cleanup = (not migrate_data.is_shared_instance_path or
9107+
has_vpmem or power_management_possible)
91049108
destroy_disks = not migrate_data.is_shared_block_storage
91059109
elif isinstance(migrate_data, migrate_data_obj.HyperVLiveMigrateData):
91069110
# NOTE(claudiub): We need to cleanup any zombie Planned VM.

nova/tests/functional/libvirt/test_power_manage.py

Lines changed: 62 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -59,12 +59,15 @@ def setUp(self):
5959
'hw:cpu_policy': 'dedicated',
6060
'hw:cpu_thread_policy': 'prefer',
6161
}
62+
self.isolate_extra_spec = {
63+
'hw:cpu_policy': 'dedicated',
64+
'hw:cpu_thread_policy': 'prefer',
65+
'hw:emulator_threads_policy': 'isolate',
66+
}
6267
self.pcpu_flavor_id = self._create_flavor(
6368
vcpu=4, extra_spec=self.extra_spec)
6469
self.isolate_flavor_id = self._create_flavor(
65-
vcpu=4, extra_spec={'hw:cpu_policy': 'dedicated',
66-
'hw:cpu_thread_policy': 'prefer',
67-
'hw:emulator_threads_policy': 'isolate'})
70+
vcpu=4, extra_spec=self.isolate_extra_spec)
6871

6972
def _assert_server_cpus_state(self, server, expected='online'):
7073
inst = objects.Instance.get_by_uuid(self.ctxt, server['id'])
@@ -117,8 +120,8 @@ def __call__(self, i):
117120
return self.cores[i]
118121

119122

120-
class PowerManagementLiveMigrationTests(base.LibvirtMigrationMixin,
121-
PowerManagementTestsBase):
123+
class PowerManagementLiveMigrationTestsBase(base.LibvirtMigrationMixin,
124+
PowerManagementTestsBase):
122125

123126
def setUp(self):
124127
super().setUp()
@@ -129,10 +132,13 @@ def setUp(self):
129132
self.flags(vcpu_pin_set=None)
130133
self.flags(cpu_power_management=True, group='libvirt')
131134

132-
# NOTE(artom) Fill up all dedicated CPUs. This makes the assertions
133-
# further down easier.
135+
# NOTE(artom) Fill up all dedicated CPUs (either with only the
136+
# instance's CPUs, or instance CPUs + 1 emulator thread). This makes
137+
# the assertions further down easier.
134138
self.pcpu_flavor_id = self._create_flavor(
135139
vcpu=9, extra_spec=self.extra_spec)
140+
self.isolate_flavor_id = self._create_flavor(
141+
vcpu=8, extra_spec=self.isolate_extra_spec)
136142

137143
self.start_compute(
138144
host_info=fakelibvirt.HostInfo(cpu_nodes=1, cpu_sockets=1,
@@ -156,14 +162,61 @@ def assert_cores(self, host, cores, online=True):
156162
for i in cores:
157163
self.assertEqual(online, host.driver.cpu_api.core(i).online)
158164

165+
166+
class PowerManagementLiveMigrationTests(PowerManagementLiveMigrationTestsBase):
167+
159168
def test_live_migrate_server(self):
160169
self.server = self._create_server(
161170
flavor_id=self.pcpu_flavor_id,
162171
expected_state='ACTIVE', host='src')
163172
server = self._live_migrate(self.server)
164173
self.assertEqual('dest', server['OS-EXT-SRV-ATTR:host'])
165-
# FIXME(artom) We've not powered up the dest cores, and left the src
166-
# cores powered on.
174+
# We've powered down the source cores, and powered up the destination
175+
# ones.
176+
self.assert_cores(self.src, range(1, 10), online=False)
177+
self.assert_cores(self.dest, range(1, 10), online=True)
178+
179+
def test_live_migrate_server_with_emulator_threads_isolate(self):
180+
self.server = self._create_server(
181+
flavor_id=self.isolate_flavor_id,
182+
expected_state='ACTIVE', host='src')
183+
server = self._live_migrate(self.server)
184+
self.assertEqual('dest', server['OS-EXT-SRV-ATTR:host'])
185+
# We're using a flavor with 8 CPUs, but with the extra dedicated CPU
186+
# for the emulator threads, we expect all 9 cores to be powered up on
187+
# the dest, and down on the source.
188+
self.assert_cores(self.src, range(1, 10), online=False)
189+
self.assert_cores(self.dest, range(1, 10), online=True)
190+
191+
192+
class PowerManagementLiveMigrationRollbackTests(
193+
PowerManagementLiveMigrationTestsBase):
194+
195+
def _migrate_stub(self, domain, destination, params, flags):
196+
conn = self.src.driver._host.get_connection()
197+
dom = conn.lookupByUUIDString(self.server['id'])
198+
dom.fail_job()
199+
200+
def test_live_migrate_server_rollback(self):
201+
self.server = self._create_server(
202+
flavor_id=self.pcpu_flavor_id,
203+
expected_state='ACTIVE', host='src')
204+
server = self._live_migrate(self.server,
205+
migration_expected_state='failed')
206+
self.assertEqual('src', server['OS-EXT-SRV-ATTR:host'])
207+
self.assert_cores(self.src, range(1, 10), online=True)
208+
self.assert_cores(self.dest, range(1, 10), online=False)
209+
210+
def test_live_migrate_server_with_emulator_threads_isolate_rollback(self):
211+
self.server = self._create_server(
212+
flavor_id=self.isolate_flavor_id,
213+
expected_state='ACTIVE', host='src')
214+
server = self._live_migrate(self.server,
215+
migration_expected_state='failed')
216+
self.assertEqual('src', server['OS-EXT-SRV-ATTR:host'])
217+
# We're using a flavor with 8 CPUs, but with the extra dedicated CPU
218+
# for the emulator threads, we expect all 9 cores to be powered back
219+
# down on the dest, and up on the source.
167220
self.assert_cores(self.src, range(1, 10), online=True)
168221
self.assert_cores(self.dest, range(1, 10), online=False)
169222

nova/tests/unit/virt/libvirt/cpu/test_api.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ def test_power_up_online(self, mock_online):
8383
self.flags(cpu_power_management=True, group='libvirt')
8484
self.flags(cpu_dedicated_set='1-2', group='compute')
8585

86-
self.api.power_up(self.fake_inst)
86+
self.api.power_up_for_instance(self.fake_inst)
8787
# only core #2 can be set as core #0 is not on the dedicated set
8888
# As a reminder, core(i).online calls set_online(i)
8989
mock_online.assert_called_once_with(2)
@@ -94,29 +94,29 @@ def test_power_up_governor(self, mock_set_governor):
9494
self.flags(cpu_power_management_strategy='governor', group='libvirt')
9595
self.flags(cpu_dedicated_set='1-2', group='compute')
9696

97-
self.api.power_up(self.fake_inst)
97+
self.api.power_up_for_instance(self.fake_inst)
9898
# only core #2 can be set as core #1 is not on the dedicated set
9999
# As a reminder, core(i).set_high_governor calls set_governor(i)
100100
mock_set_governor.assert_called_once_with(2, 'performance')
101101

102102
@mock.patch.object(core, 'set_online')
103103
def test_power_up_skipped(self, mock_online):
104104
self.flags(cpu_power_management=False, group='libvirt')
105-
self.api.power_up(self.fake_inst)
105+
self.api.power_up_for_instance(self.fake_inst)
106106
mock_online.assert_not_called()
107107

108108
@mock.patch.object(core, 'set_online')
109109
def test_power_up_skipped_if_standard_instance(self, mock_online):
110110
self.flags(cpu_power_management=True, group='libvirt')
111-
self.api.power_up(objects.Instance(numa_topology=None))
111+
self.api.power_up_for_instance(objects.Instance(numa_topology=None))
112112
mock_online.assert_not_called()
113113

114114
@mock.patch.object(core, 'set_offline')
115115
def test_power_down_offline(self, mock_offline):
116116
self.flags(cpu_power_management=True, group='libvirt')
117117
self.flags(cpu_dedicated_set='1-2', group='compute')
118118

119-
self.api.power_down(self.fake_inst)
119+
self.api.power_down_for_instance(self.fake_inst)
120120
# only core #2 can be set as core #1 is not on the dedicated set
121121
# As a reminder, core(i).online calls set_online(i)
122122
mock_offline.assert_called_once_with(2)
@@ -127,7 +127,7 @@ def test_power_down_governor_cpu0_ignored(self, mock_set_governor):
127127
self.flags(cpu_power_management_strategy='governor', group='libvirt')
128128
self.flags(cpu_dedicated_set='0-1', group='compute')
129129

130-
self.api.power_down(self.fake_inst)
130+
self.api.power_down_for_instance(self.fake_inst)
131131

132132
# Make sure that core #0 is ignored, since it is special and cannot
133133
# be powered down.
@@ -139,7 +139,7 @@ def test_power_down_governor(self, mock_set_governor):
139139
self.flags(cpu_power_management_strategy='governor', group='libvirt')
140140
self.flags(cpu_dedicated_set='1-2', group='compute')
141141

142-
self.api.power_down(self.fake_inst)
142+
self.api.power_down_for_instance(self.fake_inst)
143143

144144
# only core #2 can be set as core #0 is not on the dedicated set
145145
# As a reminder, core(i).set_high_governor calls set_governor(i)
@@ -148,13 +148,13 @@ def test_power_down_governor(self, mock_set_governor):
148148
@mock.patch.object(core, 'set_offline')
149149
def test_power_down_skipped(self, mock_offline):
150150
self.flags(cpu_power_management=False, group='libvirt')
151-
self.api.power_down(self.fake_inst)
151+
self.api.power_down_for_instance(self.fake_inst)
152152
mock_offline.assert_not_called()
153153

154154
@mock.patch.object(core, 'set_offline')
155155
def test_power_down_skipped_if_standard_instance(self, mock_offline):
156156
self.flags(cpu_power_management=True, group='libvirt')
157-
self.api.power_down(objects.Instance(numa_topology=None))
157+
self.api.power_down_for_instance(objects.Instance(numa_topology=None))
158158
mock_offline.assert_not_called()
159159

160160
@mock.patch.object(core, 'set_offline')

nova/virt/libvirt/cpu/api.py

Lines changed: 41 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -91,46 +91,70 @@ def core(self, i):
9191
"""
9292
return Core(i)
9393

94-
def power_up(self, instance: objects.Instance) -> None:
94+
def _power_up(self, cpus: ty.Set[int]) -> None:
9595
if not CONF.libvirt.cpu_power_management:
9696
return
97-
if instance.numa_topology is None:
98-
return
99-
10097
cpu_dedicated_set = hardware.get_cpu_dedicated_set_nozero() or set()
101-
pcpus = instance.numa_topology.cpu_pinning.union(
102-
instance.numa_topology.cpuset_reserved)
10398
powered_up = set()
104-
for pcpu in pcpus:
105-
if pcpu in cpu_dedicated_set:
106-
pcpu = self.core(pcpu)
99+
for cpu in cpus:
100+
if cpu in cpu_dedicated_set:
101+
pcpu = self.core(cpu)
107102
if CONF.libvirt.cpu_power_management_strategy == 'cpu_state':
108103
pcpu.online = True
109104
else:
110105
pcpu.set_high_governor()
111106
powered_up.add(str(pcpu))
112107
LOG.debug("Cores powered up : %s", powered_up)
113108

114-
def power_down(self, instance: objects.Instance) -> None:
115-
if not CONF.libvirt.cpu_power_management:
116-
return
109+
def power_up_for_instance(self, instance: objects.Instance) -> None:
117110
if instance.numa_topology is None:
118111
return
119-
120-
cpu_dedicated_set = hardware.get_cpu_dedicated_set_nozero() or set()
121112
pcpus = instance.numa_topology.cpu_pinning.union(
122113
instance.numa_topology.cpuset_reserved)
114+
self._power_up(pcpus)
115+
116+
def power_up_for_migration(
117+
self, dst_numa_info: objects.LibvirtLiveMigrateNUMAInfo
118+
) -> None:
119+
pcpus = set()
120+
if 'emulator_pins' in dst_numa_info and dst_numa_info.emulator_pins:
121+
pcpus = dst_numa_info.emulator_pins
122+
for pins in dst_numa_info.cpu_pins.values():
123+
pcpus = pcpus.union(pins)
124+
self._power_up(pcpus)
125+
126+
def _power_down(self, cpus: ty.Set[int]) -> None:
127+
if not CONF.libvirt.cpu_power_management:
128+
return
129+
cpu_dedicated_set = hardware.get_cpu_dedicated_set_nozero() or set()
123130
powered_down = set()
124-
for pcpu in pcpus:
125-
if pcpu in cpu_dedicated_set:
126-
pcpu = self.core(pcpu)
131+
for cpu in cpus:
132+
if cpu in cpu_dedicated_set:
133+
pcpu = self.core(cpu)
127134
if CONF.libvirt.cpu_power_management_strategy == 'cpu_state':
128135
pcpu.online = False
129136
else:
130137
pcpu.set_low_governor()
131138
powered_down.add(str(pcpu))
132139
LOG.debug("Cores powered down : %s", powered_down)
133140

141+
def power_down_for_migration(
142+
self, dst_numa_info: objects.LibvirtLiveMigrateNUMAInfo
143+
) -> None:
144+
pcpus = set()
145+
if 'emulator_pins' in dst_numa_info and dst_numa_info.emulator_pins:
146+
pcpus = dst_numa_info.emulator_pins
147+
for pins in dst_numa_info.cpu_pins.values():
148+
pcpus = pcpus.union(pins)
149+
self._power_down(pcpus)
150+
151+
def power_down_for_instance(self, instance: objects.Instance) -> None:
152+
if instance.numa_topology is None:
153+
return
154+
pcpus = instance.numa_topology.cpu_pinning.union(
155+
instance.numa_topology.cpuset_reserved)
156+
self._power_down(pcpus)
157+
134158
def power_down_all_dedicated_cpus(self) -> None:
135159
if not CONF.libvirt.cpu_power_management:
136160
return

nova/virt/libvirt/driver.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1535,7 +1535,7 @@ def _wait_for_destroy(expected_domid):
15351535
if CONF.libvirt.virt_type == 'lxc':
15361536
self._teardown_container(instance)
15371537
# We're sure the instance is gone, we can shutdown the core if so
1538-
self.cpu_api.power_down(instance)
1538+
self.cpu_api.power_down_for_instance(instance)
15391539

15401540
def destroy(self, context, instance, network_info, block_device_info=None,
15411541
destroy_disks=True, destroy_secrets=True):
@@ -3189,7 +3189,7 @@ def _resume_guest_after_snapshot(
31893189

31903190
current_power_state = guest.get_power_state(self._host)
31913191

3192-
self.cpu_api.power_up(instance)
3192+
self.cpu_api.power_up_for_instance(instance)
31933193
# TODO(stephenfin): Any reason we couldn't use 'self.resume' here?
31943194
guest.launch(pause=current_power_state == power_state.PAUSED)
31953195

@@ -7670,7 +7670,7 @@ def _create_guest(
76707670
post_xml_callback()
76717671

76727672
if power_on or pause:
7673-
self.cpu_api.power_up(instance)
7673+
self.cpu_api.power_up_for_instance(instance)
76747674
guest.launch(pause=pause)
76757675

76767676
return guest
@@ -10757,6 +10757,16 @@ def rollback_live_migration_at_destination(self, context, instance,
1075710757
serial_console.release_port(
1075810758
host=migrate_data.serial_listen_addr, port=port)
1075910759

10760+
if (
10761+
'dst_numa_info' in migrate_data and
10762+
migrate_data.dst_numa_info
10763+
):
10764+
self.cpu_api.power_down_for_migration(
10765+
migrate_data.dst_numa_info)
10766+
else:
10767+
LOG.debug('No dst_numa_info in migrate_data, '
10768+
'no cores to power down in rollback.')
10769+
1076010770
if not is_shared_instance_path:
1076110771
instance_dir = libvirt_utils.get_instance_path_at_destination(
1076210772
instance, migrate_data)
@@ -10923,6 +10933,12 @@ def pre_live_migration(self, context, instance, block_device_info,
1092310933

1092410934
migrate_data.bdms.append(bdmi)
1092510935

10936+
if 'dst_numa_info' in migrate_data and migrate_data.dst_numa_info:
10937+
self.cpu_api.power_up_for_migration(migrate_data.dst_numa_info)
10938+
else:
10939+
LOG.debug('No dst_numa_info in migrate_data, '
10940+
'no cores to power up in pre_live_migration.')
10941+
1092610942
return migrate_data
1092710943

1092810944
def _try_fetch_image_cache(self, image, fetch_func, context, filename,
@@ -11086,6 +11102,7 @@ def post_live_migration_at_source(self, context, instance, network_info):
1108611102
:param network_info: instance network information
1108711103
"""
1108811104
self.unplug_vifs(instance, network_info)
11105+
self.cpu_api.power_down_for_instance(instance)
1108911106

1109011107
def _qemu_monitor_announce_self(self, instance):
1109111108
"""Send announce_self command to QEMU monitor.

0 commit comments

Comments
 (0)