Skip to content

Commit

Permalink
savevm: New save live migration method: pending
Browse files Browse the repository at this point in the history
Code just now does (simplified for clarity)

    if (qemu_savevm_state_iterate(s->file) == 1) {
       vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
       qemu_savevm_state_complete(s->file);
    }

Problem here is that qemu_savevm_state_iterate() returns 1 when it
knows that remaining memory to sent takes less than max downtime.

But this means that we could end spending 2x max_downtime, one
downtime in qemu_savevm_iterate, and the other in
qemu_savevm_state_complete.

Changed code to:

    pending_size = qemu_savevm_state_pending(s->file, max_size);
    DPRINTF("pending size %lu max %lu\n", pending_size, max_size);
    if (pending_size >= max_size) {
        ret = qemu_savevm_state_iterate(s->file);
     } else {
        vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
        qemu_savevm_state_complete(s->file);
     }

So what we do is: at current network speed, we calculate the maximum
number of bytes we can sent: max_size.

Then we ask every save_live section how much they have pending.  If
they are less than max_size, we move to complete phase, otherwise we
do an iterate one.

This makes things much simpler, because now individual sections don't
have to caluclate the bandwidth (it was implossible to do right from
there).

Signed-off-by: Juan Quintela <quintela@redhat.com>

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
  • Loading branch information
Juan Quintela committed Dec 20, 2012
1 parent f50b498 commit e4ed154
Show file tree
Hide file tree
Showing 8 changed files with 83 additions and 84 deletions.
48 changes: 18 additions & 30 deletions arch_init.c
Expand Up @@ -582,22 +582,17 @@ static int ram_save_setup(QEMUFile *f, void *opaque)

static int ram_save_iterate(QEMUFile *f, void *opaque)
{
uint64_t bytes_transferred_last;
double bwidth = 0;
int ret;
int i;
uint64_t expected_downtime;
MigrationState *s = migrate_get_current();
int64_t t0;

qemu_mutex_lock_ramlist();

if (ram_list.version != last_version) {
reset_ram_globals();
}

bytes_transferred_last = bytes_transferred;
bwidth = qemu_get_clock_ns(rt_clock);

t0 = qemu_get_clock_ns(rt_clock);
i = 0;
while ((ret = qemu_file_rate_limit(f)) == 0) {
int bytes_sent;
Expand All @@ -615,7 +610,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
iterations
*/
if ((i & 63) == 0) {
uint64_t t1 = (qemu_get_clock_ns(rt_clock) - bwidth) / 1000000;
uint64_t t1 = (qemu_get_clock_ns(rt_clock) - t0) / 1000000;
if (t1 > MAX_WAIT) {
DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
t1, i);
Expand All @@ -629,31 +624,10 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
return ret;
}

bwidth = qemu_get_clock_ns(rt_clock) - bwidth;
bwidth = (bytes_transferred - bytes_transferred_last) / bwidth;

/* if we haven't transferred anything this round, force
* expected_downtime to a very high value, but without
* crashing */
if (bwidth == 0) {
bwidth = 0.000001;
}

qemu_mutex_unlock_ramlist();
qemu_put_be64(f, RAM_SAVE_FLAG_EOS);

expected_downtime = ram_save_remaining() * TARGET_PAGE_SIZE / bwidth;
DPRINTF("ram_save_live: expected(%" PRIu64 ") <= max(" PRIu64 ")?\n",
expected_downtime, migrate_max_downtime());

if (expected_downtime <= migrate_max_downtime()) {
migration_bitmap_sync();
expected_downtime = ram_save_remaining() * TARGET_PAGE_SIZE / bwidth;
s->expected_downtime = expected_downtime / 1000000; /* ns -> ms */

return expected_downtime <= migrate_max_downtime();
}
return 0;
return i;
}

static int ram_save_complete(QEMUFile *f, void *opaque)
Expand Down Expand Up @@ -683,6 +657,19 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
return 0;
}

static uint64_t ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
{
uint64_t remaining_size;

remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;

if (remaining_size < max_size) {
migration_bitmap_sync();
remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
}
return remaining_size;
}

static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
{
int ret, rc = 0;
Expand Down Expand Up @@ -869,6 +856,7 @@ SaveVMHandlers savevm_ram_handlers = {
.save_live_setup = ram_save_setup,
.save_live_iterate = ram_save_iterate,
.save_live_complete = ram_save_complete,
.save_live_pending = ram_save_pending,
.load_state = ram_load,
.cancel = ram_migration_cancel,
};
Expand Down
49 changes: 10 additions & 39 deletions block-migration.c
Expand Up @@ -77,9 +77,7 @@ typedef struct BlkMigState {
int64_t total_sector_sum;
int prev_progress;
int bulk_completed;
long double total_time;
long double prev_time_offset;
int reads;
} BlkMigState;

static BlkMigState block_mig_state;
Expand Down Expand Up @@ -132,12 +130,6 @@ uint64_t blk_mig_bytes_total(void)
return sum << BDRV_SECTOR_BITS;
}

static inline long double compute_read_bwidth(void)
{
assert(block_mig_state.total_time != 0);
return (block_mig_state.reads / block_mig_state.total_time) * BLOCK_SIZE;
}

static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector)
{
int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
Expand Down Expand Up @@ -191,8 +183,6 @@ static void blk_mig_read_cb(void *opaque, int ret)

blk->ret = ret;

block_mig_state.reads++;
block_mig_state.total_time += (curr_time - block_mig_state.prev_time_offset);
block_mig_state.prev_time_offset = curr_time;

QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
Expand Down Expand Up @@ -310,8 +300,6 @@ static void init_blk_migration(QEMUFile *f)
block_mig_state.total_sector_sum = 0;
block_mig_state.prev_progress = -1;
block_mig_state.bulk_completed = 0;
block_mig_state.total_time = 0;
block_mig_state.reads = 0;

bdrv_iterate(init_blk_migration_it, NULL);
}
Expand Down Expand Up @@ -493,32 +481,6 @@ static int64_t get_remaining_dirty(void)
return dirty * BLOCK_SIZE;
}

static int is_stage2_completed(void)
{
int64_t remaining_dirty;
long double bwidth;

if (block_mig_state.bulk_completed == 1) {

remaining_dirty = get_remaining_dirty();
if (remaining_dirty == 0) {
return 1;
}

bwidth = compute_read_bwidth();

if ((remaining_dirty / bwidth) <=
migrate_max_downtime()) {
/* finish stage2 because we think that we can finish remaining work
below max_downtime */

return 1;
}
}

return 0;
}

static void blk_mig_cleanup(void)
{
BlkMigDevState *bmds;
Expand Down Expand Up @@ -619,7 +581,7 @@ static int block_save_iterate(QEMUFile *f, void *opaque)

qemu_put_be64(f, BLK_MIG_FLAG_EOS);

return is_stage2_completed();
return 0;
}

static int block_save_complete(QEMUFile *f, void *opaque)
Expand Down Expand Up @@ -659,6 +621,14 @@ static int block_save_complete(QEMUFile *f, void *opaque)
return 0;
}

static uint64_t block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
{

DPRINTF("Enter save live pending %ld\n", get_remaining_dirty());

return get_remaining_dirty();
}

static int block_load(QEMUFile *f, void *opaque, int version_id)
{
static int banner_printed;
Expand Down Expand Up @@ -755,6 +725,7 @@ SaveVMHandlers savevm_block_handlers = {
.save_live_setup = block_save_setup,
.save_live_iterate = block_save_iterate,
.save_live_complete = block_save_complete,
.save_live_pending = block_save_pending,
.load_state = block_load,
.cancel = block_migration_cancel,
.is_active = block_is_active,
Expand Down
25 changes: 18 additions & 7 deletions buffered_file.c
Expand Up @@ -181,27 +181,38 @@ static int64_t buffered_get_rate_limit(void *opaque)
return s->xfer_limit;
}

/* 10ms xfer_limit is the limit that we should write each 10ms */
/* 100ms xfer_limit is the limit that we should write each 100ms */
#define BUFFER_DELAY 100

static void *buffered_file_thread(void *opaque)
{
QEMUFileBuffered *s = opaque;
int64_t expire_time = qemu_get_clock_ms(rt_clock) + BUFFER_DELAY;
int64_t initial_time = qemu_get_clock_ms(rt_clock);
int64_t max_size = 0;
bool last_round = false;

while (true) {
int64_t current_time = qemu_get_clock_ms(rt_clock);

if (s->migration_state->complete) {
break;
}
if (current_time >= expire_time) {
if (current_time >= initial_time + BUFFER_DELAY) {
uint64_t transferred_bytes = s->bytes_xfer;
uint64_t time_spent = current_time - initial_time;
double bandwidth = transferred_bytes / time_spent;
max_size = bandwidth * migrate_max_downtime() / 1000000;

DPRINTF("transferred %" PRIu64 " time_spent %" PRIu64
" bandwidth %g max_size %" PRId64 "\n",
transferred_bytes, time_spent, bandwidth, max_size);

s->bytes_xfer = 0;
expire_time = current_time + BUFFER_DELAY;
initial_time = current_time;
}
if (s->bytes_xfer >= s->xfer_limit) {
if (!last_round && (s->bytes_xfer >= s->xfer_limit)) {
/* usleep expects microseconds */
g_usleep((expire_time - current_time)*1000);
g_usleep((initial_time + BUFFER_DELAY - current_time)*1000);
}
if (buffered_flush(s) < 0) {
break;
Expand All @@ -210,7 +221,7 @@ static void *buffered_file_thread(void *opaque)
DPRINTF("file is ready\n");
if (s->bytes_xfer < s->xfer_limit) {
DPRINTF("notifying client\n");
migrate_fd_put_ready(s->migration_state);
last_round = migrate_fd_put_ready(s->migration_state, max_size);
}
}

Expand Down
2 changes: 1 addition & 1 deletion include/migration/migration.h
Expand Up @@ -81,7 +81,7 @@ void migrate_fd_connect(MigrationState *s);

ssize_t migrate_fd_put_buffer(MigrationState *s, const void *data,
size_t size);
void migrate_fd_put_ready(MigrationState *s);
bool migrate_fd_put_ready(MigrationState *s, uint64_t max_size);
int migrate_fd_close(MigrationState *s);

void add_migration_state_change_notifier(Notifier *notify);
Expand Down
1 change: 1 addition & 0 deletions include/migration/vmstate.h
Expand Up @@ -35,6 +35,7 @@ typedef struct SaveVMHandlers {
int (*save_live_setup)(QEMUFile *f, void *opaque);
int (*save_live_iterate)(QEMUFile *f, void *opaque);
int (*save_live_complete)(QEMUFile *f, void *opaque);
uint64_t (*save_live_pending)(QEMUFile *f, void *opaque, uint64_t max_size);
void (*cancel)(void *opaque);
LoadStateHandler *load_state;
bool (*is_active)(void *opaque);
Expand Down
1 change: 1 addition & 0 deletions include/sysemu/sysemu.h
Expand Up @@ -78,6 +78,7 @@ int qemu_savevm_state_begin(QEMUFile *f,
int qemu_savevm_state_iterate(QEMUFile *f);
int qemu_savevm_state_complete(QEMUFile *f);
void qemu_savevm_state_cancel(QEMUFile *f);
uint64_t qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size);
int qemu_loadvm_state(QEMUFile *f);

/* SLIRP */
Expand Down
22 changes: 15 additions & 7 deletions migration.c
Expand Up @@ -316,15 +316,17 @@ ssize_t migrate_fd_put_buffer(MigrationState *s, const void *data,
return ret;
}

void migrate_fd_put_ready(MigrationState *s)
bool migrate_fd_put_ready(MigrationState *s, uint64_t max_size)
{
int ret;
uint64_t pending_size;
bool last_round = false;

qemu_mutex_lock_iothread();
if (s->state != MIG_STATE_ACTIVE) {
DPRINTF("put_ready returning because of non-active state\n");
qemu_mutex_unlock_iothread();
return;
return false;
}
if (s->first_time) {
s->first_time = false;
Expand All @@ -334,15 +336,19 @@ void migrate_fd_put_ready(MigrationState *s)
DPRINTF("failed, %d\n", ret);
migrate_fd_error(s);
qemu_mutex_unlock_iothread();
return;
return false;
}
}

DPRINTF("iterate\n");
ret = qemu_savevm_state_iterate(s->file);
if (ret < 0) {
migrate_fd_error(s);
} else if (ret == 1) {
pending_size = qemu_savevm_state_pending(s->file, max_size);
DPRINTF("pending size %lu max %lu\n", pending_size, max_size);
if (pending_size >= max_size) {
ret = qemu_savevm_state_iterate(s->file);
if (ret < 0) {
migrate_fd_error(s);
}
} else {
int old_vm_running = runstate_is_running();
int64_t start_time, end_time;

Expand All @@ -368,9 +374,11 @@ void migrate_fd_put_ready(MigrationState *s)
vm_start();
}
}
last_round = true;
}
qemu_mutex_unlock_iothread();

return last_round;
}

static void migrate_fd_cancel(MigrationState *s)
Expand Down
19 changes: 19 additions & 0 deletions savevm.c
Expand Up @@ -1753,6 +1753,25 @@ int qemu_savevm_state_complete(QEMUFile *f)
return qemu_file_get_error(f);
}

uint64_t qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size)
{
SaveStateEntry *se;
uint64_t ret = 0;

QTAILQ_FOREACH(se, &savevm_handlers, entry) {
if (!se->ops || !se->ops->save_live_pending) {
continue;
}
if (se->ops && se->ops->is_active) {
if (!se->ops->is_active(se->opaque)) {
continue;
}
}
ret += se->ops->save_live_pending(f, se->opaque, max_size);
}
return ret;
}

void qemu_savevm_state_cancel(QEMUFile *f)
{
SaveStateEntry *se;
Expand Down

0 comments on commit e4ed154

Please sign in to comment.