Skip to content

Commit

Permalink
migration: Refactor error handling in source return path
Browse files Browse the repository at this point in the history
rp_state.error was a boolean used to show error happened in return path
thread.  That's not only duplicating error reporting (migrate_set_error),
but also not good enough in that we only do error_report() and set it to
true, we never can keep a history of the exact error and show it in
query-migrate.

To make this better, a few things done:

  - Use error_setg() rather than error_report() across the whole lifecycle
    of return path thread, keeping the error in an Error*.

  - With above, no need to have mark_source_rp_bad(), remove it, alongside
    with rp_state.error itself.

  - Use migrate_set_error() to apply that captured error to the global
    migration object when error occured in this thread.

  - Do the same when detected qemufile error in source return path

We need to re-export qemu_file_get_error_obj() to do the last one.

Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Fabiano Rosas <farosas@suse.de>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
Message-ID: <20231017202633.296756-2-peterx@redhat.com>
  • Loading branch information
xzpeter authored and Juan Quintela committed Nov 2, 2023
1 parent e7b428d commit 7aa6070
Show file tree
Hide file tree
Showing 7 changed files with 80 additions and 95 deletions.
121 changes: 53 additions & 68 deletions migration/migration.c
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ static int migration_maybe_pause(MigrationState *s,
int *current_active_state,
int new_state);
static void migrate_fd_cancel(MigrationState *s);
static int close_return_path_on_source(MigrationState *s);
static bool close_return_path_on_source(MigrationState *s);

static void migration_downtime_start(MigrationState *s)
{
Expand Down Expand Up @@ -1475,7 +1475,6 @@ int migrate_init(MigrationState *s, Error **errp)
s->to_dst_file = NULL;
s->state = MIGRATION_STATUS_NONE;
s->rp_state.from_dst_file = NULL;
s->rp_state.error = false;
s->mbps = 0.0;
s->pages_per_second = 0.0;
s->downtime = 0;
Expand Down Expand Up @@ -1883,16 +1882,6 @@ void qmp_migrate_continue(MigrationStatus state, Error **errp)
qemu_sem_post(&s->pause_sem);
}

/* migration thread support */
/*
* Something bad happened to the RP stream, mark an error
* The caller shall print or trace something to indicate why
*/
static void mark_source_rp_bad(MigrationState *s)
{
s->rp_state.error = true;
}

void migration_rp_wait(MigrationState *s)
{
qemu_sem_wait(&s->rp_state.rp_sem);
Expand Down Expand Up @@ -1923,8 +1912,9 @@ static struct rp_cmd_args {
* We're allowed to send more than requested (e.g. to round to our page size)
* and we don't need to send pages that have already been sent.
*/
static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname,
ram_addr_t start, size_t len)
static void
migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname,
ram_addr_t start, size_t len, Error **errp)
{
long our_host_ps = qemu_real_host_page_size();

Expand All @@ -1936,37 +1926,36 @@ static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname,
*/
if (!QEMU_IS_ALIGNED(start, our_host_ps) ||
!QEMU_IS_ALIGNED(len, our_host_ps)) {
error_report("%s: Misaligned page request, start: " RAM_ADDR_FMT
" len: %zd", __func__, start, len);
mark_source_rp_bad(ms);
error_setg(errp, "MIG_RP_MSG_REQ_PAGES: Misaligned page request, start:"
RAM_ADDR_FMT " len: %zd", start, len);
return;
}

if (ram_save_queue_pages(rbname, start, len)) {
mark_source_rp_bad(ms);
}
ram_save_queue_pages(rbname, start, len, errp);
}

static int migrate_handle_rp_recv_bitmap(MigrationState *s, char *block_name)
static int migrate_handle_rp_recv_bitmap(MigrationState *s, char *block_name,
Error **errp)
{
RAMBlock *block = qemu_ram_block_by_name(block_name);

if (!block) {
error_report("%s: invalid block name '%s'", __func__, block_name);
error_setg(errp, "MIG_RP_MSG_RECV_BITMAP has invalid block name '%s'",
block_name);
return -EINVAL;
}

/* Fetch the received bitmap and refresh the dirty bitmap */
return ram_dirty_bitmap_reload(s, block);
return ram_dirty_bitmap_reload(s, block, errp);
}

static int migrate_handle_rp_resume_ack(MigrationState *s, uint32_t value)
static int migrate_handle_rp_resume_ack(MigrationState *s,
uint32_t value, Error **errp)
{
trace_source_return_path_thread_resume_ack(value);

if (value != MIGRATION_RESUME_ACK_VALUE) {
error_report("%s: illegal resume_ack value %"PRIu32,
__func__, value);
error_setg(errp, "illegal resume_ack value %"PRIu32, value);
return -1;
}

Expand Down Expand Up @@ -2025,48 +2014,46 @@ static void *source_return_path_thread(void *opaque)
uint32_t tmp32, sibling_error;
ram_addr_t start = 0; /* =0 to silence warning */
size_t len = 0, expected_len;
Error *err = NULL;
int res;

trace_source_return_path_thread_entry();
rcu_register_thread();

while (!ms->rp_state.error && !qemu_file_get_error(rp) &&
migration_is_setup_or_active(ms->state)) {
while (migration_is_setup_or_active(ms->state)) {
trace_source_return_path_thread_loop_top();

header_type = qemu_get_be16(rp);
header_len = qemu_get_be16(rp);

if (qemu_file_get_error(rp)) {
mark_source_rp_bad(ms);
qemu_file_get_error_obj(rp, &err);
goto out;
}

if (header_type >= MIG_RP_MSG_MAX ||
header_type == MIG_RP_MSG_INVALID) {
error_report("RP: Received invalid message 0x%04x length 0x%04x",
header_type, header_len);
mark_source_rp_bad(ms);
error_setg(&err, "Received invalid message 0x%04x length 0x%04x",
header_type, header_len);
goto out;
}

if ((rp_cmd_args[header_type].len != -1 &&
header_len != rp_cmd_args[header_type].len) ||
header_len > sizeof(buf)) {
error_report("RP: Received '%s' message (0x%04x) with"
"incorrect length %d expecting %zu",
rp_cmd_args[header_type].name, header_type, header_len,
(size_t)rp_cmd_args[header_type].len);
mark_source_rp_bad(ms);
error_setg(&err, "Received '%s' message (0x%04x) with"
"incorrect length %d expecting %zu",
rp_cmd_args[header_type].name, header_type, header_len,
(size_t)rp_cmd_args[header_type].len);
goto out;
}

/* We know we've got a valid header by this point */
res = qemu_get_buffer(rp, buf, header_len);
if (res != header_len) {
error_report("RP: Failed reading data for message 0x%04x"
" read %d expected %d",
header_type, res, header_len);
mark_source_rp_bad(ms);
error_setg(&err, "Failed reading data for message 0x%04x"
" read %d expected %d",
header_type, res, header_len);
goto out;
}

Expand All @@ -2076,8 +2063,7 @@ static void *source_return_path_thread(void *opaque)
sibling_error = ldl_be_p(buf);
trace_source_return_path_thread_shut(sibling_error);
if (sibling_error) {
error_report("RP: Sibling indicated error %d", sibling_error);
mark_source_rp_bad(ms);
error_setg(&err, "Sibling indicated error %d", sibling_error);
}
/*
* We'll let the main thread deal with closing the RP
Expand All @@ -2095,7 +2081,10 @@ static void *source_return_path_thread(void *opaque)
case MIG_RP_MSG_REQ_PAGES:
start = ldq_be_p(buf);
len = ldl_be_p(buf + 8);
migrate_handle_rp_req_pages(ms, NULL, start, len);
migrate_handle_rp_req_pages(ms, NULL, start, len, &err);
if (err) {
goto out;
}
break;

case MIG_RP_MSG_REQ_PAGES_ID:
Expand All @@ -2110,32 +2099,32 @@ static void *source_return_path_thread(void *opaque)
expected_len += tmp32;
}
if (header_len != expected_len) {
error_report("RP: Req_Page_id with length %d expecting %zd",
header_len, expected_len);
mark_source_rp_bad(ms);
error_setg(&err, "Req_Page_id with length %d expecting %zd",
header_len, expected_len);
goto out;
}
migrate_handle_rp_req_pages(ms, (char *)&buf[13], start, len,
&err);
if (err) {
goto out;
}
migrate_handle_rp_req_pages(ms, (char *)&buf[13], start, len);
break;

case MIG_RP_MSG_RECV_BITMAP:
if (header_len < 1) {
error_report("%s: missing block name", __func__);
mark_source_rp_bad(ms);
error_setg(&err, "MIG_RP_MSG_RECV_BITMAP missing block name");
goto out;
}
/* Format: len (1B) + idstr (<255B). This ends the idstr. */
buf[buf[0] + 1] = '\0';
if (migrate_handle_rp_recv_bitmap(ms, (char *)(buf + 1))) {
mark_source_rp_bad(ms);
if (migrate_handle_rp_recv_bitmap(ms, (char *)(buf + 1), &err)) {
goto out;
}
break;

case MIG_RP_MSG_RESUME_ACK:
tmp32 = ldl_be_p(buf);
if (migrate_handle_rp_resume_ack(ms, tmp32)) {
mark_source_rp_bad(ms);
if (migrate_handle_rp_resume_ack(ms, tmp32, &err)) {
goto out;
}
break;
Expand All @@ -2151,13 +2140,15 @@ static void *source_return_path_thread(void *opaque)
}

out:
if (qemu_file_get_error(rp)) {
if (err) {
migrate_set_error(ms, err);
error_free(err);
trace_source_return_path_thread_bad_end();
mark_source_rp_bad(ms);
}

trace_source_return_path_thread_end();
rcu_unregister_thread();

return NULL;
}

Expand All @@ -2179,12 +2170,11 @@ static int open_return_path_on_source(MigrationState *ms)
return 0;
}

static int close_return_path_on_source(MigrationState *ms)
/* Return true if error detected, or false otherwise */
static bool close_return_path_on_source(MigrationState *ms)
{
int ret;

if (!ms->rp_state.rp_thread_created) {
return 0;
return false;
}

trace_migration_return_path_end_before();
Expand All @@ -2202,18 +2192,13 @@ static int close_return_path_on_source(MigrationState *ms)
}
}

trace_await_return_path_close_on_source_joining();
qemu_thread_join(&ms->rp_state.rp_thread);
ms->rp_state.rp_thread_created = false;
trace_await_return_path_close_on_source_close();

ret = ms->rp_state.error;
ms->rp_state.error = false;

migration_release_dst_files(ms);
trace_migration_return_path_end_after();

trace_migration_return_path_end_after(ret);
return ret;
/* Return path will persist the error in MigrationState when quit */
return migrate_has_error(ms);
}

static inline void
Expand Down
1 change: 0 additions & 1 deletion migration/migration.h
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,6 @@ struct MigrationState {
/* Protected by qemu_file_lock */
QEMUFile *from_dst_file;
QemuThread rp_thread;
bool error;
/*
* We can also check non-zero of rp_thread, but there's no "official"
* way to do this, so this bool makes it slightly more elegant.
Expand Down
2 changes: 1 addition & 1 deletion migration/qemu-file.c
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ QEMUFile *qemu_file_new_input(QIOChannel *ioc)
*
* If errp is specified, a verbose error message will be copied over.
*/
static int qemu_file_get_error_obj(QEMUFile *f, Error **errp)
int qemu_file_get_error_obj(QEMUFile *f, Error **errp)
{
if (!f->last_error) {
return 0;
Expand Down
1 change: 1 addition & 0 deletions migration/qemu-file.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ int coroutine_mixed_fn qemu_peek_byte(QEMUFile *f, int offset);
void qemu_file_skip(QEMUFile *f, int size);
int qemu_file_get_error_obj_any(QEMUFile *f1, QEMUFile *f2, Error **errp);
void qemu_file_set_error_obj(QEMUFile *f, int ret, Error *err);
int qemu_file_get_error_obj(QEMUFile *f, Error **errp);
void qemu_file_set_error(QEMUFile *f, int ret);
int qemu_file_shutdown(QEMUFile *f);
QEMUFile *qemu_file_get_return_path(QEMUFile *f);
Expand Down

0 comments on commit 7aa6070

Please sign in to comment.