Skip to content

Commit

Permalink
coroutine: resize pool periodically instead of limiting size
Browse files Browse the repository at this point in the history
It was reported that enabling SafeStack reduces IOPS significantly
(>25%) with the following fio benchmark on virtio-blk using a NVMe host
block device:

  # fio --rw=randrw --bs=4k --iodepth=64 --runtime=1m --direct=1 \
	--filename=/dev/vdb --name=job1 --ioengine=libaio --thread \
	--group_reporting --numjobs=16 --time_based \
        --output=/tmp/fio_result

Serge Guelton and I found that SafeStack is not really at fault, it just
increases the cost of coroutine creation. This fio workload exhausts the
coroutine pool and coroutine creation becomes a bottleneck. Previous
work by Honghao Wang also pointed to excessive coroutine creation.

Creating new coroutines is expensive due to allocating new stacks with
mmap(2) and mprotect(2). Currently there are thread-local and global
pools that recycle old Coroutine objects and their stacks but the
hardcoded size limit of 64 for thread-local pools and 128 for the global
pool is insufficient for the fio benchmark shown above.

This patch changes the coroutine pool algorithm to a simple thread-local
pool without a maximum size limit. Threads periodically shrink the pool
down to a size sufficient for the maximum observed number of coroutines.

The global pool is removed by this patch. It can help to hide the fact
that local pools are easily exhausted, but it's doesn't fix the root
cause. I don't think there is a need for a global pool because QEMU's
threads are long-lived, so let's keep things simple.

Performance of the above fio benchmark is as follows:

      Before   After
IOPS     60k     97k

Memory usage varies over time as needed by the workload:

            VSZ (KB)             RSS (KB)
Before fio  4705248              843128
During fio  5747668 (+ ~100 MB)  849280
After fio   4694996 (- ~100 MB)  845184

This confirms that coroutines are indeed being freed when no longer
needed.

Thanks to Serge Guelton for working on identifying the bottleneck with
me!

Reported-by: Tingting Mao <timao@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20210913153524.1190696-1-stefanha@redhat.com
Cc: Serge Guelton <sguelton@redhat.com>
Cc: Honghao Wang <wanghonghao@bytedance.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Daniele Buono <dbuono@linux.vnet.ibm.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>

[Moved atexit notifier to coroutine_delete() after GitLab CI reported a
memory leak in tests/unit/test-aio-multithread because the Coroutine
object was created in the main thread but runs in an IOThread (where
it's also deleted).
--Stefan]
  • Loading branch information
stefanhaRH committed Oct 21, 2021
1 parent afc9fcd commit 4b2b3d2
Show file tree
Hide file tree
Showing 7 changed files with 125 additions and 29 deletions.
36 changes: 36 additions & 0 deletions include/qemu/coroutine-pool-timer.h
@@ -0,0 +1,36 @@
/*
* QEMU coroutine pool timer
*
* Copyright (c) 2021 Red Hat, Inc.
*
* SPDX-License-Identifier: LGPL-2.1-or-later
*
* This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
* See the COPYING.LIB file in the top-level directory.
*
*/
#ifndef COROUTINE_POOL_TIMER_H
#define COROUTINE_POOL_TIMER_H

#include "qemu/osdep.h"
#include "block/aio.h"

/**
* A timer that periodically resizes this thread's coroutine pool, freeing
* memory if there are too many unused coroutines.
*
* Threads that make heavy use of coroutines should use this. Failure to resize
* the coroutine pool can lead to large amounts of memory sitting idle and
* never being used after the first time.
*/
typedef struct {
QEMUTimer *timer;
} CoroutinePoolTimer;

/* Call this before the thread runs the AioContext */
void coroutine_pool_timer_init(CoroutinePoolTimer *pt, AioContext *ctx);

/* Call this before the AioContext from the init function is destroyed */
void coroutine_pool_timer_cleanup(CoroutinePoolTimer *pt);

#endif /* COROUTINE_POOL_TIMER_H */
7 changes: 7 additions & 0 deletions include/qemu/coroutine.h
Expand Up @@ -122,6 +122,13 @@ bool qemu_in_coroutine(void);
*/
bool qemu_coroutine_entered(Coroutine *co);

/**
* Optionally call this function periodically to shrink the thread-local pool
* down. Spiky workloads can create many coroutines and then never reach that
* level again. Shrinking the pool reclaims memory in this case.
*/
void qemu_coroutine_pool_periodic_resize(void);

/**
* Provides a mutex that can be used to synchronise coroutines
*/
Expand Down
6 changes: 6 additions & 0 deletions iothread.c
Expand Up @@ -23,6 +23,7 @@
#include "qemu/error-report.h"
#include "qemu/rcu.h"
#include "qemu/main-loop.h"
#include "qemu/coroutine-pool-timer.h"

typedef ObjectClass IOThreadClass;

Expand All @@ -42,6 +43,7 @@ DECLARE_CLASS_CHECKERS(IOThreadClass, IOTHREAD,
static void *iothread_run(void *opaque)
{
IOThread *iothread = opaque;
CoroutinePoolTimer co_pool_timer;

rcu_register_thread();
/*
Expand All @@ -53,6 +55,8 @@ static void *iothread_run(void *opaque)
iothread->thread_id = qemu_get_thread_id();
qemu_sem_post(&iothread->init_done_sem);

coroutine_pool_timer_init(&co_pool_timer, iothread->ctx);

while (iothread->running) {
/*
* Note: from functional-wise the g_main_loop_run() below can
Expand All @@ -74,6 +78,8 @@ static void *iothread_run(void *opaque)
}
}

coroutine_pool_timer_cleanup(&co_pool_timer);

g_main_context_pop_thread_default(iothread->worker_context);
rcu_unregister_thread();
return NULL;
Expand Down
35 changes: 35 additions & 0 deletions util/coroutine-pool-timer.c
@@ -0,0 +1,35 @@
/*
* QEMU coroutine pool timer
*
* Copyright (c) 2021 Red Hat, Inc.
*
* SPDX-License-Identifier: LGPL-2.1-or-later
*
* This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
* See the COPYING.LIB file in the top-level directory.
*
*/
#include "qemu/coroutine-pool-timer.h"

static void coroutine_pool_timer_cb(void *opaque)
{
CoroutinePoolTimer *pt = opaque;
int64_t expiry_time_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) +
15 * NANOSECONDS_PER_SECOND;

qemu_coroutine_pool_periodic_resize();
timer_mod(pt->timer, expiry_time_ns);
}

void coroutine_pool_timer_init(CoroutinePoolTimer *pt, AioContext *ctx)
{
pt->timer = aio_timer_new(ctx, QEMU_CLOCK_REALTIME, SCALE_NS,
coroutine_pool_timer_cb, pt);
coroutine_pool_timer_cb(pt);
}

void coroutine_pool_timer_cleanup(CoroutinePoolTimer *pt)
{
timer_free(pt->timer);
pt->timer = NULL;
}
5 changes: 5 additions & 0 deletions util/main-loop.c
Expand Up @@ -33,6 +33,7 @@
#include "qemu/error-report.h"
#include "qemu/queue.h"
#include "qemu/compiler.h"
#include "qemu/coroutine-pool-timer.h"

#ifndef _WIN32
#include <sys/wait.h>
Expand Down Expand Up @@ -131,6 +132,7 @@ static int qemu_signal_init(Error **errp)

static AioContext *qemu_aio_context;
static QEMUBH *qemu_notify_bh;
static CoroutinePoolTimer main_loop_co_pool_timer;

static void notify_event_cb(void *opaque)
{
Expand Down Expand Up @@ -181,6 +183,9 @@ int qemu_init_main_loop(Error **errp)
g_source_set_name(src, "io-handler");
g_source_attach(src, NULL);
g_source_unref(src);

coroutine_pool_timer_init(&main_loop_co_pool_timer, qemu_aio_context);

return 0;
}

Expand Down
1 change: 1 addition & 0 deletions util/meson.build
Expand Up @@ -65,6 +65,7 @@ if have_block
util_ss.add(files('buffer.c'))
util_ss.add(files('bufferiszero.c'))
util_ss.add(files('coroutine-@0@.c'.format(config_host['CONFIG_COROUTINE_BACKEND'])))
util_ss.add(files('coroutine-pool-timer.c'))
util_ss.add(files('hbitmap.c'))
util_ss.add(files('hexdump.c'))
util_ss.add(files('iova-tree.c'))
Expand Down
64 changes: 35 additions & 29 deletions util/qemu-coroutine.c
Expand Up @@ -21,14 +21,19 @@
#include "block/aio.h"

enum {
POOL_BATCH_SIZE = 64,
/*
* qemu_coroutine_pool_periodic_resize() keeps at least this many
* coroutines around.
*/
ALLOC_POOL_MIN = 64,
};


/** Free list to speed up creation */
static QSLIST_HEAD(, Coroutine) release_pool = QSLIST_HEAD_INITIALIZER(pool);
static unsigned int release_pool_size;
static __thread QSLIST_HEAD(, Coroutine) alloc_pool = QSLIST_HEAD_INITIALIZER(pool);
static __thread unsigned int alloc_pool_size;
static __thread unsigned int num_coroutines;
static __thread unsigned int max_coroutines_this_slice;
static __thread Notifier coroutine_pool_cleanup_notifier;

static void coroutine_pool_cleanup(Notifier *n, void *value)
Expand All @@ -48,27 +53,15 @@ Coroutine *qemu_coroutine_create(CoroutineEntry *entry, void *opaque)

if (CONFIG_COROUTINE_POOL) {
co = QSLIST_FIRST(&alloc_pool);
if (!co) {
if (release_pool_size > POOL_BATCH_SIZE) {
/* Slow path; a good place to register the destructor, too. */
if (!coroutine_pool_cleanup_notifier.notify) {
coroutine_pool_cleanup_notifier.notify = coroutine_pool_cleanup;
qemu_thread_atexit_add(&coroutine_pool_cleanup_notifier);
}

/* This is not exact; there could be a little skew between
* release_pool_size and the actual size of release_pool. But
* it is just a heuristic, it does not need to be perfect.
*/
alloc_pool_size = qatomic_xchg(&release_pool_size, 0);
QSLIST_MOVE_ATOMIC(&alloc_pool, &release_pool);
co = QSLIST_FIRST(&alloc_pool);
}
}
if (co) {
QSLIST_REMOVE_HEAD(&alloc_pool, pool_next);
alloc_pool_size--;
}

num_coroutines++;
if (num_coroutines > max_coroutines_this_slice) {
max_coroutines_this_slice = num_coroutines;
}
}

if (!co) {
Expand All @@ -86,21 +79,34 @@ static void coroutine_delete(Coroutine *co)
co->caller = NULL;

if (CONFIG_COROUTINE_POOL) {
if (release_pool_size < POOL_BATCH_SIZE * 2) {
QSLIST_INSERT_HEAD_ATOMIC(&release_pool, co, pool_next);
qatomic_inc(&release_pool_size);
return;
}
if (alloc_pool_size < POOL_BATCH_SIZE) {
QSLIST_INSERT_HEAD(&alloc_pool, co, pool_next);
alloc_pool_size++;
return;
if (!coroutine_pool_cleanup_notifier.notify) {
coroutine_pool_cleanup_notifier.notify = coroutine_pool_cleanup;
qemu_thread_atexit_add(&coroutine_pool_cleanup_notifier);
}

num_coroutines--;
QSLIST_INSERT_HEAD(&alloc_pool, co, pool_next);
alloc_pool_size++;
return;
}

qemu_coroutine_delete(co);
}

void qemu_coroutine_pool_periodic_resize(void)
{
unsigned pool_size_target =
MAX(ALLOC_POOL_MIN, max_coroutines_this_slice) - num_coroutines;
max_coroutines_this_slice = num_coroutines;

while (alloc_pool_size > pool_size_target) {
Coroutine *co = QSLIST_FIRST(&alloc_pool);
QSLIST_REMOVE_HEAD(&alloc_pool, pool_next);
qemu_coroutine_delete(co);
alloc_pool_size--;
}
}

void qemu_aio_coroutine_enter(AioContext *ctx, Coroutine *co)
{
QSIMPLEQ_HEAD(, Coroutine) pending = QSIMPLEQ_HEAD_INITIALIZER(pending);
Expand Down

0 comments on commit 4b2b3d2

Please sign in to comment.