Skip to content

Commit

Permalink
oslib-posix: initialize backend memory objects in parallel
Browse files Browse the repository at this point in the history
QEMU initializes preallocated backend memory as the objects are parsed from
the command line. This is not optimal in some cases (e.g. memory spanning
multiple NUMA nodes) because the memory objects are initialized in series.

Allow the initialization to occur in parallel (asynchronously). In order to
ensure optimal thread placement, asynchronous initialization requires prealloc
context threads to be in use.

Signed-off-by: Mark Kanda <mark.kanda@oracle.com>
Message-ID: <20240131165327.3154970-2-mark.kanda@oracle.com>
Tested-by: Mario Casquero <mcasquer@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
  • Loading branch information
mhkanda authored and davidhildenbrand committed Feb 6, 2024
1 parent 540a1ab commit 04accf4
Show file tree
Hide file tree
Showing 7 changed files with 145 additions and 37 deletions.
7 changes: 5 additions & 2 deletions backends/hostmem.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "qom/object_interfaces.h"
#include "qemu/mmap-alloc.h"
#include "qemu/madvise.h"
#include "hw/qdev-core.h"

#ifdef CONFIG_NUMA
#include <numaif.h>
Expand Down Expand Up @@ -237,7 +238,7 @@ static void host_memory_backend_set_prealloc(Object *obj, bool value,
uint64_t sz = memory_region_size(&backend->mr);

if (!qemu_prealloc_mem(fd, ptr, sz, backend->prealloc_threads,
backend->prealloc_context, errp)) {
backend->prealloc_context, false, errp)) {
return;
}
backend->prealloc = true;
Expand Down Expand Up @@ -323,6 +324,7 @@ host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)
HostMemoryBackendClass *bc = MEMORY_BACKEND_GET_CLASS(uc);
void *ptr;
uint64_t sz;
bool async = !phase_check(PHASE_LATE_BACKENDS_CREATED);

if (!bc->alloc) {
return;
Expand Down Expand Up @@ -402,7 +404,8 @@ host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)
if (backend->prealloc && !qemu_prealloc_mem(memory_region_get_fd(&backend->mr),
ptr, sz,
backend->prealloc_threads,
backend->prealloc_context, errp)) {
backend->prealloc_context,
async, errp)) {
return;
}
}
Expand Down
4 changes: 2 additions & 2 deletions hw/virtio/virtio-mem.c
Original file line number Diff line number Diff line change
Expand Up @@ -605,7 +605,7 @@ static int virtio_mem_set_block_state(VirtIOMEM *vmem, uint64_t start_gpa,
int fd = memory_region_get_fd(&vmem->memdev->mr);
Error *local_err = NULL;

if (!qemu_prealloc_mem(fd, area, size, 1, NULL, &local_err)) {
if (!qemu_prealloc_mem(fd, area, size, 1, NULL, false, &local_err)) {
static bool warned;

/*
Expand Down Expand Up @@ -1248,7 +1248,7 @@ static int virtio_mem_prealloc_range_cb(VirtIOMEM *vmem, void *arg,
int fd = memory_region_get_fd(&vmem->memdev->mr);
Error *local_err = NULL;

if (!qemu_prealloc_mem(fd, area, size, 1, NULL, &local_err)) {
if (!qemu_prealloc_mem(fd, area, size, 1, NULL, false, &local_err)) {
error_report_err(local_err);
return -ENOMEM;
}
Expand Down
5 changes: 5 additions & 0 deletions include/hw/qdev-core.h
Original file line number Diff line number Diff line change
Expand Up @@ -1083,6 +1083,11 @@ typedef enum MachineInitPhase {
*/
PHASE_ACCEL_CREATED,

/*
* Late backend objects have been created and initialized.
*/
PHASE_LATE_BACKENDS_CREATED,

/*
* machine_class->init has been called, thus creating any embedded
* devices and validating machine properties. Devices created at
Expand Down
18 changes: 17 additions & 1 deletion include/qemu/osdep.h
Original file line number Diff line number Diff line change
Expand Up @@ -680,17 +680,33 @@ typedef struct ThreadContext ThreadContext;
* @area: start address of the are to preallocate
* @sz: the size of the area to preallocate
* @max_threads: maximum number of threads to use
* @tc: prealloc context threads pointer, NULL if not in use
* @async: request asynchronous preallocation, requires @tc
* @errp: returns an error if this function fails
*
* Preallocate memory (populate/prefault page tables writable) for the virtual
* memory area starting at @area with the size of @sz. After a successful call,
* each page in the area was faulted in writable at least once, for example,
* after allocating file blocks for mapped files.
*
* When setting @async, allocation might be performed asynchronously.
* qemu_finish_async_prealloc_mem() must be called to finish any asynchronous
* preallocation.
*
* Return: true on success, else false setting @errp with error.
*/
bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
ThreadContext *tc, Error **errp);
ThreadContext *tc, bool async, Error **errp);

/**
* qemu_finish_async_prealloc_mem:
* @errp: returns an error if this function fails
*
* Finish all outstanding asynchronous memory preallocation.
*
* Return: true on success, else false setting @errp with error.
*/
bool qemu_finish_async_prealloc_mem(Error **errp);

/**
* qemu_get_pid_name:
Expand Down
9 changes: 9 additions & 0 deletions system/vl.c
Original file line number Diff line number Diff line change
Expand Up @@ -2013,6 +2013,14 @@ static void qemu_create_late_backends(void)

object_option_foreach_add(object_create_late);

/*
* Wait for any outstanding memory prealloc from created memory
* backends to complete.
*/
if (!qemu_finish_async_prealloc_mem(&error_fatal)) {
exit(1);
}

if (tpm_init() < 0) {
exit(1);
}
Expand Down Expand Up @@ -3699,6 +3707,7 @@ void qemu_init(int argc, char **argv)
* over memory-backend-file objects).
*/
qemu_create_late_backends();
phase_advance(PHASE_LATE_BACKENDS_CREATED);

/*
* Note: creates a QOM object, must run only after global and
Expand Down
131 changes: 100 additions & 31 deletions util/oslib-posix.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
#include "qemu/cutils.h"
#include "qemu/units.h"
#include "qemu/thread-context.h"
#include "qemu/main-loop.h"

#ifdef CONFIG_LINUX
#include <sys/syscall.h>
Expand All @@ -63,11 +64,15 @@

struct MemsetThread;

static QLIST_HEAD(, MemsetContext) memset_contexts =
QLIST_HEAD_INITIALIZER(memset_contexts);

typedef struct MemsetContext {
bool all_threads_created;
bool any_thread_failed;
struct MemsetThread *threads;
int num_threads;
QLIST_ENTRY(MemsetContext) next;
} MemsetContext;

struct MemsetThread {
Expand Down Expand Up @@ -412,28 +417,56 @@ static inline int get_memset_num_threads(size_t hpagesize, size_t numpages,
return ret;
}

static int wait_and_free_mem_prealloc_context(MemsetContext *context)
{
int i, ret = 0, tmp;

for (i = 0; i < context->num_threads; i++) {
tmp = (uintptr_t)qemu_thread_join(&context->threads[i].pgthread);

if (tmp) {
ret = tmp;
}
}
g_free(context->threads);
g_free(context);
return ret;
}

static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
int max_threads, ThreadContext *tc,
int max_threads, ThreadContext *tc, bool async,
bool use_madv_populate_write)
{
static gsize initialized = 0;
MemsetContext context = {
.num_threads = get_memset_num_threads(hpagesize, numpages, max_threads),
};
MemsetContext *context = g_malloc0(sizeof(MemsetContext));
size_t numpages_per_thread, leftover;
void *(*touch_fn)(void *);
int ret = 0, i = 0;
int ret, i = 0;
char *addr = area;

/*
* Asynchronous preallocation is only allowed when using MADV_POPULATE_WRITE
* and prealloc context for thread placement.
*/
if (!use_madv_populate_write || !tc) {
async = false;
}

context->num_threads =
get_memset_num_threads(hpagesize, numpages, max_threads);

if (g_once_init_enter(&initialized)) {
qemu_mutex_init(&page_mutex);
qemu_cond_init(&page_cond);
g_once_init_leave(&initialized, 1);
}

if (use_madv_populate_write) {
/* Avoid creating a single thread for MADV_POPULATE_WRITE */
if (context.num_threads == 1) {
/*
* Avoid creating a single thread for MADV_POPULATE_WRITE when
* preallocating synchronously.
*/
if (context->num_threads == 1 && !async) {
if (qemu_madvise(area, hpagesize * numpages,
QEMU_MADV_POPULATE_WRITE)) {
return -errno;
Expand All @@ -445,50 +478,86 @@ static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
touch_fn = do_touch_pages;
}

context.threads = g_new0(MemsetThread, context.num_threads);
numpages_per_thread = numpages / context.num_threads;
leftover = numpages % context.num_threads;
for (i = 0; i < context.num_threads; i++) {
context.threads[i].addr = addr;
context.threads[i].numpages = numpages_per_thread + (i < leftover);
context.threads[i].hpagesize = hpagesize;
context.threads[i].context = &context;
context->threads = g_new0(MemsetThread, context->num_threads);
numpages_per_thread = numpages / context->num_threads;
leftover = numpages % context->num_threads;
for (i = 0; i < context->num_threads; i++) {
context->threads[i].addr = addr;
context->threads[i].numpages = numpages_per_thread + (i < leftover);
context->threads[i].hpagesize = hpagesize;
context->threads[i].context = context;
if (tc) {
thread_context_create_thread(tc, &context.threads[i].pgthread,
thread_context_create_thread(tc, &context->threads[i].pgthread,
"touch_pages",
touch_fn, &context.threads[i],
touch_fn, &context->threads[i],
QEMU_THREAD_JOINABLE);
} else {
qemu_thread_create(&context.threads[i].pgthread, "touch_pages",
touch_fn, &context.threads[i],
qemu_thread_create(&context->threads[i].pgthread, "touch_pages",
touch_fn, &context->threads[i],
QEMU_THREAD_JOINABLE);
}
addr += context.threads[i].numpages * hpagesize;
addr += context->threads[i].numpages * hpagesize;
}

if (async) {
/*
* async requests currently require the BQL. Add it to the list and kick
* preallocation off during qemu_finish_async_prealloc_mem().
*/
assert(bql_locked());
QLIST_INSERT_HEAD(&memset_contexts, context, next);
return 0;
}

if (!use_madv_populate_write) {
sigbus_memset_context = &context;
sigbus_memset_context = context;
}

qemu_mutex_lock(&page_mutex);
context.all_threads_created = true;
context->all_threads_created = true;
qemu_cond_broadcast(&page_cond);
qemu_mutex_unlock(&page_mutex);

for (i = 0; i < context.num_threads; i++) {
int tmp = (uintptr_t)qemu_thread_join(&context.threads[i].pgthread);
ret = wait_and_free_mem_prealloc_context(context);

if (!use_madv_populate_write) {
sigbus_memset_context = NULL;
}
return ret;
}

bool qemu_finish_async_prealloc_mem(Error **errp)
{
int ret = 0, tmp;
MemsetContext *context, *next_context;

/* Waiting for preallocation requires the BQL. */
assert(bql_locked());
if (QLIST_EMPTY(&memset_contexts)) {
return true;
}

qemu_mutex_lock(&page_mutex);
QLIST_FOREACH(context, &memset_contexts, next) {
context->all_threads_created = true;
}
qemu_cond_broadcast(&page_cond);
qemu_mutex_unlock(&page_mutex);

QLIST_FOREACH_SAFE(context, &memset_contexts, next, next_context) {
QLIST_REMOVE(context, next);
tmp = wait_and_free_mem_prealloc_context(context);
if (tmp) {
ret = tmp;
}
}

if (!use_madv_populate_write) {
sigbus_memset_context = NULL;
if (ret) {
error_setg_errno(errp, -ret,
"qemu_prealloc_mem: preallocating memory failed");
return false;
}
g_free(context.threads);

return ret;
return true;
}

static bool madv_populate_write_possible(char *area, size_t pagesize)
Expand All @@ -498,7 +567,7 @@ static bool madv_populate_write_possible(char *area, size_t pagesize)
}

bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
ThreadContext *tc, Error **errp)
ThreadContext *tc, bool async, Error **errp)
{
static gsize initialized;
int ret;
Expand Down Expand Up @@ -540,7 +609,7 @@ bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
}

/* touch pages simultaneously */
ret = touch_all_pages(area, hpagesize, numpages, max_threads, tc,
ret = touch_all_pages(area, hpagesize, numpages, max_threads, tc, async,
use_madv_populate_write);
if (ret) {
error_setg_errno(errp, -ret,
Expand Down
8 changes: 7 additions & 1 deletion util/oslib-win32.c
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ int getpagesize(void)
}

bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
ThreadContext *tc, Error **errp)
ThreadContext *tc, bool async, Error **errp)
{
int i;
size_t pagesize = qemu_real_host_page_size();
Expand All @@ -278,6 +278,12 @@ bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
return true;
}

bool qemu_finish_async_prealloc_mem(Error **errp)
{
/* async prealloc not supported, there is nothing to finish */
return true;
}

char *qemu_get_pid_name(pid_t pid)
{
/* XXX Implement me */
Expand Down

0 comments on commit 04accf4

Please sign in to comment.