Skip to content

Commit

Permalink
block: add BDRV_REQ_REGISTERED_BUF request flag
Browse files Browse the repository at this point in the history
Block drivers may optimize I/O requests accessing buffers previously
registered with bdrv_register_buf(). Checking whether all elements of a
request's QEMUIOVector are within previously registered buffers is
expensive, so we need a hint from the user to avoid costly checks.

Add a BDRV_REQ_REGISTERED_BUF request flag to indicate that all
QEMUIOVector elements in an I/O request are known to be within
previously registered buffers.

Always pass the flag through to driver read/write functions. There is
little harm in passing the flag to a driver that does not use it.
Passing the flag to drivers avoids changes across many block drivers.
Filter drivers would need to explicitly support the flag and pass
through to their children when the children support it. That's a lot of
code changes and it's hard to remember to do that everywhere, leading to
silent reduced performance when the flag is accidentally dropped.

The only problematic scenario with the approach in this patch is when a
driver passes the flag through to internal I/O requests that don't use
the same I/O buffer. In that case the hint may be set when it should
actually be clear. This is a rare case though so the risk is low.

Some drivers have assert(!flags), which no longer works when
BDRV_REQ_REGISTERED_BUF is passed in. These assertions aren't very
useful anyway since the functions are called almost exclusively by
bdrv_driver_preadv/pwritev() so if we get flags handling right there
then the assertion is not needed.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20221013185908.1297568-7-stefanha@redhat.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
  • Loading branch information
stefanhaRH committed Oct 26, 2022
1 parent 98b3ddc commit e8b6535
Show file tree
Hide file tree
Showing 16 changed files with 69 additions and 37 deletions.
14 changes: 14 additions & 0 deletions block.c
Expand Up @@ -1641,6 +1641,20 @@ static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
goto open_failed;
}

assert(!(bs->supported_read_flags & ~BDRV_REQ_MASK));
assert(!(bs->supported_write_flags & ~BDRV_REQ_MASK));

/*
* Always allow the BDRV_REQ_REGISTERED_BUF optimization hint. This saves
* drivers that pass read/write requests through to a child the trouble of
* declaring support explicitly.
*
* Drivers must not propagate this flag accidentally when they initiate I/O
* to a bounce buffer. That case should be rare though.
*/
bs->supported_read_flags |= BDRV_REQ_REGISTERED_BUF;
bs->supported_write_flags |= BDRV_REQ_REGISTERED_BUF;

ret = refresh_total_sectors(bs, bs->total_sectors);
if (ret < 0) {
error_setg_errno(errp, -ret, "Could not refresh total sector count");
Expand Down
4 changes: 2 additions & 2 deletions block/blkverify.c
Expand Up @@ -235,8 +235,8 @@ blkverify_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
qemu_iovec_init(&raw_qiov, qiov->niov);
qemu_iovec_clone(&raw_qiov, qiov, buf);

ret = blkverify_co_prwv(bs, &r, offset, bytes, qiov, &raw_qiov, flags,
false);
ret = blkverify_co_prwv(bs, &r, offset, bytes, qiov, &raw_qiov,
flags & ~BDRV_REQ_REGISTERED_BUF, false);

cmp_offset = qemu_iovec_compare(qiov, &raw_qiov);
if (cmp_offset != -1) {
Expand Down
4 changes: 2 additions & 2 deletions block/crypto.c
Expand Up @@ -410,7 +410,6 @@ block_crypto_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
uint64_t sector_size = qcrypto_block_get_sector_size(crypto->block);
uint64_t payload_offset = qcrypto_block_get_payload_offset(crypto->block);

assert(!flags);
assert(payload_offset < INT64_MAX);
assert(QEMU_IS_ALIGNED(offset, sector_size));
assert(QEMU_IS_ALIGNED(bytes, sector_size));
Expand Down Expand Up @@ -473,7 +472,8 @@ block_crypto_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
uint64_t sector_size = qcrypto_block_get_sector_size(crypto->block);
uint64_t payload_offset = qcrypto_block_get_payload_offset(crypto->block);

assert(!(flags & ~BDRV_REQ_FUA));
flags &= ~BDRV_REQ_REGISTERED_BUF;

assert(payload_offset < INT64_MAX);
assert(QEMU_IS_ALIGNED(offset, sector_size));
assert(QEMU_IS_ALIGNED(bytes, sector_size));
Expand Down
1 change: 0 additions & 1 deletion block/file-posix.c
Expand Up @@ -2133,7 +2133,6 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset,
int64_t bytes, QEMUIOVector *qiov,
BdrvRequestFlags flags)
{
assert(flags == 0);
return raw_co_prw(bs, offset, bytes, qiov, QEMU_AIO_WRITE);
}

Expand Down
1 change: 0 additions & 1 deletion block/gluster.c
Expand Up @@ -1236,7 +1236,6 @@ static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs,
QEMUIOVector *qiov,
int flags)
{
assert(!flags);
return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 1);
}

Expand Down
61 changes: 38 additions & 23 deletions block/io.c
Expand Up @@ -1130,8 +1130,7 @@ static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
int ret;

bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
assert(!(flags & ~BDRV_REQ_MASK));
assert(!(flags & BDRV_REQ_NO_FALLBACK));
assert(!(flags & ~bs->supported_read_flags));

if (!drv) {
return -ENOMEDIUM;
Expand Down Expand Up @@ -1195,23 +1194,29 @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
BdrvRequestFlags flags)
{
BlockDriver *drv = bs->drv;
bool emulate_fua = false;
int64_t sector_num;
unsigned int nb_sectors;
QEMUIOVector local_qiov;
int ret;

bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
assert(!(flags & ~BDRV_REQ_MASK));
assert(!(flags & BDRV_REQ_NO_FALLBACK));

if (!drv) {
return -ENOMEDIUM;
}

if ((flags & BDRV_REQ_FUA) &&
(~bs->supported_write_flags & BDRV_REQ_FUA)) {
flags &= ~BDRV_REQ_FUA;
emulate_fua = true;
}

flags &= bs->supported_write_flags;

if (drv->bdrv_co_pwritev_part) {
ret = drv->bdrv_co_pwritev_part(bs, offset, bytes, qiov, qiov_offset,
flags & bs->supported_write_flags);
flags &= ~bs->supported_write_flags;
flags);
goto emulate_flags;
}

Expand All @@ -1221,9 +1226,7 @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
}

if (drv->bdrv_co_pwritev) {
ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov,
flags & bs->supported_write_flags);
flags &= ~bs->supported_write_flags;
ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov, flags);
goto emulate_flags;
}

Expand All @@ -1233,10 +1236,8 @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
.coroutine = qemu_coroutine_self(),
};

acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov,
flags & bs->supported_write_flags,
acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov, flags,
bdrv_co_io_em_complete, &co);
flags &= ~bs->supported_write_flags;
if (acb == NULL) {
ret = -EIO;
} else {
Expand All @@ -1254,12 +1255,10 @@ static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
assert(bytes <= BDRV_REQUEST_MAX_BYTES);

assert(drv->bdrv_co_writev);
ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov,
flags & bs->supported_write_flags);
flags &= ~bs->supported_write_flags;
ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov, flags);

emulate_flags:
if (ret == 0 && (flags & BDRV_REQ_FUA)) {
if (ret == 0 && emulate_fua) {
ret = bdrv_co_flush(bs);
}

Expand Down Expand Up @@ -1487,11 +1486,14 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
align);

/* TODO: We would need a per-BDS .supported_read_flags and
/*
* TODO: We would need a per-BDS .supported_read_flags and
* potential fallback support, if we ever implement any read flags
* to pass through to drivers. For now, there aren't any
* passthrough flags. */
assert(!(flags & ~(BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH)));
* passthrough flags except the BDRV_REQ_REGISTERED_BUF optimization hint.
*/
assert(!(flags & ~(BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH |
BDRV_REQ_REGISTERED_BUF)));

/* Handle Copy on Read and associated serialisation */
if (flags & BDRV_REQ_COPY_ON_READ) {
Expand Down Expand Up @@ -1532,7 +1534,7 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
goto out;
}

assert(!(flags & ~bs->supported_read_flags));
assert(!(flags & ~(bs->supported_read_flags | BDRV_REQ_REGISTERED_BUF)));

max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
if (bytes <= max_bytes && bytes <= max_transfer) {
Expand Down Expand Up @@ -1721,7 +1723,8 @@ static void bdrv_padding_destroy(BdrvRequestPadding *pad)
static int bdrv_pad_request(BlockDriverState *bs,
QEMUIOVector **qiov, size_t *qiov_offset,
int64_t *offset, int64_t *bytes,
BdrvRequestPadding *pad, bool *padded)
BdrvRequestPadding *pad, bool *padded,
BdrvRequestFlags *flags)
{
int ret;

Expand Down Expand Up @@ -1749,6 +1752,10 @@ static int bdrv_pad_request(BlockDriverState *bs,
if (padded) {
*padded = true;
}
if (flags) {
/* Can't use optimization hint with bounce buffer */
*flags &= ~BDRV_REQ_REGISTERED_BUF;
}

return 0;
}
Expand Down Expand Up @@ -1803,7 +1810,7 @@ int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
}

ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad,
NULL);
NULL, &flags);
if (ret < 0) {
goto fail;
}
Expand Down Expand Up @@ -1848,6 +1855,11 @@ static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
return -ENOTSUP;
}

/* By definition there is no user buffer so this flag doesn't make sense */
if (flags & BDRV_REQ_REGISTERED_BUF) {
return -EINVAL;
}

/* Invalidate the cached block-status data range if this write overlaps */
bdrv_bsc_invalidate_range(bs, offset, bytes);

Expand Down Expand Up @@ -2133,6 +2145,9 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
bool padding;
BdrvRequestPadding pad;

/* This flag doesn't make sense for padding or zero writes */
flags &= ~BDRV_REQ_REGISTERED_BUF;

padding = bdrv_init_padding(bs, offset, bytes, &pad);
if (padding) {
assert(!(flags & BDRV_REQ_NO_WAIT));
Expand Down Expand Up @@ -2250,7 +2265,7 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
* alignment only if there is no ZERO flag.
*/
ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad,
&padded);
&padded, &flags);
if (ret < 0) {
return ret;
}
Expand Down
2 changes: 2 additions & 0 deletions block/mirror.c
Expand Up @@ -1486,6 +1486,8 @@ static int coroutine_fn bdrv_mirror_top_pwritev(BlockDriverState *bs,
qemu_iovec_init(&bounce_qiov, 1);
qemu_iovec_add(&bounce_qiov, bounce_buf, bytes);
qiov = &bounce_qiov;

flags &= ~BDRV_REQ_REGISTERED_BUF;
}

ret = bdrv_mirror_top_do_write(bs, MIRROR_METHOD_COPY, offset, bytes, qiov,
Expand Down
1 change: 0 additions & 1 deletion block/nbd.c
Expand Up @@ -1222,7 +1222,6 @@ static int coroutine_fn nbd_client_co_preadv(BlockDriverState *bs, int64_t offse
};

assert(bytes <= NBD_MAX_BUFFER_SIZE);
assert(!flags);

if (!bytes) {
return 0;
Expand Down
1 change: 0 additions & 1 deletion block/parallels.c
Expand Up @@ -329,7 +329,6 @@ static coroutine_fn int parallels_co_writev(BlockDriverState *bs,
QEMUIOVector hd_qiov;
int ret = 0;

assert(!flags);
qemu_iovec_init(&hd_qiov, qiov->niov);

while (nb_sectors > 0) {
Expand Down
2 changes: 0 additions & 2 deletions block/qcow.c
Expand Up @@ -628,7 +628,6 @@ static coroutine_fn int qcow_co_preadv(BlockDriverState *bs, int64_t offset,
uint8_t *buf;
void *orig_buf;

assert(!flags);
if (qiov->niov > 1) {
buf = orig_buf = qemu_try_blockalign(bs, qiov->size);
if (buf == NULL) {
Expand Down Expand Up @@ -725,7 +724,6 @@ static coroutine_fn int qcow_co_pwritev(BlockDriverState *bs, int64_t offset,
uint8_t *buf;
void *orig_buf;

assert(!flags);
s->cluster_cache_offset = -1; /* disable compressed cache */

/* We must always copy the iov when encrypting, so we
Expand Down
1 change: 0 additions & 1 deletion block/qed.c
Expand Up @@ -1395,7 +1395,6 @@ static int coroutine_fn bdrv_qed_co_writev(BlockDriverState *bs,
int64_t sector_num, int nb_sectors,
QEMUIOVector *qiov, int flags)
{
assert(!flags);
return qed_co_request(bs, sector_num, qiov, nb_sectors, QED_AIOCB_WRITE);
}

Expand Down
2 changes: 2 additions & 0 deletions block/raw-format.c
Expand Up @@ -258,6 +258,8 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, int64_t offset,
qemu_iovec_add(&local_qiov, buf, 512);
qemu_iovec_concat(&local_qiov, qiov, 512, qiov->size - 512);
qiov = &local_qiov;

flags &= ~BDRV_REQ_REGISTERED_BUF;
}

ret = raw_adjust_offset(bs, &offset, bytes, true);
Expand Down
1 change: 0 additions & 1 deletion block/replication.c
Expand Up @@ -261,7 +261,6 @@ static coroutine_fn int replication_co_writev(BlockDriverState *bs,
int ret;
int64_t n;

assert(!flags);
ret = replication_get_io_status(s);
if (ret < 0) {
goto out;
Expand Down
1 change: 0 additions & 1 deletion block/ssh.c
Expand Up @@ -1196,7 +1196,6 @@ static coroutine_fn int ssh_co_writev(BlockDriverState *bs,
BDRVSSHState *s = bs->opaque;
int ret;

assert(!flags);
qemu_co_mutex_lock(&s->lock);
ret = ssh_write(s, bs, sector_num * BDRV_SECTOR_SIZE,
nb_sectors * BDRV_SECTOR_SIZE, qiov);
Expand Down
1 change: 0 additions & 1 deletion block/vhdx.c
Expand Up @@ -1342,7 +1342,6 @@ static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num,
uint64_t bat_prior_offset = 0;
bool bat_update = false;

assert(!flags);
qemu_iovec_init(&hd_qiov, qiov->niov);

qemu_co_mutex_lock(&s->lock);
Expand Down
9 changes: 9 additions & 0 deletions include/block/block-common.h
Expand Up @@ -80,6 +80,15 @@ typedef enum {
*/
BDRV_REQ_MAY_UNMAP = 0x4,

/*
* An optimization hint when all QEMUIOVector elements are within
* previously registered bdrv_register_buf() memory ranges.
*
* Code that replaces the user's QEMUIOVector elements with bounce buffers
* must take care to clear this flag.
*/
BDRV_REQ_REGISTERED_BUF = 0x8,

BDRV_REQ_FUA = 0x10,
BDRV_REQ_WRITE_COMPRESSED = 0x20,

Expand Down

0 comments on commit e8b6535

Please sign in to comment.