From 6cba5377f54d7ea859a29c1877785e7101794683 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 6 Apr 2018 18:41:08 +0200 Subject: [PATCH 01/21] iotests: Split 214 off of 122 Commit abd3622cc03cf41ed542126a540385f30a4c0175 added a case to 122 regarding how the qcow2 driver handles an incorrect compressed data length value. This does not really fit into 122, as that file is supposed to contain qemu-img convert test cases, which this case is not. So this patch splits it off into its own file; maybe we will even get more qcow2-only compression tests in the future. Also, that test case does not work with refcount_bits=1, so mark that option as unsupported. Signed-off-by: Max Reitz Message-id: 20180406164108.26118-1-mreitz@redhat.com Reviewed-by: Eric Blake Signed-off-by: Alberto Garcia Signed-off-by: Max Reitz --- tests/qemu-iotests/122 | 47 ------------------ tests/qemu-iotests/122.out | 33 ------------- tests/qemu-iotests/214 | 97 ++++++++++++++++++++++++++++++++++++++ tests/qemu-iotests/214.out | 35 ++++++++++++++ tests/qemu-iotests/group | 1 + 5 files changed, 133 insertions(+), 80 deletions(-) create mode 100755 tests/qemu-iotests/214 create mode 100644 tests/qemu-iotests/214.out diff --git a/tests/qemu-iotests/122 b/tests/qemu-iotests/122 index 6cf4fcb86632..45b359c2ba05 100755 --- a/tests/qemu-iotests/122 +++ b/tests/qemu-iotests/122 @@ -129,53 +129,6 @@ $QEMU_IO -c "read -P 0x44 1023k 1k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _fil $QEMU_IO -c "read -P 0 1024k 1022k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir -echo -echo "=== Corrupted size field in compressed cluster descriptor ===" -echo -# Create an empty image and fill half of it with compressed data. -# The L2 entries of the two compressed clusters are located at -# 0x800000 and 0x800008, their original values are 0x4008000000a00000 -# and 0x4008000000a00802 (5 sectors for compressed data each). -_make_test_img 8M -o cluster_size=2M -$QEMU_IO -c "write -c -P 0x11 0 2M" -c "write -c -P 0x11 2M 2M" "$TEST_IMG" \ - 2>&1 | _filter_qemu_io | _filter_testdir - -# Reduce size of compressed data to 4 sectors: this corrupts the image. -poke_file "$TEST_IMG" $((0x800000)) "\x40\x06" -$QEMU_IO -c "read -P 0x11 0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir - -# 'qemu-img check' however doesn't see anything wrong because it -# doesn't try to decompress the data and the refcounts are consistent. -# TODO: update qemu-img so this can be detected. -_check_test_img - -# Increase size of compressed data to the maximum (8192 sectors). -# This makes QEMU read more data (8192 sectors instead of 5, host -# addresses [0xa00000, 0xdfffff]), but the decompression algorithm -# stops once we have enough to restore the uncompressed cluster, so -# the rest of the data is ignored. -poke_file "$TEST_IMG" $((0x800000)) "\x7f\xfe" -# Do it also for the second compressed cluster (L2 entry at 0x800008). -# In this case the compressed data would span 3 host clusters -# (host addresses: [0xa00802, 0xe00801]) -poke_file "$TEST_IMG" $((0x800008)) "\x7f\xfe" - -# Here the image is too small so we're asking QEMU to read beyond the -# end of the image. -$QEMU_IO -c "read -P 0x11 0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir -# But if we grow the image we won't be reading beyond its end anymore. -$QEMU_IO -c "write -P 0x22 4M 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir -$QEMU_IO -c "read -P 0x11 0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir - -# The refcount data is however wrong because due to the increased size -# of the compressed data it now reaches the following host clusters. -# This can be repaired by qemu-img check by increasing the refcount of -# those clusters. -# TODO: update qemu-img to correct the compressed cluster size instead. -_check_test_img -r all -$QEMU_IO -c "read -P 0x11 0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir -$QEMU_IO -c "read -P 0x22 4M 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir - echo echo "=== Full allocation with -S 0 ===" echo diff --git a/tests/qemu-iotests/122.out b/tests/qemu-iotests/122.out index a6b7fe007e20..47d8656db84f 100644 --- a/tests/qemu-iotests/122.out +++ b/tests/qemu-iotests/122.out @@ -99,39 +99,6 @@ read 1024/1024 bytes at offset 1047552 read 1046528/1046528 bytes at offset 1048576 1022 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) -=== Corrupted size field in compressed cluster descriptor === - -Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=8388608 -wrote 2097152/2097152 bytes at offset 0 -2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) -wrote 2097152/2097152 bytes at offset 2097152 -2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) -read failed: Input/output error -No errors were found on the image. -read 4194304/4194304 bytes at offset 0 -4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) -wrote 4194304/4194304 bytes at offset 4194304 -4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) -read 4194304/4194304 bytes at offset 0 -4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) -ERROR cluster 6 refcount=1 reference=3 -ERROR cluster 7 refcount=1 reference=2 -Repairing cluster 6 refcount=1 reference=3 -Repairing cluster 7 refcount=1 reference=2 -Repairing OFLAG_COPIED data cluster: l2_entry=8000000000c00000 refcount=3 -Repairing OFLAG_COPIED data cluster: l2_entry=8000000000e00000 refcount=2 -The following inconsistencies were found and repaired: - - 0 leaked clusters - 4 corruptions - -Double checking the fixed image now... -No errors were found on the image. -read 4194304/4194304 bytes at offset 0 -4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) -read 4194304/4194304 bytes at offset 4194304 -4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) - === Full allocation with -S 0 === Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 diff --git a/tests/qemu-iotests/214 b/tests/qemu-iotests/214 new file mode 100755 index 000000000000..c46ca2a6dde0 --- /dev/null +++ b/tests/qemu-iotests/214 @@ -0,0 +1,97 @@ +#!/bin/bash +# +# Test qcow2 image compression +# +# Copyright (C) 2018 Igalia, S.L. +# Author: Alberto Garcia +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +seq=$(basename "$0") +echo "QA output created by $seq" + +here=$PWD +status=1 # failure is the default! + +_cleanup() +{ + _cleanup_test_img +} +trap "_cleanup; exit \$status" 0 1 2 3 15 + +# get standard environment, filters and checks +. ./common.rc +. ./common.filter + +_supported_fmt qcow2 +_supported_proto file +_supported_os Linux + +# Repairing the corrupted image requires qemu-img check to store a +# refcount up to 3, which requires at least two refcount bits. +_unsupported_imgopts 'refcount_bits=1[^0-9]' + + +echo +echo "=== Corrupted size field in compressed cluster descriptor ===" +echo +# Create an empty image and fill half of it with compressed data. +# The L2 entries of the two compressed clusters are located at +# 0x800000 and 0x800008, their original values are 0x4008000000a00000 +# and 0x4008000000a00802 (5 sectors for compressed data each). +_make_test_img 8M -o cluster_size=2M +$QEMU_IO -c "write -c -P 0x11 0 2M" -c "write -c -P 0x11 2M 2M" "$TEST_IMG" \ + 2>&1 | _filter_qemu_io | _filter_testdir + +# Reduce size of compressed data to 4 sectors: this corrupts the image. +poke_file "$TEST_IMG" $((0x800000)) "\x40\x06" +$QEMU_IO -c "read -P 0x11 0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir + +# 'qemu-img check' however doesn't see anything wrong because it +# doesn't try to decompress the data and the refcounts are consistent. +# TODO: update qemu-img so this can be detected. +_check_test_img + +# Increase size of compressed data to the maximum (8192 sectors). +# This makes QEMU read more data (8192 sectors instead of 5, host +# addresses [0xa00000, 0xdfffff]), but the decompression algorithm +# stops once we have enough to restore the uncompressed cluster, so +# the rest of the data is ignored. +poke_file "$TEST_IMG" $((0x800000)) "\x7f\xfe" +# Do it also for the second compressed cluster (L2 entry at 0x800008). +# In this case the compressed data would span 3 host clusters +# (host addresses: [0xa00802, 0xe00801]) +poke_file "$TEST_IMG" $((0x800008)) "\x7f\xfe" + +# Here the image is too small so we're asking QEMU to read beyond the +# end of the image. +$QEMU_IO -c "read -P 0x11 0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir +# But if we grow the image we won't be reading beyond its end anymore. +$QEMU_IO -c "write -P 0x22 4M 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "read -P 0x11 0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir + +# The refcount data is however wrong because due to the increased size +# of the compressed data it now reaches the following host clusters. +# This can be repaired by qemu-img check by increasing the refcount of +# those clusters. +# TODO: update qemu-img to correct the compressed cluster size instead. +_check_test_img -r all +$QEMU_IO -c "read -P 0x11 0 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir +$QEMU_IO -c "read -P 0x22 4M 4M" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir + +# success, all done +echo '*** done' +rm -f $seq.full +status=0 diff --git a/tests/qemu-iotests/214.out b/tests/qemu-iotests/214.out new file mode 100644 index 000000000000..0fcd8dc051ad --- /dev/null +++ b/tests/qemu-iotests/214.out @@ -0,0 +1,35 @@ +QA output created by 214 + +=== Corrupted size field in compressed cluster descriptor === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=8388608 +wrote 2097152/2097152 bytes at offset 0 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 2097152/2097152 bytes at offset 2097152 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read failed: Input/output error +No errors were found on the image. +read 4194304/4194304 bytes at offset 0 +4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 4194304/4194304 bytes at offset 4194304 +4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 4194304/4194304 bytes at offset 0 +4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +ERROR cluster 6 refcount=1 reference=3 +ERROR cluster 7 refcount=1 reference=2 +Repairing cluster 6 refcount=1 reference=3 +Repairing cluster 7 refcount=1 reference=2 +Repairing OFLAG_COPIED data cluster: l2_entry=8000000000c00000 refcount=3 +Repairing OFLAG_COPIED data cluster: l2_entry=8000000000e00000 refcount=2 +The following inconsistencies were found and repaired: + + 0 leaked clusters + 4 corruptions + +Double checking the fixed image now... +No errors were found on the image. +read 4194304/4194304 bytes at offset 0 +4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 4194304/4194304 bytes at offset 4194304 +4 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +*** done diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group index 5daef2402083..aed024af05b3 100644 --- a/tests/qemu-iotests/group +++ b/tests/qemu-iotests/group @@ -212,4 +212,5 @@ 211 rw auto quick 212 rw auto quick 213 rw auto quick +214 rw auto 218 rw auto quick From 74c44a59348f7fac96c32621e37ee636546f26f8 Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Tue, 10 Apr 2018 18:05:03 +0200 Subject: [PATCH 02/21] Fix error message about compressed clusters with OFLAG_COPIED Compressed clusters are not supposed to have the COPIED bit set. "qemu-img check" detects that and prints an error message reporting the number of the affected host cluster. This doesn't make much sense because compressed clusters are not aligned to host clusters, so it would be better to report the offset instead. Plus, the calculation is wrong and it uses the raw L2 entry as if it was simply an offset. This patch fixes the error message and reports the offset of the compressed cluster. Signed-off-by: Alberto Garcia Message-id: 0f687957feb72e80c740403191a47e607c2463fe.1523376013.git.berto@igalia.com Signed-off-by: Max Reitz --- block/qcow2-refcount.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 6b8b63514afc..2dc23005b7c6 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -1577,9 +1577,9 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, case QCOW2_CLUSTER_COMPRESSED: /* Compressed clusters don't have QCOW_OFLAG_COPIED */ if (l2_entry & QCOW_OFLAG_COPIED) { - fprintf(stderr, "ERROR: cluster %" PRId64 ": " + fprintf(stderr, "ERROR: coffset=0x%" PRIx64 ": " "copied flag must never be set for compressed " - "clusters\n", l2_entry >> s->cluster_bits); + "clusters\n", l2_entry & s->cluster_offset_mask); l2_entry &= ~QCOW_OFLAG_COPIED; res->corruptions++; } From 3c7d14b201ee4eeec2ca259b5a071a4599aa8847 Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Tue, 10 Apr 2018 18:05:04 +0200 Subject: [PATCH 03/21] specs/qcow2: Clarify that compressed clusters have the COPIED bit reset Compressed clusters are not supposed to have the COPIED bit set, but this is not made explicit in the specs, so let's document it. Signed-off-by: Alberto Garcia Message-id: 74552e1d6e858d3159cb0c0e188e80bc9248e337.1523376013.git.berto@igalia.com Reviewed-by: Eric Blake Signed-off-by: Max Reitz --- docs/interop/qcow2.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/interop/qcow2.txt b/docs/interop/qcow2.txt index feb711fb6a88..8e1547ded27b 100644 --- a/docs/interop/qcow2.txt +++ b/docs/interop/qcow2.txt @@ -400,10 +400,10 @@ L2 table entry: 62: 0 for standard clusters 1 for compressed clusters - 63: 0 for a cluster that is unused or requires COW, 1 if its - refcount is exactly one. This information is only accurate - in L2 tables that are reachable from the active L1 - table. + 63: 0 for clusters that are unused, compressed or require COW. + 1 for standard clusters whose refcount is exactly one. + This information is only accurate in L2 tables + that are reachable from the active L1 table. Standard Cluster Descriptor: From 52253998ec3e523c9e20ae81e2a6431d8ff733ba Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Tue, 17 Apr 2018 15:37:04 +0300 Subject: [PATCH 04/21] qcow2: Give the refcount cache the minimum possible size by default The L2 and refcount caches have default sizes that can be overridden using the l2-cache-size and refcount-cache-size (an additional parameter named cache-size sets the combined size of both caches). Unless forced by one of the aforementioned parameters, QEMU will set the unspecified sizes so that the L2 cache is 4 times larger than the refcount cache. This is based on the premise that the refcount metadata needs to be only a fourth of the L2 metadata to cover the same amount of disk space. This is incorrect for two reasons: a) The amount of disk covered by an L2 table depends solely on the cluster size, but in the case of a refcount block it depends on the cluster size *and* the width of each refcount entry. The 4/1 ratio is only valid with 16-bit entries (the default). b) When we talk about disk space and L2 tables we are talking about guest space (L2 tables map guest clusters to host clusters), whereas refcount blocks are used for host clusters (including L1/L2 tables and the refcount blocks themselves). On a fully populated (and uncompressed) qcow2 file, image size > virtual size so there are more refcount entries than L2 entries. Problem (a) could be fixed by adjusting the algorithm to take into account the refcount entry width. Problem (b) could be fixed by increasing a bit the refcount cache size to account for the clusters used for qcow2 metadata. However this patch takes a completely different approach and instead of keeping a ratio between both cache sizes it assigns as much as possible to the L2 cache and the remainder to the refcount cache. The reason is that L2 tables are used for every single I/O request from the guest and the effect of increasing the cache is significant and clearly measurable. Refcount blocks are however only used for cluster allocation and internal snapshots and in practice are accessed sequentially in most cases, so the effect of increasing the cache is negligible (even when doing random writes from the guest). So, make the refcount cache as small as possible unless the user explicitly asks for a larger one. Signed-off-by: Alberto Garcia Reviewed-by: Eric Blake Reviewed-by: Max Reitz Message-id: 9695182c2eb11b77cb319689a1ebaa4e7c9d6591.1523968389.git.berto@igalia.com Signed-off-by: Max Reitz --- block/qcow2.c | 31 +++++++++++++++++++------------ block/qcow2.h | 4 ---- tests/qemu-iotests/137.out | 2 +- 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index 2f36e632f93f..6d532470a8e8 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -802,23 +802,30 @@ static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts, } else if (refcount_cache_size_set) { *l2_cache_size = combined_cache_size - *refcount_cache_size; } else { - *refcount_cache_size = combined_cache_size - / (DEFAULT_L2_REFCOUNT_SIZE_RATIO + 1); - *l2_cache_size = combined_cache_size - *refcount_cache_size; + uint64_t virtual_disk_size = bs->total_sectors * BDRV_SECTOR_SIZE; + uint64_t max_l2_cache = virtual_disk_size / (s->cluster_size / 8); + uint64_t min_refcount_cache = + (uint64_t) MIN_REFCOUNT_CACHE_SIZE * s->cluster_size; + + /* Assign as much memory as possible to the L2 cache, and + * use the remainder for the refcount cache */ + if (combined_cache_size >= max_l2_cache + min_refcount_cache) { + *l2_cache_size = max_l2_cache; + *refcount_cache_size = combined_cache_size - *l2_cache_size; + } else { + *refcount_cache_size = + MIN(combined_cache_size, min_refcount_cache); + *l2_cache_size = combined_cache_size - *refcount_cache_size; + } } } else { - if (!l2_cache_size_set && !refcount_cache_size_set) { + if (!l2_cache_size_set) { *l2_cache_size = MAX(DEFAULT_L2_CACHE_BYTE_SIZE, (uint64_t)DEFAULT_L2_CACHE_CLUSTERS * s->cluster_size); - *refcount_cache_size = *l2_cache_size - / DEFAULT_L2_REFCOUNT_SIZE_RATIO; - } else if (!l2_cache_size_set) { - *l2_cache_size = *refcount_cache_size - * DEFAULT_L2_REFCOUNT_SIZE_RATIO; - } else if (!refcount_cache_size_set) { - *refcount_cache_size = *l2_cache_size - / DEFAULT_L2_REFCOUNT_SIZE_RATIO; + } + if (!refcount_cache_size_set) { + *refcount_cache_size = MIN_REFCOUNT_CACHE_SIZE * s->cluster_size; } } diff --git a/block/qcow2.h b/block/qcow2.h index adf5c3950fd3..01b5250415f0 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -77,10 +77,6 @@ #define DEFAULT_L2_CACHE_CLUSTERS 8 /* clusters */ #define DEFAULT_L2_CACHE_BYTE_SIZE 1048576 /* bytes */ -/* The refblock cache needs only a fourth of the L2 cache size to cover as many - * clusters */ -#define DEFAULT_L2_REFCOUNT_SIZE_RATIO 4 - #define DEFAULT_CLUSTER_SIZE 65536 diff --git a/tests/qemu-iotests/137.out b/tests/qemu-iotests/137.out index e28e1eadbad7..96724a6c3323 100644 --- a/tests/qemu-iotests/137.out +++ b/tests/qemu-iotests/137.out @@ -22,7 +22,7 @@ refcount-cache-size may not exceed cache-size L2 cache size too big L2 cache entry size must be a power of two between 512 and the cluster size (65536) L2 cache entry size must be a power of two between 512 and the cluster size (65536) -L2 cache size too big +Refcount cache size too big Conflicting values for qcow2 options 'overlap-check' ('constant') and 'overlap-check.template' ('all') Unsupported value 'blubb' for qcow2 option 'overlap-check'. Allowed are any of the following: none, constant, cached, all Unsupported value 'blubb' for qcow2 option 'overlap-check'. Allowed are any of the following: none, constant, cached, all From 603790ef3aec6a19b1c095188a1d2171934a27de Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Tue, 17 Apr 2018 15:37:05 +0300 Subject: [PATCH 05/21] docs: Document the new default sizes of the qcow2 caches We have just reduced the refcount cache size to the minimum unless the user explicitly requests a larger one, so we have to update the documentation to reflect this change. Signed-off-by: Alberto Garcia Message-id: c5f0bde23558dd9d33b21fffc76ac9953cc19c56.1523968389.git.berto@igalia.com Reviewed-by: Eric Blake Signed-off-by: Max Reitz --- docs/qcow2-cache.txt | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/docs/qcow2-cache.txt b/docs/qcow2-cache.txt index 170191a242bf..8a09a5cc5ffc 100644 --- a/docs/qcow2-cache.txt +++ b/docs/qcow2-cache.txt @@ -116,31 +116,30 @@ There are three options available, and all of them take bytes: "refcount-cache-size": maximum size of the refcount block cache "cache-size": maximum size of both caches combined -There are two things that need to be taken into account: +There are a few things that need to be taken into account: - Both caches must have a size that is a multiple of the cluster size (or the cache entry size: see "Using smaller cache sizes" below). - - If you only set one of the options above, QEMU will automatically - adjust the others so that the L2 cache is 4 times bigger than the - refcount cache. + - The default L2 cache size is 8 clusters or 1MB (whichever is more), + and the minimum is 2 clusters (or 2 cache entries, see below). -This means that these options are equivalent: + - The default (and minimum) refcount cache size is 4 clusters. - -drive file=hd.qcow2,l2-cache-size=2097152 - -drive file=hd.qcow2,refcount-cache-size=524288 - -drive file=hd.qcow2,cache-size=2621440 + - If only "cache-size" is specified then QEMU will assign as much + memory as possible to the L2 cache before increasing the refcount + cache size. -The reason for this 1/4 ratio is to ensure that both caches cover the -same amount of disk space. Note however that this is only valid with -the default value of refcount_bits (16). If you are using a different -value you might want to calculate both cache sizes yourself since QEMU -will always use the same 1/4 ratio. +Unlike L2 tables, refcount blocks are not used during normal I/O but +only during allocations and internal snapshots. In most cases they are +accessed sequentially (even during random guest I/O) so increasing the +refcount cache size won't have any measurable effect in performance +(this can change if you are using internal snapshots, so you may want +to think about increasing the cache size if you use them heavily). -It's also worth mentioning that there's no strict need for both caches -to cover the same amount of disk space. The refcount cache is used -much less often than the L2 cache, so it's perfectly reasonable to -keep it small. +Before QEMU 2.12 the refcount cache had a default size of 1/4 of the +L2 cache size. This resulted in unnecessarily large caches, so now the +refcount cache is as small as possible unless overridden by the user. Using smaller cache entries From 81c6ddf49a76a663cea16c07a07d51b67c853209 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 6 Apr 2018 17:17:30 +0200 Subject: [PATCH 06/21] iotests: Add failure matching to common.qemu Currently, common.qemu only allows to match for results indicating success. The only way to fail is by provoking a timeout. However, sometimes we do have a defined failure output and can match for that, which saves us from having to wait for the timeout in case of failure. Because failure can sometimes just result in a _notrun in the test, it is actually important to care about being able to fail quickly. Also, sometimes we simply do not get any specific output in case of success. The only way to handle this currently would be to define an error message as the string to look for, which means that actual success results in a timeout. This is really bad because it unnecessarily slows down a succeeding test. Therefore, this patch adds a new parameter $success_or_failure to _timed_wait_for and _send_qemu_cmd. Setting this to a non-empty string makes both commands expect two match parameters: If the first matches, the function succeeds. If the second matches, the function fails. Signed-off-by: Max Reitz Message-id: 20180406151731.4285-2-mreitz@redhat.com Signed-off-by: Max Reitz --- tests/qemu-iotests/common.qemu | 58 ++++++++++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 7 deletions(-) diff --git a/tests/qemu-iotests/common.qemu b/tests/qemu-iotests/common.qemu index 85f66b852cd8..f2854849519c 100644 --- a/tests/qemu-iotests/common.qemu +++ b/tests/qemu-iotests/common.qemu @@ -52,11 +52,29 @@ _in_fd=4 # response is not echoed out. # If $mismatch_only is set, only non-matching responses will # be echoed. +# +# If $success_or_failure is set, the meaning of the arguments is +# changed as follows: +# $2: A string to search for in the response; if found, this indicates +# success and ${QEMU_STATUS[$1]} is set to 0. +# $3: A string to search for in the response; if found, this indicates +# failure and the test is either aborted (if $qemu_error_no_exit +# is not set) or ${QEMU_STATUS[$1]} is set to -1 (otherwise). function _timed_wait_for() { local h=${1} shift + if [ -z "${success_or_failure}" ]; then + success_match=${*} + failure_match= + else + success_match=${1} + failure_match=${2} + fi + + timeout=yes + QEMU_STATUS[$h]=0 while IFS= read -t ${QEMU_COMM_TIMEOUT} resp <&${QEMU_OUT[$h]} do @@ -64,10 +82,18 @@ function _timed_wait_for() echo "${resp}" | _filter_testdir | _filter_qemu \ | _filter_qemu_io | _filter_qmp | _filter_hmp fi - grep -q "${*}" < <(echo "${resp}") + if [ -n "${failure_match}" ]; then + grep -q "${failure_match}" < <(echo "${resp}") + if [ $? -eq 0 ]; then + timeout= + break + fi + fi + grep -q "${success_match}" < <(echo "${resp}") if [ $? -eq 0 ]; then return - elif [ -z "${silent}" ] && [ -n "${mismatch_only}" ]; then + fi + if [ -z "${silent}" ] && [ -n "${mismatch_only}" ]; then echo "${resp}" | _filter_testdir | _filter_qemu \ | _filter_qemu_io | _filter_qmp | _filter_hmp fi @@ -75,8 +101,12 @@ function _timed_wait_for() done QEMU_STATUS[$h]=-1 if [ -z "${qemu_error_no_exit}" ]; then - echo "Timeout waiting for ${*} on handle ${h}" - exit 1 # Timeout means the test failed + if [ -n "${timeout}" ]; then + echo "Timeout waiting for ${success_match} on handle ${h}" + else + echo "Wrong response matching ${failure_match} on handle ${h}" + fi + exit 1 # Timeout or wrong match mean the test failed fi } @@ -96,6 +126,11 @@ function _timed_wait_for() # If $qemu_error_no_exit is set, then even if the expected response # is not seen, we will not exit. $QEMU_STATUS[$1] will be set it -1 in # that case. +# +# If $success_or_failure is set, then the last two strings are the +# strings the response will be scanned for. The first of the two +# indicates success, the latter indicates failure. Failure is handled +# like a timeout. function _send_qemu_cmd() { local h=${1} @@ -109,14 +144,23 @@ function _send_qemu_cmd() use_error="no" fi # This array element extraction is done to accommodate pathnames with spaces - cmd=${@: 1:${#@}-1} - shift $(($# - 1)) + if [ -z "${success_or_failure}" ]; then + cmd=${@: 1:${#@}-1} + shift $(($# - 1)) + else + cmd=${@: 1:${#@}-2} + shift $(($# - 2)) + fi while [ ${count} -gt 0 ] do echo "${cmd}" >&${QEMU_IN[${h}]} if [ -n "${1}" ]; then - qemu_error_no_exit=${use_error} _timed_wait_for ${h} "${1}" + if [ -z "${success_or_failure}" ]; then + qemu_error_no_exit=${use_error} _timed_wait_for ${h} "${1}" + else + qemu_error_no_exit=${use_error} _timed_wait_for ${h} "${1}" "${2}" + fi if [ ${QEMU_STATUS[$h]} -eq 0 ]; then return fi From b05a2225d2e87a04697509219d00ced7c46ed34d Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 6 Apr 2018 17:17:31 +0200 Subject: [PATCH 07/21] iotests: Skip 181 and 201 without userfaultfd userfaultfd support depends on the host kernel, so it may not be available. If so, 181 and 201 should be skipped. Signed-off-by: Max Reitz Message-id: 20180406151731.4285-3-mreitz@redhat.com Signed-off-by: Max Reitz --- tests/qemu-iotests/181 | 13 +++++++++++++ tests/qemu-iotests/201 | 13 +++++++++++++ 2 files changed, 26 insertions(+) diff --git a/tests/qemu-iotests/181 b/tests/qemu-iotests/181 index 5e767c6195d2..e02979378da4 100755 --- a/tests/qemu-iotests/181 +++ b/tests/qemu-iotests/181 @@ -96,6 +96,19 @@ echo # Enable postcopy-ram capability both on source and destination silent=yes _send_qemu_cmd $dest 'migrate_set_capability postcopy-ram on' "(qemu)" + +qemu_error_no_exit=yes success_or_failure=yes \ + _send_qemu_cmd $dest '' "(qemu)" "Postcopy is not supported" +if [ ${QEMU_STATUS[$dest]} -lt 0 ]; then + _send_qemu_cmd $dest '' "(qemu)" + + _send_qemu_cmd $src 'quit' "" + _send_qemu_cmd $dest 'quit' "" + wait=1 _cleanup_qemu + + _notrun 'Postcopy is not supported' +fi + _send_qemu_cmd $src 'migrate_set_speed 4k' "(qemu)" _send_qemu_cmd $src 'migrate_set_capability postcopy-ram on' "(qemu)" _send_qemu_cmd $src "migrate -d unix:${MIG_SOCKET}" "(qemu)" diff --git a/tests/qemu-iotests/201 b/tests/qemu-iotests/201 index 11f640f5dfc5..c1a1e00077f8 100755 --- a/tests/qemu-iotests/201 +++ b/tests/qemu-iotests/201 @@ -82,6 +82,19 @@ echo silent=yes _send_qemu_cmd $dest 'migrate_set_capability postcopy-ram on' "(qemu)" + +qemu_error_no_exit=yes success_or_failure=yes \ + _send_qemu_cmd $dest '' "(qemu)" "Postcopy is not supported" +if [ ${QEMU_STATUS[$dest]} -lt 0 ]; then + _send_qemu_cmd $dest '' "(qemu)" + + _send_qemu_cmd $src 'quit' "" + _send_qemu_cmd $dest 'quit' "" + wait=1 _cleanup_qemu + + _notrun 'Postcopy is not supported' +fi + _send_qemu_cmd $src 'migrate_set_capability postcopy-ram on' "(qemu)" _send_qemu_cmd $src "migrate -d unix:${MIG_SOCKET}" "(qemu)" From 6c6f24fd84895d03baa898bbc4324dd4ccc97071 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Sat, 21 Apr 2018 15:29:21 +0200 Subject: [PATCH 08/21] block: Add COR filter driver This adds a simple copy-on-read filter driver. It relies on the already existing COR functionality in the central block layer code, which may be moved here once we no longer need it there. Signed-off-by: Max Reitz Message-id: 20180421132929.21610-2-mreitz@redhat.com Reviewed-by: Alberto Garcia Reviewed-by: Kevin Wolf Signed-off-by: Max Reitz --- block/Makefile.objs | 2 +- block/copy-on-read.c | 171 +++++++++++++++++++++++++++++++++++++++++++ qapi/block-core.json | 5 +- 3 files changed, 176 insertions(+), 2 deletions(-) create mode 100644 block/copy-on-read.c diff --git a/block/Makefile.objs b/block/Makefile.objs index d644bac60a65..899bfb5e2cf0 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -26,7 +26,7 @@ block-obj-y += accounting.o dirty-bitmap.o block-obj-y += write-threshold.o block-obj-y += backup.o block-obj-$(CONFIG_REPLICATION) += replication.o -block-obj-y += throttle.o +block-obj-y += throttle.o copy-on-read.o block-obj-y += crypto.o diff --git a/block/copy-on-read.c b/block/copy-on-read.c new file mode 100644 index 000000000000..823ec751c403 --- /dev/null +++ b/block/copy-on-read.c @@ -0,0 +1,171 @@ +/* + * Copy-on-read filter block driver + * + * Copyright (c) 2018 Red Hat, Inc. + * + * Author: + * Max Reitz + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 or + * (at your option) version 3 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include "qemu/osdep.h" +#include "block/block_int.h" + + +static int cor_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file, false, + errp); + if (!bs->file) { + return -EINVAL; + } + + bs->supported_write_flags = BDRV_REQ_FUA & + bs->file->bs->supported_write_flags; + + bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) & + bs->file->bs->supported_zero_flags; + + return 0; +} + + +static void cor_close(BlockDriverState *bs) +{ +} + + +#define PERM_PASSTHROUGH (BLK_PERM_CONSISTENT_READ \ + | BLK_PERM_WRITE \ + | BLK_PERM_RESIZE) +#define PERM_UNCHANGED (BLK_PERM_ALL & ~PERM_PASSTHROUGH) + +static void cor_child_perm(BlockDriverState *bs, BdrvChild *c, + const BdrvChildRole *role, + BlockReopenQueue *reopen_queue, + uint64_t perm, uint64_t shared, + uint64_t *nperm, uint64_t *nshared) +{ + if (c == NULL) { + *nperm = (perm & PERM_PASSTHROUGH) | BLK_PERM_WRITE_UNCHANGED; + *nshared = (shared & PERM_PASSTHROUGH) | PERM_UNCHANGED; + return; + } + + *nperm = (perm & PERM_PASSTHROUGH) | + (c->perm & PERM_UNCHANGED); + *nshared = (shared & PERM_PASSTHROUGH) | + (c->shared_perm & PERM_UNCHANGED); +} + + +static int64_t cor_getlength(BlockDriverState *bs) +{ + return bdrv_getlength(bs->file->bs); +} + + +static int cor_truncate(BlockDriverState *bs, int64_t offset, + PreallocMode prealloc, Error **errp) +{ + return bdrv_truncate(bs->file, offset, prealloc, errp); +} + + +static int coroutine_fn cor_co_preadv(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) +{ + return bdrv_co_preadv(bs->file, offset, bytes, qiov, + flags | BDRV_REQ_COPY_ON_READ); +} + + +static int coroutine_fn cor_co_pwritev(BlockDriverState *bs, + uint64_t offset, uint64_t bytes, + QEMUIOVector *qiov, int flags) +{ + + return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags); +} + + +static int coroutine_fn cor_co_pwrite_zeroes(BlockDriverState *bs, + int64_t offset, int bytes, + BdrvRequestFlags flags) +{ + return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags); +} + + +static int coroutine_fn cor_co_pdiscard(BlockDriverState *bs, + int64_t offset, int bytes) +{ + return bdrv_co_pdiscard(bs->file->bs, offset, bytes); +} + + +static void cor_eject(BlockDriverState *bs, bool eject_flag) +{ + bdrv_eject(bs->file->bs, eject_flag); +} + + +static void cor_lock_medium(BlockDriverState *bs, bool locked) +{ + bdrv_lock_medium(bs->file->bs, locked); +} + + +static bool cor_recurse_is_first_non_filter(BlockDriverState *bs, + BlockDriverState *candidate) +{ + return bdrv_recurse_is_first_non_filter(bs->file->bs, candidate); +} + + +BlockDriver bdrv_copy_on_read = { + .format_name = "copy-on-read", + + .bdrv_open = cor_open, + .bdrv_close = cor_close, + .bdrv_child_perm = cor_child_perm, + + .bdrv_getlength = cor_getlength, + .bdrv_truncate = cor_truncate, + + .bdrv_co_preadv = cor_co_preadv, + .bdrv_co_pwritev = cor_co_pwritev, + .bdrv_co_pwrite_zeroes = cor_co_pwrite_zeroes, + .bdrv_co_pdiscard = cor_co_pdiscard, + + .bdrv_eject = cor_eject, + .bdrv_lock_medium = cor_lock_medium, + + .bdrv_co_block_status = bdrv_co_block_status_from_file, + + .bdrv_recurse_is_first_non_filter = cor_recurse_is_first_non_filter, + + .has_variable_length = true, + .is_filter = true, +}; + +static void bdrv_copy_on_read_init(void) +{ + bdrv_register(&bdrv_copy_on_read); +} + +block_init(bdrv_copy_on_read_init); diff --git a/qapi/block-core.json b/qapi/block-core.json index 17ffd44cce3b..55728cb82327 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -2510,11 +2510,12 @@ # @vxhs: Since 2.10 # @throttle: Since 2.11 # @nvme: Since 2.12 +# @copy-on-read: Since 2.13 # # Since: 2.9 ## { 'enum': 'BlockdevDriver', - 'data': [ 'blkdebug', 'blkverify', 'bochs', 'cloop', + 'data': [ 'blkdebug', 'blkverify', 'bochs', 'cloop', 'copy-on-read', 'dmg', 'file', 'ftp', 'ftps', 'gluster', 'host_cdrom', 'host_device', 'http', 'https', 'iscsi', 'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels', 'qcow', 'qcow2', 'qed', @@ -3531,6 +3532,7 @@ 'blkverify': 'BlockdevOptionsBlkverify', 'bochs': 'BlockdevOptionsGenericFormat', 'cloop': 'BlockdevOptionsGenericFormat', + 'copy-on-read':'BlockdevOptionsGenericFormat', 'dmg': 'BlockdevOptionsGenericFormat', 'file': 'BlockdevOptionsFile', 'ftp': 'BlockdevOptionsCurlFtp', @@ -4058,6 +4060,7 @@ 'blkverify': 'BlockdevCreateNotSupported', 'bochs': 'BlockdevCreateNotSupported', 'cloop': 'BlockdevCreateNotSupported', + 'copy-on-read': 'BlockdevCreateNotSupported', 'dmg': 'BlockdevCreateNotSupported', 'file': 'BlockdevCreateOptionsFile', 'ftp': 'BlockdevCreateNotSupported', From 24b7c538fea86b598e2a335f4805a0ab50a30e98 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Sat, 21 Apr 2018 15:29:22 +0200 Subject: [PATCH 09/21] block: BLK_PERM_WRITE includes ..._UNCHANGED Currently we never actually check whether the WRITE_UNCHANGED permission has been taken for unchanging writes. But the one check that is commented out checks both WRITE and WRITE_UNCHANGED; and considering that WRITE_UNCHANGED is already documented as being weaker than WRITE, we should probably explicitly document WRITE to include WRITE_UNCHANGED. Signed-off-by: Max Reitz Reviewed-by: Stefan Hajnoczi Reviewed-by: Alberto Garcia Message-id: 20180421132929.21610-3-mreitz@redhat.com Reviewed-by: Kevin Wolf Signed-off-by: Max Reitz --- include/block/block.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/block/block.h b/include/block/block.h index cdec3639a353..397b5e8d44b5 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -205,6 +205,9 @@ enum { * This permission (which is weaker than BLK_PERM_WRITE) is both enough and * required for writes to the block node when the caller promises that * the visible disk content doesn't change. + * + * As the BLK_PERM_WRITE permission is strictly stronger, either is + * sufficient to perform an unchanging write. */ BLK_PERM_WRITE_UNCHANGED = 0x04, From c6035964f8316b504060618d05b5dd434f18595b Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Sat, 21 Apr 2018 15:29:23 +0200 Subject: [PATCH 10/21] block: Add BDRV_REQ_WRITE_UNCHANGED flag This flag signifies that a write request will not change the visible disk content. With this flag set, it is sufficient to have the BLK_PERM_WRITE_UNCHANGED permission instead of BLK_PERM_WRITE. Signed-off-by: Max Reitz Reviewed-by: Stefan Hajnoczi Reviewed-by: Alberto Garcia Message-id: 20180421132929.21610-4-mreitz@redhat.com Reviewed-by: Kevin Wolf Signed-off-by: Max Reitz --- block/io.c | 6 +++++- include/block/block.h | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/block/io.c b/block/io.c index 4fad5ac2fef6..9e8449e795b2 100644 --- a/block/io.c +++ b/block/io.c @@ -1504,7 +1504,11 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child, assert(!waited || !req->serialising); assert(req->overlap_offset <= offset); assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); - assert(child->perm & BLK_PERM_WRITE); + if (flags & BDRV_REQ_WRITE_UNCHANGED) { + assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE)); + } else { + assert(child->perm & BLK_PERM_WRITE); + } assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE); ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); diff --git a/include/block/block.h b/include/block/block.h index 397b5e8d44b5..3894edda9de1 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -54,8 +54,12 @@ typedef enum { BDRV_REQ_FUA = 0x10, BDRV_REQ_WRITE_COMPRESSED = 0x20, + /* Signifies that this write request will not change the visible disk + * content. */ + BDRV_REQ_WRITE_UNCHANGED = 0x40, + /* Mask of valid flags */ - BDRV_REQ_MASK = 0x3f, + BDRV_REQ_MASK = 0x7f, } BdrvRequestFlags; typedef struct BlockSizes { From 7adcf59fecf3c8ce9330430187350b53f9e50cf7 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Sat, 21 Apr 2018 15:29:24 +0200 Subject: [PATCH 11/21] block: Set BDRV_REQ_WRITE_UNCHANGED for COR writes Signed-off-by: Max Reitz Reviewed-by: Stefan Hajnoczi Reviewed-by: Alberto Garcia Message-id: 20180421132929.21610-5-mreitz@redhat.com Reviewed-by: Kevin Wolf Signed-off-by: Max Reitz --- block/io.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/block/io.c b/block/io.c index 9e8449e795b2..ca96b487eb84 100644 --- a/block/io.c +++ b/block/io.c @@ -1118,13 +1118,15 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child, /* FIXME: Should we (perhaps conditionally) be setting * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy * that still correctly reads as zero? */ - ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, 0); + ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, + BDRV_REQ_WRITE_UNCHANGED); } else { /* This does not change the data on the disk, it is not * necessary to flush even in cache=writethrough mode. */ ret = bdrv_driver_pwritev(bs, cluster_offset, pnum, - &local_qiov, 0); + &local_qiov, + BDRV_REQ_WRITE_UNCHANGED); } if (ret < 0) { From 1b1a920b713af6af795d49d0e3d2a8a65020bf82 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Sat, 21 Apr 2018 15:29:25 +0200 Subject: [PATCH 12/21] block/quorum: Support BDRV_REQ_WRITE_UNCHANGED We just need to forward it to quorum's children (except in case of a rewrite because of corruption), but for that we first have to support flags in child requests at all. Signed-off-by: Max Reitz Reviewed-by: Stefan Hajnoczi Reviewed-by: Alberto Garcia Message-id: 20180421132929.21610-6-mreitz@redhat.com Reviewed-by: Kevin Wolf Signed-off-by: Max Reitz --- block/quorum.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/block/quorum.c b/block/quorum.c index a5051da56e35..e448d7e384c0 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -115,6 +115,7 @@ struct QuorumAIOCB { /* Request metadata */ uint64_t offset; uint64_t bytes; + int flags; QEMUIOVector *qiov; /* calling IOV */ @@ -157,7 +158,8 @@ static bool quorum_64bits_compare(QuorumVoteValue *a, QuorumVoteValue *b) static QuorumAIOCB *quorum_aio_get(BlockDriverState *bs, QEMUIOVector *qiov, uint64_t offset, - uint64_t bytes) + uint64_t bytes, + int flags) { BDRVQuorumState *s = bs->opaque; QuorumAIOCB *acb = g_new(QuorumAIOCB, 1); @@ -168,6 +170,7 @@ static QuorumAIOCB *quorum_aio_get(BlockDriverState *bs, .bs = bs, .offset = offset, .bytes = bytes, + .flags = flags, .qiov = qiov, .votes.compare = quorum_sha256_compare, .votes.vote_list = QLIST_HEAD_INITIALIZER(acb.votes.vote_list), @@ -271,9 +274,11 @@ static void quorum_rewrite_entry(void *opaque) BDRVQuorumState *s = acb->bs->opaque; /* Ignore any errors, it's just a correction attempt for already - * corrupted data. */ + * corrupted data. + * Mask out BDRV_REQ_WRITE_UNCHANGED because this overwrites the + * area with different data from the other children. */ bdrv_co_pwritev(s->children[co->idx], acb->offset, acb->bytes, - acb->qiov, 0); + acb->qiov, acb->flags & ~BDRV_REQ_WRITE_UNCHANGED); /* Wake up the caller after the last rewrite */ acb->rewrite_count--; @@ -673,7 +678,7 @@ static int quorum_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) { BDRVQuorumState *s = bs->opaque; - QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes); + QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes, flags); int ret; acb->is_read = true; @@ -699,7 +704,7 @@ static void write_quorum_entry(void *opaque) sacb->bs = s->children[i]->bs; sacb->ret = bdrv_co_pwritev(s->children[i], acb->offset, acb->bytes, - acb->qiov, 0); + acb->qiov, acb->flags); if (sacb->ret == 0) { acb->success_count++; } else { @@ -719,7 +724,7 @@ static int quorum_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) { BDRVQuorumState *s = bs->opaque; - QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes); + QuorumAIOCB *acb = quorum_aio_get(bs, qiov, offset, bytes, flags); int i, ret; for (i = 0; i < s->num_children; i++) { @@ -961,6 +966,8 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags, } s->next_child_index = s->num_children; + bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED; + g_free(opened); goto exit; From 228345bf5db8bc97d1c64f062e138d389065d1ab Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Sat, 21 Apr 2018 15:29:26 +0200 Subject: [PATCH 13/21] block: Support BDRV_REQ_WRITE_UNCHANGED in filters Update the rest of the filter drivers to support BDRV_REQ_WRITE_UNCHANGED. They already forward write request flags to their children, so we just have to announce support for it. This patch does not cover the replication driver because that currently does not support flags at all, and because it just grabs the WRITE permission for its children when it can, so we should be fine just submitting the incoming WRITE_UNCHANGED requests as normal writes. It also does not cover format drivers for similar reasons. They all use bdrv_format_default_perms() as their .bdrv_child_perm() implementation so they just always grab the WRITE permission for their file children whenever possible. In addition, it often would be difficult to ascertain whether incoming unchanging writes end up as unchanging writes in their files. So we just leave them as normal potentially changing writes. Signed-off-by: Max Reitz Reviewed-by: Stefan Hajnoczi Reviewed-by: Alberto Garcia Message-id: 20180421132929.21610-7-mreitz@redhat.com Reviewed-by: Kevin Wolf Signed-off-by: Max Reitz --- block/blkdebug.c | 9 +++++---- block/blkreplay.c | 3 +++ block/blkverify.c | 3 +++ block/copy-on-read.c | 10 ++++++---- block/mirror.c | 2 ++ block/raw-format.c | 9 +++++---- block/throttle.c | 6 ++++-- 7 files changed, 28 insertions(+), 14 deletions(-) diff --git a/block/blkdebug.c b/block/blkdebug.c index 053372c22ea2..526af2a80843 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -398,10 +398,11 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags, goto out; } - bs->supported_write_flags = BDRV_REQ_FUA & - bs->file->bs->supported_write_flags; - bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) & - bs->file->bs->supported_zero_flags; + bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED | + (BDRV_REQ_FUA & bs->file->bs->supported_write_flags); + bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED | + ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) & + bs->file->bs->supported_zero_flags); ret = -EINVAL; /* Set alignment overrides */ diff --git a/block/blkreplay.c b/block/blkreplay.c index fe5a9b4a984d..b016dbeee75a 100755 --- a/block/blkreplay.c +++ b/block/blkreplay.c @@ -35,6 +35,9 @@ static int blkreplay_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } + bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED; + bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED; + ret = 0; fail: return ret; diff --git a/block/blkverify.c b/block/blkverify.c index 754cc9e85769..da97ee592767 100644 --- a/block/blkverify.c +++ b/block/blkverify.c @@ -141,6 +141,9 @@ static int blkverify_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } + bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED; + bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED; + ret = 0; fail: qemu_opts_del(opts); diff --git a/block/copy-on-read.c b/block/copy-on-read.c index 823ec751c403..6a972088889f 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -33,11 +33,13 @@ static int cor_open(BlockDriverState *bs, QDict *options, int flags, return -EINVAL; } - bs->supported_write_flags = BDRV_REQ_FUA & - bs->file->bs->supported_write_flags; + bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED | + (BDRV_REQ_FUA & + bs->file->bs->supported_write_flags); - bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) & - bs->file->bs->supported_zero_flags; + bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED | + ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) & + bs->file->bs->supported_zero_flags); return 0; } diff --git a/block/mirror.c b/block/mirror.c index 6aa38db11400..a4197bb975c6 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -1134,6 +1134,8 @@ static void mirror_start_job(const char *job_id, BlockDriverState *bs, mirror_top_bs->implicit = true; } mirror_top_bs->total_sectors = bs->total_sectors; + mirror_top_bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED; + mirror_top_bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED; bdrv_set_aio_context(mirror_top_bs, bdrv_get_aio_context(bs)); /* bdrv_append takes ownership of the mirror_top_bs reference, need to keep diff --git a/block/raw-format.c b/block/raw-format.c index a378547c9983..fe33693a2da2 100644 --- a/block/raw-format.c +++ b/block/raw-format.c @@ -415,10 +415,11 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags, } bs->sg = bs->file->bs->sg; - bs->supported_write_flags = BDRV_REQ_FUA & - bs->file->bs->supported_write_flags; - bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) & - bs->file->bs->supported_zero_flags; + bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED | + (BDRV_REQ_FUA & bs->file->bs->supported_write_flags); + bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED | + ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) & + bs->file->bs->supported_zero_flags); if (bs->probed && !bdrv_is_read_only(bs)) { fprintf(stderr, diff --git a/block/throttle.c b/block/throttle.c index 95ed06acd8dd..e298827f95e2 100644 --- a/block/throttle.c +++ b/block/throttle.c @@ -81,8 +81,10 @@ static int throttle_open(BlockDriverState *bs, QDict *options, if (!bs->file) { return -EINVAL; } - bs->supported_write_flags = bs->file->bs->supported_write_flags; - bs->supported_zero_flags = bs->file->bs->supported_zero_flags; + bs->supported_write_flags = bs->file->bs->supported_write_flags | + BDRV_REQ_WRITE_UNCHANGED; + bs->supported_zero_flags = bs->file->bs->supported_zero_flags | + BDRV_REQ_WRITE_UNCHANGED; return throttle_configure_tgm(bs, tgm, options, errp); } From 5fdc0b73eb68d107944cfa65185fb155b511e496 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Sat, 21 Apr 2018 15:29:27 +0200 Subject: [PATCH 14/21] iotests: Clean up wrap image in 197 Signed-off-by: Max Reitz Reviewed-by: Stefan Hajnoczi Reviewed-by: Alberto Garcia Message-id: 20180421132929.21610-8-mreitz@redhat.com Reviewed-by: Kevin Wolf Signed-off-by: Max Reitz --- tests/qemu-iotests/197 | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/qemu-iotests/197 b/tests/qemu-iotests/197 index 5e869fe2b776..3ae4975eecdd 100755 --- a/tests/qemu-iotests/197 +++ b/tests/qemu-iotests/197 @@ -44,6 +44,7 @@ esac _cleanup() { _cleanup_test_img + rm -f "$TEST_WRAP" rm -f "$BLKDBG_CONF" } trap "_cleanup; exit \$status" 0 1 2 3 15 From a62cbac4ce2db79c14ff299e98ee556b57467c19 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Sat, 21 Apr 2018 15:29:28 +0200 Subject: [PATCH 15/21] iotests: Copy 197 for COR filter driver iotest 197 tests copy-on-read using the (now old) copy-on-read flag. Copy it to 215 and modify it to use the COR filter driver instead. Signed-off-by: Max Reitz Message-id: 20180421132929.21610-9-mreitz@redhat.com Reviewed-by: Kevin Wolf Signed-off-by: Max Reitz --- tests/qemu-iotests/215 | 120 +++++++++++++++++++++++++++++++++++++ tests/qemu-iotests/215.out | 26 ++++++++ tests/qemu-iotests/group | 1 + 3 files changed, 147 insertions(+) create mode 100755 tests/qemu-iotests/215 create mode 100644 tests/qemu-iotests/215.out diff --git a/tests/qemu-iotests/215 b/tests/qemu-iotests/215 new file mode 100755 index 000000000000..2e616ed659bf --- /dev/null +++ b/tests/qemu-iotests/215 @@ -0,0 +1,120 @@ +#!/bin/bash +# +# Test case for copy-on-read into qcow2, using the COR filter driver +# +# Copyright (C) 2018 Red Hat, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +seq="$(basename $0)" +echo "QA output created by $seq" + +here="$PWD" +status=1 # failure is the default! + +# get standard environment, filters and checks +. ./common.rc +. ./common.filter + +TEST_WRAP="$TEST_DIR/t.wrap.qcow2" +BLKDBG_CONF="$TEST_DIR/blkdebug.conf" + +# Sanity check: our use of blkdebug fails if $TEST_DIR contains spaces +# or other problems +case "$TEST_DIR" in + *[^-_a-zA-Z0-9/]*) + _notrun "Suspicious TEST_DIR='$TEST_DIR', cowardly refusing to run" ;; +esac + +_cleanup() +{ + _cleanup_test_img + rm -f "$TEST_WRAP" + rm -f "$BLKDBG_CONF" +} +trap "_cleanup; exit \$status" 0 1 2 3 15 + +# Test is supported for any backing file; but we force qcow2 for our wrapper. +_supported_fmt generic +_supported_proto generic +_supported_os Linux +# LUKS support may be possible, but it complicates things. +_unsupported_fmt luks + +echo +echo '=== Copy-on-read ===' +echo + +# Prep the images +# VPC rounds image sizes to a specific geometry, force a specific size. +if [ "$IMGFMT" = "vpc" ]; then + IMGOPTS=$(_optstr_add "$IMGOPTS" "force_size") +fi +_make_test_img 4G +$QEMU_IO -c "write -P 55 3G 1k" "$TEST_IMG" | _filter_qemu_io +IMGPROTO=file IMGFMT=qcow2 IMGOPTS= TEST_IMG_FILE="$TEST_WRAP" \ + _make_test_img -F "$IMGFMT" -b "$TEST_IMG" | _filter_img_create +$QEMU_IO -f qcow2 -c "write -z -u 1M 64k" "$TEST_WRAP" | _filter_qemu_io + +# Ensure that a read of two clusters, but where one is already allocated, +# does not re-write the allocated cluster +cat > "$BLKDBG_CONF" <&1 | _filter_qemu_io) +case $output in + *allocate*) + _notrun "Insufficent memory to run test" ;; + *) printf '%s\n' "$output" ;; +esac +$QEMU_IO \ + -c "open -o driver=copy-on-read,file.driver=qcow2 $TEST_WRAP" \ + -c "read -P 0 $((3*1024*1024*1024 + 1024)) 1k" \ + | _filter_qemu_io + +# Copy-on-read is incompatible with read-only +$QEMU_IO \ + -c "open -r -o driver=copy-on-read,file.driver=qcow2 $TEST_WRAP" \ + 2>&1 | _filter_testdir + +# Break the backing chain, and show that images are identical, and that +# we properly copied over explicit zeros. +$QEMU_IMG rebase -u -b "" -f qcow2 "$TEST_WRAP" +$QEMU_IO -f qcow2 -c map "$TEST_WRAP" +_check_test_img +$QEMU_IMG compare -f $IMGFMT -F qcow2 "$TEST_IMG" "$TEST_WRAP" + +# success, all done +echo '*** done' +status=0 diff --git a/tests/qemu-iotests/215.out b/tests/qemu-iotests/215.out new file mode 100644 index 000000000000..70b0f5fb1956 --- /dev/null +++ b/tests/qemu-iotests/215.out @@ -0,0 +1,26 @@ +QA output created by 215 + +=== Copy-on-read === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=4294967296 +wrote 1024/1024 bytes at offset 3221225472 +1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +Formatting 'TEST_DIR/t.wrap.IMGFMT', fmt=IMGFMT size=4294967296 backing_file=TEST_DIR/t.IMGFMT backing_fmt=IMGFMT +wrote 65536/65536 bytes at offset 1048576 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 131072/131072 bytes at offset 1048576 +128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 0/0 bytes at offset 0 +0 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 2147483136/2147483136 bytes at offset 1024 +2 GiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 1024/1024 bytes at offset 3221226496 +1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +can't open device TEST_DIR/t.wrap.qcow2: Block node is read-only +2 GiB (0x80010000) bytes allocated at offset 0 bytes (0x0) +1023.938 MiB (0x3fff0000) bytes not allocated at offset 2 GiB (0x80010000) +64 KiB (0x10000) bytes allocated at offset 3 GiB (0xc0000000) +1023.938 MiB (0x3fff0000) bytes not allocated at offset 3 GiB (0xc0010000) +No errors were found on the image. +Images are identical. +*** done diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group index aed024af05b3..b59bcea6407a 100644 --- a/tests/qemu-iotests/group +++ b/tests/qemu-iotests/group @@ -213,4 +213,5 @@ 212 rw auto quick 213 rw auto quick 214 rw auto +215 rw auto quick 218 rw auto quick From 3e7a95feb9b5d66cff7fee38b3c423135ed245f6 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Sat, 21 Apr 2018 15:29:29 +0200 Subject: [PATCH 16/21] iotests: Add test for COR across nodes COR across nodes (that is, you have some filter node between the actually COR target and the node that performs the COR) cannot reliably work together with the permission system when there is no explicit COR node that can request the WRITE_UNCHANGED permission for its child. This is because COR (currently) sneaks its requests by the usual permission checks, so it can work without a WRITE* permission; but if there is a filter node in between, that will re-issue the request, which then passes through the usual check -- and if nobody has requested a WRITE_UNCHANGED permission, that check will fail. There is no real direct fix apart from hoping that there is someone who has requested that permission; in case of just the qemu-io HMP command (and no guest device), however, that is not the case. The real real fix is to implement the copy-on-read flag through an implicitly added COR node. Such a node can request the necessary permissions as shown in this test. Signed-off-by: Max Reitz Message-id: 20180421132929.21610-10-mreitz@redhat.com Reviewed-by: Kevin Wolf Signed-off-by: Max Reitz --- tests/qemu-iotests/216 | 115 +++++++++++++++++++++++++++++++++++++ tests/qemu-iotests/216.out | 28 +++++++++ tests/qemu-iotests/group | 1 + 3 files changed, 144 insertions(+) create mode 100755 tests/qemu-iotests/216 create mode 100644 tests/qemu-iotests/216.out diff --git a/tests/qemu-iotests/216 b/tests/qemu-iotests/216 new file mode 100755 index 000000000000..ca9b47a7fd5b --- /dev/null +++ b/tests/qemu-iotests/216 @@ -0,0 +1,115 @@ +#!/usr/bin/env python +# +# Copy-on-read tests using a COR filter node +# +# Copyright (C) 2018 Red Hat, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# Creator/Owner: Max Reitz + +import iotests +from iotests import log, qemu_img_pipe, qemu_io, filter_qemu_io + +# Need backing file support +iotests.verify_image_format(supported_fmts=['qcow2', 'qcow', 'qed', 'vmdk']) +iotests.verify_platform(['linux']) + +log('') +log('=== Copy-on-read across nodes ===') +log('') + +# The old copy-on-read mechanism without a filter node cannot request +# WRITE_UNCHANGED permissions for its child. Therefore it just tries +# to sneak its write by the usual permission system and holds its +# fingers crossed. However, that sneaking does not work so well when +# there is a filter node in the way: That will receive the write +# request and re-issue a new one to its child, which this time is a +# proper write request that will make the permission system cough -- +# unless there is someone at the top (like a guest device) that has +# requested write permissions. +# +# A COR filter node, however, can request the proper permissions for +# its child and therefore is not hit by this issue. + +with iotests.FilePath('base.img') as base_img_path, \ + iotests.FilePath('top.img') as top_img_path, \ + iotests.VM() as vm: + + log('--- Setting up images ---') + log('') + + qemu_img_pipe('create', '-f', iotests.imgfmt, base_img_path, '64M') + + log(filter_qemu_io(qemu_io(base_img_path, '-c', 'write -P 1 0M 1M'))) + + qemu_img_pipe('create', '-f', iotests.imgfmt, '-b', base_img_path, + top_img_path) + + log(filter_qemu_io(qemu_io(top_img_path, '-c', 'write -P 2 1M 1M'))) + + log('') + log('--- Doing COR ---') + log('') + + # Compare with e.g. the following: + # vm.add_drive_raw('if=none,node-name=node0,copy-on-read=on,driver=raw,' \ + # 'file.driver=%s,file.file.filename=%s' % + # (iotests.imgfmt, top_img_path)) + # (Remove the blockdev-add instead.) + # ((Not tested here because it hits an assertion in the permission + # system.)) + + vm.launch() + + log(vm.qmp('blockdev-add', + node_name='node0', + driver='copy-on-read', + file={ + 'driver': 'raw', + 'file': { + 'driver': 'copy-on-read', + 'file': { + 'driver': 'raw', + 'file': { + 'driver': iotests.imgfmt, + 'file': { + 'driver': 'file', + 'filename': top_img_path + }, + 'backing': { + 'driver': iotests.imgfmt, + 'file': { + 'driver': 'file', + 'filename': base_img_path + } + } + } + } + } + })) + + # Trigger COR + log(vm.qmp('human-monitor-command', + command_line='qemu-io node0 "read 0 64M"')) + + vm.shutdown() + + log('') + log('--- Checking COR result ---') + log('') + + log(filter_qemu_io(qemu_io(base_img_path, '-c', 'discard 0 64M'))) + log(filter_qemu_io(qemu_io(top_img_path, '-c', 'read -P 1 0M 1M'))) + log(filter_qemu_io(qemu_io(top_img_path, '-c', 'read -P 2 1M 1M'))) diff --git a/tests/qemu-iotests/216.out b/tests/qemu-iotests/216.out new file mode 100644 index 000000000000..d3fc590d2966 --- /dev/null +++ b/tests/qemu-iotests/216.out @@ -0,0 +1,28 @@ + +=== Copy-on-read across nodes === + +--- Setting up images --- + +wrote 1048576/1048576 bytes at offset 0 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +wrote 1048576/1048576 bytes at offset 1048576 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + + +--- Doing COR --- + +{u'return': {}} +{u'return': u''} + +--- Checking COR result --- + +discard 67108864/67108864 bytes at offset 0 +64 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +read 1048576/1048576 bytes at offset 0 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +read 1048576/1048576 bytes at offset 1048576 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group index b59bcea6407a..cc8cd8cc8e75 100644 --- a/tests/qemu-iotests/group +++ b/tests/qemu-iotests/group @@ -214,4 +214,5 @@ 213 rw auto quick 214 rw auto 215 rw auto quick +216 rw auto quick 218 rw auto quick From 5279b30392da7a3248b320c75f20c61e3a95863c Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Sat, 21 Apr 2018 18:39:57 +0200 Subject: [PATCH 17/21] qemu-img: Check post-truncation size Some block drivers (iscsi and file-posix when dealing with device files) do not actually support truncation, even though they provide a .bdrv_truncate() method and will happily return success when providing a new size that does not exceed the current size. This is because these drivers expect the user to resize the image outside of qemu and then provide qemu with that information through the block_resize command (compare cb1b83e740384b4e0d950f3d7c81c02b8ce86c2e). Of course, anyone using qemu-img resize will find that behavior useless. So we should check the actual size of the image after the supposedly successful truncation took place, emit an error if nothing changed and emit a warning if the target size was not met. Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1523065 Signed-off-by: Max Reitz Message-id: 20180421163957.29872-1-mreitz@redhat.com Reviewed-by: Stefan Hajnoczi Signed-off-by: Max Reitz --- qemu-img.c | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/qemu-img.c b/qemu-img.c index ea62d2d61e85..62b29e7febae 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -3381,7 +3381,7 @@ static int img_resize(int argc, char **argv) Error *err = NULL; int c, ret, relative; const char *filename, *fmt, *size; - int64_t n, total_size, current_size; + int64_t n, total_size, current_size, new_size; bool quiet = false; BlockBackend *blk = NULL; PreallocMode prealloc = PREALLOC_MODE_OFF; @@ -3557,11 +3557,42 @@ static int img_resize(int argc, char **argv) } ret = blk_truncate(blk, total_size, prealloc, &err); - if (!ret) { - qprintf(quiet, "Image resized.\n"); - } else { + if (ret < 0) { error_report_err(err); + goto out; + } + + new_size = blk_getlength(blk); + if (new_size < 0) { + error_report("Failed to verify truncated image length: %s", + strerror(-new_size)); + ret = -1; + goto out; } + + /* Some block drivers implement a truncation method, but only so + * the user can cause qemu to refresh the image's size from disk. + * The idea is that the user resizes the image outside of qemu and + * then invokes block_resize to inform qemu about it. + * (This includes iscsi and file-posix for device files.) + * Of course, that is not the behavior someone invoking + * qemu-img resize would find useful, so we catch that behavior + * here and tell the user. */ + if (new_size != total_size && new_size == current_size) { + error_report("Image was not resized; resizing may not be supported " + "for this image"); + ret = -1; + goto out; + } + + if (new_size != total_size) { + warn_report("Image should have been resized to %" PRIi64 + " bytes, but was resized to %" PRIi64 " bytes", + total_size, new_size); + } + + qprintf(quiet, "Image resized.\n"); + out: blk_unref(blk); if (ret) { From c1e3489dfaa01d215e37b1412759b856c33d44ed Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Wed, 2 May 2018 16:03:59 +0200 Subject: [PATCH 18/21] block: Document BDRV_REQ_WRITE_UNCHANGED support Add BDRV_REQ_WRITE_UNCHANGED to the list of flags honored during pwrite and pwrite_zeroes, and also add a note on when you absolutely need to support it. Signed-off-by: Max Reitz Message-id: 20180502140359.18222-1-mreitz@redhat.com Reviewed-by: Eric Blake Signed-off-by: Max Reitz --- include/block/block_int.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/include/block/block_int.h b/include/block/block_int.h index e3d6219f4e37..76b589da576a 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -656,10 +656,24 @@ struct BlockDriverState { /* I/O Limits */ BlockLimits bl; - /* Flags honored during pwrite (so far: BDRV_REQ_FUA) */ + /* Flags honored during pwrite (so far: BDRV_REQ_FUA, + * BDRV_REQ_WRITE_UNCHANGED). + * If a driver does not support BDRV_REQ_WRITE_UNCHANGED, those + * writes will be issued as normal writes without the flag set. + * This is important to note for drivers that do not explicitly + * request a WRITE permission for their children and instead take + * the same permissions as their parent did (this is commonly what + * block filters do). Such drivers have to be aware that the + * parent may have taken a WRITE_UNCHANGED permission only and is + * issuing such requests. Drivers either must make sure that + * these requests do not result in plain WRITE accesses (usually + * by supporting BDRV_REQ_WRITE_UNCHANGED, and then forwarding + * every incoming write request as-is, including potentially that + * flag), or they have to explicitly take the WRITE permission for + * their children. */ unsigned int supported_write_flags; /* Flags honored during pwrite_zeroes (so far: BDRV_REQ_FUA, - * BDRV_REQ_MAY_UNMAP) */ + * BDRV_REQ_MAY_UNMAP, BDRV_REQ_WRITE_UNCHANGED) */ unsigned int supported_zero_flags; /* the following member gives a name to every node on the bs graph. */ From 2a01c01f9ecb43af4c0a85fe6adc429ffc9c31b5 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Wed, 2 May 2018 22:20:49 +0200 Subject: [PATCH 19/21] qemu-io: Use purely string blockdev options Currently, qemu-io only uses string-valued blockdev options (as all are converted directly from QemuOpts) -- with one exception: -U adds the force-share option as a boolean. This in itself is already a bit questionable, but a real issue is that it also assumes the value already existing in the options QDict would be a boolean, which is wrong. That has the following effect: $ ./qemu-io -r -U --image-opts \ driver=file,filename=/dev/null,force-share=off [1] 15200 segmentation fault (core dumped) ./qemu-io -r -U --image-opts driver=file,filename=/dev/null,force-share=off Since @opts is converted from QemuOpts, the value must be a string, and we have to compare it as such. Consequently, it makes sense to also set it as a string instead of a boolean. Cc: qemu-stable@nongnu.org Signed-off-by: Max Reitz Message-id: 20180502202051.15493-2-mreitz@redhat.com Reviewed-by: Eric Blake Signed-off-by: Max Reitz --- qemu-io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qemu-io.c b/qemu-io.c index 72fee0d8b728..73c638ff8b8d 100644 --- a/qemu-io.c +++ b/qemu-io.c @@ -95,12 +95,12 @@ static int openfile(char *name, int flags, bool writethrough, bool force_share, opts = qdict_new(); } if (qdict_haskey(opts, BDRV_OPT_FORCE_SHARE) - && !qdict_get_bool(opts, BDRV_OPT_FORCE_SHARE)) { + && strcmp(qdict_get_str(opts, BDRV_OPT_FORCE_SHARE), "on")) { error_report("-U conflicts with image options"); qobject_unref(opts); return 1; } - qdict_put_bool(opts, BDRV_OPT_FORCE_SHARE, true); + qdict_put_str(opts, BDRV_OPT_FORCE_SHARE, "on"); } qemuio_blk = blk_new_open(name, NULL, opts, flags, &local_err); if (!qemuio_blk) { From 4615f87832d2fcb7a544bedeece2741bf8c21f94 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Wed, 2 May 2018 22:20:50 +0200 Subject: [PATCH 20/21] qemu-img: Use only string options in img_open_opts img_open_opts() takes a QemuOpts and converts them to a QDict, so all values therein are strings. Then it may try to call qdict_get_bool(), however, which will fail with a segmentation fault every time: $ ./qemu-img info -U --image-opts \ driver=file,filename=/dev/null,force-share=off [1] 27869 segmentation fault (core dumped) ./qemu-img info -U --image-opts driver=file,filename=/dev/null,force-share=off Fix this by using qdict_get_str() and comparing the value as a string. Also, when adding a force-share value to the QDict, add it as a string so it fits the rest of the dict. Cc: qemu-stable@nongnu.org Signed-off-by: Max Reitz Message-id: 20180502202051.15493-3-mreitz@redhat.com Reviewed-by: Eric Blake Signed-off-by: Max Reitz --- qemu-img.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/qemu-img.c b/qemu-img.c index 62b29e7febae..60e45ec10365 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -277,12 +277,12 @@ static BlockBackend *img_open_opts(const char *optstr, options = qemu_opts_to_qdict(opts, NULL); if (force_share) { if (qdict_haskey(options, BDRV_OPT_FORCE_SHARE) - && !qdict_get_bool(options, BDRV_OPT_FORCE_SHARE)) { + && strcmp(qdict_get_str(options, BDRV_OPT_FORCE_SHARE), "on")) { error_report("--force-share/-U conflicts with image options"); qobject_unref(options); return NULL; } - qdict_put_bool(options, BDRV_OPT_FORCE_SHARE, true); + qdict_put_str(options, BDRV_OPT_FORCE_SHARE, "on"); } blk = blk_new_open(NULL, NULL, options, flags, &local_err); if (!blk) { From 4e7d73c5fbd97e55ffe5af02f24d1f7dbe3bbf20 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Wed, 2 May 2018 22:20:51 +0200 Subject: [PATCH 21/21] iotests: Add test for -U/force-share conflicts Signed-off-by: Max Reitz Message-id: 20180502202051.15493-4-mreitz@redhat.com Reviewed-by: Eric Blake Signed-off-by: Max Reitz --- tests/qemu-iotests/153 | 17 +++++++++++++++++ tests/qemu-iotests/153.out | 16 ++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/tests/qemu-iotests/153 b/tests/qemu-iotests/153 index a0fd81548301..ec508c758f55 100755 --- a/tests/qemu-iotests/153 +++ b/tests/qemu-iotests/153 @@ -242,6 +242,23 @@ _run_cmd $QEMU_IO "${TEST_IMG}" -c 'write 0 512' _cleanup_qemu +echo +echo "== Detecting -U and force-share conflicts ==" + +echo +echo 'No conflict:' +$QEMU_IMG info -U --image-opts driver=null-co,force-share=on +echo +echo 'Conflict:' +$QEMU_IMG info -U --image-opts driver=null-co,force-share=off + +echo +echo 'No conflict:' +$QEMU_IO -c 'open -r -U -o driver=null-co,force-share=on' +echo +echo 'Conflict:' +$QEMU_IO -c 'open -r -U -o driver=null-co,force-share=off' + # success, all done echo "*** done" rm -f $seq.full diff --git a/tests/qemu-iotests/153.out b/tests/qemu-iotests/153.out index bb721cb747c8..2510762ba1f0 100644 --- a/tests/qemu-iotests/153.out +++ b/tests/qemu-iotests/153.out @@ -399,4 +399,20 @@ Is another process using the image? Closing the other _qemu_io_wrapper TEST_DIR/t.qcow2 -c write 0 512 + +== Detecting -U and force-share conflicts == + +No conflict: +image: null-co:// +file format: null-co +virtual size: 1.0G (1073741824 bytes) +disk size: unavailable + +Conflict: +qemu-img: --force-share/-U conflicts with image options + +No conflict: + +Conflict: +-U conflicts with image options *** done