Skip to content

Commit

Permalink
quorum: Add the rewrite-corrupted parameter to quorum
Browse files Browse the repository at this point in the history
On read operations when this parameter is set and some replicas are corrupted
while quorum can be reached quorum will proceed to rewrite the correct version
of the data to fix the corrupted replicas.

This will shine with SSD where the FTL will remap the same block at another
place on rewrite.

Signed-off-by: Benoit Canet <benoit@irqsave.net>
Reviewed-by: Max Reitz <mreitz@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
  • Loading branch information
Benoît Canet authored and kevmw committed Jun 27, 2014
1 parent d1fde4a commit cf29a57
Show file tree
Hide file tree
Showing 4 changed files with 119 additions and 8 deletions.
97 changes: 91 additions & 6 deletions block/quorum.c
Expand Up @@ -23,6 +23,7 @@

#define QUORUM_OPT_VOTE_THRESHOLD "vote-threshold"
#define QUORUM_OPT_BLKVERIFY "blkverify"
#define QUORUM_OPT_REWRITE "rewrite-corrupted"

/* This union holds a vote hash value */
typedef union QuorumVoteValue {
Expand Down Expand Up @@ -70,6 +71,9 @@ typedef struct BDRVQuorumState {
* It is useful to debug other block drivers by
* comparing them with a reference one.
*/
bool rewrite_corrupted;/* true if the driver must rewrite-on-read corrupted
* block if Quorum is reached.
*/
} BDRVQuorumState;

typedef struct QuorumAIOCB QuorumAIOCB;
Expand Down Expand Up @@ -105,13 +109,17 @@ struct QuorumAIOCB {
int count; /* number of completed AIOCB */
int success_count; /* number of successfully completed AIOCB */

int rewrite_count; /* number of replica to rewrite: count down to
* zero once writes are fired
*/

QuorumVotes votes;

bool is_read;
int vote_ret;
};

static void quorum_vote(QuorumAIOCB *acb);
static bool quorum_vote(QuorumAIOCB *acb);

static void quorum_aio_cancel(BlockDriverAIOCB *blockacb)
{
Expand Down Expand Up @@ -183,6 +191,7 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s,
acb->qcrs = g_new0(QuorumChildRequest, s->num_children);
acb->count = 0;
acb->success_count = 0;
acb->rewrite_count = 0;
acb->votes.compare = quorum_sha256_compare;
QLIST_INIT(&acb->votes.vote_list);
acb->is_read = false;
Expand Down Expand Up @@ -232,11 +241,27 @@ static bool quorum_has_too_much_io_failed(QuorumAIOCB *acb)
return false;
}

static void quorum_rewrite_aio_cb(void *opaque, int ret)
{
QuorumAIOCB *acb = opaque;

/* one less rewrite to do */
acb->rewrite_count--;

/* wait until all rewrite callbacks have completed */
if (acb->rewrite_count) {
return;
}

quorum_aio_finalize(acb);
}

static void quorum_aio_cb(void *opaque, int ret)
{
QuorumChildRequest *sacb = opaque;
QuorumAIOCB *acb = sacb->parent;
BDRVQuorumState *s = acb->common.bs->opaque;
bool rewrite = false;

sacb->ret = ret;
acb->count++;
Expand All @@ -253,12 +278,15 @@ static void quorum_aio_cb(void *opaque, int ret)

/* Do the vote on read */
if (acb->is_read) {
quorum_vote(acb);
rewrite = quorum_vote(acb);
} else {
quorum_has_too_much_io_failed(acb);
}

quorum_aio_finalize(acb);
/* if no rewrite is done the code will finish right away */
if (!rewrite) {
quorum_aio_finalize(acb);
}
}

static void quorum_report_bad_versions(BDRVQuorumState *s,
Expand All @@ -278,6 +306,43 @@ static void quorum_report_bad_versions(BDRVQuorumState *s,
}
}

static bool quorum_rewrite_bad_versions(BDRVQuorumState *s, QuorumAIOCB *acb,
QuorumVoteValue *value)
{
QuorumVoteVersion *version;
QuorumVoteItem *item;
int count = 0;

/* first count the number of bad versions: done first to avoid concurrency
* issues.
*/
QLIST_FOREACH(version, &acb->votes.vote_list, next) {
if (acb->votes.compare(&version->value, value)) {
continue;
}
QLIST_FOREACH(item, &version->items, next) {
count++;
}
}

/* quorum_rewrite_aio_cb will count down this to zero */
acb->rewrite_count = count;

/* now fire the correcting rewrites */
QLIST_FOREACH(version, &acb->votes.vote_list, next) {
if (acb->votes.compare(&version->value, value)) {
continue;
}
QLIST_FOREACH(item, &version->items, next) {
bdrv_aio_writev(s->bs[item->index], acb->sector_num, acb->qiov,
acb->nb_sectors, quorum_rewrite_aio_cb, acb);
}
}

/* return true if any rewrite is done else false */
return count;
}

static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source)
{
int i;
Expand Down Expand Up @@ -468,16 +533,17 @@ static int quorum_vote_error(QuorumAIOCB *acb)
return ret;
}

static void quorum_vote(QuorumAIOCB *acb)
static bool quorum_vote(QuorumAIOCB *acb)
{
bool quorum = true;
bool rewrite = false;
int i, j, ret;
QuorumVoteValue hash;
BDRVQuorumState *s = acb->common.bs->opaque;
QuorumVoteVersion *winner;

if (quorum_has_too_much_io_failed(acb)) {
return;
return false;
}

/* get the index of the first successful read */
Expand Down Expand Up @@ -505,7 +571,7 @@ static void quorum_vote(QuorumAIOCB *acb)
/* Every successful read agrees */
if (quorum) {
quorum_copy_qiov(acb->qiov, &acb->qcrs[i].qiov);
return;
return false;
}

/* compute hashes for each successful read, also store indexes */
Expand Down Expand Up @@ -538,9 +604,15 @@ static void quorum_vote(QuorumAIOCB *acb)
/* some versions are bad print them */
quorum_report_bad_versions(s, acb, &winner->value);

/* corruption correction is enabled */
if (s->rewrite_corrupted) {
rewrite = quorum_rewrite_bad_versions(s, acb, &winner->value);
}

free_exit:
/* free lists */
quorum_free_vote_list(&acb->votes);
return rewrite;
}

static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs,
Expand Down Expand Up @@ -705,6 +777,11 @@ static QemuOptsList quorum_runtime_opts = {
.type = QEMU_OPT_BOOL,
.help = "Trigger block verify mode if set",
},
{
.name = QUORUM_OPT_REWRITE,
.type = QEMU_OPT_BOOL,
.help = "Rewrite corrupted block on read quorum",
},
{ /* end of list */ }
},
};
Expand Down Expand Up @@ -766,6 +843,14 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags,
"and using two files with vote_threshold=2\n");
}

s->rewrite_corrupted = qemu_opt_get_bool(opts, QUORUM_OPT_REWRITE, false);
if (s->rewrite_corrupted && s->is_blkverify) {
error_setg(&local_err,
"rewrite-corrupted=on cannot be used with blkverify=on");
ret = -EINVAL;
goto exit;
}

/* allocate the children BlockDriverState array */
s->bs = g_new0(BlockDriverState *, s->num_children);
opened = g_new0(bool, s->num_children);
Expand Down
5 changes: 4 additions & 1 deletion qapi/block-core.json
Expand Up @@ -1329,12 +1329,15 @@
#
# @vote-threshold: the vote limit under which a read will fail
#
# @rewrite-corrupted: #optional rewrite corrupted data when quorum is reached
# (Since 2.1)
#
# Since: 2.0
##
{ 'type': 'BlockdevOptionsQuorum',
'data': { '*blkverify': 'bool',
'children': [ 'BlockdevRef' ],
'vote-threshold': 'int' } }
'vote-threshold': 'int', '*rewrite-corrupted': 'bool' } }

##
# @BlockdevOptions
Expand Down
15 changes: 14 additions & 1 deletion tests/qemu-iotests/081
Expand Up @@ -133,16 +133,29 @@ run_qemu -drive "file=$TEST_DIR/2.raw,format=$IMGFMT,if=none,id=drive2" <<EOF
{ "execute": "quit" }
EOF

echo
echo "== using quorum rewrite corrupted mode =="

quorum="$quorum,file.rewrite-corrupted=on"

$QEMU_IO -c "open -o $quorum" -c "read -P 0x32 0 $size" | _filter_qemu_io

echo
echo "== checking that quorum has corrected the corrupted file =="

$QEMU_IO -c "read -P 0x32 0 $size" "$TEST_DIR/2.raw" | _filter_qemu_io

echo
echo "== breaking quorum =="

$QEMU_IO -c "write -P 0x41 0 $size" "$TEST_DIR/1.raw" | _filter_qemu_io
$QEMU_IO -c "write -P 0x42 0 $size" "$TEST_DIR/2.raw" | _filter_qemu_io

echo
echo "== checking that quorum is broken =="

$QEMU_IO -c "open -o $quorum" -c "read -P 0x32 0 $size" | _filter_qemu_io


# success, all done
echo "*** done"
rm -f $seq.full
Expand Down
10 changes: 10 additions & 0 deletions tests/qemu-iotests/081.out
Expand Up @@ -40,9 +40,19 @@ read 10485760/10485760 bytes at offset 0
{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "DEVICE_TRAY_MOVED", "data": {"device": "floppy0", "tray-open": true}}


== using quorum rewrite corrupted mode ==
read 10485760/10485760 bytes at offset 0
10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)

== checking that quorum has corrected the corrupted file ==
read 10485760/10485760 bytes at offset 0
10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)

== breaking quorum ==
wrote 10485760/10485760 bytes at offset 0
10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
wrote 10485760/10485760 bytes at offset 0
10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)

== checking that quorum is broken ==
qemu-io: can't open: Could not read image for determining its format: Input/output error
Expand Down

0 comments on commit cf29a57

Please sign in to comment.