Skip to content

Commit

Permalink
md/raid10: close race that lose writes lost when replacement completes.
Browse files Browse the repository at this point in the history
When a replacement operation completes there is a small window
when the original device is marked 'faulty' and the replacement
still looks like a replacement.  The faulty should be removed and
the replacement moved in place very quickly, bit it isn't instant.

So the code write out to the array must handle the possibility that
the only working device for some slot in the replacement - but it
doesn't.  If the primary device is faulty it just gives up.  This
can lead to corruption.

So make the code more robust: if either  the primary or the
replacement is present and working, write to them.  Only when
neither are present do we give up.

This bug has been present since replacement was introduced in
3.3, so it is suitable for any -stable kernel since then.

Reported-by: "George Spelvin" <linux@horizon.com>
Cc: stable@vger.kernel.org
Signed-off-by: NeilBrown <neilb@suse.de>
  • Loading branch information
neilbrown committed Nov 22, 2012
1 parent ca64cae commit e7c0c3f
Showing 1 changed file with 68 additions and 61 deletions.
129 changes: 68 additions & 61 deletions drivers/md/raid10.c
Expand Up @@ -1334,18 +1334,21 @@ static void make_request(struct mddev *mddev, struct bio * bio)
blocked_rdev = rrdev;
break;
}
if (rdev && (test_bit(Faulty, &rdev->flags)
|| test_bit(Unmerged, &rdev->flags)))
rdev = NULL;
if (rrdev && (test_bit(Faulty, &rrdev->flags)
|| test_bit(Unmerged, &rrdev->flags)))
rrdev = NULL;

r10_bio->devs[i].bio = NULL;
r10_bio->devs[i].repl_bio = NULL;
if (!rdev || test_bit(Faulty, &rdev->flags) ||
test_bit(Unmerged, &rdev->flags)) {

if (!rdev && !rrdev) {
set_bit(R10BIO_Degraded, &r10_bio->state);
continue;
}
if (test_bit(WriteErrorSeen, &rdev->flags)) {
if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
sector_t first_bad;
sector_t dev_sector = r10_bio->devs[i].addr;
int bad_sectors;
Expand Down Expand Up @@ -1387,8 +1390,10 @@ static void make_request(struct mddev *mddev, struct bio * bio)
max_sectors = good_sectors;
}
}
r10_bio->devs[i].bio = bio;
atomic_inc(&rdev->nr_pending);
if (rdev) {
r10_bio->devs[i].bio = bio;
atomic_inc(&rdev->nr_pending);
}
if (rrdev) {
r10_bio->devs[i].repl_bio = bio;
atomic_inc(&rrdev->nr_pending);
Expand Down Expand Up @@ -1444,69 +1449,71 @@ static void make_request(struct mddev *mddev, struct bio * bio)
for (i = 0; i < conf->copies; i++) {
struct bio *mbio;
int d = r10_bio->devs[i].devnum;
if (!r10_bio->devs[i].bio)
continue;
if (r10_bio->devs[i].bio) {
struct md_rdev *rdev = conf->mirrors[d].rdev;
mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
max_sectors);
r10_bio->devs[i].bio = mbio;

mbio->bi_sector = (r10_bio->devs[i].addr+
choose_data_offset(r10_bio,
rdev));
mbio->bi_bdev = rdev->bdev;
mbio->bi_end_io = raid10_end_write_request;
mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
mbio->bi_private = r10_bio;

mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
max_sectors);
r10_bio->devs[i].bio = mbio;
atomic_inc(&r10_bio->remaining);

mbio->bi_sector = (r10_bio->devs[i].addr+
choose_data_offset(r10_bio,
conf->mirrors[d].rdev));
mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
mbio->bi_end_io = raid10_end_write_request;
mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
mbio->bi_private = r10_bio;
cb = blk_check_plugged(raid10_unplug, mddev,
sizeof(*plug));
if (cb)
plug = container_of(cb, struct raid10_plug_cb,
cb);
else
plug = NULL;
spin_lock_irqsave(&conf->device_lock, flags);
if (plug) {
bio_list_add(&plug->pending, mbio);
plug->pending_cnt++;
} else {
bio_list_add(&conf->pending_bio_list, mbio);
conf->pending_count++;
}
spin_unlock_irqrestore(&conf->device_lock, flags);
if (!plug)
md_wakeup_thread(mddev->thread);
}

atomic_inc(&r10_bio->remaining);
if (r10_bio->devs[i].repl_bio) {
struct md_rdev *rdev = conf->mirrors[d].replacement;
if (rdev == NULL) {
/* Replacement just got moved to main 'rdev' */
smp_mb();
rdev = conf->mirrors[d].rdev;
}
mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
max_sectors);
r10_bio->devs[i].repl_bio = mbio;

mbio->bi_sector = (r10_bio->devs[i].addr +
choose_data_offset(
r10_bio, rdev));
mbio->bi_bdev = rdev->bdev;
mbio->bi_end_io = raid10_end_write_request;
mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
mbio->bi_private = r10_bio;

cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug));
if (cb)
plug = container_of(cb, struct raid10_plug_cb, cb);
else
plug = NULL;
spin_lock_irqsave(&conf->device_lock, flags);
if (plug) {
bio_list_add(&plug->pending, mbio);
plug->pending_cnt++;
} else {
atomic_inc(&r10_bio->remaining);
spin_lock_irqsave(&conf->device_lock, flags);
bio_list_add(&conf->pending_bio_list, mbio);
conf->pending_count++;
spin_unlock_irqrestore(&conf->device_lock, flags);
if (!mddev_check_plugged(mddev))
md_wakeup_thread(mddev->thread);
}
spin_unlock_irqrestore(&conf->device_lock, flags);
if (!plug)
md_wakeup_thread(mddev->thread);

if (!r10_bio->devs[i].repl_bio)
continue;

mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
max_sectors);
r10_bio->devs[i].repl_bio = mbio;

/* We are actively writing to the original device
* so it cannot disappear, so the replacement cannot
* become NULL here
*/
mbio->bi_sector = (r10_bio->devs[i].addr +
choose_data_offset(
r10_bio,
conf->mirrors[d].replacement));
mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
mbio->bi_end_io = raid10_end_write_request;
mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
mbio->bi_private = r10_bio;

atomic_inc(&r10_bio->remaining);
spin_lock_irqsave(&conf->device_lock, flags);
bio_list_add(&conf->pending_bio_list, mbio);
conf->pending_count++;
spin_unlock_irqrestore(&conf->device_lock, flags);
if (!mddev_check_plugged(mddev))
md_wakeup_thread(mddev->thread);
}

/* Don't remove the bias on 'remaining' (one_write_done) until
Expand Down

0 comments on commit e7c0c3f

Please sign in to comment.