diff --git a/include/sys/zio.h b/include/sys/zio.h index 69b00d0f4029..278b138e6cea 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -440,6 +440,7 @@ struct zio { uint64_t io_child_count; uint64_t io_phys_children; uint64_t io_parent_count; + uint64_t io_recursion_count; uint64_t *io_stall; zio_t *io_gang_leader; zio_gang_node_t *io_gang_tree; diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 34e4420da733..15d2b11ad38f 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -701,18 +701,11 @@ vdev_queue_io_to_issue(vdev_queue_t *vq) vdev_queue_io_remove(vq, zio); /* - * If the I/O is or was optional and therefore has no data, we need to - * simply discard it. We need to drop the vdev queue's lock to avoid a - * deadlock that we could encounter since this I/O will complete - * immediately. + * If the I/O is or was optional and therefore has no data, return it + * to the caller to be discarded outside of the vq->vq_lock lock. */ - if (zio->io_flags & ZIO_FLAG_NODATA) { - mutex_exit(&vq->vq_lock); - zio_vdev_io_bypass(zio); - zio_execute(zio); - mutex_enter(&vq->vq_lock); - goto again; - } + if (zio->io_flags & ZIO_FLAG_NODATA) + return (zio); vdev_queue_pending_add(vq, zio); vq->vq_last_offset = zio->io_offset; @@ -783,6 +776,24 @@ vdev_queue_io_done(zio_t *zio) while ((nio = vdev_queue_io_to_issue(vq)) != NULL) { mutex_exit(&vq->vq_lock); + /* + * If the I/O is or was optional and therefore has no data, we + * need to simply discard it. We do it here instead of + * vdev_queue_io_to_issue() because: + * + * 1. We need to propagate the value of ->io_recursion_count to + * avoid recursing too deeply under heavy I/O load. + * + * 2. We also need to drop the vdev queue's lock to avoid a + * deadlock that we could encounter since this I/O will + * complete immediately. + */ + if (zio->io_flags & ZIO_FLAG_NODATA) { + nio->io_recursion_count = zio->io_recursion_count + 1; + zio_vdev_io_bypass(nio); + zio_execute(nio); + continue; + } if (nio->io_done == vdev_queue_agg_io_done) { zio_nowait(nio); } else { diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 0ba167c62b59..a7434eea20ec 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -60,6 +60,7 @@ kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; int zio_bulk_flags = 0; int zio_delay_max = ZIO_DELAY_MAX; +int zio_recursion_count = 10; /* Point at which notification of parents is redispatched */ /* * The following actions directly effect the spa's sync-to-convergence logic. @@ -516,6 +517,7 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait) if (*countp == 0 && pio->io_stall == countp) { pio->io_stall = NULL; + pio->io_recursion_count = zio->io_recursion_count + 1; mutex_exit(&pio->io_lock); __zio_execute(pio); } else { @@ -975,6 +977,8 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) zio->io_logical->io_phys_children++; + zio->io_recursion_count = pio->io_recursion_count + 1; + return (zio); } @@ -1268,6 +1272,9 @@ zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) zio_type_t t = zio->io_type; int flags = (cutinline ? TQ_FRONT : 0); + /* Reset the notify counter */ + zio->io_recursion_count = 0; + /* * If we're a config writer or a probe, the normal issue and * interrupt threads may all be blocked waiting for the config lock. @@ -1391,6 +1398,16 @@ __zio_execute(zio_t *zio) cut = (stage == ZIO_STAGE_VDEV_IO_START) ? zio_requeue_io_start_cut_in_line : B_FALSE; + /* + * Deep notification chains can cause us to overrun the stack. + * Redispatch ZIO when we hit zio_recursion_count. + */ + if (zio->io_recursion_count && + ((zio->io_recursion_count % zio_recursion_count) == 0)) { + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE); + return; + } + /* * If we are in interrupt context and this pipeline stage * will grab a config lock that is held across I/O,