From a3d9f3a9629bfb22864e83e73f047cd62e9ba283 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Thu, 8 Apr 2021 11:11:02 +0200 Subject: [PATCH 1/4] docs: add nvme emulation documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the docs/specs/nvme.txt and replace it with proper documentation in docs/system/nvme.rst. Signed-off-by: Klaus Jensen Reviewed-by: Philippe Mathieu-Daudé --- MAINTAINERS | 2 +- docs/specs/nvme.txt | 23 ----- docs/system/index.rst | 1 + docs/system/nvme.rst | 225 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 227 insertions(+), 24 deletions(-) delete mode 100644 docs/specs/nvme.txt create mode 100644 docs/system/nvme.rst diff --git a/MAINTAINERS b/MAINTAINERS index 58f342108e9e..04beb34e7ec4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1974,7 +1974,7 @@ S: Supported F: hw/block/nvme* F: include/block/nvme.h F: tests/qtest/nvme-test.c -F: docs/specs/nvme.txt +F: docs/system/nvme.rst T: git git://git.infradead.org/qemu-nvme.git nvme-next megasas diff --git a/docs/specs/nvme.txt b/docs/specs/nvme.txt deleted file mode 100644 index 56d393884e7a..000000000000 --- a/docs/specs/nvme.txt +++ /dev/null @@ -1,23 +0,0 @@ -NVM Express Controller -====================== - -The nvme device (-device nvme) emulates an NVM Express Controller. - - -Reference Specifications ------------------------- - -The device currently implements most mandatory features of NVMe v1.3d, see - - https://nvmexpress.org/resources/specifications/ - -for the specification. - - -Known issues ------------- - -* The accounting numbers in the SMART/Health are reset across power cycles - -* Interrupt Coalescing is not supported and is disabled by default in volation - of the specification. diff --git a/docs/system/index.rst b/docs/system/index.rst index 02d07071810f..b05af716a973 100644 --- a/docs/system/index.rst +++ b/docs/system/index.rst @@ -23,6 +23,7 @@ Contents: net virtio-net-failover usb + nvme ivshmem linuxboot generic-loader diff --git a/docs/system/nvme.rst b/docs/system/nvme.rst new file mode 100644 index 000000000000..f7f63d6bf615 --- /dev/null +++ b/docs/system/nvme.rst @@ -0,0 +1,225 @@ +============== +NVMe Emulation +============== + +QEMU provides NVMe emulation through the ``nvme``, ``nvme-ns`` and +``nvme-subsys`` devices. + +See the following sections for specific information on + + * `Adding NVMe Devices`_, `additional namespaces`_ and `NVM subsystems`_. + * Configuration of `Optional Features`_ such as `Controller Memory Buffer`_, + `Simple Copy`_, `Zoned Namespaces`_, `metadata`_ and `End-to-End Data + Protection`_, + +Adding NVMe Devices +=================== + +Controller Emulation +-------------------- + +The QEMU emulated NVMe controller implements version 1.4 of the NVM Express +specification. All mandatory features are implement with a couple of exceptions +and limitations: + + * Accounting numbers in the SMART/Health log page are reset when the device + is power cycled. + * Interrupt Coalescing is not supported and is disabled by default. + +The simplest way to attach an NVMe controller on the QEMU PCI bus is to add the +following parameters: + +.. code-block:: console + + -drive file=nvm.img,if=none,id=nvm + -device nvme,serial=deadbeef,drive=nvm + +There are a number of optional general parameters for the ``nvme`` device. Some +are mentioned here, but see ``-device nvme,help`` to list all possible +parameters. + +``max_ioqpairs=UINT32`` (default: ``64``) + Set the maximum number of allowed I/O queue pairs. This replaces the + deprecated ``num_queues`` parameter. + +``msix_qsize=UINT16`` (default: ``65``) + The number of MSI-X vectors that the device should support. + +``mdts=UINT8`` (default: ``7``) + Set the Maximum Data Transfer Size of the device. + +``use-intel-id`` (default: ``off``) + Since QEMU 5.2, the device uses a QEMU allocated "Red Hat" PCI Device and + Vendor ID. Set this to ``on`` to revert to the unallocated Intel ID + previously used. + +Additional Namespaces +--------------------- + +In the simplest possible invocation sketched above, the device only support a +single namespace with the namespace identifier ``1``. To support multiple +namespaces and additional features, the ``nvme-ns`` device must be used. + +.. code-block:: console + + -device nvme,id=nvme-ctrl-0,serial=deadbeef + -drive file=nvm-1.img,if=none,id=nvm-1 + -device nvme-ns,drive=nvm-1 + -drive file=nvm-2.img,if=none,id=nvm-2 + -device nvme-ns,drive=nvm-2 + +The namespaces defined by the ``nvme-ns`` device will attach to the most +recently defined ``nvme-bus`` that is created by the ``nvme`` device. Namespace +identifers are allocated automatically, starting from ``1``. + +There are a number of parameters available: + +``nsid`` (default: ``0``) + Explicitly set the namespace identifier. + +``uuid`` (default: *autogenerated*) + Set the UUID of the namespace. This will be reported as a "Namespace UUID" + descriptor in the Namespace Identification Descriptor List. + +``bus`` + If there are more ``nvme`` devices defined, this parameter may be used to + attach the namespace to a specific ``nvme`` device (identified by an ``id`` + parameter on the controller device). + +NVM Subsystems +-------------- + +Additional features becomes available if the controller device (``nvme``) is +linked to an NVM Subsystem device (``nvme-subsys``). + +The NVM Subsystem emulation allows features such as shared namespaces and +multipath I/O. + +.. code-block:: console + + -device nvme-subsys,id=nvme-subsys-0,nqn=subsys0 + -device nvme,serial=a,subsys=nvme-subsys-0 + -device nvme,serial=b,subsys=nvme-subsys-0 + +This will create an NVM subsystem with two controllers. Having controllers +linked to an ``nvme-subsys`` device allows additional ``nvme-ns`` parameters: + +``shared`` (default: ``off``) + Specifies that the namespace will be attached to all controllers in the + subsystem. If set to ``off`` (the default), the namespace will remain a + private namespace and may only be attached to a single controller at a time. + +``detached`` (default: ``off``) + If set to ``on``, the namespace will be be available in the subsystem, but + not attached to any controllers initially. + +Thus, adding + +.. code-block:: console + + -drive file=nvm-1.img,if=none,id=nvm-1 + -device nvme-ns,drive=nvm-1,nsid=1,shared=on + -drive file=nvm-2.img,if=none,id=nvm-2 + -device nvme-ns,drive=nvm-2,nsid=3,detached=on + +will cause NSID 1 will be a shared namespace (due to ``shared=on``) that is +initially attached to both controllers. NSID 3 will be a private namespace +(i.e. only attachable to a single controller at a time) and will not be +attached to any controller initially (due to ``detached=on``). + +Optional Features +================= + +Controller Memory Buffer +------------------------ + +``nvme`` device parameters related to the Controller Memory Buffer support: + +``cmb_size_mb=UINT32`` (default: ``0``) + This adds a Controller Memory Buffer of the given size at offset zero in BAR + 2. + +``legacy-cmb`` (default: ``off``) + By default, the device uses the "v1.4 scheme" for the Controller Memory + Buffer support (i.e, the CMB is initially disabled and must be explicitly + enabled by the host). Set this to ``on`` to behave as a v1.3 device wrt. the + CMB. + +Simple Copy +----------- + +The device includes support for TP 4065 ("Simple Copy Command"). A number of +additional ``nvme-ns`` device parameters may be used to control the Copy +command limits: + +``mssrl=UINT16`` (default: ``128``) + Set the Maximum Single Source Range Length (``MSSRL``). This is the maximum + number of logical blocks that may be specified in each source range. + +``mcl=UINT32`` (default: ``128``) + Set the Maximum Copy Length (``MCL``). This is the maximum number of logical + blocks that may be specified in a Copy command (the total for all source + ranges). + +``msrc=UINT8`` (default: ``127``) + Set the Maximum Source Range Count (``MSRC``). This is the maximum number of + source ranges that may be used in a Copy command. This is a 0's based value. + +Zoned Namespaces +---------------- + +A namespaces may be "Zoned" as defined by TP 4053 ("Zoned Namespaces"). Set +``zoned=on`` on an ``nvme-ns`` device to configure it as a zoned namespace. + +The namespace may be configured with additional parameters + +``zoned.zone_size=SIZE`` (default: ``128MiB``) + Define the zone size (``ZSZE``). + +``zoned.zone_capacity=SIZE`` (default: ``0``) + Define the zone capacity (``ZCAP``). If left at the default (``0``), the zone + capacity will equal the zone size. + +``zoned.descr_ext_size=UINT32`` (default: ``0``) + Set the Zone Descriptor Extension Size (``ZDES``). Must be a multiple of 64 + bytes. + +``zoned.cross_read=BOOL`` (default: ``off``) + Set to ``on`` to allow reads to cross zone boundaries. + +``zoned.max_active=UINT32`` (default: ``0``) + Set the maximum number of active resources (``MAR``). The default (``0``) + allows all zones to be active. + +``zoned.max_open=UINT32`` (default: ``0``) + Set the maximum number of open resources (``MOR``). The default (``0``) + allows all zones to be open. If ``zoned.max_active`` is specified, this value + must be less than or equal to that. + +Metadata +-------- + +The virtual namespace device supports LBA metadata in the form separate +metadata (``MPTR``-based) and extended LBAs. + +``ms=UINT16`` (default: ``0``) + Defines the number of metadata bytes per LBA. + +``mset=UINT8`` (default: ``0``) + Set to ``1`` to enable extended LBAs. + +End-to-End Data Protection +-------------------------- + +The virtual namespace device supports DIF- and DIX-based protection information +(depending on ``mset``). + +``pi=UINT8`` (default: ``0``) + Enable protection information of the specified type (type ``1``, ``2`` or + ``3``). + +``pil=UINT8`` (default: ``0``) + Controls the location of the protection information within the metadata. Set + to ``1`` to transfer protection information as the first eight bytes of + metadata. Otherwise, the protection information is transferred as the last + eight bytes. From d357230b20e5f238e5ff6084cdf6d3231d27f0c6 Mon Sep 17 00:00:00 2001 From: Padmakar Kalghatgi Date: Fri, 9 Apr 2021 12:55:48 +0530 Subject: [PATCH 2/4] hw/block/nvme: map prp fix if prp2 contains non-zero offset nvme_map_prp needs to calculate the number of list entries based on the offset value. For the subsequent PRP2 list, need to ensure the number of entries is within the MAX number of PRP entries for a page. Signed-off-by: Padmakar Kalghatgi Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 6b1f056a0ebc..86336152a378 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -655,7 +655,12 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1, uint32_t nents, prp_trans; int i = 0; - nents = (len + n->page_size - 1) >> n->page_bits; + /* + * The first PRP list entry, pointed to by PRP2 may contain offset. + * Hence, we need to calculate the number of entries in based on + * that offset. + */ + nents = (n->page_size - (prp2 & (n->page_size - 1))) >> 3; prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t); ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans); if (ret) { @@ -666,7 +671,7 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1, while (len != 0) { uint64_t prp_ent = le64_to_cpu(prp_list[i]); - if (i == n->max_prp_ents - 1 && len > n->page_size) { + if (i == nents - 1 && len > n->page_size) { if (unlikely(prp_ent & (n->page_size - 1))) { trace_pci_nvme_err_invalid_prplist_ent(prp_ent); status = NVME_INVALID_PRP_OFFSET | NVME_DNR; @@ -675,7 +680,8 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1, i = 0; nents = (len + n->page_size - 1) >> n->page_bits; - prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t); + nents = MIN(nents, n->max_prp_ents); + prp_trans = nents * sizeof(uint64_t); ret = nvme_addr_read(n, prp_ent, (void *)prp_list, prp_trans); if (ret) { From 5cefe2870874e6442eba62408bedbb4338b183a3 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Thu, 8 Apr 2021 13:46:03 +0200 Subject: [PATCH 3/4] hw/block/nvme: store aiocb in compare nvme_compare() fails to store the aiocb from the blk_aio_preadv() call. Fix this. Fixes: 0a384f923f51 ("hw/block/nvme: add compare command") Cc: Gollu Appalanaidu Signed-off-by: Klaus Jensen Reviewed-by: Gollu Appalanaidu Reviewed-by: Minwoo Im --- hw/block/nvme.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 86336152a378..65bb2cfc21d7 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -2843,7 +2843,8 @@ static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req) block_acct_start(blk_get_stats(blk), &req->acct, data_len, BLOCK_ACCT_READ); - blk_aio_preadv(blk, offset, &ctx->data.iov, 0, nvme_compare_data_cb, req); + req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0, + nvme_compare_data_cb, req); return NVME_NO_COMPLETE; } From 98f84f5a4eca5c03e32fff20f246d9b4b96d6422 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Thu, 8 Apr 2021 12:44:05 +0200 Subject: [PATCH 4/4] hw/block/nvme: drain namespaces on sq deletion For most commands, when issuing an AIO, the BlockAIOCB is stored in the NvmeRequest aiocb pointer when the AIO is issued. The main use of this is cancelling AIOs when deleting submission queues (it is currently not used for Abort). However, some commands like Dataset Management Zone Management Send (zone reset) may involve more than one AIO and here the AIOs are issued without saving a reference to the BlockAIOCB. This is a problem since nvme_del_sq() will attempt to cancel outstanding AIOs, potentially with an invalid BlockAIOCB since the aiocb pointer is not NULL'ed when the request structure is recycled. Fix this by 1. making sure the aiocb pointer is NULL'ed when requests are recycled 2. only attempt to cancel the AIO if the aiocb is non-NULL 3. if any AIOs could not be cancelled, drain all aio as a last resort. Fixes: dc04d25e2f3f ("hw/block/nvme: add support for the format nvm command") Fixes: c94973288cd9 ("hw/block/nvme: add broadcast nsid support flush command") Fixes: e4e430b3d6ba ("hw/block/nvme: add simple copy command") Fixes: 5f5dc4c6a942 ("hw/block/nvme: zero out zones on reset") Fixes: 2605257a26b8 ("hw/block/nvme: add the dataset management command") Cc: Gollu Appalanaidu Cc: Minwoo Im Signed-off-by: Klaus Jensen Reviewed-by: Minwoo Im --- hw/block/nvme.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 65bb2cfc21d7..624a1431d072 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -470,6 +470,7 @@ static void nvme_req_clear(NvmeRequest *req) { req->ns = NULL; req->opaque = NULL; + req->aiocb = NULL; memset(&req->cqe, 0x0, sizeof(req->cqe)); req->status = NVME_SUCCESS; } @@ -3687,6 +3688,7 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req) NvmeSQueue *sq; NvmeCQueue *cq; uint16_t qid = le16_to_cpu(c->qid); + uint32_t nsid; if (unlikely(!qid || nvme_check_sqid(n, qid))) { trace_pci_nvme_err_invalid_del_sq(qid); @@ -3698,9 +3700,26 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req) sq = n->sq[qid]; while (!QTAILQ_EMPTY(&sq->out_req_list)) { r = QTAILQ_FIRST(&sq->out_req_list); - assert(r->aiocb); - blk_aio_cancel(r->aiocb); + if (r->aiocb) { + blk_aio_cancel(r->aiocb); + } + } + + /* + * Drain all namespaces if there are still outstanding requests that we + * could not cancel explicitly. + */ + if (!QTAILQ_EMPTY(&sq->out_req_list)) { + for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) { + NvmeNamespace *ns = nvme_ns(n, nsid); + if (ns) { + nvme_ns_drain(ns); + } + } } + + assert(QTAILQ_EMPTY(&sq->out_req_list)); + if (!nvme_check_cqid(n, sq->cqid)) { cq = n->cq[sq->cqid]; QTAILQ_REMOVE(&cq->sq_list, sq, entry);