Skip to content

Commit

Permalink
hw/nvme: Implement shadow doorbell buffer support
Browse files Browse the repository at this point in the history
Implement Doorbel Buffer Config command (Section 5.7 in NVMe Spec 1.3)
and Shadow Doorbel buffer & EventIdx buffer handling logic (Section 7.13
in NVMe Spec 1.3). For queues created before the Doorbell Buffer Config
command, the nvme_dbbuf_config function tries to associate each existing
SQ and CQ with its Shadow Doorbel buffer and EventIdx buffer address.
Queues created after the Doorbell Buffer Config command will have the
doorbell buffers associated with them when they are initialized.

In nvme_process_sq and nvme_post_cqe, proactively check for Shadow
Doorbell buffer changes instead of wait for doorbell register changes.
This reduces the number of MMIOs.

In nvme_process_db(), update the shadow doorbell buffer value with
the doorbell register value if it is the admin queue. This is a hack
since hosts like Linux NVMe driver and SPDK do not use shadow
doorbell buffer for the admin queue. Copying the doorbell register
value to the shadow doorbell buffer allows us to support these hosts
as well as spec-compliant hosts that use shadow doorbell buffer for
the admin queue.

Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn>
Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
[k.jensen: rebased]
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
  • Loading branch information
fandahao17 authored and birkelund committed Jul 15, 2022
1 parent 8482ab5 commit 3f7fe8d
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 1 deletion.
115 changes: 114 additions & 1 deletion hw/nvme/ctrl.c
Expand Up @@ -264,6 +264,7 @@ static const uint32_t nvme_cse_acs[256] = {
[NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
[NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
[NVME_ADM_CMD_VIRT_MNGMT] = NVME_CMD_EFF_CSUPP,
[NVME_ADM_CMD_DBBUF_CONFIG] = NVME_CMD_EFF_CSUPP,
[NVME_ADM_CMD_FORMAT_NVM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
};

Expand Down Expand Up @@ -1330,6 +1331,12 @@ static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
}
}

static void nvme_update_cq_head(NvmeCQueue *cq)
{
pci_dma_read(&cq->ctrl->parent_obj, cq->db_addr, &cq->head,
sizeof(cq->head));
}

static void nvme_post_cqes(void *opaque)
{
NvmeCQueue *cq = opaque;
Expand All @@ -1342,6 +1349,10 @@ static void nvme_post_cqes(void *opaque)
NvmeSQueue *sq;
hwaddr addr;

if (n->dbbuf_enabled) {
nvme_update_cq_head(cq);
}

if (nvme_cq_full(cq)) {
break;
}
Expand Down Expand Up @@ -4287,6 +4298,11 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
}
sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);

if (n->dbbuf_enabled) {
sq->db_addr = n->dbbuf_dbs + (sqid << 3);
sq->ei_addr = n->dbbuf_eis + (sqid << 3);
}

assert(n->cq[cqid]);
cq = n->cq[cqid];
QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
Expand Down Expand Up @@ -4645,6 +4661,10 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
cq->head = cq->tail = 0;
QTAILQ_INIT(&cq->req_list);
QTAILQ_INIT(&cq->sq_list);
if (n->dbbuf_enabled) {
cq->db_addr = n->dbbuf_dbs + (cqid << 3) + (1 << 2);
cq->ei_addr = n->dbbuf_eis + (cqid << 3) + (1 << 2);
}
n->cq[cqid] = cq;
cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
}
Expand Down Expand Up @@ -5988,6 +6008,50 @@ static uint16_t nvme_virt_mngmt(NvmeCtrl *n, NvmeRequest *req)
}
}

static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
{
uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1);
uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2);
int i;

/* Address should be page aligned */
if (dbs_addr & (n->page_size - 1) || eis_addr & (n->page_size - 1)) {
return NVME_INVALID_FIELD | NVME_DNR;
}

/* Save shadow buffer base addr for use during queue creation */
n->dbbuf_dbs = dbs_addr;
n->dbbuf_eis = eis_addr;
n->dbbuf_enabled = true;

for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
NvmeSQueue *sq = n->sq[i];
NvmeCQueue *cq = n->cq[i];

if (sq) {
/*
* CAP.DSTRD is 0, so offset of ith sq db_addr is (i<<3)
* nvme_process_db() uses this hard-coded way to calculate
* doorbell offsets. Be consistent with that here.
*/
sq->db_addr = dbs_addr + (i << 3);
sq->ei_addr = eis_addr + (i << 3);
pci_dma_write(&n->parent_obj, sq->db_addr, &sq->tail,
sizeof(sq->tail));
}

if (cq) {
/* CAP.DSTRD is 0, so offset of ith cq db_addr is (i<<3)+(1<<2) */
cq->db_addr = dbs_addr + (i << 3) + (1 << 2);
cq->ei_addr = eis_addr + (i << 3) + (1 << 2);
pci_dma_write(&n->parent_obj, cq->db_addr, &cq->head,
sizeof(cq->head));
}
}

return NVME_SUCCESS;
}

static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
{
trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
Expand Down Expand Up @@ -6032,6 +6096,8 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
return nvme_ns_attachment(n, req);
case NVME_ADM_CMD_VIRT_MNGMT:
return nvme_virt_mngmt(n, req);
case NVME_ADM_CMD_DBBUF_CONFIG:
return nvme_dbbuf_config(n, req);
case NVME_ADM_CMD_FORMAT_NVM:
return nvme_format(n, req);
default:
Expand All @@ -6041,6 +6107,18 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
return NVME_INVALID_OPCODE | NVME_DNR;
}

static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
{
pci_dma_write(&sq->ctrl->parent_obj, sq->ei_addr, &sq->tail,
sizeof(sq->tail));
}

static void nvme_update_sq_tail(NvmeSQueue *sq)
{
pci_dma_read(&sq->ctrl->parent_obj, sq->db_addr, &sq->tail,
sizeof(sq->tail));
}

static void nvme_process_sq(void *opaque)
{
NvmeSQueue *sq = opaque;
Expand All @@ -6052,6 +6130,10 @@ static void nvme_process_sq(void *opaque)
NvmeCmd cmd;
NvmeRequest *req;

if (n->dbbuf_enabled) {
nvme_update_sq_tail(sq);
}

while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
addr = sq->dma_addr + sq->head * n->sqe_size;
if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
Expand All @@ -6075,6 +6157,11 @@ static void nvme_process_sq(void *opaque)
req->status = status;
nvme_enqueue_req_completion(cq, req);
}

if (n->dbbuf_enabled) {
nvme_update_sq_eventidx(sq);
nvme_update_sq_tail(sq);
}
}
}

Expand Down Expand Up @@ -6184,6 +6271,10 @@ static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst)
stl_le_p(&n->bar.intms, 0);
stl_le_p(&n->bar.intmc, 0);
stl_le_p(&n->bar.cc, 0);

n->dbbuf_dbs = 0;
n->dbbuf_eis = 0;
n->dbbuf_enabled = false;
}

static void nvme_ctrl_shutdown(NvmeCtrl *n)
Expand Down Expand Up @@ -6694,6 +6785,10 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)

start_sqs = nvme_cq_full(cq) ? 1 : 0;
cq->head = new_head;
if (!qid && n->dbbuf_enabled) {
pci_dma_write(&n->parent_obj, cq->db_addr, &cq->head,
sizeof(cq->head));
}
if (start_sqs) {
NvmeSQueue *sq;
QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
Expand Down Expand Up @@ -6751,6 +6846,23 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail);

sq->tail = new_tail;
if (!qid && n->dbbuf_enabled) {
/*
* The spec states "the host shall also update the controller's
* corresponding doorbell property to match the value of that entry
* in the Shadow Doorbell buffer."
*
* Since this context is currently a VM trap, we can safely enforce
* the requirement from the device side in case the host is
* misbehaving.
*
* Note, we shouldn't have to do this, but various drivers
* including ones that run on Linux, are not updating Admin Queues,
* so we can't trust reading it for an appropriate sq tail.
*/
pci_dma_write(&n->parent_obj, sq->db_addr, &sq->tail,
sizeof(sq->tail));
}
timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
}
}
Expand Down Expand Up @@ -7231,7 +7343,8 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)

id->mdts = n->params.mdts;
id->ver = cpu_to_le32(NVME_SPEC_VER);
id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT);
id->oacs =
cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT | NVME_OACS_DBBUF);
id->cntrltype = 0x1;

/*
Expand Down
8 changes: 8 additions & 0 deletions hw/nvme/nvme.h
Expand Up @@ -341,6 +341,7 @@ static inline const char *nvme_adm_opc_str(uint8_t opc)
case NVME_ADM_CMD_ASYNC_EV_REQ: return "NVME_ADM_CMD_ASYNC_EV_REQ";
case NVME_ADM_CMD_NS_ATTACHMENT: return "NVME_ADM_CMD_NS_ATTACHMENT";
case NVME_ADM_CMD_VIRT_MNGMT: return "NVME_ADM_CMD_VIRT_MNGMT";
case NVME_ADM_CMD_DBBUF_CONFIG: return "NVME_ADM_CMD_DBBUF_CONFIG";
case NVME_ADM_CMD_FORMAT_NVM: return "NVME_ADM_CMD_FORMAT_NVM";
default: return "NVME_ADM_CMD_UNKNOWN";
}
Expand Down Expand Up @@ -372,6 +373,8 @@ typedef struct NvmeSQueue {
uint32_t tail;
uint32_t size;
uint64_t dma_addr;
uint64_t db_addr;
uint64_t ei_addr;
QEMUTimer *timer;
NvmeRequest *io_req;
QTAILQ_HEAD(, NvmeRequest) req_list;
Expand All @@ -389,6 +392,8 @@ typedef struct NvmeCQueue {
uint32_t vector;
uint32_t size;
uint64_t dma_addr;
uint64_t db_addr;
uint64_t ei_addr;
QEMUTimer *timer;
QTAILQ_HEAD(, NvmeSQueue) sq_list;
QTAILQ_HEAD(, NvmeRequest) req_list;
Expand Down Expand Up @@ -445,6 +450,9 @@ typedef struct NvmeCtrl {
uint8_t smart_critical_warning;
uint32_t conf_msix_qsize;
uint32_t conf_ioqpairs;
uint64_t dbbuf_dbs;
uint64_t dbbuf_eis;
bool dbbuf_enabled;

struct {
MemoryRegion mem;
Expand Down
2 changes: 2 additions & 0 deletions include/block/nvme.h
Expand Up @@ -596,6 +596,7 @@ enum NvmeAdminCommands {
NVME_ADM_CMD_DOWNLOAD_FW = 0x11,
NVME_ADM_CMD_NS_ATTACHMENT = 0x15,
NVME_ADM_CMD_VIRT_MNGMT = 0x1c,
NVME_ADM_CMD_DBBUF_CONFIG = 0x7c,
NVME_ADM_CMD_FORMAT_NVM = 0x80,
NVME_ADM_CMD_SECURITY_SEND = 0x81,
NVME_ADM_CMD_SECURITY_RECV = 0x82,
Expand Down Expand Up @@ -1141,6 +1142,7 @@ enum NvmeIdCtrlOacs {
NVME_OACS_FORMAT = 1 << 1,
NVME_OACS_FW = 1 << 2,
NVME_OACS_NS_MGMT = 1 << 3,
NVME_OACS_DBBUF = 1 << 8,
};

enum NvmeIdCtrlOncs {
Expand Down

0 comments on commit 3f7fe8d

Please sign in to comment.