From dc3b25f111c5d254fa9632153a5792de947742a4 Mon Sep 17 00:00:00 2001 From: Daniel Kranzdorf Date: Sun, 13 Jul 2025 08:46:09 +0000 Subject: [PATCH 01/66] verbs: Allow query only device support for QP data in order Add a flag to query directly the RDMA device support for QP data-in-order semantics without enforcing host CPU architecture restrictions. It is particularly useful in scenarios where the GPU performs data polling directly, with the application responsible for ensuring the GPU side support for data-in-order semantics. Reviewed-by: Michael Margolin Reviewed-by: Yonatan Nachum Signed-off-by: Daniel Kranzdorf --- .../man/ibv_query_qp_data_in_order.3.md | 4 ++++ libibverbs/verbs.c | 22 +++++++++++++------ libibverbs/verbs.h | 1 + 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/libibverbs/man/ibv_query_qp_data_in_order.3.md b/libibverbs/man/ibv_query_qp_data_in_order.3.md index 058646358..b96d7fc6b 100644 --- a/libibverbs/man/ibv_query_qp_data_in_order.3.md +++ b/libibverbs/man/ibv_query_qp_data_in_order.3.md @@ -45,6 +45,10 @@ This function describes ordering at the receiving side of the QP, not the sendin IBV_QUERY_QP_DATA_IN_ORDER_RETURN_CAPS - Query for supported capabilities and return a capabilities vector. +IBV_QUERY_QP_DATA_IN_ORDER_DEVICE_ONLY - Allows querying only the RDMA device side support for data-in-order semantics + without enforcing host CPU architecture restrictions. It is the responsibility of the application + to ensure data-in-order semantics support for the host CPU or receiver device. + Passing 0 is equivalent to using IBV_QUERY_QP_DATA_IN_ORDER_RETURN_CAPS and checking for IBV_QUERY_QP_DATA_IN_ORDER_WHOLE_MSG support. # RETURN VALUE diff --git a/libibverbs/verbs.c b/libibverbs/verbs.c index e506f8a8f..30e1d87d6 100644 --- a/libibverbs/verbs.c +++ b/libibverbs/verbs.c @@ -695,23 +695,31 @@ LATEST_SYMVER_FUNC(ibv_query_qp, 1_1, "IBVERBS_1.1", int ibv_query_qp_data_in_order(struct ibv_qp *qp, enum ibv_wr_opcode op, uint32_t flags) { + int result; + + if (!check_comp_mask(flags, + IBV_QUERY_QP_DATA_IN_ORDER_RETURN_CAPS | + IBV_QUERY_QP_DATA_IN_ORDER_DEVICE_ONLY)) + return 0; + #if !defined(__i386__) && !defined(__x86_64__) /* Currently this API is only supported for x86 architectures since most * non-x86 platforms are known to be OOO and need to do a per-platform study. + * However, it is possible to override this restriction to allow querying + * the device capability directly, regardless of the host CPU architecture. */ - return 0; -#else - int result; - - if (!check_comp_mask(flags, IBV_QUERY_QP_DATA_IN_ORDER_RETURN_CAPS)) + if (!(flags & IBV_QUERY_QP_DATA_IN_ORDER_DEVICE_ONLY)) return 0; +#endif result = get_ops(qp->context)->query_qp_data_in_order(qp, op, flags); if (result & IBV_QUERY_QP_DATA_IN_ORDER_WHOLE_MSG) result |= IBV_QUERY_QP_DATA_IN_ORDER_ALIGNED_128_BYTES; - return flags ? result : !!(result & IBV_QUERY_QP_DATA_IN_ORDER_WHOLE_MSG); -#endif + if (flags & IBV_QUERY_QP_DATA_IN_ORDER_RETURN_CAPS) + return result; + + return !!(result & IBV_QUERY_QP_DATA_IN_ORDER_WHOLE_MSG); } LATEST_SYMVER_FUNC(ibv_modify_qp, 1_1, "IBVERBS_1.1", diff --git a/libibverbs/verbs.h b/libibverbs/verbs.h index 47cdc067e..645bb092c 100644 --- a/libibverbs/verbs.h +++ b/libibverbs/verbs.h @@ -1042,6 +1042,7 @@ enum ibv_qp_attr_mask { enum ibv_query_qp_data_in_order_flags { IBV_QUERY_QP_DATA_IN_ORDER_RETURN_CAPS = 1 << 0, + IBV_QUERY_QP_DATA_IN_ORDER_DEVICE_ONLY = 1 << 1, }; enum ibv_query_qp_data_in_order_caps { From a4f3f441e66f4e33e3f12b642243b1faa8143e7b Mon Sep 17 00:00:00 2001 From: Yonatan Nachum Date: Thu, 3 Jul 2025 11:32:05 +0000 Subject: [PATCH 02/66] efa: Add single sub CQ poll variant For the new polling API add an option to dynamically choose the CQ basic polling functions: start, next and end. This will allow for different optimizations with the first one being CQs with a single sub CQ. With this type of CQ we have an overhead of function calls and redundant for loop that we can drop. Signed-off-by: Yonatan Nachum --- providers/efa/verbs.c | 93 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 85 insertions(+), 8 deletions(-) diff --git a/providers/efa/verbs.c b/providers/efa/verbs.c index de950588b..a955cd13a 100644 --- a/providers/efa/verbs.c +++ b/providers/efa/verbs.c @@ -884,12 +884,11 @@ int efa_poll_cq(struct ibv_cq *ibvcq, int nwc, struct ibv_wc *wc) return i ?: -ret; } -static int efa_start_poll(struct ibv_cq_ex *ibvcqx, - struct ibv_poll_cq_attr *attr) +static inline int efa_start_poll_comp_check(struct ibv_cq_ex *ibvcqx, + struct ibv_poll_cq_attr *attr) ALWAYS_INLINE; +static inline int efa_start_poll_comp_check(struct ibv_cq_ex *ibvcqx, + struct ibv_poll_cq_attr *attr) { - struct efa_cq *cq = to_efa_cq_ex(ibvcqx); - int ret; - if (unlikely(attr->comp_mask)) { verbs_err(verbs_get_ctx(ibvcqx->context), "Invalid comp_mask %u\n", @@ -897,6 +896,18 @@ static int efa_start_poll(struct ibv_cq_ex *ibvcqx, return EINVAL; } + return 0; +} + +static int efa_start_poll(struct ibv_cq_ex *ibvcqx, + struct ibv_poll_cq_attr *attr) +{ + struct efa_cq *cq = to_efa_cq_ex(ibvcqx); + int ret; + + if (efa_start_poll_comp_check(ibvcqx, attr)) + return EINVAL; + pthread_spin_lock(&cq->lock); ret = efa_poll_sub_cqs(cq, NULL, true); @@ -932,15 +943,81 @@ static void efa_end_poll(struct ibv_cq_ex *ibvcqx) pthread_spin_unlock(&cq->lock); } +static int efa_start_poll_single_sub_cq(struct ibv_cq_ex *ibvcqx, + struct ibv_poll_cq_attr *attr) +{ + struct efa_cq *cq = to_efa_cq_ex(ibvcqx); + struct efa_qp *qp = NULL; + int ret; + + if (efa_start_poll_comp_check(ibvcqx, attr)) + return EINVAL; + + pthread_spin_lock(&cq->lock); + ret = efa_poll_sub_cq(cq, cq->sub_cq_arr, &qp, NULL, true); + if (ret != ENOENT) + cq->cc++; + + if (ret) + pthread_spin_unlock(&cq->lock); + + return ret; +} + +static int efa_next_poll_single_sub_cq(struct ibv_cq_ex *ibvcqx) +{ + struct efa_cq *cq = to_efa_cq_ex(ibvcqx); + struct efa_qp *qp = NULL; + int ret; + + if (cq->cur_wq) + efa_wq_put_wrid_idx_unlocked(cq->cur_wq, cq->cur_cqe->req_id); + + ret = efa_poll_sub_cq(cq, cq->sub_cq_arr, &qp, NULL, true); + if (ret != ENOENT) + cq->cc++; + + return ret; +} + +enum cq_pfns_attr { + SINGLE_SUB_CQ_PFNS = BIT(0), +}; + +#define efa_start_poll_name(single_sub_cq) efa_start_poll##single_sub_cq +#define efa_next_poll_name(single_sub_cq) efa_next_poll##single_sub_cq +#define efa_end_poll_name() efa_end_poll + +#define POLL_FN_ENTRY(single_sub_cq) { \ + .start_poll = efa_start_poll_name(single_sub_cq), \ + .next_poll = efa_next_poll_name(single_sub_cq), \ + .end_poll = efa_end_poll_name(), \ + } + +struct cq_base_ops { + int (*start_poll)(struct ibv_cq_ex *ibcq, struct ibv_poll_cq_attr *attr); + int (*next_poll)(struct ibv_cq_ex *ibcq); + void (*end_poll)(struct ibv_cq_ex *ibcq); +} base_ops[SINGLE_SUB_CQ_PFNS + 1] = { + [0] = POLL_FN_ENTRY(), + [SINGLE_SUB_CQ_PFNS] = POLL_FN_ENTRY(_single_sub_cq), +}; + static void efa_cq_fill_pfns(struct efa_cq *cq, struct ibv_cq_init_attr_ex *attr, struct efadv_cq_init_attr *efa_attr) { struct ibv_cq_ex *ibvcqx = &cq->verbs_cq.cq_ex; + const struct cq_base_ops *cq_ops; + uint32_t cq_pfns_mask = 0; + + if (cq->num_sub_cqs == 1) + cq_pfns_mask |= SINGLE_SUB_CQ_PFNS; - ibvcqx->start_poll = efa_start_poll; - ibvcqx->end_poll = efa_end_poll; - ibvcqx->next_poll = efa_next_poll; + cq_ops = &base_ops[cq_pfns_mask]; + ibvcqx->start_poll = cq_ops->start_poll; + ibvcqx->next_poll = cq_ops->next_poll; + ibvcqx->end_poll = cq_ops->end_poll; ibvcqx->read_opcode = efa_wc_read_opcode; ibvcqx->read_vendor_err = efa_wc_read_vendor_err; From fbd0b88ee741c52e05cd6b2d39338aa46aea2ae7 Mon Sep 17 00:00:00 2001 From: Yonatan Nachum Date: Thu, 3 Jul 2025 11:59:04 +0000 Subject: [PATCH 03/66] efa: Add option to create single threaded CQ Add an option to create a single threaded CQ, if a single threaded CQ is created the CQ lock isn't taken on completion polling functions. Signed-off-by: Yonatan Nachum --- providers/efa/verbs.c | 89 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 73 insertions(+), 16 deletions(-) diff --git a/providers/efa/verbs.c b/providers/efa/verbs.c index a955cd13a..708199355 100644 --- a/providers/efa/verbs.c +++ b/providers/efa/verbs.c @@ -899,6 +899,17 @@ static inline int efa_start_poll_comp_check(struct ibv_cq_ex *ibvcqx, return 0; } +static inline void efa_end_poll_common(struct efa_cq *cq) ALWAYS_INLINE; +static inline void efa_end_poll_common(struct efa_cq *cq) +{ + if (cq->cur_cqe) { + if (cq->cur_wq) + efa_wq_put_wrid_idx_unlocked(cq->cur_wq, cq->cur_cqe->req_id); + if (cq->db) + efa_update_cq_doorbell(cq, false); + } +} + static int efa_start_poll(struct ibv_cq_ex *ibvcqx, struct ibv_poll_cq_attr *attr) { @@ -933,13 +944,7 @@ static void efa_end_poll(struct ibv_cq_ex *ibvcqx) { struct efa_cq *cq = to_efa_cq_ex(ibvcqx); - if (cq->cur_cqe) { - if (cq->cur_wq) - efa_wq_put_wrid_idx_unlocked(cq->cur_wq, cq->cur_cqe->req_id); - if (cq->db) - efa_update_cq_doorbell(cq, false); - } - + efa_end_poll_common(cq); pthread_spin_unlock(&cq->lock); } @@ -980,27 +985,65 @@ static int efa_next_poll_single_sub_cq(struct ibv_cq_ex *ibvcqx) return ret; } +static int efa_start_poll_single_thread(struct ibv_cq_ex *ibvcqx, + struct ibv_poll_cq_attr *attr) +{ + struct efa_cq *cq = to_efa_cq_ex(ibvcqx); + + if (efa_start_poll_comp_check(ibvcqx, attr)) + return EINVAL; + + return efa_poll_sub_cqs(cq, NULL, true); +} + +static void efa_end_poll_single_thread(struct ibv_cq_ex *ibvcqx) +{ + struct efa_cq *cq = to_efa_cq_ex(ibvcqx); + + efa_end_poll_common(cq); +} + +static int efa_start_poll_single_sub_cq_single_thread(struct ibv_cq_ex *ibvcqx, + struct ibv_poll_cq_attr *attr) +{ + struct efa_cq *cq = to_efa_cq_ex(ibvcqx); + struct efa_qp *qp = NULL; + int ret; + + if (efa_start_poll_comp_check(ibvcqx, attr)) + return EINVAL; + + ret = efa_poll_sub_cq(cq, cq->sub_cq_arr, &qp, NULL, true); + if (ret != ENOENT) + cq->cc++; + + return ret; +} + enum cq_pfns_attr { SINGLE_SUB_CQ_PFNS = BIT(0), + SINGLE_THREAD_PFNS = BIT(1), }; -#define efa_start_poll_name(single_sub_cq) efa_start_poll##single_sub_cq +#define efa_start_poll_name(single_sub_cq, single_thread) efa_start_poll##single_sub_cq##single_thread #define efa_next_poll_name(single_sub_cq) efa_next_poll##single_sub_cq -#define efa_end_poll_name() efa_end_poll +#define efa_end_poll_name(single_thread) efa_end_poll##single_thread -#define POLL_FN_ENTRY(single_sub_cq) { \ - .start_poll = efa_start_poll_name(single_sub_cq), \ +#define POLL_FN_ENTRY(single_sub_cq, single_thread) { \ + .start_poll = efa_start_poll_name(single_sub_cq, single_thread), \ .next_poll = efa_next_poll_name(single_sub_cq), \ - .end_poll = efa_end_poll_name(), \ + .end_poll = efa_end_poll_name(single_thread), \ } struct cq_base_ops { int (*start_poll)(struct ibv_cq_ex *ibcq, struct ibv_poll_cq_attr *attr); int (*next_poll)(struct ibv_cq_ex *ibcq); void (*end_poll)(struct ibv_cq_ex *ibcq); -} base_ops[SINGLE_SUB_CQ_PFNS + 1] = { - [0] = POLL_FN_ENTRY(), - [SINGLE_SUB_CQ_PFNS] = POLL_FN_ENTRY(_single_sub_cq), +} base_ops[] = { + [0] = POLL_FN_ENTRY(,), + [SINGLE_SUB_CQ_PFNS] = POLL_FN_ENTRY(_single_sub_cq,), + [SINGLE_THREAD_PFNS] = POLL_FN_ENTRY(, _single_thread), + [SINGLE_SUB_CQ_PFNS | SINGLE_THREAD_PFNS] = POLL_FN_ENTRY(_single_sub_cq, _single_thread) }; static void efa_cq_fill_pfns(struct efa_cq *cq, @@ -1014,6 +1057,9 @@ static void efa_cq_fill_pfns(struct efa_cq *cq, if (cq->num_sub_cqs == 1) cq_pfns_mask |= SINGLE_SUB_CQ_PFNS; + if (attr->flags & IBV_CREATE_CQ_ATTR_SINGLE_THREADED) + cq_pfns_mask |= SINGLE_THREAD_PFNS; + cq_ops = &base_ops[cq_pfns_mask]; ibvcqx->start_poll = cq_ops->start_poll; ibvcqx->next_poll = cq_ops->next_poll; @@ -1074,7 +1120,10 @@ static struct ibv_cq_ex *create_cq(struct ibv_context *ibvctx, int err; int i; - if (!check_comp_mask(attr->comp_mask, IBV_CQ_INIT_ATTR_MASK_PD) || +#define EFA_CREATE_CQ_SUPP_ATTR_MASK \ + (IBV_CQ_INIT_ATTR_MASK_PD | IBV_CQ_INIT_ATTR_MASK_FLAGS) + + if (!check_comp_mask(attr->comp_mask, EFA_CREATE_CQ_SUPP_ATTR_MASK) || !check_comp_mask(attr->wc_flags, IBV_WC_STANDARD_FLAGS)) { verbs_err(verbs_get_ctx(ibvctx), "Invalid comp_mask or wc_flags\n"); @@ -1082,6 +1131,14 @@ static struct ibv_cq_ex *create_cq(struct ibv_context *ibvctx, return NULL; } + if (attr->comp_mask & IBV_CQ_INIT_ATTR_MASK_FLAGS && + !check_comp_mask(attr->flags, IBV_CREATE_CQ_ATTR_SINGLE_THREADED)) { + verbs_err(verbs_get_ctx(ibvctx), + "Invalid flags\n"); + errno = EOPNOTSUPP; + return NULL; + } + if (attr->channel && !EFA_DEV_CAP(ctx, CQ_NOTIFICATIONS)) { errno = EOPNOTSUPP; From 92ad54bff6071f08eb002f7add4b75712c65b406 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Tue, 12 Aug 2025 08:35:29 -0700 Subject: [PATCH 04/66] libibverbs: Rename ibv_reg_mr_in to ibv_mr_init_attr Rename the structure and input parameter to align with other libibverbs API calls. Signed-off-by: Sean Hefty --- libibverbs/cmd_mr.c | 41 ++++++++++++++++++++----------------- libibverbs/driver.h | 5 +++-- libibverbs/dummy_ops.c | 3 ++- libibverbs/man/ibv_reg_mr.3 | 8 ++++---- libibverbs/verbs.c | 27 ++++++++++++------------ libibverbs/verbs.h | 11 +++++----- providers/mlx5/mlx5.h | 2 +- providers/mlx5/verbs.c | 6 +++--- 8 files changed, 55 insertions(+), 48 deletions(-) diff --git a/libibverbs/cmd_mr.c b/libibverbs/cmd_mr.c index 1ec5416dd..bf0953253 100644 --- a/libibverbs/cmd_mr.c +++ b/libibverbs/cmd_mr.c @@ -158,49 +158,51 @@ int ibv_cmd_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, } int ibv_cmd_reg_mr_ex(struct ibv_pd *pd, struct verbs_mr *vmr, - struct ibv_reg_mr_in *in) + struct ibv_mr_init_attr *mr_init_attr) { DECLARE_COMMAND_BUFFER(cmdb, UVERBS_OBJECT_MR, UVERBS_METHOD_REG_MR, 11); - bool fd_based = (in->comp_mask & IBV_REG_MR_MASK_FD); + bool fd_based = (mr_init_attr->comp_mask & IBV_REG_MR_MASK_FD); struct ib_uverbs_attr *handle; - uint64_t length = in->length; + uint64_t length = mr_init_attr->length; uint32_t lkey, rkey; int ret; if (fd_based) { - if (!(in->comp_mask & IBV_REG_MR_MASK_FD_OFFSET) || - (in->comp_mask & IBV_REG_MR_MASK_ADDR)) { + if (!(mr_init_attr->comp_mask & IBV_REG_MR_MASK_FD_OFFSET) || + (mr_init_attr->comp_mask & IBV_REG_MR_MASK_ADDR)) { errno = EINVAL; return EINVAL; } - fill_attr_in_uint64(cmdb, UVERBS_ATTR_REG_MR_FD_OFFSET, in->fd_offset); + fill_attr_in_uint64(cmdb, UVERBS_ATTR_REG_MR_FD_OFFSET, + mr_init_attr->fd_offset); fill_attr_in_fd(cmdb, UVERBS_ATTR_REG_MR_FD, - in->fd); + mr_init_attr->fd); } else { - if ((in->comp_mask & IBV_REG_MR_MASK_FD_OFFSET) || - !(in->comp_mask & IBV_REG_MR_MASK_ADDR)) { + if ((mr_init_attr->comp_mask & IBV_REG_MR_MASK_FD_OFFSET) || + !(mr_init_attr->comp_mask & IBV_REG_MR_MASK_ADDR)) { errno = EINVAL; return EINVAL; } - fill_attr_in_uint64(cmdb, UVERBS_ATTR_REG_MR_ADDR, (uintptr_t)in->addr); - if (in->access & IBV_ACCESS_ON_DEMAND) { - if (in->length == SIZE_MAX && in->addr) { + fill_attr_in_uint64(cmdb, UVERBS_ATTR_REG_MR_ADDR, + (uintptr_t) mr_init_attr->addr); + if (mr_init_attr->access & IBV_ACCESS_ON_DEMAND) { + if (mr_init_attr->length == SIZE_MAX && mr_init_attr->addr) { errno = EINVAL; return EINVAL; } - if (in->length == SIZE_MAX) + if (mr_init_attr->length == SIZE_MAX) length = UINT64_MAX; } } - if (in->comp_mask & IBV_REG_MR_MASK_IOVA) { - fill_attr_in_uint64(cmdb, UVERBS_ATTR_REG_MR_IOVA, in->iova); + if (mr_init_attr->comp_mask & IBV_REG_MR_MASK_IOVA) { + fill_attr_in_uint64(cmdb, UVERBS_ATTR_REG_MR_IOVA, mr_init_attr->iova); } else { if (!fd_based) { fill_attr_in_uint64(cmdb, UVERBS_ATTR_REG_MR_IOVA, - (uintptr_t)in->addr); + (uintptr_t) mr_init_attr->addr); } else { /* iova is a must from kernel point of view */ errno = EINVAL; @@ -213,11 +215,12 @@ int ibv_cmd_reg_mr_ex(struct ibv_pd *pd, struct verbs_mr *vmr, fill_attr_out_ptr(cmdb, UVERBS_ATTR_REG_MR_RESP_RKEY, &rkey); fill_attr_in_obj(cmdb, UVERBS_ATTR_REG_MR_PD_HANDLE, pd->handle); fill_attr_in_uint64(cmdb, UVERBS_ATTR_REG_MR_LENGTH, length); - fill_attr_in_uint32(cmdb, UVERBS_ATTR_REG_MR_ACCESS_FLAGS, in->access); + fill_attr_in_uint32(cmdb, UVERBS_ATTR_REG_MR_ACCESS_FLAGS, + mr_init_attr->access); - if (in->comp_mask & IBV_REG_MR_MASK_DMAH) + if (mr_init_attr->comp_mask & IBV_REG_MR_MASK_DMAH) fill_attr_in_obj(cmdb, UVERBS_ATTR_REG_MR_DMA_HANDLE, - verbs_get_dmah(in->dmah)->handle); + verbs_get_dmah(mr_init_attr->dmah)->handle); ret = execute_ioctl(pd->context, cmdb); if (ret) diff --git a/libibverbs/driver.h b/libibverbs/driver.h index 30ea6fee2..98b862abc 100644 --- a/libibverbs/driver.h +++ b/libibverbs/driver.h @@ -466,7 +466,8 @@ struct verbs_context_ops { int fd, int access); struct ibv_mr *(*reg_mr)(struct ibv_pd *pd, void *addr, size_t length, uint64_t hca_va, int access); - struct ibv_mr *(*reg_mr_ex)(struct ibv_pd *pd, struct ibv_reg_mr_in *in); + struct ibv_mr *(*reg_mr_ex)(struct ibv_pd *pd, + struct ibv_mr_init_attr *mr_init_attr); int (*req_notify_cq)(struct ibv_cq *cq, int solicited_only); int (*rereg_mr)(struct verbs_mr *vmr, int flags, struct ibv_pd *pd, void *addr, size_t length, int access); @@ -590,7 +591,7 @@ int ibv_cmd_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, struct verbs_mr *vmr, struct ibv_command_buffer *driver); int ibv_cmd_reg_mr_ex(struct ibv_pd *pd, struct verbs_mr *vmr, - struct ibv_reg_mr_in *in); + struct ibv_mr_init_attr *mr_init_attr); int ibv_cmd_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type, struct ibv_mw *mw, struct ibv_alloc_mw *cmd, size_t cmd_size, diff --git a/libibverbs/dummy_ops.c b/libibverbs/dummy_ops.c index 4834d7197..1c18d0f18 100644 --- a/libibverbs/dummy_ops.c +++ b/libibverbs/dummy_ops.c @@ -461,7 +461,8 @@ static struct ibv_mr *reg_mr(struct ibv_pd *pd, void *addr, size_t length, return NULL; } -static struct ibv_mr *reg_mr_ex(struct ibv_pd *pd, struct ibv_reg_mr_in *in) +static struct ibv_mr *reg_mr_ex(struct ibv_pd *pd, + struct ibv_mr_init_attr *mr_init_attr) { errno = EOPNOTSUPP; return NULL; diff --git a/libibverbs/man/ibv_reg_mr.3 b/libibverbs/man/ibv_reg_mr.3 index 668daab04..9fcb69a09 100644 --- a/libibverbs/man/ibv_reg_mr.3 +++ b/libibverbs/man/ibv_reg_mr.3 @@ -19,7 +19,7 @@ ibv_reg_mr, ibv_reg_mr_iova, ibv_reg_dmabuf_mr, ibv_reg_mr_ex, ibv_dereg_mr \- r .BI " size_t " "length" ", uint64_t " "iova" , .BI " int " "fd" ", int " "access" ); .sp -.BI "struct ibv_mr *ibv_reg_mr_ex(struct ibv_pd " "*pd" ", struct ibv_reg_mr_in " in" ); +.BI "struct ibv_mr *ibv_reg_mr_ex(struct ibv_pd " "*pd" ", struct ibv_mr_init_attr "*mr_init_attr" ); .sp .BI "int ibv_dereg_mr(struct ibv_mr " "*mr" ); .fi @@ -124,10 +124,10 @@ describes the desired memory protection attributes; it is similar to the ibv_reg .B ibv_reg_mr_ex() ibv_reg_mr_ex is a an API that enables all the variants of the other ibv_reg_mr_xxx() that desecibed in that man page. It has the -.I in->comp_mask +.I mr_init_attr->comp_mask field to let application mark which fields are applicable. In addition, it includes the -.I in->dmah +.I mr_init_attr->dmah which can be used to include an ibv_dmah object that will be used for that MR. The other fields on the input pointer have the same meaning as of the fields that described in that man page for the other verbs. .PP @@ -153,7 +153,7 @@ returns 0 on success, or the value of errno on failure (which indicates the fail .B ibv_dereg_mr() fails if any memory window is still bound to this MR. .B ibv_dereg_mr_ex() -One among in->fd and in->addr is required, both can't come together. +One among mr_init_attr->fd and mr_init_attr->addr is required, both can't come together. .SH "SEE ALSO" .BR ibv_alloc_pd (3), .BR ibv_post_send (3), diff --git a/libibverbs/verbs.c b/libibverbs/verbs.c index f9f2a03bf..c09bff8eb 100644 --- a/libibverbs/verbs.c +++ b/libibverbs/verbs.c @@ -430,37 +430,38 @@ struct ibv_mr *ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, return mr; } -struct ibv_mr *ibv_reg_mr_ex(struct ibv_pd *pd, struct ibv_reg_mr_in *in) +/* Note: mr_init_attr may be modified during this call */ +struct ibv_mr *ibv_reg_mr_ex(struct ibv_pd *pd, struct ibv_mr_init_attr *mr_init_attr) { struct verbs_device *device = verbs_get_device(pd->context->device); struct ibv_mr *mr; - int in_access = in->access; - bool need_fork = !((in->access & IBV_ACCESS_ON_DEMAND) || - (in->comp_mask & IBV_REG_MR_MASK_FD)); + int in_access = mr_init_attr->access; + bool need_fork = !((mr_init_attr->access & IBV_ACCESS_ON_DEMAND) || + (mr_init_attr->comp_mask & IBV_REG_MR_MASK_FD)); - if (need_fork && ibv_dontfork_range(in->addr, in->length)) + if (need_fork && ibv_dontfork_range(mr_init_attr->addr, mr_init_attr->length)) return NULL; if (!(device->core_support & IB_UVERBS_CORE_SUPPORT_OPTIONAL_MR_ACCESS)) - in->access &= ~IBV_ACCESS_OPTIONAL_RANGE; + mr_init_attr->access &= ~IBV_ACCESS_OPTIONAL_RANGE; - mr = get_ops(pd->context)->reg_mr_ex(pd, in); + mr = get_ops(pd->context)->reg_mr_ex(pd, mr_init_attr); if (mr) { mr->context = pd->context; - mr->length = in->length; + mr->length = mr_init_attr->length; mr->pd = pd; - if (in->comp_mask & IBV_REG_MR_MASK_ADDR) - mr->addr = in->addr; + if (mr_init_attr->comp_mask & IBV_REG_MR_MASK_ADDR) + mr->addr = mr_init_attr->addr; else /* Follows ibv_reg_dmabuf_mr logic */ - mr->addr = (void *)(uintptr_t)in->fd_offset; + mr->addr = (void *)(uintptr_t) mr_init_attr->fd_offset; } else { if (need_fork) - ibv_dofork_range(in->addr, in->length); + ibv_dofork_range(mr_init_attr->addr, mr_init_attr->length); } /* restore the input access flags */ - in->access = in_access; + mr_init_attr->access = in_access; return mr; } diff --git a/libibverbs/verbs.h b/libibverbs/verbs.h index 908c87f53..1cd6408c1 100644 --- a/libibverbs/verbs.h +++ b/libibverbs/verbs.h @@ -681,7 +681,7 @@ struct ibv_mr { uint32_t rkey; }; -enum ibv_reg_mr_in_mask { +enum ibv_mr_init_attr_mask { IBV_REG_MR_MASK_IOVA = 1 << 0, IBV_REG_MR_MASK_ADDR = 1 << 1, IBV_REG_MR_MASK_FD = 1 << 2, @@ -689,10 +689,10 @@ enum ibv_reg_mr_in_mask { IBV_REG_MR_MASK_DMAH = 1 << 4, }; -struct ibv_reg_mr_in { +struct ibv_mr_init_attr { size_t length; int access; - uint64_t comp_mask; /* Use enum ibv_reg_mr_in_mask */ + uint64_t comp_mask; /* Use enum ibv_mr_init_attr_mask */ uint64_t iova; void *addr; int fd; @@ -2181,7 +2181,8 @@ struct ibv_values_ex { struct verbs_context { /* "grows up" - new fields go here */ - struct ibv_mr *(*reg_mr_ex)(struct ibv_pd *pd, struct ibv_reg_mr_in *in); + struct ibv_mr *(*reg_mr_ex)(struct ibv_pd *pd, + struct ibv_mr_init_attr *mr_init_attr); int (*dealloc_dmah)(struct ibv_dmah *dmah); struct ibv_dmah *(*alloc_dmah)(struct ibv_context *context, struct ibv_dmah_init_attr *attr); @@ -2677,7 +2678,7 @@ __ibv_reg_mr_iova(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, struct ibv_mr *ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access); -struct ibv_mr *ibv_reg_mr_ex(struct ibv_pd *pd, struct ibv_reg_mr_in *in); +struct ibv_mr *ibv_reg_mr_ex(struct ibv_pd *pd, struct ibv_mr_init_attr *mr_init_attr); enum ibv_rereg_mr_err_code { /* Old MR is valid, invalid input */ diff --git a/providers/mlx5/mlx5.h b/providers/mlx5/mlx5.h index 0038a9a53..d98db9aa1 100644 --- a/providers/mlx5/mlx5.h +++ b/providers/mlx5/mlx5.h @@ -1143,7 +1143,7 @@ struct ibv_mr *mlx5_reg_mr(struct ibv_pd *pd, void *addr, size_t length, uint64_t hca_va, int access); struct ibv_mr *mlx5_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access); -struct ibv_mr *mlx5_reg_mr_ex(struct ibv_pd *pd, struct ibv_reg_mr_in *in); +struct ibv_mr *mlx5_reg_mr_ex(struct ibv_pd *pd, struct ibv_mr_init_attr *mr_init_attr); int mlx5_rereg_mr(struct verbs_mr *mr, int flags, struct ibv_pd *pd, void *addr, size_t length, int access); int mlx5_dereg_mr(struct verbs_mr *mr); diff --git a/providers/mlx5/verbs.c b/providers/mlx5/verbs.c index 5c9bd465d..49894ddc9 100644 --- a/providers/mlx5/verbs.c +++ b/providers/mlx5/verbs.c @@ -651,7 +651,7 @@ struct ibv_mr *mlx5_reg_mr(struct ibv_pd *pd, void *addr, size_t length, return &mr->vmr.ibv_mr; } -struct ibv_mr *mlx5_reg_mr_ex(struct ibv_pd *pd, struct ibv_reg_mr_in *in) +struct ibv_mr *mlx5_reg_mr_ex(struct ibv_pd *pd, struct ibv_mr_init_attr *mr_init_attr) { struct mlx5_mr *mr; int ret; @@ -660,12 +660,12 @@ struct ibv_mr *mlx5_reg_mr_ex(struct ibv_pd *pd, struct ibv_reg_mr_in *in) if (!mr) return NULL; - ret = ibv_cmd_reg_mr_ex(pd, &mr->vmr, in); + ret = ibv_cmd_reg_mr_ex(pd, &mr->vmr, mr_init_attr); if (ret) { free(mr); return NULL; } - mr->alloc_flags = in->access; + mr->alloc_flags = mr_init_attr->access; return &mr->vmr.ibv_mr; } From 091ddb56b18aa4da19322a10ac5525bc036714d7 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 13 Aug 2025 09:12:55 -0400 Subject: [PATCH 05/66] Update library version to be 60.0 Signed-off-by: Leon Romanovsky --- CMakeLists.txt | 2 +- debian/changelog | 2 +- redhat/rdma-core.spec | 2 +- suse/rdma-core.spec | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3b520a40a..0606e29c3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -79,7 +79,7 @@ endif() set(PACKAGE_NAME "RDMA") # See Documentation/versioning.md -set(PACKAGE_VERSION "59.0") +set(PACKAGE_VERSION "60.0") # When this is changed the values in these files need changing too: # debian/control # debian/libibverbs1.symbols diff --git a/debian/changelog b/debian/changelog index bce59e3cf..70207d365 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,4 +1,4 @@ -rdma-core (59.0-1) unstable; urgency=medium +rdma-core (60.0-1) unstable; urgency=medium * New upstream release. diff --git a/redhat/rdma-core.spec b/redhat/rdma-core.spec index aa9defae6..a57743d12 100644 --- a/redhat/rdma-core.spec +++ b/redhat/rdma-core.spec @@ -1,5 +1,5 @@ Name: rdma-core -Version: 59.0 +Version: 60.0 Release: 1%{?dist} Summary: RDMA core userspace libraries and daemons diff --git a/suse/rdma-core.spec b/suse/rdma-core.spec index 8c125ab54..869fa2580 100644 --- a/suse/rdma-core.spec +++ b/suse/rdma-core.spec @@ -28,7 +28,7 @@ %define git_ver %{nil} Name: rdma-core -Version: 59.0 +Version: 60.0 Release: 0 Summary: RDMA core userspace libraries and daemons License: BSD-2-Clause OR GPL-2.0-only From 003724a29ad7957f7564b7bbb6fdc20e253d06fb Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Tue, 12 Aug 2025 11:51:38 -0500 Subject: [PATCH 06/66] providers/mana: Add error code mappings for retry and rnr timeouts Add vendor error code definitions and corresponding work completion status mappings for retry and rnr timeout exceeded errors. Signed-off-by: Shiraz Saleem --- providers/mana/cq.c | 4 ++++ providers/mana/gdma.h | 2 ++ 2 files changed, 6 insertions(+) diff --git a/providers/mana/cq.c b/providers/mana/cq.c index dd8c54cc9..91b751948 100644 --- a/providers/mana/cq.c +++ b/providers/mana/cq.c @@ -383,6 +383,10 @@ static enum ibv_wc_status vendor_error_to_wc_error(uint32_t vendor_error) case VENDOR_ERR_RX_NOT_EMPTY_ON_DISABLE: case VENDOR_ERR_SW_FLUSHED: return IBV_WC_WR_FLUSH_ERR; + case VENDOR_ERR_TX_RETRY_LIMIT_EXCEEDED: + return IBV_WC_RETRY_EXC_ERR; + case VENDOR_ERR_RX_RNR_NAK: + return IBV_WC_RNR_RETRY_EXC_ERR; default: return IBV_WC_GENERAL_ERR; } diff --git a/providers/mana/gdma.h b/providers/mana/gdma.h index fd02c2311..ee0b80831 100644 --- a/providers/mana/gdma.h +++ b/providers/mana/gdma.h @@ -249,10 +249,12 @@ enum mana_error_code { VENDOR_ERR_RX_INVALID_REQ_NAK = 0x161, VENDOR_ERR_RX_REMOTE_ACCESS_NAK = 0x162, VENDOR_ERR_RX_REMOTE_OP_ERR_NAK = 0x163, + VENDOR_ERR_RX_RNR_NAK = 0x164, VENDOR_ERR_RX_ATB_SGE_ADDR_RIGHT = 0x183, VENDOR_ERR_RX_ATB_WQE_ADDR_RIGHT = 0x185, VENDOR_ERR_RX_ATB_SGE_ADDR_RANGE = 0x1c3, VENDOR_ERR_RX_ATB_WQE_ADDR_RANGE = 0x1c5, + VENDOR_ERR_TX_RETRY_LIMIT_EXCEEDED = 0x1c6, VENDOR_ERR_RX_NOT_EMPTY_ON_DISABLE = 0x1c7, VENDOR_ERR_TX_GDMA_CORRUPTED_WQE = 0x201, VENDOR_ERR_TX_ATB_WQE_ACCESS_VIOLATION = 0x202, From 3735e62cf6524eab70889da8bf22b57b91adaac5 Mon Sep 17 00:00:00 2001 From: Daniel Hayon Date: Wed, 23 Jul 2025 17:07:30 +0300 Subject: [PATCH 07/66] tests: Fix RDMA transport domain test capability validation Add missing TX capability check in test_flow_rdma_transport_domain_traffic. The test only validated RX flow table support, but should check both RX and TX flow table capabilities. Fixes: fb811423bb9d ("tests: Add test to cover RDMA transport domain") Signed-off-by: Daniel Hayon Signed-off-by: Edward Srouji --- tests/test_mlx5_flow.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_mlx5_flow.py b/tests/test_mlx5_flow.py index 48386feef..9e934d4ae 100644 --- a/tests/test_mlx5_flow.py +++ b/tests/test_mlx5_flow.py @@ -86,7 +86,8 @@ def check_rdma_transport_domain_caps(agr_obj): query_adv_rdma_cap_out = QueryAdvRdmaCapOut(agr_obj.ctx.devx_general_cmd( query_adv_rdma_cap_in, len(QueryAdvRdmaCapOut()))) - if not query_adv_rdma_cap_out.capability.rdma_transport_rx_flow_table_properties.ft_support: + if not query_adv_rdma_cap_out.capability.rdma_transport_rx_flow_table_properties.ft_support or \ + not query_adv_rdma_cap_out.capability.rdma_transport_tx_flow_table_properties.ft_support: raise unittest.SkipTest("The device doesn't support the RDMA transport domain") From 648c9514834b1864e53531aeaefcef73351308a6 Mon Sep 17 00:00:00 2001 From: Shachar Kagan Date: Tue, 29 Oct 2024 15:04:24 +0200 Subject: [PATCH 08/66] tests: Update PCIE mapping flag of mlx5 DMABUF Set the appropriate CUDA flag when creating FD for mlx5 DMABUF. This flag is mandatory for data-direct traffic. Signed-off-by: Shachar Kagan Signed-off-by: Edward Srouji --- tests/test_mlx5_dmabuf.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_mlx5_dmabuf.py b/tests/test_mlx5_dmabuf.py index 1e0915179..a4de7fa26 100644 --- a/tests/test_mlx5_dmabuf.py +++ b/tests/test_mlx5_dmabuf.py @@ -72,11 +72,13 @@ def create_mr(self): int(self.cuda_addr))) cuda_flag = cuda.CUmemRangeHandleType.CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD + cuda_dma_mapping_type = cuda.CUmemRangeFlags.CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE \ + if self.mlx5_access else 0 dmabuf_fd = cu.check_cuda_errors( cuda.cuMemGetHandleForAddressRange(self.cuda_addr, GPU_PAGE_SIZE, cuda_flag, - 0)) + cuda_dma_mapping_type)) try: self.mr = Mlx5DmaBufMR(self.pd, offset=0, length=self.msg_size, access=self.mr_access, fd=dmabuf_fd, mlx5_access=self.mlx5_access) From d1669a9cc3ac5dd98d182bb7fdbbdab178ee79a2 Mon Sep 17 00:00:00 2001 From: Elyashiv Cohen Date: Mon, 10 Feb 2025 15:46:23 +0200 Subject: [PATCH 09/66] tests: Update CmdHcaCap in mlx5 PRM struct Field relaxed_ordering_read is depreceted in the older position and was changed to relaxed_ordering_read_pci_enabled. New field was added for relaxed_ordering_read. Signed-off-by: Elyashiv Cohen Signed-off-by: Edward Srouji --- tests/mlx5_prm_structs.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/mlx5_prm_structs.py b/tests/mlx5_prm_structs.py index 16eaada16..8dca2e432 100644 --- a/tests/mlx5_prm_structs.py +++ b/tests/mlx5_prm_structs.py @@ -878,7 +878,7 @@ class CmdHcaCap(PRMPacket): BitField('log_max_cq', 0, 5), ByteField('log_max_eq_sz', 0), BitField('relaxed_ordering_write', 0, 1), - BitField('relaxed_ordering_read', 0, 1), + BitField('relaxed_ordering_read_pci_enabled', 0, 1), BitField('log_max_mkey', 0, 6), BitField('tunneled_atomic', 0, 1), BitField('as_notify', 0, 1), @@ -1114,7 +1114,9 @@ class CmdHcaCap(PRMPacket): ByteField('log_max_mcg', 0), BitField('reserved25', 0, 3), BitField('log_max_transport_domain', 0, 5), - BitField('reserved26', 0, 3), + BitField('tir_esw_lb_filter_disable', 0, 1), + BitField('reserved26', 0, 1), + BitField('relaxed_ordering_read', 0, 1), BitField('log_max_pd', 0, 5), BitField('reserved27', 0, 11), BitField('log_max_xrcd', 0, 5), From 5f6cefd10c1259744ed1286a508c3808c3b58696 Mon Sep 17 00:00:00 2001 From: Maxim Chicherin Date: Tue, 19 Mar 2024 15:02:00 +0200 Subject: [PATCH 10/66] pyverbs: Add DevX events API Add all necessary support for DevX events in pyverbs. Signed-off-by: Maxim Chicherin Signed-off-by: Edward Srouji --- pyverbs/device.pxd | 1 + pyverbs/device.pyx | 3 +- pyverbs/providers/mlx5/CMakeLists.txt | 1 + pyverbs/providers/mlx5/libmlx5.pxd | 17 +++ pyverbs/providers/mlx5/mlx5_enums.pxd | 3 + pyverbs/providers/mlx5/mlx5dv_event.pxd | 18 +++ pyverbs/providers/mlx5/mlx5dv_event.pyx | 188 ++++++++++++++++++++++++ 7 files changed, 230 insertions(+), 1 deletion(-) create mode 100644 pyverbs/providers/mlx5/mlx5dv_event.pxd create mode 100644 pyverbs/providers/mlx5/mlx5dv_event.pyx diff --git a/pyverbs/device.pxd b/pyverbs/device.pxd index 7143ab0bf..4f9c7ca94 100644 --- a/pyverbs/device.pxd +++ b/pyverbs/device.pxd @@ -27,6 +27,7 @@ cdef class Context(PyverbsCM): cdef object wqs cdef object rwq_ind_tbls cdef object crypto_logins + cdef object event_channels cdef class DeviceAttr(PyverbsObject): cdef v.ibv_device_attr dev_attr diff --git a/pyverbs/device.pyx b/pyverbs/device.pyx index 82baefcb6..aecfb9357 100644 --- a/pyverbs/device.pyx +++ b/pyverbs/device.pyx @@ -122,6 +122,7 @@ cdef class Context(PyverbsCM): self.wqs = weakref.WeakSet() self.rwq_ind_tbls = weakref.WeakSet() self.crypto_logins = weakref.WeakSet() + self.event_channels = weakref.WeakSet() self.name = kwargs.get('name') provider_attr = kwargs.get('attr') @@ -178,7 +179,7 @@ cdef class Context(PyverbsCM): self.logger.debug('Closing Context') close_weakrefs([self.qps, self.crypto_logins, self.rwq_ind_tbls, self.wqs, self.ccs, self.cqs, self.dms, self.pds, self.xrcds, self.vars, self.sched_leafs, - self.sched_nodes, self.dr_domains]) + self.sched_nodes, self.dr_domains, self.event_channels]) rc = v.ibv_close_device(self.context) if rc != 0: raise PyverbsRDMAErrno(f'Failed to close device {self.name}') diff --git a/pyverbs/providers/mlx5/CMakeLists.txt b/pyverbs/providers/mlx5/CMakeLists.txt index 75967fff2..03266c504 100644 --- a/pyverbs/providers/mlx5/CMakeLists.txt +++ b/pyverbs/providers/mlx5/CMakeLists.txt @@ -13,6 +13,7 @@ rdma_cython_module(pyverbs/providers/mlx5 mlx5 mlx5dv.pyx mlx5dv_crypto.pyx mlx5dv_dmabuf.pyx + mlx5dv_event.pyx mlx5dv_flow.pyx mlx5dv_mkey.pyx mlx5dv_objects.pyx diff --git a/pyverbs/providers/mlx5/libmlx5.pxd b/pyverbs/providers/mlx5/libmlx5.pxd index 74eb41e86..bef34bfc4 100644 --- a/pyverbs/providers/mlx5/libmlx5.pxd +++ b/pyverbs/providers/mlx5/libmlx5.pxd @@ -356,6 +356,13 @@ cdef extern from 'infiniband/mlx5dv.h': uint8_t signature uint8_t op_own + cdef struct mlx5dv_devx_event_channel: + int fd + + cdef struct mlx5dv_devx_async_event_hdr: + uint64_t cookie + uint8_t *out_data + void mlx5dv_set_ctrl_seg(mlx5_wqe_ctrl_seg *seg, uint16_t pi, uint8_t opcode, uint8_t opmod, uint32_t qp_num, uint8_t fm_ce_se, @@ -567,6 +574,16 @@ cdef extern from 'infiniband/mlx5dv.h': mlx5dv_devx_eq *mlx5dv_devx_create_eq(v.ibv_context *context, const void *_in, size_t inlen, void *out, size_t outlen) int mlx5dv_devx_destroy_eq(mlx5dv_devx_eq *eq) + mlx5dv_devx_event_channel *mlx5dv_devx_create_event_channel(v.ibv_context *context, + mlx5dv_devx_create_event_channel_flags flags) + void mlx5dv_devx_destroy_event_channel(mlx5dv_devx_event_channel *dv_event_channel) + int mlx5dv_devx_subscribe_devx_event(mlx5dv_devx_event_channel *event_channel, + mlx5dv_devx_obj *obj, uint16_t events_sz, + uint16_t events_num[], uint64_t cookie) + int mlx5dv_devx_subscribe_devx_event_fd(mlx5dv_devx_event_channel *event_channel, int fd, + mlx5dv_devx_obj *obj, uint16_t event_num) + ssize_t mlx5dv_devx_get_event(mlx5dv_devx_event_channel *event_channel, + mlx5dv_devx_async_event_hdr *event_data, size_t event_resp_len) # Mkey setters void mlx5dv_wr_mkey_configure(mlx5dv_qp_ex *mqp, mlx5dv_mkey *mkey, diff --git a/pyverbs/providers/mlx5/mlx5_enums.pxd b/pyverbs/providers/mlx5/mlx5_enums.pxd index 70b81d52f..ee1b7e7e2 100644 --- a/pyverbs/providers/mlx5/mlx5_enums.pxd +++ b/pyverbs/providers/mlx5/mlx5_enums.pxd @@ -334,3 +334,6 @@ cdef extern from 'infiniband/mlx5_api.h': cdef int MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL cdef int MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT + + cpdef enum mlx5dv_devx_create_event_channel_flags: + MLX5DV_DEVX_CREATE_EVENT_CHANNEL_FLAGS_OMIT_EV_DATA diff --git a/pyverbs/providers/mlx5/mlx5dv_event.pxd b/pyverbs/providers/mlx5/mlx5dv_event.pxd new file mode 100644 index 000000000..2c183f0c3 --- /dev/null +++ b/pyverbs/providers/mlx5/mlx5dv_event.pxd @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2025 Nvidia Inc. All rights reserved. See COPYING file + +#cython: language_level=3 + +cimport pyverbs.providers.mlx5.libmlx5 as dv +from pyverbs.base cimport PyverbsCM, PyverbsObject + + +cdef class EventChannel(PyverbsCM): + cdef dv.mlx5dv_devx_event_channel *ec + +cdef class EventHeader(PyverbsObject): + cdef object cookie + cdef object data + +cdef class EventFD(PyverbsObject): + cdef int fd diff --git a/pyverbs/providers/mlx5/mlx5dv_event.pyx b/pyverbs/providers/mlx5/mlx5dv_event.pyx new file mode 100644 index 000000000..e697c65e0 --- /dev/null +++ b/pyverbs/providers/mlx5/mlx5dv_event.pyx @@ -0,0 +1,188 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2025 Nvidia Inc. All rights reserved. See COPYING file + +#cython: language_level=3 + +from libc.stdint cimport uint8_t, uint16_t, uint64_t +from libc.stdlib cimport free, malloc +from cpython.bytes cimport PyBytes_FromStringAndSize +from pyverbs.providers.mlx5.mlx5dv cimport Mlx5DevxObj +cimport pyverbs.providers.mlx5.libmlx5 as dv +from pyverbs.pyverbs_error import PyverbsError +from pyverbs.base import PyverbsRDMAErrno +from pyverbs.device cimport Context + +cdef extern from 'unistd.h': + ssize_t read(int fd, void *buf, size_t count) +cdef extern from 'sys/eventfd.h': + int eventfd(unsigned int initval, int flags) + + +DEFAULT_EQE_SIZE = 1024 + + +cdef class EventChannel(PyverbsCM): + """ + EventChannel class represents Devx Event Channel + (mlx5dv_devx_event_channel rdma-core struct). + """ + def __init__(self, Context context not None, flags=0): + """ + EventChannel class represents Devx Event Channel + (mlx5dv_devx_event_channel rdma-core struct). + :param context: Context to create the schedule resources on. + :param flags: Create flags of the event channel. + """ + super().__init__() + self.ec = dv.mlx5dv_devx_create_event_channel(context.context, flags) + if self.ec == NULL: + raise PyverbsRDMAErrno('Failed to create a devx event channel') + context.event_channels.add(self) + + def subscribe(self, events_num, cookie=None, Mlx5DevxObj obj=None, + EventFD fd=None): + """ + Subscribe to the event channel. + :param self: The event channel instance to subscribe to + :param events_num: An array that contains event type numbers to + subscribe. If fd is given, it can be either a number or an array with + one number (since subscription with fd supports only one event number) + :param cookie: A 64b number that can be used as an ID for the + subscription + :param obj: The object that's related to subscribed event (None in case + of an unaffiliated event) + :param fd: EventFD. If given, the subscription will be done on the + eventfd using the event_fd subscription API + """ + if fd: + if cookie: + self.logger.warning('Cookie is ignored in fd subscription') + if hasattr(events_num, '__len__') and len(events_num) > 1: + raise PyverbsError('Only one event must be provided in ' + 'events_num in fd subscription') + event_num = events_num[0] if hasattr(events_num, '__len__') else \ + events_num + self._subscribe_fd(event_num, fd, obj) + else: + if cookie is None: + raise PyverbsError('Cookie must be provided with subscription') + self._subscribe(events_num, cookie, obj) + + def _subscribe(self, events_num, cookie, Mlx5DevxObj obj=None): + cdef dv.mlx5dv_devx_obj *devx_obj = obj.obj if obj else NULL + # Size of each event number is 16b which is 2 Bytes + size = len(events_num) * 2 + cdef uint16_t *events_n = malloc(size) + if not events_n: + raise PyverbsRDMAErrno('Couldn\'t allocate array for events_num') + for i in range(len(events_num)): + events_n[i] = events_num[i] + if dv.mlx5dv_devx_subscribe_devx_event(self.ec, devx_obj, size, + events_n, cookie): + raise PyverbsRDMAErrno('Failed to subscribe to devx event channel') + free(events_n) + + def _subscribe_fd(self, event_num, EventFD fd, Mlx5DevxObj obj=None): + cdef dv.mlx5dv_devx_obj *devx_obj = obj.obj if obj else NULL + if dv.mlx5dv_devx_subscribe_devx_event_fd(self.ec, (fd).fd, + devx_obj, event_num): + raise PyverbsRDMAErrno('Failed to subscribe to devx event channel fd') + + def get_event(self, event_resp_len=DEFAULT_EQE_SIZE, EventFD fd=None): + """ + Gets an event (if any) that the user subscribed to this channel. + If no events were generated this function will block until the arrival + of an event. + :param event_resp_len: The size in bytes of the allocated event + response. (1024B by default, which satisfies all current event types) + :param fd: EventFD. If given, will use read function directly on the + eventfd to read any gotten events + :return: EventHeader or the read value from the eventfd in case of fd + """ + if fd: + return self._get_event_fd(fd) + return self._get_event(event_resp_len) + + def _get_event(self, event_resp_len=DEFAULT_EQE_SIZE): + cdef dv.mlx5dv_devx_async_event_hdr *event_data = \ + malloc(event_resp_len) + if not event_data: + raise PyverbsRDMAErrno('Couldn\'t allocate array for events_data') + bytes_read = dv.mlx5dv_devx_get_event(self.ec, event_data, + event_resp_len) + if bytes_read < 0: + free(event_data) + raise PyverbsRDMAErrno('Failed to get devx event') + data_bytes = PyBytes_FromStringAndSize(event_data.out_data, + bytes_read) + event_header = EventHeader(event_data.cookie, data_bytes) + free(event_data) + return event_header + + def _get_event_fd(self, fd): + cdef uint64_t buff + rc = read(fd.fd, &buff, sizeof(buff)) + if rc < 0: + raise PyverbsRDMAErrno('Failed to get event from FD') + return buff + + def __dealloc__(self): + self.close() + + cpdef close(self): + if self.logger: + self.logger.debug('Closing EventChannel') + if self.ec != NULL: + dv.mlx5dv_devx_destroy_event_channel(self.ec) + self.ec = NULL + + @property + def fd(self): + return self.ec.fd + + +cdef class EventHeader(PyverbsObject): + """ + Event header that contains the event cookie and the event data. + An instance of this class is returned to the Python user by + EventChannel.get_event() function to represent the returned event in a + friendly manner. + """ + def __init__(self, cookie, data): + """ + Create EventHeader object + :param cookie: Cookie retrieved from the event (used upon subscription) + :param data: Event data + """ + super().__init__() + self.cookie = cookie + self.data = data + + @property + def cookie(self): + return self.cookie + + @property + def data(self): + return self.data + + +cdef class EventFD(PyverbsObject): + """ + Represent eventfd, a file descriptor for event notification. + """ + def __init__(self, initval=0, flags=0): + """ + Create an EventFD. + :param initval: The eventfd counter initial value + :param flags: The eventfd flags + """ + super().__init__() + event_fd = eventfd(initval, flags) + if event_fd < 0: + raise PyverbsRDMAErrno('Failed to create a event fd') + self.fd = event_fd + + @property + def fd(self): + return self.fd From e304dfe687f96bae09669f596f75999c8e22cc5b Mon Sep 17 00:00:00 2001 From: Maxim Chicherin Date: Mon, 1 Apr 2024 16:15:36 +0300 Subject: [PATCH 11/66] tests: Add tests for DevX events Add tests for DevX events with cq error. Signed-off-by: Maxim Chicherin Signed-off-by: Edward Srouji --- tests/CMakeLists.txt | 1 + tests/test_mlx5_devx_async_events.py | 90 ++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) create mode 100644 tests/test_mlx5_devx_async_events.py diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d50308d00..0570b8d2b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -28,6 +28,7 @@ rdma_python_test(tests test_mlx5_cuda_umem.py test_mlx5_dc.py test_mlx5_devx.py + test_mlx5_devx_async_events.py test_mlx5_dm_ops.py test_mlx5_dma_memcpy.py test_mlx5_dmabuf.py diff --git a/tests/test_mlx5_devx_async_events.py b/tests/test_mlx5_devx_async_events.py new file mode 100644 index 000000000..99114e54e --- /dev/null +++ b/tests/test_mlx5_devx_async_events.py @@ -0,0 +1,90 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2025 Nvidia Inc. All rights reserved. See COPYING file + +""" +Test module for mlx5 DevX Async Events. +""" + +from pyverbs.providers.mlx5.mlx5_enums import mlx5dv_devx_create_event_channel_flags +from pyverbs.providers.mlx5.mlx5dv_event import EventChannel, EventFD +from pyverbs.pyverbs_error import PyverbsError +from tests.mlx5_base import Mlx5DevxRcResources, Mlx5DevxTrafficBase +import tests.utils as u + +import time +import sys + + +EVENT_COOKIE = 100 +OMIT_EV_DATA = mlx5dv_devx_create_event_channel_flags.MLX5DV_DEVX_CREATE_EVENT_CHANNEL_FLAGS_OMIT_EV_DATA +POLL_EVENT_TIMEOUT = 10 +CQ_OVERRUN = 0x1 + + +def get_event(event_c, fd=None, count=1): + """ + Polls given event channel 'count' number of times + :param event_c: Event channel + :param fd: File descriptor + :param count: Number of expected events + :return: List of event responses + """ + events_resp = [] + start_poll_t = time.perf_counter() + + while count > 0 and (time.perf_counter() - start_poll_t < POLL_EVENT_TIMEOUT): + try: + event = event_c.get_event(fd=fd) + events_resp.append(event) + except PyverbsError as err: + raise err + count -= 1 + + if count > 0: + raise PyverbsError(f'Got timeout on polling ({count} EQEs remaining)') + + return events_resp + + +class Mlx5EventsTrafficTest(Mlx5DevxTrafficBase): + """ + Test various functionality of mlx5 events + """ + def cq_error_event(self, with_fd=True): + """ + Creates DEVX resources and executes traffic without polling the CQ, + which generates CQ error + """ + from tests.mlx5_prm_structs import EventType + self.create_players(Mlx5DevxRcResources) + fd = EventFD() if with_fd else None + cookie = EVENT_COOKIE if not with_fd else None + flags = OMIT_EV_DATA if with_fd else 0 + ec = EventChannel(self.server.ctx, flags=flags) + ec.subscribe([EventType.CQ_ERROR], cookie=cookie, obj=self.server.cq, fd=fd) + for _ in range(self.client.num_msgs): + # Post send/recv without polling the CQ in order to exceed its size for CQ error event + self.server.post_recv() + self.client.post_send() + return get_event(ec, fd) + + def test_cq_error_event_fd(self): + """ + CQ error event test with FD + """ + comp_events = self.cq_error_event() + self.assertEqual(len(comp_events), 1) + self.assertEqual(comp_events[0], 1) + + def test_cq_error_event_cookie(self): + """ + CQ error event test with cookie + """ + from tests.mlx5_prm_structs import SwEqe, CreateCqOut + comp_events = self.cq_error_event(with_fd=False) + self.assertEqual(len(comp_events), 1) + data = SwEqe(comp_events[0].data).event_data + cqn = CreateCqOut(self.server.cq.out_view).cqn + self.assertEqual(comp_events[0].cookie, EVENT_COOKIE) + self.assertEqual(data.cqn, cqn) + self.assertEqual(data.syndrome, CQ_OVERRUN) From ebc482281bf9a0f6de6794dc287436256b387ccc Mon Sep 17 00:00:00 2001 From: Maxim Chicherin Date: Mon, 6 Jan 2025 01:56:48 +0200 Subject: [PATCH 12/66] pyverbs: Extend mlx5dv_flow Extend Mlx5FlowActionAttr to support counters. Adjust QP action setter in Mlx5FlowActionAttr. Adjust type attribute to support more actions. Add Mlx5ModifyFlowAction action. Extend Mlx5Flow to support counters. Signed-off-by: Maxim Chicherin Signed-off-by: Edward Srouji --- pyverbs/providers/mlx5/libmlx5.pxd | 3 ++ pyverbs/providers/mlx5/mlx5dv_flow.pxd | 6 ++- pyverbs/providers/mlx5/mlx5dv_flow.pyx | 61 +++++++++++++++++++++++--- 3 files changed, 64 insertions(+), 6 deletions(-) diff --git a/pyverbs/providers/mlx5/libmlx5.pxd b/pyverbs/providers/mlx5/libmlx5.pxd index bef34bfc4..149244e8d 100644 --- a/pyverbs/providers/mlx5/libmlx5.pxd +++ b/pyverbs/providers/mlx5/libmlx5.pxd @@ -488,6 +488,9 @@ cdef extern from 'infiniband/mlx5dv.h': void *data, unsigned char reformat_type, unsigned char ft_type) + v.ibv_flow_action *mlx5dv_create_flow_action_modify_header(v.ibv_context *ctx, + size_t actions_sz, void *actions, + unsigned char ft_type) v.ibv_mr *mlx5dv_reg_dmabuf_mr(v.ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access) int mlx5dv_get_data_direct_sysfs_path(v.ibv_context *context, char *buf, size_t buf_len) diff --git a/pyverbs/providers/mlx5/mlx5dv_flow.pxd b/pyverbs/providers/mlx5/mlx5dv_flow.pxd index a0b70b9f8..09d437f6e 100644 --- a/pyverbs/providers/mlx5/mlx5dv_flow.pxd +++ b/pyverbs/providers/mlx5/mlx5dv_flow.pxd @@ -25,9 +25,13 @@ cdef class Mlx5FlowActionAttr(PyverbsObject): cdef dv.mlx5dv_flow_action_attr attr cdef object qp cdef object action + cdef object devx_obj cdef class Mlx5Flow(Flow): - pass + cdef object devx_obj cdef class Mlx5PacketReformatFlowAction(FlowAction): pass + +cdef class Mlx5ModifyFlowAction(FlowAction): + pass diff --git a/pyverbs/providers/mlx5/mlx5dv_flow.pyx b/pyverbs/providers/mlx5/mlx5dv_flow.pyx index 726948db1..c509621e5 100644 --- a/pyverbs/providers/mlx5/mlx5dv_flow.pyx +++ b/pyverbs/providers/mlx5/mlx5dv_flow.pyx @@ -2,6 +2,7 @@ # Copyright (c) 2020 Nvidia, Inc. All rights reserved. See COPYING file from libc.stdlib cimport calloc, free +from libc.stdint cimport uint64_t from libc.string cimport memcpy from pyverbs.pyverbs_error import PyverbsRDMAError, PyverbsError, \ @@ -13,6 +14,11 @@ from pyverbs.device cimport Context cimport pyverbs.libibverbs as v from pyverbs.qp cimport QP import weakref +import struct + + +be64toh = lambda num: struct.unpack('Q'.encode(), struct.pack('!8s'.encode(), num))[0] +ACTION_SIZE = 8 cdef class Mlx5FlowMatchParameters(PyverbsObject): @@ -166,6 +172,30 @@ cdef class Mlx5PacketReformatFlowAction(FlowAction): raise PyverbsRDMAErrno('Failed to create flow action packet reformat') +cdef class Mlx5ModifyFlowAction(FlowAction): + def __init__(self, Context ctx, ft_type=dv.MLX5DV_FLOW_TABLE_TYPE_NIC_RX, actions=list()): + """ + Initialize a Mlx5ModifyFlowAction object derived from FlowAction class and represents modify + flow steering action that allows to modify packet headers. + :param ctx: Context object + :param ft_type: Flow table type + :param actions: List of modify actions of types AddActionIn, SetActionIn, CopyActionIn + defined in prm_structs + """ + super().__init__() + action_buf_size = len(actions) * ACTION_SIZE + cdef uint64_t *buf = calloc(1, action_buf_size) + if buf == NULL: + raise MemoryError('Failed to allocate memory') + for i in range(len(actions)): + buf[i] = be64toh(bytes(actions[i])) + self.action = dv.mlx5dv_create_flow_action_modify_header(ctx.context, action_buf_size, + buf, ft_type) + free(buf) + if self.action == NULL: + raise PyverbsRDMAErrno('Failed to create flow action modify') + + cdef class Mlx5FlowActionAttr(PyverbsObject): def __init__(self, action_type=None, QP qp=None, FlowAction flow_action=None, Mlx5DevxObj obj=None): @@ -187,8 +217,9 @@ cdef class Mlx5FlowActionAttr(PyverbsObject): elif action_type == dv.MLX5DV_FLOW_ACTION_IBV_FLOW_ACTION: self.attr.action = flow_action.action self.action = flow_action - elif action_type == dv.MLX5DV_FLOW_ACTION_DEST_DEVX: + elif action_type in [dv.MLX5DV_FLOW_ACTION_DEST_DEVX, dv.MLX5DV_FLOW_ACTION_COUNTERS_DEVX]: self.attr.obj = obj.obj + self.devx_obj = obj elif action_type: raise PyverbsUserError(f'Unsupported action type: {action_type}.') @@ -198,7 +229,9 @@ cdef class Mlx5FlowActionAttr(PyverbsObject): @type.setter def type(self, action_type): - if self.attr.type != dv.MLX5DV_FLOW_ACTION_DEST_IBV_QP: + if not (self.attr.type in [dv.MLX5DV_FLOW_ACTION_DEST_IBV_QP, + dv.MLX5DV_FLOW_ACTION_DEST_DEVX, + dv.MLX5DV_FLOW_ACTION_COUNTERS_DEVX]): raise PyverbsUserError(f'Unsupported action type: {action_type}.') self.attr.type = action_type @@ -213,6 +246,22 @@ cdef class Mlx5FlowActionAttr(PyverbsObject): if self.attr.type != dv.MLX5DV_FLOW_ACTION_DEST_IBV_QP: raise PyverbsUserError(f'Action attr of type {self.attr.type} doesn\'t have a qp') self.qp = qp + self.attr.qp = qp.qp + + @property + def devx_obj(self): + if not (self.attr.type in [dv.MLX5DV_FLOW_ACTION_DEST_DEVX, + dv.MLX5DV_FLOW_ACTION_COUNTERS_DEVX]): + raise PyverbsUserError(f'Action attr of type {self.attr.type} doesn\'t have a devx_obj') + return self.devx_obj + + @devx_obj.setter + def devx_obj(self, Mlx5DevxObj devx_obj): + if not (self.attr.type in [dv.MLX5DV_FLOW_ACTION_DEST_DEVX, + dv.MLX5DV_FLOW_ACTION_COUNTERS_DEVX]): + raise PyverbsUserError(f'Action attr of type {self.attr.type} doesn\'t have a devx_obj') + self.devx_obj = devx_obj + self.attr.obj = devx_obj.obj @property def action(self): @@ -255,10 +304,12 @@ cdef class Mlx5Flow(Flow): if (attr).attr.type == dv.MLX5DV_FLOW_ACTION_DEST_IBV_QP: ((attr.qp)).add_ref(self) self.qp = (attr).qp - elif (attr).attr.type not in \ - [dv.MLX5DV_FLOW_ACTION_IBV_FLOW_ACTION, dv.MLX5DV_FLOW_ACTION_DEST_DEVX]: + elif (attr).attr.type in [dv.MLX5DV_FLOW_ACTION_DEST_DEVX, + dv.MLX5DV_FLOW_ACTION_COUNTERS_DEVX]: + self.devx_obj = (attr).devx_obj + elif (attr).attr.type not in [dv.MLX5DV_FLOW_ACTION_IBV_FLOW_ACTION]: raise PyverbsUserError(f'Unsupported action type: ' - f'{attr).attr.type}.') + f'{(attr).attr.type}.') memcpy(tmp_addr, &(attr).attr, sizeof(dv.mlx5dv_flow_action_attr)) tmp_addr += sizeof(dv.mlx5dv_flow_action_attr) From 348a32ee8430d08d0c035681a2cc20afa8a39387 Mon Sep 17 00:00:00 2001 From: Maxim Chicherin Date: Wed, 19 Feb 2025 15:02:31 +0200 Subject: [PATCH 13/66] tests: Add flow counter test Add counter test for FW steering in mlx5_flow tests. Signed-off-by: Maxim Chicherin Signed-off-by: Edward Srouji --- tests/test_mlx5_flow.py | 57 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/tests/test_mlx5_flow.py b/tests/test_mlx5_flow.py index 48386feef..cdc922c9d 100644 --- a/tests/test_mlx5_flow.py +++ b/tests/test_mlx5_flow.py @@ -120,6 +120,43 @@ def create_qps(self): super().create_qps() +class Mlx5CounterFlowResources(Mlx5FlowResources): + + def create_context(self): + mlx5dv_attr = Mlx5DVContextAttr() + try: + self.ctx = Mlx5Context(mlx5dv_attr, name=self.dev_name) + except PyverbsUserError as ex: + raise unittest.SkipTest(f'Could not open mlx5 context ({ex})') + except PyverbsRDMAError: + raise unittest.SkipTest('Opening mlx5 context is not supported') + + @staticmethod + def create_counter(ctx): + """ + Create flow counter. + :param ctx: The player context to create the counter on. + :return: The counter object and the flow counter ID . + """ + from tests.mlx5_prm_structs import AllocFlowCounterIn, AllocFlowCounterOut + counter = Mlx5DevxObj(ctx, AllocFlowCounterIn(), len(AllocFlowCounterOut())) + flow_counter_id = AllocFlowCounterOut(counter.out_view).flow_counter_id + return counter, flow_counter_id + + @staticmethod + def query_counter_packets(counter, flow_counter_id): + """ + Query flow counter packets count. + :param counter: The counter for the query. + :param flow_counter_id: The flow counter ID for the query. + :return: Number of packets on this counter. + """ + from tests.mlx5_prm_structs import QueryFlowCounterIn, QueryFlowCounterOut + query_in = QueryFlowCounterIn(flow_counter_id=flow_counter_id) + counter_out = QueryFlowCounterOut(counter.query(query_in, len(QueryFlowCounterOut()))) + return counter_out.flow_statistics.packets + + class Mlx5RCFlowResources(Mlx5RcResources): def __init__(self, dev_name, ib_port, gid_index, is_privileged_ctx=False, **kwargs): """ @@ -356,6 +393,26 @@ def test_smac_matcher_to_qp_flow(self): self.server.flow = Mlx5Flow(matcher, value_param, [action_qp], 1) u.raw_traffic(self.client, self.server, self.iters) + @u.skip_unsupported + def test_counter_qp_flow(self): + """ + Creates a matcher to match on outer source mac and a flow that forwards + packets to QP when matching on source mac. + """ + self.create_players(Mlx5CounterFlowResources) + empty_bytes_arr = bytes(MAX_MATCH_PARAM_SIZE) + empty_value_param = Mlx5FlowMatchParameters(len(empty_bytes_arr), empty_bytes_arr) + matcher = self.server.create_matcher(empty_bytes_arr, u.MatchCriteriaEnable.NONE) + qp_attr = Mlx5FlowActionAttr(action_type=mlx5dv_flow_action_type.MLX5DV_FLOW_ACTION_DEST_IBV_QP, + qp=self.server.qp) + counter, flow_counter_id = self.server.create_counter(self.server.ctx) + ctr_attr = Mlx5FlowActionAttr(action_type=mlx5dv_flow_action_type.MLX5DV_FLOW_ACTION_COUNTERS_DEVX, + obj=counter) + self.server.flow = Mlx5Flow(matcher, empty_value_param, [ctr_attr, qp_attr], 2) + u.raw_traffic(self.client, self.server, self.iters) + sent_packets = self.server.query_counter_packets(counter, flow_counter_id) + self.assertEqual(sent_packets, self.iters, 'Counter of metadata missed some sent packets') + @requires_reformat_support @u.requires_encap_disabled_if_eswitch_on def test_tx_packet_reformat(self): From 1dc3e8d9866e9328fcf5c0b727226161a2729c4d Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Tue, 8 Jul 2025 17:23:09 +0300 Subject: [PATCH 14/66] mlx5: Add support for bulk flow counters in mlx5dv_create_flow Extend the mlx5dv_create_flow API to support bulk counter operations by introducing a new action type MLX5DV_FLOW_ACTION_COUNTERS_DEVX_WITH_OFFSET. This allows users to specify an offset within DEVX counter objects for more granular bulk counter object management. The implementation removes the previous auxiliary array approach (_mlx5dv_create_flow with actions_attr_aux parameter) in favor of a cleaner design that embeds offset information directly within the flow action structure. The mlx5dv_flow_action_attr union is extended with a bulk_obj member containing both the DEVX object and an offset, allowing also external rdma-core applications to use DEVX bulk counter via the offset. Existing applications using MLX5DV_FLOW_ACTION_COUNTERS_DEVX continue to work unchanged, while new applications can use the enhanced MLX5DV_FLOW_ACTION_COUNTERS_DEVX_WITH_OFFSET for bulk counter scenarios. Note that no kernel changes needed, since DEVX bulk counter object with offset is already supported. Signed-off-by: Maher Sanalla Signed-off-by: Alex Vesker Signed-off-by: Yishai Hadas --- providers/mlx5/dr_action.c | 7 +++--- providers/mlx5/dr_rule.c | 21 ++++------------ providers/mlx5/man/mlx5dv_create_flow.3.md | 13 +++++++++- providers/mlx5/mlx5.h | 13 +++++----- providers/mlx5/mlx5dv.h | 5 ++++ providers/mlx5/mlx5dv_dr.h | 3 +-- providers/mlx5/verbs.c | 29 +++++++++++++--------- 7 files changed, 49 insertions(+), 42 deletions(-) diff --git a/providers/mlx5/dr_action.c b/providers/mlx5/dr_action.c index 4377bf8ba..8fe966eef 100644 --- a/providers/mlx5/dr_action.c +++ b/providers/mlx5/dr_action.c @@ -975,8 +975,7 @@ int dr_actions_build_ste_arr(struct mlx5dv_dr_matcher *matcher, int dr_actions_build_attr(struct mlx5dv_dr_matcher *matcher, struct mlx5dv_dr_action *actions[], size_t num_actions, - struct mlx5dv_flow_action_attr *attr, - struct mlx5_flow_action_attr_aux *attr_aux) + struct mlx5dv_flow_action_attr *attr) { struct mlx5dv_dr_domain *dmn = matcher->tbl->dmn; int i; @@ -1026,8 +1025,8 @@ int dr_actions_build_attr(struct mlx5dv_dr_matcher *matcher, attr[i].obj = actions[i]->ctr.devx_obj; if (actions[i]->ctr.offset) { - attr_aux[i].type = MLX5_FLOW_ACTION_COUNTER_OFFSET; - attr_aux[i].offset = actions[i]->ctr.offset; + attr[i].type = MLX5DV_FLOW_ACTION_COUNTERS_DEVX_WITH_OFFSET; + attr[i].bulk_obj.offset = actions[i]->ctr.offset; } break; case DR_ACTION_TYP_TAG: diff --git a/providers/mlx5/dr_rule.c b/providers/mlx5/dr_rule.c index 61bc1e469..e324e90b5 100644 --- a/providers/mlx5/dr_rule.c +++ b/providers/mlx5/dr_rule.c @@ -1552,7 +1552,6 @@ dr_rule_create_rule_root(struct mlx5dv_dr_matcher *matcher, struct mlx5dv_dr_action *actions[]) { struct mlx5dv_flow_action_attr *attr; - struct mlx5_flow_action_attr_aux *attr_aux; struct mlx5dv_dr_rule *rule; int ret; @@ -1570,37 +1569,27 @@ dr_rule_create_rule_root(struct mlx5dv_dr_matcher *matcher, goto free_rule; } - attr_aux = calloc(num_actions, sizeof(*attr_aux)); - if (!attr_aux) { - errno = ENOMEM; - goto free_attr; - } - - ret = dr_actions_build_attr(matcher, actions, num_actions, attr, attr_aux); + ret = dr_actions_build_attr(matcher, actions, num_actions, attr); if (ret) - goto free_attr_aux; + goto free_attr; ret = dr_rule_add_action_members(rule, num_actions, actions); if (ret) - goto free_attr_aux; + goto free_attr; - rule->flow = _mlx5dv_create_flow(matcher->dv_matcher, + rule->flow = mlx5dv_create_flow(matcher->dv_matcher, value, num_actions, - attr, - attr_aux); + attr); if (!rule->flow) goto remove_action_members; free(attr); - free(attr_aux); return rule; remove_action_members: dr_rule_remove_action_members(rule); -free_attr_aux: - free(attr_aux); free_attr: free(attr); free_rule: diff --git a/providers/mlx5/man/mlx5dv_create_flow.3.md b/providers/mlx5/man/mlx5dv_create_flow.3.md index 1b7622a95..7334bafc9 100644 --- a/providers/mlx5/man/mlx5dv_create_flow.3.md +++ b/providers/mlx5/man/mlx5dv_create_flow.3.md @@ -46,6 +46,10 @@ struct mlx5dv_flow_action_attr { struct ibv_flow_action *action; uint32_t tag_value; struct mlx5dv_devx_obj *obj; + struct { + struct mlx5dv_devx_obj *obj; + uint32_t offset; + } bulk_obj; }; }; ``` @@ -65,6 +69,8 @@ struct mlx5dv_flow_action_attr { Steer the packet to the default miss destination. MLX5DV_FLOW_ACTION_DROP Action is dropping the matched packet. + MLX5DV_FLOW_ACTION_COUNTERS_DEVX_WITH_OFFSET + The DEVX bulk counter object and its counter offset for the matched packets. *qp* : QP passed, to be used with *type* *MLX5DV_FLOW_ACTION_DEST_IBV_QP*. @@ -78,7 +84,12 @@ struct mlx5dv_flow_action_attr { *MLX5DV_FLOW_ACTION_TAG* see *ibv_create_cq_ex(3)*. *obj* -: DEVX object, to be used with *type* *MLX5DV_FLOW_ACTION_DEST_DEVX* or by *MLX5DV_FLOW_ACTION_COUNTERS_DEVX*. +: DEVX object, to be used with *type* *MLX5DV_FLOW_ACTION_DEST_DEVX* or by *MLX5DV_FLOW_ACTION_COUNTERS_DEVX* + or by *MLX5DV_FLOW_ACTION_COUNTERS_DEVX_WITH_OFFSET*. + +*offset* +: offset to the target counter within a bulk DEVX object, to be used with *type* + *MLX5DV_FLOW_ACTION_COUNTERS_DEVX_WITH_OFFSET* # RETURN VALUE diff --git a/providers/mlx5/mlx5.h b/providers/mlx5/mlx5.h index d98db9aa1..34c18083b 100644 --- a/providers/mlx5/mlx5.h +++ b/providers/mlx5/mlx5.h @@ -962,8 +962,7 @@ struct ibv_flow * _mlx5dv_create_flow(struct mlx5dv_flow_matcher *flow_matcher, struct mlx5dv_flow_match_parameters *match_value, size_t num_actions, - struct mlx5dv_flow_action_attr actions_attr[], - struct mlx5_flow_action_attr_aux actions_attr_aux[]); + struct mlx5dv_flow_action_attr actions_attr[]); extern int mlx5_stall_num_loop; extern int mlx5_stall_cq_poll_min; @@ -1586,11 +1585,11 @@ struct mlx5_dv_context_ops { struct mlx5dv_flow_matcher *(*create_flow_matcher)(struct ibv_context *context, struct mlx5dv_flow_matcher_attr *attr); int (*destroy_flow_matcher)(struct mlx5dv_flow_matcher *flow_matcher); - struct ibv_flow *(*create_flow)(struct mlx5dv_flow_matcher *flow_matcher, - struct mlx5dv_flow_match_parameters *match_value, - size_t num_actions, - struct mlx5dv_flow_action_attr actions_attr[], - struct mlx5_flow_action_attr_aux actions_attr_aux[]); + struct ibv_flow *(*create_flow)( + struct mlx5dv_flow_matcher *flow_matcher, + struct mlx5dv_flow_match_parameters *match_value, + size_t num_actions, + struct mlx5dv_flow_action_attr actions_attr[]); struct mlx5dv_steering_anchor *(*create_steering_anchor)(struct ibv_context *conterxt, struct mlx5dv_steering_anchor_attr *attr); diff --git a/providers/mlx5/mlx5dv.h b/providers/mlx5/mlx5dv.h index 4e8d461e7..a24940196 100644 --- a/providers/mlx5/mlx5dv.h +++ b/providers/mlx5/mlx5dv.h @@ -783,6 +783,7 @@ enum mlx5dv_flow_action_type { MLX5DV_FLOW_ACTION_DEST_DEVX, MLX5DV_FLOW_ACTION_COUNTERS_DEVX, MLX5DV_FLOW_ACTION_DEFAULT_MISS, + MLX5DV_FLOW_ACTION_COUNTERS_DEVX_WITH_OFFSET, }; struct mlx5dv_flow_action_attr { @@ -793,6 +794,10 @@ struct mlx5dv_flow_action_attr { struct ibv_flow_action *action; uint32_t tag_value; struct mlx5dv_devx_obj *obj; + struct { + struct mlx5dv_devx_obj *obj; + uint32_t offset; + } bulk_obj; }; }; diff --git a/providers/mlx5/mlx5dv_dr.h b/providers/mlx5/mlx5dv_dr.h index 36fb67e28..8639b37c5 100644 --- a/providers/mlx5/mlx5dv_dr.h +++ b/providers/mlx5/mlx5dv_dr.h @@ -691,8 +691,7 @@ int dr_actions_build_ste_arr(struct mlx5dv_dr_matcher *matcher, int dr_actions_build_attr(struct mlx5dv_dr_matcher *matcher, struct mlx5dv_dr_action *actions[], size_t num_actions, - struct mlx5dv_flow_action_attr *attr, - struct mlx5_flow_action_attr_aux *attr_aux); + struct mlx5dv_flow_action_attr *attr); uint32_t dr_actions_reformat_get_id(struct mlx5dv_dr_action *action); diff --git a/providers/mlx5/verbs.c b/providers/mlx5/verbs.c index 49894ddc9..5694194e7 100644 --- a/providers/mlx5/verbs.c +++ b/providers/mlx5/verbs.c @@ -5615,8 +5615,7 @@ struct ibv_flow * _mlx5dv_create_flow(struct mlx5dv_flow_matcher *flow_matcher, struct mlx5dv_flow_match_parameters *match_value, size_t num_actions, - struct mlx5dv_flow_action_attr actions_attr[], - struct mlx5_flow_action_attr_aux actions_attr_aux[]) + struct mlx5dv_flow_action_attr actions_attr[]) { uint32_t flow_actions[CREATE_FLOW_MAX_FLOW_ACTIONS_SUPPORTED]; struct verbs_flow_action *vaction; @@ -5626,6 +5625,7 @@ _mlx5dv_create_flow(struct mlx5dv_flow_matcher *flow_matcher, bool have_dest_devx = false; bool have_flow_tag = false; bool have_counter = false; + bool have_bulk_counter = false; bool have_default = false; bool have_drop = false; int ret; @@ -5695,20 +5695,13 @@ _mlx5dv_create_flow(struct mlx5dv_flow_matcher *flow_matcher, have_flow_tag = true; break; case MLX5DV_FLOW_ACTION_COUNTERS_DEVX: - if (have_counter) { + if (have_counter || have_bulk_counter) { errno = EOPNOTSUPP; goto err; } fill_attr_in_objs_arr(cmd, MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX, &actions_attr[i].obj->handle, 1); - - if (actions_attr_aux && - actions_attr_aux[i].type == MLX5_FLOW_ACTION_COUNTER_OFFSET) - fill_attr_in_ptr_array(cmd, - MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET, - &actions_attr_aux[i].offset, 1); - have_counter = true; break; case MLX5DV_FLOW_ACTION_DEFAULT_MISS: @@ -5733,6 +5726,19 @@ _mlx5dv_create_flow(struct mlx5dv_flow_matcher *flow_matcher, MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP); have_drop = true; break; + case MLX5DV_FLOW_ACTION_COUNTERS_DEVX_WITH_OFFSET: + if (have_counter || have_bulk_counter) { + errno = EOPNOTSUPP; + goto err; + } + fill_attr_in_objs_arr(cmd, + MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX, + &actions_attr[i].bulk_obj.obj->handle, 1); + fill_attr_in_ptr_array(cmd, + MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET, + &actions_attr[i].bulk_obj.offset, 1); + have_bulk_counter = true; + break; default: errno = EOPNOTSUPP; goto err; @@ -5772,8 +5778,7 @@ mlx5dv_create_flow(struct mlx5dv_flow_matcher *flow_matcher, return dvops->create_flow(flow_matcher, match_value, num_actions, - actions_attr, - NULL); + actions_attr); } static struct mlx5dv_steering_anchor * From ec3d7f5ec6a44601e4e8f2fda1750ae3dbd76c48 Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Mon, 28 Jul 2025 14:10:29 +0300 Subject: [PATCH 15/66] mlx5: Implement UAR fallback for td allocation In mlx5_alloc_td(), check if blueflame is supported by examining ctx->bf_reg_size before attempting UAR allocation. When blueflame is not supported (bf_reg_size == 0), fallback to using the shared nc (non-cached) UAR instead of trying to allocate a dedicated UAR. This prevents unnecessary dedicated UAR allocation attempts on devices that don't support blueflame, while ensuring td allocation succeeds by using the available non-cached singleton UAR. In mlx5_dealloc_td(), only detach dedicated UARs by checking the singleton flag to avoid incorrectly freeing the shared nc_uar. Signed-off-by: Maher Sanalla Signed-off-by: Yishai Hadas --- providers/mlx5/verbs.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/providers/mlx5/verbs.c b/providers/mlx5/verbs.c index 5694194e7..06da22907 100644 --- a/providers/mlx5/verbs.c +++ b/providers/mlx5/verbs.c @@ -448,6 +448,7 @@ static void mlx5_detach_dedicated_uar(struct ibv_context *context, struct mlx5_b struct ibv_td *mlx5_alloc_td(struct ibv_context *context, struct ibv_td_init_attr *init_attr) { + struct mlx5_context *ctx = to_mctx(context); struct mlx5_td *td; if (init_attr->comp_mask) { @@ -461,7 +462,12 @@ struct ibv_td *mlx5_alloc_td(struct ibv_context *context, struct ibv_td_init_att return NULL; } - td->bf = mlx5_attach_dedicated_uar(context, 0); + /* Check whether BlueFlame is supported on the device */ + if (ctx->bf_reg_size) + td->bf = mlx5_attach_dedicated_uar(context, 0); + else + td->bf = ctx->nc_uar; + if (!td->bf) { free(td); return NULL; @@ -481,7 +487,8 @@ int mlx5_dealloc_td(struct ibv_td *ib_td) if (atomic_load(&td->refcount) > 1) return EBUSY; - mlx5_detach_dedicated_uar(ib_td->context, td->bf); + if (!td->bf->singleton) + mlx5_detach_dedicated_uar(ib_td->context, td->bf); free(td); return 0; From 55ee455bb60a5f0be2a06d8aebb3d51112b670e3 Mon Sep 17 00:00:00 2001 From: Elyashiv Cohen Date: Wed, 26 Feb 2025 11:16:26 +0200 Subject: [PATCH 16/66] tests: Add CX9 to MLX5_DEVS list Add CX9 to MLX5_DEVS list. Signed-off-by: Elyashiv Cohen Signed-off-by: Edward Srouji --- tests/mlx5_base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/mlx5_base.py b/tests/mlx5_base.py index fcd14aec9..2e7d6f2c7 100644 --- a/tests/mlx5_base.py +++ b/tests/mlx5_base.py @@ -62,6 +62,7 @@ 0x101f, # ConnectX-6 LX 0x1021, # ConnectX-7 0x1023, # ConnectX-8 + 0x1025, # ConnectX-9 0xa2d2, # BlueField integrated ConnectX-5 network controller 0xa2d3, # BlueField integrated ConnectX-5 network controller VF 0xa2d6, # BlueField-2 integrated ConnectX-6 Dx network controller From 513106cd6af6ad14800b6b219fbefa9b71b8fffb Mon Sep 17 00:00:00 2001 From: Linoy Ganti Date: Mon, 5 May 2025 09:12:01 +0300 Subject: [PATCH 17/66] pyverbs: Add DevX async command completion support Add new pyverbs APIs to support DevX async command completion, including functions to create, get, and destroy async command completion objects. Signed-off-by: Linoy Ganti Signed-off-by: Edward Srouji --- pyverbs/providers/mlx5/libmlx5.pxd | 14 ++++++ pyverbs/providers/mlx5/mlx5dv.pxd | 5 ++ pyverbs/providers/mlx5/mlx5dv.pyx | 73 +++++++++++++++++++++++++++++- 3 files changed, 91 insertions(+), 1 deletion(-) diff --git a/pyverbs/providers/mlx5/libmlx5.pxd b/pyverbs/providers/mlx5/libmlx5.pxd index 149244e8d..09f32bb62 100644 --- a/pyverbs/providers/mlx5/libmlx5.pxd +++ b/pyverbs/providers/mlx5/libmlx5.pxd @@ -277,6 +277,13 @@ cdef extern from 'infiniband/mlx5dv.h': cdef struct mlx5dv_devx_umem: uint32_t umem_id; + cdef struct mlx5dv_devx_cmd_comp: + int fd + + cdef struct mlx5dv_devx_async_cmd_hdr: + uint64_t wr_id + uint8_t *out_data + cdef struct mlx5dv_devx_umem_in: void *addr size_t size @@ -568,6 +575,13 @@ cdef extern from 'infiniband/mlx5dv.h': size_t inlen, void *out, size_t outlen) int mlx5dv_devx_obj_query(mlx5dv_devx_obj *obj, const void *in_, size_t inlen, void *out, size_t outlen) + int mlx5dv_devx_obj_query_async(mlx5dv_devx_obj *obj, const void *in_, + size_t inlen, size_t outlen, uint64_t wr_id, + mlx5dv_devx_cmd_comp *cmd_comp) + mlx5dv_devx_cmd_comp *mlx5dv_devx_create_cmd_comp(v.ibv_context *context) + void mlx5dv_devx_destroy_cmd_comp(mlx5dv_devx_cmd_comp *cmd_comp) + int mlx5dv_devx_get_async_cmd_comp(mlx5dv_devx_cmd_comp *cmd_comp, + mlx5dv_devx_async_cmd_hdr *cmd_resp, size_t cmd_resp_len) int mlx5dv_devx_obj_modify(mlx5dv_devx_obj *obj, const void *in_, size_t inlen, void *out, size_t outlen) int mlx5dv_devx_obj_destroy(mlx5dv_devx_obj *obj) diff --git a/pyverbs/providers/mlx5/mlx5dv.pxd b/pyverbs/providers/mlx5/mlx5dv.pxd index bb1af813d..b5b5a27ed 100644 --- a/pyverbs/providers/mlx5/mlx5dv.pxd +++ b/pyverbs/providers/mlx5/mlx5dv.pxd @@ -15,6 +15,7 @@ cdef class Mlx5Context(Context): cdef object devx_umems cdef object devx_objs cdef object devx_eqs + cdef object cmd_comps cdef add_ref(self, obj) cpdef close(self) @@ -108,3 +109,7 @@ cdef class Mlx5DevxEq(PyverbsCM): cdef dv.mlx5dv_devx_eq *eq cdef Context context cdef object out_view + +cdef class Mlx5DevxCmdComp(PyverbsCM): + cdef dv.mlx5dv_devx_cmd_comp *cmd_comp + cdef Context context diff --git a/pyverbs/providers/mlx5/mlx5dv.pyx b/pyverbs/providers/mlx5/mlx5dv.pyx index 10ce4643e..ab54da76e 100644 --- a/pyverbs/providers/mlx5/mlx5dv.pyx +++ b/pyverbs/providers/mlx5/mlx5dv.pyx @@ -218,6 +218,27 @@ cdef class Mlx5DevxObj(PyverbsCM): free(out_mailbox) return out + def query_async(self, in_, outlen, wr_id, Mlx5DevxCmdComp cmd_comp): + """ + Queries asynchronously the DevX object. + :param in_: Bytes of the obj_query command's input data provided in a + device specification format. + (Stream of bytes or __bytes__ is implemented) + :param outlen: Expected output length in bytes + :param wr_id: Num of command that sent + :param cmd_comp: Mlx5DevxCmdComp object + :return: Bytes of the command's output + """ + in_bytes = bytes(in_) + cdef char *in_mailbox = _prepare_devx_inbox(in_bytes) + rc = dv.mlx5dv_devx_obj_query_async(self.obj, in_mailbox, len(in_bytes), + outlen, wr_id, cmd_comp.cmd_comp) + try: + if rc: + raise PyverbsRDMAError('Failed to query DevX object', rc) + finally: + free(in_mailbox) + def modify(self, in_, outlen): """ Modifies the DevX object. @@ -293,6 +314,7 @@ cdef class Mlx5Context(Context): self.devx_umems = weakref.WeakSet() self.devx_objs = weakref.WeakSet() self.devx_eqs = weakref.WeakSet() + self.cmd_comps = weakref.WeakSet() def query_mlx5_device(self, comp_mask=-1): """ @@ -471,6 +493,8 @@ cdef class Mlx5Context(Context): self.devx_objs.add(obj) elif isinstance(obj, Mlx5DevxEq): self.devx_eqs.add(obj) + elif isinstance(obj, Mlx5DevxCmdComp): + self.cmd_comps.add(obj) else: raise PyverbsError('Unrecognized object type') @@ -479,7 +503,8 @@ cdef class Mlx5Context(Context): cpdef close(self): if self.context != NULL: - close_weakrefs([self.pps, self.devx_objs, self.devx_umems, self.devx_eqs]) + close_weakrefs([self.pps, self.devx_objs, self.devx_umems, self.devx_eqs, + self.cmd_comps]) super(Mlx5Context, self).close() @@ -1926,3 +1951,49 @@ cdef class Mlx5DevxEq(PyverbsCM): raise PyverbsRDMAError('Failed to destroy a DevX EQ object', rc) self.eq = NULL self.context = None + + +cdef class Mlx5DevxCmdComp(PyverbsCM): + """ + Represents mlx5dv_devx_cmd_comp C struct. + """ + def __init__(self, Context context): + super().__init__() + self.cmd_comp = dv.mlx5dv_devx_create_cmd_comp(context.context) + if self.cmd_comp == NULL: + raise PyverbsRDMAErrno('Failed to create DevX cmd comp.') + self.context = context + self.context.add_ref(self) + + def __str__(self): + print_format = '{:20}: {:<20}\n' + return print_format.format('fd', hex(self.cmd_comp.fd)) + + def get_async_cmd_comp(self, cmd_resp_len=1024): + # mlx5dv_devx_async_cmd_hdr size is 1uint64_t of wr_id + cmd_resp_len of out_data + size = sizeof(uint64_t) + cmd_resp_len + cmd_resp = malloc(size) + if not cmd_resp: + raise MemoryError('Couldn\'t allocate array for cmd_resp') + rc = dv.mlx5dv_devx_get_async_cmd_comp(self.cmd_comp, cmd_resp, cmd_resp_len) + if rc != 0: + free(cmd_resp) + raise PyverbsError('Failed to get devx async event', rc) + wr_id = cmd_resp.wr_id + out_data = cmd_resp.out_data[:cmd_resp_len] + free(cmd_resp) + return wr_id, out_data + + @property + def fd(self): + return self.cmd_comp.fd + + def __dealloc__(self): + self.close() + + cpdef close(self): + if self.cmd_comp != NULL: + self.logger.debug('Closing Mlx5DevxCmdComp') + dv.mlx5dv_devx_destroy_cmd_comp(self.cmd_comp) + self.cmd_comp = NULL + self.context = None From ee8b3b35a1513721cff21db2cbb7dbd725b839c1 Mon Sep 17 00:00:00 2001 From: Linoy Ganti Date: Mon, 5 May 2025 11:21:10 +0300 Subject: [PATCH 18/66] tests: Add test for async command completion in DevX Add a new test to verify async command completion support in DevX. This test covers the creation of a DevX QP, issuing an asynchronous query, and validating the completion and results. Signed-off-by: Linoy Ganti Signed-off-by: Edward Srouji --- tests/mlx5_base.py | 4 ++-- tests/test_mlx5_devx.py | 45 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/tests/mlx5_base.py b/tests/mlx5_base.py index 2e7d6f2c7..f843940e8 100644 --- a/tests/mlx5_base.py +++ b/tests/mlx5_base.py @@ -752,14 +752,14 @@ def create_qp(self): qp_size = (qp_size + pg_size - 1) & ~(pg_size - 1) self.umems['qp'] = self.create_umem(size=qp_size) self.umems['qp_dbr'] = self.create_umem(size=8, alignment=8) - log_rq_size = int(math.log2(self.qattr.rq.wqe_num - 1)) + 1 + self.log_rq_size = int(math.log2(self.qattr.rq.wqe_num - 1)) + 1 # Size of a receive WQE is 16*pow(2, log_rq_stride) log_rq_stride = self.qattr.rq.wqe_shift - 4 log_sq_size = int(math.log2(self.qattr.sq.wqe_num - 1)) + 1 cqn = CreateCqOut(self.cq.out_view).cqn qpc = SwQpc(st=DevxOps.MLX5_QPC_ST_RC, pd=self.dv_pd.pdn, pm_state=DevxOps.MLX5_QPC_PM_STATE_MIGRATED, - log_rq_size=log_rq_size, log_sq_size=log_sq_size, ts_format=0x1, + log_rq_size=self.log_rq_size, log_sq_size=log_sq_size, ts_format=0x1, log_rq_stride=log_rq_stride, uar_page=self.uar['qp'].page_id, cqn_snd=cqn, cqn_rcv=cqn, dbr_umem_id=self.umems['qp_dbr'].umem_id, dbr_umem_valid=1, send_dbr_mode=self.send_dbr_mode) diff --git a/tests/test_mlx5_devx.py b/tests/test_mlx5_devx.py index 88b5f0f6e..7454d64a6 100644 --- a/tests/test_mlx5_devx.py +++ b/tests/test_mlx5_devx.py @@ -6,10 +6,14 @@ """ from tests.mlx5_base import Mlx5DevxRcResources, Mlx5DevxTrafficBase +from pyverbs.providers.mlx5.mlx5dv import Mlx5DevxCmdComp +from pyverbs.pyverbs_error import PyverbsRDMAError import pyverbs.mem_alloc as mem from pyverbs.mr import MR from pyverbs.libibverbs_enums import ibv_access_flags, ibv_odp_transport_cap_bits import tests.utils as u +import unittest +import errno class Mlx5DevxRcOdpRes(Mlx5DevxRcResources): @@ -57,3 +61,44 @@ def test_devx_rc_qp_odp_traffic(self): self.create_players(Mlx5DevxRcOdpRes) # Send traffic self.send_imm_traffic() + + +class Mlx5DevxApiTest(Mlx5DevxTrafficBase): + def setUp(self): + super().setUp() + self.devx_res = None + + def tearDown(self): + super().tearDown() + if self.devx_res: + self.devx_res.close_resources() + + def test_devx_async_query(self): + """ + Test DevX Async Query API. + Creating a DevX QP and query it using DevX async query. + """ + self.devx_res = Mlx5DevxRcResources(**self.dev_info) + self.cmd_comp = Mlx5DevxCmdComp(self.devx_res.ctx) + from tests.mlx5_prm_structs import QueryQpIn, QueryQpOut + query_qp_in = QueryQpIn(qpn=self.devx_res.qpn) + qp_wr_id = 100 + try: + self.devx_res.qp.query_async(query_qp_in, len(QueryQpOut()), wr_id=qp_wr_id, + cmd_comp=self.cmd_comp) + wr_id, out_data = self.cmd_comp.get_async_cmd_comp() + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Async command completion is not supported') + raise ex + + query_qp_out = QueryQpOut(out_data) + self.assertTrue(query_qp_out.status == 0, + 'Query Devx QP by Async Query API failed with non-zero status: ' + f'{query_qp_out.status}') + self.assertTrue(wr_id == qp_wr_id, + f'Mismatched work request ID. Expected: {qp_wr_id}, Actual: {wr_id}') + self.assertTrue(query_qp_out.sw_qpc.log_rq_size == self.devx_res.log_rq_size, + f'Mismatched RQ size. Expected: {self.devx_res.log_rq_size}, ' + f'Actual: {query_qp_out.sw_qpc.log_rq_size}') + From e65fabee8d0ae232c637d1e88118ed90f505dcd0 Mon Sep 17 00:00:00 2001 From: Daniel Hayon Date: Tue, 6 May 2025 16:47:21 +0300 Subject: [PATCH 19/66] tests: Cover different RDMA matcher priorities - Create a shared helper method `rdma_transport_domain_test` to handle common test setup. - Update both test methods to use the helper with their specific priority values. Signed-off-by: Daniel Hayon Signed-off-by: Edward Srouji --- tests/test_mlx5_flow.py | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/tests/test_mlx5_flow.py b/tests/test_mlx5_flow.py index cdc922c9d..5887b0391 100644 --- a/tests/test_mlx5_flow.py +++ b/tests/test_mlx5_flow.py @@ -184,16 +184,17 @@ def close_resources(self): obj.close() def create_matcher(self, mask, match_criteria_enable, flags=0, - ft_type=MLX5DV_FLOW_TABLE_TYPE_NIC_RX_, ib_port=1): + ft_type=MLX5DV_FLOW_TABLE_TYPE_NIC_RX_, ib_port=1, priority=0): """ Creates a matcher from a provided mask. :param mask: The mask to match on (in bytes) :param match_criteria_enable: Bitmask representing which of the headers and parameters in match_criteria are used - :param flags: Flow matcher flags - :param ft_type: Flow table type + :param flags: Flow matcher flags. + :param ft_type: Flow table type. :param ib_port: Specify its corresponding port. + :param priority: Priority value for the matcher. :return: Resulting matcher """ try: @@ -210,7 +211,7 @@ def create_matcher(self, mask, match_criteria_enable, flags=0, attr = Mlx5FlowMatcherAttr(match_mask=flow_match_param, match_criteria_enable=match_criteria_enable, flags=flags, ft_type=ft_type, comp_mask=comp_mask, - ib_port=ib_port) + ib_port=ib_port, priority=priority) matcher = Mlx5FlowMatcher(self.ctx, attr) except PyverbsRDMAError as ex: if ex.error_code in [errno.EOPNOTSUPP, errno.EPROTONOSUPPORT]: @@ -308,9 +309,9 @@ def test_create_smac_matcher(self): self.res.create_matcher(smac_mask, u.MatchCriteriaEnable.OUTER) def generic_test_mlx5_flow_table(self, flow_table_type_mapping, action, ib_port=1, - traffic=False): + traffic=False, priority=0): """ - This function performs the following steps: + This function performs the following steps on the server side: 1. Creates a DEVX flow table with the specified table type. 2. Creates a flow group in the DEVX flow table. 3. Inserts a flow table entry (FTE) into the flow table group. @@ -322,6 +323,7 @@ def generic_test_mlx5_flow_table(self, flow_table_type_mapping, action, ib_port= :param action: Specifies the action to be performed on the matched packets. :param ib_port: The port for flow matching. :param traffic: Boolean flag indicating whether RDMA traffic should run. + :param priority: Priority value for the matcher. """ empty_mask = bytes(MAX_MATCH_PARAM_SIZE) @@ -329,7 +331,8 @@ def generic_test_mlx5_flow_table(self, flow_table_type_mapping, action, ib_port= devx_table_obj, table_id = self.server.create_devx_flow_table(table_type, level=TABLE_LEVEL) matcher = self.server.create_matcher(empty_mask, u.MatchCriteriaEnable.NONE, - ft_type=ft_type, ib_port=ib_port) + ft_type=ft_type, ib_port=ib_port, + priority=priority) self.server.store_for_cleanup_stage(matcher) group_id = self.server.create_devx_flow_group(table_id, table_type) self.server.create_devx_flow_entry(table_id, group_id, action, table_type) @@ -365,6 +368,21 @@ def test_flow_rdma_transport_domain_traffic(self): Creates a devx table object with the RDMA transport domain, Verifies that the traffic passes successfully. """ + self.rdma_transport_domain_test(priority=0) + + @u.skip_unsupported + @requires_root() + def test_priority_rdma_transport_domain_traffic(self): + """ + Creates a devx table object with the RDMA transport domain with priority, + Verifies that the traffic passes successfully. + """ + self.rdma_transport_domain_test(priority=1) + + def rdma_transport_domain_test(self, priority): + """ + :param priority: Priority value for the flow table matcher. + """ self.client = Mlx5RcResources(**self.dev_info) self.server = Mlx5RCFlowResources(is_privileged_ctx=True, **self.dev_info) self.pre_run() @@ -373,7 +391,7 @@ def test_flow_rdma_transport_domain_traffic(self): MLX5DV_FLOW_TABLE_TYPE_RDMA_TRANSPORT_RX_: RDMA_TRANSPORT_RX, MLX5DV_FLOW_TABLE_TYPE_RDMA_TRANSPORT_TX_: RDMA_TRANSPORT_TX} self.generic_test_mlx5_flow_table(flow_table_type_mapping, action=ALLOW_ACTION, ib_port=1, - traffic=True) + traffic=True, priority=priority) @u.skip_unsupported def test_smac_matcher_to_qp_flow(self): From 947b4a155b789ac89d91adf8001d33cdd613a277 Mon Sep 17 00:00:00 2001 From: Shlomo Assaf Date: Thu, 19 Jun 2025 09:58:37 +0300 Subject: [PATCH 20/66] tests: Refactor requires_root Since new behavior in the Linux driver now allows unprivileged users additional operations, requires_root was replaced with requires_cap for the relevant cases. Signed-off-by: Shlomo Assaf Signed-off-by: Edward Srouji --- tests/test_flow.py | 5 +++-- tests/test_mlx5_dr.py | 32 ++++++++++++++++++++++++++++---- tests/test_mlx5_flow.py | 9 +++++---- tests/test_mlx5_lag_affinity.py | 3 ++- tests/test_qp.py | 10 ++++++---- tests/test_rss_traffic.py | 5 +++-- tests/utils.py | 32 +++++++++++++++++++++++++++++--- 7 files changed, 76 insertions(+), 20 deletions(-) diff --git a/tests/test_flow.py b/tests/test_flow.py index d97a29cb2..02c424ac0 100644 --- a/tests/test_flow.py +++ b/tests/test_flow.py @@ -5,7 +5,7 @@ """ from tests.base import RDMATestCase, RawResources, PyverbsRDMAError from pyverbs.spec import EthSpec, Ipv4ExtSpec, Ipv6Spec, TcpUdpSpec -from tests.utils import requires_root_on_eth, PacketConsts +from tests.utils import requires_cap_net_raw, PacketConsts, requires_eth from pyverbs.flow import FlowAttr, Flow from pyverbs.libibverbs_enums import ibv_flow_spec_type import tests.utils as u @@ -25,7 +25,8 @@ def __init__(self, dev_name, ib_port, gid_index): super().__init__(dev_name=dev_name, ib_port=ib_port, gid_index=gid_index) - @requires_root_on_eth() + @requires_cap_net_raw() + @requires_eth() def create_qps(self): super().create_qps() diff --git a/tests/test_mlx5_dr.py b/tests/test_mlx5_dr.py index c303d0511..3bc078c51 100644 --- a/tests/test_mlx5_dr.py +++ b/tests/test_mlx5_dr.py @@ -17,8 +17,8 @@ DrActionDefMiss, DrActionVPort, DrActionIBPort, DrActionDestTir, DrActionPacketReformat,\ DrFlowSamplerAttr, DrActionFlowSample, DrFlowMeterAttr, DrActionFlowMeter from pyverbs.providers.mlx5.mlx5dv import Mlx5DevxObj, Mlx5Context, Mlx5DVContextAttr -from tests.utils import skip_unsupported, requires_root_on_eth, requires_eswitch_on, \ - PacketConsts +from tests.utils import skip_unsupported, requires_eswitch_on, PacketConsts, requires_cap_net_raw,\ + requires_eth, requires_root from tests.mlx5_base import Mlx5RDMATestCase, PyverbsAPITestCase, MELLANOX_VENDOR_ID from pyverbs.providers.mlx5.mlx5dv_flow import Mlx5FlowMatchParameters from pyverbs.pyverbs_error import PyverbsRDMAError, PyverbsUserError @@ -128,7 +128,8 @@ def __init__(self, dev_name, ib_port, gid_index=0, wc_flags=0, msg_size=1024, qp super().__init__(dev_name=dev_name, ib_port=ib_port, gid_index=gid_index, msg_size=msg_size, qp_count=qp_count) - @requires_root_on_eth() + @requires_cap_net_raw() + @requires_eth() def create_qps(self): super().create_qps() @@ -173,7 +174,8 @@ def __init__(self, dev_name, ib_port, gid_index=0, wc_flags=0, msg_size=1024, def create_cq(self): self.cq = CQ(self.ctx, cqe=self.num_msgs) - @requires_root_on_eth() + @requires_cap_net_raw() + @requires_eth() def create_qps(self): if not self.server: super().create_qps() @@ -445,6 +447,7 @@ def add_qp_rule_and_send_pkts(self, root_only=False): self.create_rx_recv_rules(smac_value, [self.qp_action], root_only=root_only) u.raw_traffic(self.client, self.server, self.iters) + @requires_root() def test_tbl_qp_rule(self): """ Creates RX domain, SW table with matcher on source mac. Creates QP action @@ -494,6 +497,7 @@ def modify_tx_smac_and_send_pkts(self, root_only=False): u.raw_traffic(self.client, self.server, self.iters, expected_packet=exp_packet) @skip_unsupported + @requires_root() def test_tbl_modify_header_rule(self): """ Creates TX domain, SW table with matcher on source mac and modify the smac. @@ -512,6 +516,7 @@ def test_root_tbl_modify_header_rule(self): self.modify_tx_smac_and_send_pkts(root_only=True) @skip_unsupported + @requires_root() def test_metadata_modify_action_set_copy_match(self): """ Verify modify header with set and copy actions. @@ -611,6 +616,7 @@ def test_root_tbl_counter_action(self): self.add_counter_action_and_send_pkts(root_only=True) @skip_unsupported + @requires_root() def test_tbl_counter_action(self): """ Create flow counter object, on non-root table attach it to a rule using counter action @@ -621,6 +627,7 @@ def test_tbl_counter_action(self): @skip_unsupported + @requires_root() def test_prevent_duplicate_rule(self): """ Creates RX domain, sets duplicate rule to be not allowed on that domain, @@ -692,6 +699,7 @@ def test_root_tbl_drop_action(self): self._drop_action(root_only=True) @skip_unsupported + @requires_root() def test_tbl_drop_action(self): """ Create non-root drop actions on TX and RX. Verify using counter on the server RX that @@ -720,6 +728,7 @@ def add_qp_tag_rule_and_send_pkts(self, root_only=False): self.assertEqual(self.server.cq.read_flow_tag(), tag, 'Wrong tag value') @skip_unsupported + @requires_root() def test_tbl_qp_tag_rule(self): """ Creates RX domain, non-root table with matcher on source mac. Creates QP action @@ -738,6 +747,7 @@ def test_root_tbl_qp_tag_rule(self): self.add_qp_tag_rule_and_send_pkts(root_only=True) @skip_unsupported + @requires_root() def test_set_matcher_layout(self): """ Creates a non root matcher and sets its size. Creates a rule on that @@ -754,6 +764,7 @@ def test_set_matcher_layout(self): u.raw_traffic(self.client, self.server, self.iters) @skip_unsupported + @requires_root() def test_push_vlan(self): """ Creates RX domain, root table with matcher on source mac. Create a rule to forward @@ -779,6 +790,7 @@ def test_push_vlan(self): u.raw_traffic(self.client, self.server, self.iters, expected_packet=exp_packet) @skip_unsupported + @requires_root() def test_pop_vlan(self): """ Creates RX domain, root table with matcher on source mac. Create a rule to forward @@ -833,6 +845,7 @@ def dest_array(self, root_only=False): u.raw_traffic(self.client, self.server, self.iters) @skip_unsupported + @requires_root() def test_root_dest_array(self): """ Creates RX domain, root table with matcher on source mac.on root table @@ -843,6 +856,7 @@ def test_root_dest_array(self): self.dest_array(root_only=True) @skip_unsupported + @requires_root() def test_dest_array(self): """ Creates RX domain, non-root table with matcher on source mac. Create a rule @@ -854,6 +868,7 @@ def test_dest_array(self): self.dest_array() @skip_unsupported + @requires_root() def test_tx_def_miss_action(self): """ Create TX root table and forward all traffic to next SW steering table, @@ -892,6 +907,7 @@ def add_dest_tir_action_send_pkts(self, root_only=False): u.raw_traffic(self.client, self.server, self.iters) @skip_unsupported + @requires_root() def test_dest_tir(self): self.add_dest_tir_action_send_pkts() @@ -953,6 +969,7 @@ def packet_reformat_actions(self, outer, root_only=False, l2_ref_type=True): u.raw_traffic(self.client, self.server, self.iters) @skip_unsupported + @requires_root() def test_flow_sampler(self): """ Flow sampler has a default table (all the packets are forwarded to it) @@ -1044,6 +1061,7 @@ def test_root_geneve_match_rx(self): self.geneve_match_rx(root_only=True) @requires_geneve_fields_rx_support + @requires_root() def test_geneve_match_rx(self): """ Creates matcher on RX non-root table to match on Geneve related @@ -1159,6 +1177,7 @@ def test_roce_bth_match_tx(self): self.roce_bth_match(domain_flag=mlx5dv_dr_domain_type.MLX5DV_DR_DOMAIN_TYPE_NIC_TX) @skip_unsupported + @requires_root() def test_packet_reformat_l2_gre(self): """ Creates GRE packet with non-root l2 to l2 reformat actions on TX (encap) @@ -1181,6 +1200,7 @@ def test_packet_reformat_root_l2_gre(self): self.packet_reformat_actions(outer=encap_header, root_only=True) @skip_unsupported + @requires_root() def test_packet_reformat_l3_gre(self): """ Creates GRE packet with non-root l2 to l3 reformat actions on TX (encap) @@ -1203,6 +1223,7 @@ def test_packet_reformat_root_l3_gre(self): self.packet_reformat_actions(outer=encap_header, root_only=True, l2_ref_type=False) @skip_unsupported + @requires_root() def test_packet_reformat_l2_geneve(self): """ Creates Geneve packet with non-root l2 to l2 reformat actions on TX @@ -1225,6 +1246,7 @@ def test_packet_reformat_root_l2_geneve(self): self.packet_reformat_actions(outer=encap_header, root_only=True) @skip_unsupported + @requires_root() def test_packet_reformat_l3_geneve(self): """ Creates Geneve packet with non-root l2 to l3 tunnel reformat actions on @@ -1247,6 +1269,7 @@ def test_packet_reformat_root_l3_geneve(self): self.packet_reformat_actions(outer=encap_header, root_only=True, l2_ref_type=False) @skip_unsupported + @requires_root() def test_flow_meter(self): """ Create flow meter actions on TX and RX non-root tables. Add green and @@ -1393,6 +1416,7 @@ def test_root_reuse_action_and_matcher(self): self.reuse_action_and_matcher(root_only=True) @skip_unsupported + @requires_root() def test_reuse_action_and_matcher(self): """ Create non-root rules on TX and RX that use the same matcher and actions diff --git a/tests/test_mlx5_flow.py b/tests/test_mlx5_flow.py index 5887b0391..5d553f77c 100644 --- a/tests/test_mlx5_flow.py +++ b/tests/test_mlx5_flow.py @@ -18,8 +18,8 @@ MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL_ from pyverbs.libibverbs_enums import ibv_flow_flags from tests.mlx5_base import Mlx5RDMATestCase, create_privileged_context, Mlx5RcResources -from tests.utils import requires_root_on_eth, PacketConsts, is_eth, requires_root, \ - requires_no_sriov +from tests.utils import PacketConsts, is_eth, requires_no_sriov, requires_eth,\ + requires_cap_net_raw, requires_root from tests.base import RawResources import tests.utils as u import struct @@ -115,7 +115,8 @@ def create_matcher(self, mask, match_criteria_enable, flags=0, raise ex return matcher - @requires_root_on_eth() + @requires_cap_net_raw() + @requires_eth() def create_qps(self): super().create_qps() @@ -346,8 +347,8 @@ def generic_test_mlx5_flow_table(self, flow_table_type_mapping, action, ib_port= gid_idx=self.gid_index, port=self.ib_port, is_cq_ex=True) @u.skip_unsupported - @requires_root() @requires_no_sriov() + @requires_cap_net_raw() def test_flow_table_drop(self): """ Creates rules with DevX objects for RDMA RX and TX tables. diff --git a/tests/test_mlx5_lag_affinity.py b/tests/test_mlx5_lag_affinity.py index 3110e989a..547e81cf9 100644 --- a/tests/test_mlx5_lag_affinity.py +++ b/tests/test_mlx5_lag_affinity.py @@ -21,7 +21,8 @@ def __init__(self, dev_name, ib_port): def create_cq(self): return CQ(self.ctx, 100) - @u.requires_root_on_eth() + @u.requires_cap_net_raw() + @u.requires_eth() def create_qp(self): qia = QPInitAttr(ibv_qp_type.IBV_QPT_RAW_PACKET, rcq=self.cq, scq=self.cq, cap=QPCap()) diff --git a/tests/test_qp.py b/tests/test_qp.py index 6af3790c7..8bd20404e 100644 --- a/tests/test_qp.py +++ b/tests/test_qp.py @@ -49,8 +49,9 @@ def create_qp_common_test(self, qp_type, qp_state, is_ex, with_attr, qp_attr_edi with PD(self.ctx) as pd: with CQ(self.ctx, 100, None, None, 0) as cq: if qp_type == ibv_qp_type.IBV_QPT_RAW_PACKET: - if not (u.is_eth(self.ctx, self.ib_port) and u.is_root()): - raise unittest.SkipTest('To Create RAW QP must be done by root on Ethernet link layer') + if not (u.is_eth(self.ctx, self.ib_port) and u.has_cap_net_raw()): + raise unittest.SkipTest('To Create RAW QP must be done by user with '\ + 'CAP_NET_RAW on Ethernet link layer') if is_ex: qia = get_qp_init_attr_ex(cq, pd, self.attr, self.attr_ex, qp_type) @@ -253,8 +254,9 @@ def query_qp_common_test(self, qp_type): with PD(self.ctx) as pd: with CQ(self.ctx, 100, None, None, 0) as cq: if qp_type == ibv_qp_type.IBV_QPT_RAW_PACKET: - if not (u.is_eth(self.ctx, self.ib_port) and u.is_root()): - raise unittest.SkipTest('To Create RAW QP must be done by root on Ethernet link layer') + if not (u.is_eth(self.ctx, self.ib_port) and u.has_cap_net_raw()): + raise unittest.SkipTest('To Create RAW QP must be done by user with '\ + 'CAP_NET_RAW on Ethernet link layer') # Legacy QP qia = u.get_qp_init_attr(cq, self.attr) diff --git a/tests/test_rss_traffic.py b/tests/test_rss_traffic.py index 7aa69f893..d9c289090 100644 --- a/tests/test_rss_traffic.py +++ b/tests/test_rss_traffic.py @@ -3,7 +3,7 @@ import errno from pyverbs.wq import WQInitAttr, WQAttr, WQ, RwqIndTableInitAttr, RwqIndTable, RxHashConf -from tests.utils import requires_root_on_eth, PacketConsts +from tests.utils import requires_cap_net_raw, PacketConsts, requires_eth from tests.base import RDMATestCase, PyverbsRDMAError, MLNX_VENDOR_ID, \ CX3_MLNX_PART_ID, CX3Pro_MLNX_PART_ID from pyverbs.qp import QPInitAttrEx, QPEx @@ -55,7 +55,8 @@ def __init__(self, dev_name, ib_port, gid_index, log_ind_tbl_size=3): def create_cq(self): self.cqs = [CQ(self.ctx, WRS_PER_ROUND) for _ in range(CQS_NUM)] - @requires_root_on_eth() + @requires_cap_net_raw() + @requires_eth() def create_qps(self): """ Initializes self.qps with RSS QPs. diff --git a/tests/utils.py b/tests/utils.py index 4c54e643f..7e27cf781 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1504,11 +1504,11 @@ def inner(instance): return outer -def requires_root_on_eth(port_num=1): +def requires_eth(port_num=1): def outer(func): def inner(instance): - if not (is_eth(instance.ctx, port_num) and is_root()): - raise unittest.SkipTest('Must be run by root on Ethernet link layer') + if not is_eth(instance.ctx, port_num): + raise unittest.SkipTest('Must be run on Ethernet link layer') return func(instance) return inner return outer @@ -1744,6 +1744,32 @@ def is_root(): return os.geteuid() == 0 +def has_cap_net_raw(): + """ + Check if the current process has CAP_NET_RAW capability. + """ + try: + with open('/proc/self/status', 'r') as f: + for line in f: + if line.startswith('CapEff:'): + # Parse the effective capabilities (hex format) + cap_eff = int(line.split()[1], 16) + return bool(cap_eff & 1 << 13) + except (IOError, ValueError, IndexError): + pass + return False + + +def requires_cap_net_raw(): + def outer(func): + def inner(instance): + if not has_cap_net_raw(): + raise unittest.SkipTest('The process must have CAP_NET_RAW set') + return func(instance) + return inner + return outer + + def post_rq_state_bad_flow(test_obj): """ Check post_recive on rq while qp is in invalid state. From fd734dd86c0e6abd3233a0eec7920b8f7902163d Mon Sep 17 00:00:00 2001 From: Shlomo Assaf Date: Wed, 25 Jun 2025 13:38:31 +0300 Subject: [PATCH 21/66] tests: Add test for privileged QKEY functionality Add new test test_create_ud_qp_with_privileged_qkey to cover privileged QKEY (0x80000000) functionality for UD QPs. The test verifies that the privileged QKEY can be set and queried correctly for both legacy QPs (created via ibv_create_qp) and extended QPs (created via ibv_create_qp_ex). The test ensures proper handling of IB_QP_PRIVILEGED_Q_KEY by: - Creating UD QPs using both legacy and extended creation methods - Setting the privileged QKEY value during QP initialization - Querying the QKEY attribute to verify it was set correctly - Validating that both creation paths handle privileged QKEYs properly This test requires CAP_NET_RAW capability as privileged QKEYs are restricted to privileged users. Signed-off-by: Shlomo Assaf Signed-off-by: Edward Srouji --- tests/test_qp.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tests/test_qp.py b/tests/test_qp.py index 8bd20404e..6384ec40b 100644 --- a/tests/test_qp.py +++ b/tests/test_qp.py @@ -370,6 +370,40 @@ def test_modify_ud_qp(self): qp.modify(qa, ibv_qp_attr_mask.IBV_QP_STATE) assert qp.qp_state == ibv_qp_state.IBV_QPS_RESET, 'Extended QP, QP state is not as expected' + @u.skip_unsupported + @u.requires_cap_net_raw() + def test_create_ud_qp_with_privileged_qkey(self): + """ + Test UD QP creation and modification with privileged QKEY 0x80000000. + Verifies that the privileged QKEY (IB_QP_PRIVILEGED_Q_KEY) can be set + and queried correctly for both legacy and extended QPs. + """ + privileged_qkey = 0x80000000 # IB_QP_PRIVILEGED_Q_KEY + + with PD(self.ctx) as pd: + with CQ(self.ctx, 100, None, None, 0) as cq: + # Test Legacy QP with privileged QKEY + qia = u.get_qp_init_attr(cq, self.attr) + qia.qp_type = ibv_qp_type.IBV_QPT_UD + qp = self.create_qp(pd, qia, False, False, self.ib_port) + qa = QPAttr() + qa.qkey = privileged_qkey + qp.to_init(qa) + qp_attr, _ = qp.query(ibv_qp_attr_mask.IBV_QP_QKEY) + self.assertEqual(qp_attr.qkey, qa.qkey, + f'Legacy QP, Privileged QKey is not as '\ + f'expected. Got: 0x{qp_attr.qkey:08x}, Expected: 0x{qa.qkey:08x}') + + # Test Extended QP with privileged QKEY + qia = get_qp_init_attr_ex(cq, pd, self.attr, self.attr_ex, ibv_qp_type.IBV_QPT_UD) + qp = self.create_qp(self.ctx, qia, True, False, self.ib_port) + qa = QPAttr() + qa.qkey = privileged_qkey + qp.to_init(qa) + qp_attr, _ = qp.query(ibv_qp_attr_mask.IBV_QP_QKEY) + self.assertEqual(qp_attr.qkey, qa.qkey, + f'Extended QP, Privileged QKey is not as '\ + f'expected. Got: 0x{qp_attr.qkey:08x}, Expected: 0x{qa.qkey:08x}') class RCQPTest(RDMATestCase): """ From 3a288e40bc647b783520614ab0136327defd19a4 Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Tue, 8 Jul 2025 17:23:09 +0300 Subject: [PATCH 22/66] mlx5: Add support for bulk flow counters in mlx5dv_create_flow Extend the mlx5dv_create_flow API to support bulk counter operations by introducing a new action type MLX5DV_FLOW_ACTION_COUNTERS_DEVX_WITH_OFFSET. This allows users to specify an offset within DEVX counter objects for more granular bulk counter object management. The implementation removes the previous auxiliary array approach (_mlx5dv_create_flow with actions_attr_aux parameter) in favor of a cleaner design that embeds offset information directly within the flow action structure. The mlx5dv_flow_action_attr union is extended with a bulk_obj member containing both the DEVX object and an offset, allowing also external rdma-core applications to use DEVX bulk counter via the offset. Existing applications using MLX5DV_FLOW_ACTION_COUNTERS_DEVX continue to work unchanged, while new applications can use the enhanced MLX5DV_FLOW_ACTION_COUNTERS_DEVX_WITH_OFFSET for bulk counter scenarios. Note that no kernel changes needed, since DEVX bulk counter object with offset is already supported. Signed-off-by: Maher Sanalla Signed-off-by: Edward Srouji --- providers/mlx5/dr_action.c | 7 +++--- providers/mlx5/dr_rule.c | 21 ++++------------ providers/mlx5/man/mlx5dv_create_flow.3.md | 13 +++++++++- providers/mlx5/mlx5.h | 13 +++++----- providers/mlx5/mlx5dv.h | 5 ++++ providers/mlx5/mlx5dv_dr.h | 3 +-- providers/mlx5/verbs.c | 29 +++++++++++++--------- 7 files changed, 49 insertions(+), 42 deletions(-) diff --git a/providers/mlx5/dr_action.c b/providers/mlx5/dr_action.c index 4377bf8ba..8fe966eef 100644 --- a/providers/mlx5/dr_action.c +++ b/providers/mlx5/dr_action.c @@ -975,8 +975,7 @@ int dr_actions_build_ste_arr(struct mlx5dv_dr_matcher *matcher, int dr_actions_build_attr(struct mlx5dv_dr_matcher *matcher, struct mlx5dv_dr_action *actions[], size_t num_actions, - struct mlx5dv_flow_action_attr *attr, - struct mlx5_flow_action_attr_aux *attr_aux) + struct mlx5dv_flow_action_attr *attr) { struct mlx5dv_dr_domain *dmn = matcher->tbl->dmn; int i; @@ -1026,8 +1025,8 @@ int dr_actions_build_attr(struct mlx5dv_dr_matcher *matcher, attr[i].obj = actions[i]->ctr.devx_obj; if (actions[i]->ctr.offset) { - attr_aux[i].type = MLX5_FLOW_ACTION_COUNTER_OFFSET; - attr_aux[i].offset = actions[i]->ctr.offset; + attr[i].type = MLX5DV_FLOW_ACTION_COUNTERS_DEVX_WITH_OFFSET; + attr[i].bulk_obj.offset = actions[i]->ctr.offset; } break; case DR_ACTION_TYP_TAG: diff --git a/providers/mlx5/dr_rule.c b/providers/mlx5/dr_rule.c index 61bc1e469..e324e90b5 100644 --- a/providers/mlx5/dr_rule.c +++ b/providers/mlx5/dr_rule.c @@ -1552,7 +1552,6 @@ dr_rule_create_rule_root(struct mlx5dv_dr_matcher *matcher, struct mlx5dv_dr_action *actions[]) { struct mlx5dv_flow_action_attr *attr; - struct mlx5_flow_action_attr_aux *attr_aux; struct mlx5dv_dr_rule *rule; int ret; @@ -1570,37 +1569,27 @@ dr_rule_create_rule_root(struct mlx5dv_dr_matcher *matcher, goto free_rule; } - attr_aux = calloc(num_actions, sizeof(*attr_aux)); - if (!attr_aux) { - errno = ENOMEM; - goto free_attr; - } - - ret = dr_actions_build_attr(matcher, actions, num_actions, attr, attr_aux); + ret = dr_actions_build_attr(matcher, actions, num_actions, attr); if (ret) - goto free_attr_aux; + goto free_attr; ret = dr_rule_add_action_members(rule, num_actions, actions); if (ret) - goto free_attr_aux; + goto free_attr; - rule->flow = _mlx5dv_create_flow(matcher->dv_matcher, + rule->flow = mlx5dv_create_flow(matcher->dv_matcher, value, num_actions, - attr, - attr_aux); + attr); if (!rule->flow) goto remove_action_members; free(attr); - free(attr_aux); return rule; remove_action_members: dr_rule_remove_action_members(rule); -free_attr_aux: - free(attr_aux); free_attr: free(attr); free_rule: diff --git a/providers/mlx5/man/mlx5dv_create_flow.3.md b/providers/mlx5/man/mlx5dv_create_flow.3.md index 1b7622a95..7334bafc9 100644 --- a/providers/mlx5/man/mlx5dv_create_flow.3.md +++ b/providers/mlx5/man/mlx5dv_create_flow.3.md @@ -46,6 +46,10 @@ struct mlx5dv_flow_action_attr { struct ibv_flow_action *action; uint32_t tag_value; struct mlx5dv_devx_obj *obj; + struct { + struct mlx5dv_devx_obj *obj; + uint32_t offset; + } bulk_obj; }; }; ``` @@ -65,6 +69,8 @@ struct mlx5dv_flow_action_attr { Steer the packet to the default miss destination. MLX5DV_FLOW_ACTION_DROP Action is dropping the matched packet. + MLX5DV_FLOW_ACTION_COUNTERS_DEVX_WITH_OFFSET + The DEVX bulk counter object and its counter offset for the matched packets. *qp* : QP passed, to be used with *type* *MLX5DV_FLOW_ACTION_DEST_IBV_QP*. @@ -78,7 +84,12 @@ struct mlx5dv_flow_action_attr { *MLX5DV_FLOW_ACTION_TAG* see *ibv_create_cq_ex(3)*. *obj* -: DEVX object, to be used with *type* *MLX5DV_FLOW_ACTION_DEST_DEVX* or by *MLX5DV_FLOW_ACTION_COUNTERS_DEVX*. +: DEVX object, to be used with *type* *MLX5DV_FLOW_ACTION_DEST_DEVX* or by *MLX5DV_FLOW_ACTION_COUNTERS_DEVX* + or by *MLX5DV_FLOW_ACTION_COUNTERS_DEVX_WITH_OFFSET*. + +*offset* +: offset to the target counter within a bulk DEVX object, to be used with *type* + *MLX5DV_FLOW_ACTION_COUNTERS_DEVX_WITH_OFFSET* # RETURN VALUE diff --git a/providers/mlx5/mlx5.h b/providers/mlx5/mlx5.h index d98db9aa1..34c18083b 100644 --- a/providers/mlx5/mlx5.h +++ b/providers/mlx5/mlx5.h @@ -962,8 +962,7 @@ struct ibv_flow * _mlx5dv_create_flow(struct mlx5dv_flow_matcher *flow_matcher, struct mlx5dv_flow_match_parameters *match_value, size_t num_actions, - struct mlx5dv_flow_action_attr actions_attr[], - struct mlx5_flow_action_attr_aux actions_attr_aux[]); + struct mlx5dv_flow_action_attr actions_attr[]); extern int mlx5_stall_num_loop; extern int mlx5_stall_cq_poll_min; @@ -1586,11 +1585,11 @@ struct mlx5_dv_context_ops { struct mlx5dv_flow_matcher *(*create_flow_matcher)(struct ibv_context *context, struct mlx5dv_flow_matcher_attr *attr); int (*destroy_flow_matcher)(struct mlx5dv_flow_matcher *flow_matcher); - struct ibv_flow *(*create_flow)(struct mlx5dv_flow_matcher *flow_matcher, - struct mlx5dv_flow_match_parameters *match_value, - size_t num_actions, - struct mlx5dv_flow_action_attr actions_attr[], - struct mlx5_flow_action_attr_aux actions_attr_aux[]); + struct ibv_flow *(*create_flow)( + struct mlx5dv_flow_matcher *flow_matcher, + struct mlx5dv_flow_match_parameters *match_value, + size_t num_actions, + struct mlx5dv_flow_action_attr actions_attr[]); struct mlx5dv_steering_anchor *(*create_steering_anchor)(struct ibv_context *conterxt, struct mlx5dv_steering_anchor_attr *attr); diff --git a/providers/mlx5/mlx5dv.h b/providers/mlx5/mlx5dv.h index 4e8d461e7..a24940196 100644 --- a/providers/mlx5/mlx5dv.h +++ b/providers/mlx5/mlx5dv.h @@ -783,6 +783,7 @@ enum mlx5dv_flow_action_type { MLX5DV_FLOW_ACTION_DEST_DEVX, MLX5DV_FLOW_ACTION_COUNTERS_DEVX, MLX5DV_FLOW_ACTION_DEFAULT_MISS, + MLX5DV_FLOW_ACTION_COUNTERS_DEVX_WITH_OFFSET, }; struct mlx5dv_flow_action_attr { @@ -793,6 +794,10 @@ struct mlx5dv_flow_action_attr { struct ibv_flow_action *action; uint32_t tag_value; struct mlx5dv_devx_obj *obj; + struct { + struct mlx5dv_devx_obj *obj; + uint32_t offset; + } bulk_obj; }; }; diff --git a/providers/mlx5/mlx5dv_dr.h b/providers/mlx5/mlx5dv_dr.h index 36fb67e28..8639b37c5 100644 --- a/providers/mlx5/mlx5dv_dr.h +++ b/providers/mlx5/mlx5dv_dr.h @@ -691,8 +691,7 @@ int dr_actions_build_ste_arr(struct mlx5dv_dr_matcher *matcher, int dr_actions_build_attr(struct mlx5dv_dr_matcher *matcher, struct mlx5dv_dr_action *actions[], size_t num_actions, - struct mlx5dv_flow_action_attr *attr, - struct mlx5_flow_action_attr_aux *attr_aux); + struct mlx5dv_flow_action_attr *attr); uint32_t dr_actions_reformat_get_id(struct mlx5dv_dr_action *action); diff --git a/providers/mlx5/verbs.c b/providers/mlx5/verbs.c index 49894ddc9..5694194e7 100644 --- a/providers/mlx5/verbs.c +++ b/providers/mlx5/verbs.c @@ -5615,8 +5615,7 @@ struct ibv_flow * _mlx5dv_create_flow(struct mlx5dv_flow_matcher *flow_matcher, struct mlx5dv_flow_match_parameters *match_value, size_t num_actions, - struct mlx5dv_flow_action_attr actions_attr[], - struct mlx5_flow_action_attr_aux actions_attr_aux[]) + struct mlx5dv_flow_action_attr actions_attr[]) { uint32_t flow_actions[CREATE_FLOW_MAX_FLOW_ACTIONS_SUPPORTED]; struct verbs_flow_action *vaction; @@ -5626,6 +5625,7 @@ _mlx5dv_create_flow(struct mlx5dv_flow_matcher *flow_matcher, bool have_dest_devx = false; bool have_flow_tag = false; bool have_counter = false; + bool have_bulk_counter = false; bool have_default = false; bool have_drop = false; int ret; @@ -5695,20 +5695,13 @@ _mlx5dv_create_flow(struct mlx5dv_flow_matcher *flow_matcher, have_flow_tag = true; break; case MLX5DV_FLOW_ACTION_COUNTERS_DEVX: - if (have_counter) { + if (have_counter || have_bulk_counter) { errno = EOPNOTSUPP; goto err; } fill_attr_in_objs_arr(cmd, MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX, &actions_attr[i].obj->handle, 1); - - if (actions_attr_aux && - actions_attr_aux[i].type == MLX5_FLOW_ACTION_COUNTER_OFFSET) - fill_attr_in_ptr_array(cmd, - MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET, - &actions_attr_aux[i].offset, 1); - have_counter = true; break; case MLX5DV_FLOW_ACTION_DEFAULT_MISS: @@ -5733,6 +5726,19 @@ _mlx5dv_create_flow(struct mlx5dv_flow_matcher *flow_matcher, MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP); have_drop = true; break; + case MLX5DV_FLOW_ACTION_COUNTERS_DEVX_WITH_OFFSET: + if (have_counter || have_bulk_counter) { + errno = EOPNOTSUPP; + goto err; + } + fill_attr_in_objs_arr(cmd, + MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX, + &actions_attr[i].bulk_obj.obj->handle, 1); + fill_attr_in_ptr_array(cmd, + MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET, + &actions_attr[i].bulk_obj.offset, 1); + have_bulk_counter = true; + break; default: errno = EOPNOTSUPP; goto err; @@ -5772,8 +5778,7 @@ mlx5dv_create_flow(struct mlx5dv_flow_matcher *flow_matcher, return dvops->create_flow(flow_matcher, match_value, num_actions, - actions_attr, - NULL); + actions_attr); } static struct mlx5dv_steering_anchor * From f291e45a477210e647d3b39b8db868e45924db4c Mon Sep 17 00:00:00 2001 From: Maxim Chicherin Date: Wed, 16 Jul 2025 04:24:23 +0300 Subject: [PATCH 23/66] pyverbs: Add support to flow counters with offset Add new action to flow API to support flow counters with offset. Signed-off-by: Maxim Chicherin Reviewed-by: Shachar Kagan Signed-off-by: Edward Srouji --- pyverbs/providers/mlx5/libmlx5.pxd | 5 ++++ pyverbs/providers/mlx5/mlx5_enums.pxd | 1 + pyverbs/providers/mlx5/mlx5dv_flow.pyx | 36 +++++++++++++++++++++----- 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/pyverbs/providers/mlx5/libmlx5.pxd b/pyverbs/providers/mlx5/libmlx5.pxd index 09f32bb62..ac415de6a 100644 --- a/pyverbs/providers/mlx5/libmlx5.pxd +++ b/pyverbs/providers/mlx5/libmlx5.pxd @@ -155,12 +155,17 @@ cdef extern from 'infiniband/mlx5dv.h': cdef struct mlx5dv_devx_obj + cdef struct bulk_obj: + mlx5dv_devx_obj *obj + int offset + cdef struct mlx5dv_flow_action_attr: mlx5dv_flow_action_type type v.ibv_qp *qp v.ibv_flow_action *action unsigned int tag_value mlx5dv_devx_obj *obj + bulk_obj bulk_obj cdef struct mlx5dv_dr_domain diff --git a/pyverbs/providers/mlx5/mlx5_enums.pxd b/pyverbs/providers/mlx5/mlx5_enums.pxd index ee1b7e7e2..30c6ca14e 100644 --- a/pyverbs/providers/mlx5/mlx5_enums.pxd +++ b/pyverbs/providers/mlx5/mlx5_enums.pxd @@ -185,6 +185,7 @@ cdef extern from 'infiniband/mlx5dv.h': MLX5DV_FLOW_ACTION_DEST_DEVX MLX5DV_FLOW_ACTION_COUNTERS_DEVX MLX5DV_FLOW_ACTION_DEFAULT_MISS + MLX5DV_FLOW_ACTION_COUNTERS_DEVX_WITH_OFFSET cpdef enum mlx5dv_dr_domain_type: MLX5DV_DR_DOMAIN_TYPE_NIC_RX diff --git a/pyverbs/providers/mlx5/mlx5dv_flow.pyx b/pyverbs/providers/mlx5/mlx5dv_flow.pyx index c509621e5..0f26edfcc 100644 --- a/pyverbs/providers/mlx5/mlx5dv_flow.pyx +++ b/pyverbs/providers/mlx5/mlx5dv_flow.pyx @@ -198,7 +198,7 @@ cdef class Mlx5ModifyFlowAction(FlowAction): cdef class Mlx5FlowActionAttr(PyverbsObject): def __init__(self, action_type=None, QP qp=None, - FlowAction flow_action=None, Mlx5DevxObj obj=None): + FlowAction flow_action=None, Mlx5DevxObj obj=None, offset=0): """ Initialize a Mlx5FlowActionAttr object over an underlying mlx5dv_flow_action_attr C object that defines actions attributes for @@ -207,6 +207,7 @@ cdef class Mlx5FlowActionAttr(PyverbsObject): :param qp: A QP target for go to QP action :param flow_action: An action to perform for the flow :param obj: DEVX object + :param offset: Offset for the counters devx offset action """ super().__init__() if action_type: @@ -220,6 +221,10 @@ cdef class Mlx5FlowActionAttr(PyverbsObject): elif action_type in [dv.MLX5DV_FLOW_ACTION_DEST_DEVX, dv.MLX5DV_FLOW_ACTION_COUNTERS_DEVX]: self.attr.obj = obj.obj self.devx_obj = obj + elif action_type == dv.MLX5DV_FLOW_ACTION_COUNTERS_DEVX_WITH_OFFSET: + self.attr.bulk_obj.obj = obj.obj + self.attr.bulk_obj.offset = offset + self.devx_obj = obj elif action_type: raise PyverbsUserError(f'Unsupported action type: {action_type}.') @@ -231,7 +236,8 @@ cdef class Mlx5FlowActionAttr(PyverbsObject): def type(self, action_type): if not (self.attr.type in [dv.MLX5DV_FLOW_ACTION_DEST_IBV_QP, dv.MLX5DV_FLOW_ACTION_DEST_DEVX, - dv.MLX5DV_FLOW_ACTION_COUNTERS_DEVX]): + dv.MLX5DV_FLOW_ACTION_COUNTERS_DEVX, + dv.MLX5DV_FLOW_ACTION_COUNTERS_DEVX_WITH_OFFSET]): raise PyverbsUserError(f'Unsupported action type: {action_type}.') self.attr.type = action_type @@ -251,17 +257,22 @@ cdef class Mlx5FlowActionAttr(PyverbsObject): @property def devx_obj(self): if not (self.attr.type in [dv.MLX5DV_FLOW_ACTION_DEST_DEVX, - dv.MLX5DV_FLOW_ACTION_COUNTERS_DEVX]): + dv.MLX5DV_FLOW_ACTION_COUNTERS_DEVX, + dv.MLX5DV_FLOW_ACTION_COUNTERS_DEVX_WITH_OFFSET]): raise PyverbsUserError(f'Action attr of type {self.attr.type} doesn\'t have a devx_obj') return self.devx_obj @devx_obj.setter def devx_obj(self, Mlx5DevxObj devx_obj): if not (self.attr.type in [dv.MLX5DV_FLOW_ACTION_DEST_DEVX, - dv.MLX5DV_FLOW_ACTION_COUNTERS_DEVX]): + dv.MLX5DV_FLOW_ACTION_COUNTERS_DEVX, + dv.MLX5DV_FLOW_ACTION_COUNTERS_DEVX_WITH_OFFSET]): raise PyverbsUserError(f'Action attr of type {self.attr.type} doesn\'t have a devx_obj') self.devx_obj = devx_obj - self.attr.obj = devx_obj.obj + if self.attr.type == dv.MLX5DV_FLOW_ACTION_COUNTERS_DEVX_WITH_OFFSET: + self.attr.bulk_obj.obj = devx_obj.obj + else: + self.attr.obj = devx_obj.obj @property def action(self): @@ -276,6 +287,18 @@ cdef class Mlx5FlowActionAttr(PyverbsObject): self.action = action self.attr.action = action.action + @property + def offset(self): + if self.attr.type != dv.MLX5DV_FLOW_ACTION_COUNTERS_DEVX_WITH_OFFSET: + raise PyverbsUserError(f'Action attr of type {self.attr.type} doesn\'t have an offset') + return self.attr.bulk_obj.offset + + @offset.setter + def offset(self, int offset): + if self.attr.type != dv.MLX5DV_FLOW_ACTION_COUNTERS_DEVX_WITH_OFFSET: + raise PyverbsUserError(f'Action attr of type {self.attr.type} doesn\'t have an offset') + self.attr.bulk_obj.offset = offset + cdef class Mlx5Flow(Flow): def __init__(self, Mlx5FlowMatcher matcher, @@ -305,7 +328,8 @@ cdef class Mlx5Flow(Flow): ((attr.qp)).add_ref(self) self.qp = (attr).qp elif (attr).attr.type in [dv.MLX5DV_FLOW_ACTION_DEST_DEVX, - dv.MLX5DV_FLOW_ACTION_COUNTERS_DEVX]: + dv.MLX5DV_FLOW_ACTION_COUNTERS_DEVX, + dv.MLX5DV_FLOW_ACTION_COUNTERS_DEVX_WITH_OFFSET]: self.devx_obj = (attr).devx_obj elif (attr).attr.type not in [dv.MLX5DV_FLOW_ACTION_IBV_FLOW_ACTION]: raise PyverbsUserError(f'Unsupported action type: ' From 5bb2523fcfc904954c5880622ae8eeb4334010c6 Mon Sep 17 00:00:00 2001 From: Maxim Chicherin Date: Sun, 20 Jul 2025 15:30:27 +0300 Subject: [PATCH 24/66] tests: Add test for flow counter action with offset Add test_counters_bulk_flow to test flow action counter with offset. Signed-off-by: Maxim Chicherin Reviewed-by: Shachar Kagan Signed-off-by: Edward Srouji --- tests/mlx5_prm_structs.py | 20 ++++++- tests/test_mlx5_flow.py | 106 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+), 1 deletion(-) diff --git a/tests/mlx5_prm_structs.py b/tests/mlx5_prm_structs.py index 16eaada16..055e8e0a4 100644 --- a/tests/mlx5_prm_structs.py +++ b/tests/mlx5_prm_structs.py @@ -1576,7 +1576,8 @@ class AllocFlowCounterIn(PRMPacket): ShortField('reserved1', 0), ShortField('op_mod', 0), IntField('flow_counter_id', 0), - BitField('reserved2', 0, 24), + BitField('reserved2', 0, 19), + BitField('flow_counter_bulk_log_size', 0, 5), ByteField('flow_counter_bulk', 0), ] @@ -1644,6 +1645,23 @@ class QueryFlowCounterOut(PRMPacket): ] +class QueryBulkFlowCounterOut(PRMPacket): + fields_desc = [ + ByteField('status', 0), + BitField('reserved1', 0, 24), + IntField('syndrome', 0), + StrFixedLenField('reserved2', None, length=4), + IntField('num_statistics', 0), + PacketListField('flow_statistics', [], TrafficCounter), + ] + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + if self.num_statistics > 0: + self.flow_statistics = [TrafficCounter() for _ in range(self.num_statistics)] + + class RxHashFieldSelect(PRMPacket): fields_desc = [ BitField('l3_prot_type', 0, 1), diff --git a/tests/test_mlx5_flow.py b/tests/test_mlx5_flow.py index 5d553f77c..cd4eca2e9 100644 --- a/tests/test_mlx5_flow.py +++ b/tests/test_mlx5_flow.py @@ -132,6 +132,31 @@ def create_context(self): except PyverbsRDMAError: raise unittest.SkipTest('Opening mlx5 context is not supported') + def query_bulk_counter_cap(self): + """ + Query the device for the maximum allowed bulk allocation size for flow counters. + This method queries the HCA capabilities to determine the maximum log2 bulk size + that can be allocated for flow counters. It first checks for general HCA capabilities, + then queries HCA_CAP_2 for the specific flow_counter_bulk_log_max_alloc field. + :return: The maximum log2 bulk allocation size for flow counters. + """ + from tests.mlx5_prm_structs import QueryCmdHcaCap2Out, \ + QueryHcaCapIn, QueryCmdHcaCapOut, QueryHcaCapOp, QueryHcaCapMod + query_cap_in = QueryHcaCapIn(op_mod=0x1) + query_cap_out = QueryCmdHcaCapOut(self.ctx.devx_general_cmd( + query_cap_in, len(QueryCmdHcaCapOut()))) + if query_cap_out.status: + raise PyverbsRDMAError('Failed to query general HCA CAPs with syndrome ' + f'({query_cap_out.syndrome}') + + if not query_cap_out.capability.hca_cap_2: + raise unittest.SkipTest('The device doesn\'t support general HCA CAPs 2') + query_cap2_in = QueryHcaCapIn(op_mod=(QueryHcaCapOp.HCA_CAP_2 << 0x1) | \ + QueryHcaCapMod.CURRENT) + query_cap2_out = QueryCmdHcaCap2Out(self.ctx.devx_general_cmd( + query_cap2_in, len(QueryCmdHcaCap2Out()))) + return query_cap2_out.capability.flow_counter_bulk_log_max_alloc + @staticmethod def create_counter(ctx): """ @@ -144,6 +169,26 @@ def create_counter(ctx): flow_counter_id = AllocFlowCounterOut(counter.out_view).flow_counter_id return counter, flow_counter_id + def create_bulk_counter(self, ctx, bulk_size=0): + """ + Create flow bulk counter. + :param ctx: The player context to create the counter on. + :param bulk_size: The bulk size to use for the counter. + :return: The counter object and the flow counter ID . + """ + from tests.mlx5_prm_structs import AllocFlowCounterIn, AllocFlowCounterOut + bulk_log_max_alloc = self.query_bulk_counter_cap() + if bulk_log_max_alloc > 0: + # Bulk size is log2 of the number of counters eg. if bulk_size is 9, then there + # are 512 counters + cmd_in = AllocFlowCounterIn(flow_counter_bulk_log_size=bulk_size) + else: + # If bulk_log_max_alloc=0 then use old bitmask field, 0b100 means 512 counters + cmd_in = AllocFlowCounterIn(flow_counter_bulk=0b100) + counter = Mlx5DevxObj(ctx, cmd_in, len(AllocFlowCounterOut())) + flow_counter_id = AllocFlowCounterOut(counter.out_view).flow_counter_id + return counter, flow_counter_id + @staticmethod def query_counter_packets(counter, flow_counter_id): """ @@ -157,6 +202,23 @@ def query_counter_packets(counter, flow_counter_id): counter_out = QueryFlowCounterOut(counter.query(query_in, len(QueryFlowCounterOut()))) return counter_out.flow_statistics.packets + @staticmethod + def query_bulk_counter(counter, flow_counter_id, num_of_counters=1): + """ + Query flow counter packets count for bulk counters. + :param counter: The counter for the query. + :param flow_counter_id: The flow counter ID for the query. + :param num_of_counters: Number of counters to query. + :return: Counter statistics. + """ + from tests.mlx5_prm_structs import QueryFlowCounterIn, QueryBulkFlowCounterOut + query_in = QueryFlowCounterIn(flow_counter_id=flow_counter_id, + num_of_counters=num_of_counters) + counter_out = QueryBulkFlowCounterOut(counter.query(query_in, + len(QueryBulkFlowCounterOut( + num_statistics=num_of_counters)))) + return counter_out.flow_statistics + class Mlx5RCFlowResources(Mlx5RcResources): def __init__(self, dev_name, ib_port, gid_index, is_privileged_ctx=False, **kwargs): @@ -432,6 +494,50 @@ def test_counter_qp_flow(self): sent_packets = self.server.query_counter_packets(counter, flow_counter_id) self.assertEqual(sent_packets, self.iters, 'Counter of metadata missed some sent packets') + @u.skip_unsupported + def test_counters_bulk_flow(self): + """ + Flow action counters with bulk size. + Creates 4 flows with different dst ip addresses. + Sends 10 packets to each flow. + Verifies each flow get a hit and goes to QP final destination and counter is incremented + accordingly. + """ + from tests.mlx5_prm_structs import FlowTableEntryMatchSetLyr24, FlowTableEntryMatchParam + + self.create_players(Mlx5CounterFlowResources) + outer_header = FlowTableEntryMatchSetLyr24(ethertype=0xffff, ip_version=0xf, + src_ip_mask=0xffffffff) + mask = FlowTableEntryMatchParam(outer_headers=outer_header) + matcher = self.server.create_matcher(mask, u.MatchCriteriaEnable.OUTER) + action_qp_attr = Mlx5FlowActionAttr( + action_type=mlx5dv_flow_action_type.MLX5DV_FLOW_ACTION_DEST_IBV_QP, + qp=self.server.qp) + # Create a counter with bulk_size=2 means there are 4 counters in the bulk. + counter, flow_counter_id = self.server.create_bulk_counter(self.server.ctx, bulk_size=2) + action_type = mlx5dv_flow_action_type.MLX5DV_FLOW_ACTION_COUNTERS_DEVX_WITH_OFFSET + flows = [] + flows_num = 4 + + for flow_idx in range(flows_num): + src_ip = f'{flow_idx + 2}.{flow_idx + 2}.{flow_idx + 2}.{flow_idx + 2}' + packet = u.gen_packet(self.client.msg_size, ip_version=4, src_ipv4=src_ip) + # Each offset is different counter + action_counter_attr = Mlx5FlowActionAttr(action_type=action_type, obj=counter, + offset=flow_idx) + outer_header = FlowTableEntryMatchSetLyr24(ethertype=PacketConsts.ETHER_TYPE_IPV4, + ip_version=4, src_ip4=src_ip) + value_param = Mlx5FlowMatchParameters(len(outer_header), outer_header) + flows.append(Mlx5Flow(matcher, value_param, [action_counter_attr, action_qp_attr], 2)) + u.raw_traffic(self.client, self.server, self.iters, packet_to_send=packet) + ctr_stats = self.server.query_bulk_counter(counter, flow_counter_id, + num_of_counters=flows_num) + # Verify counters + for j in range(flows_num): + exp_num_pkts = self.iters if j <= flow_idx else 0 + self.assertEqual(ctr_stats[j].packets, exp_num_pkts, + f'Counter {j} in metadata missed some sent packets') + @requires_reformat_support @u.requires_encap_disabled_if_eswitch_on def test_tx_packet_reformat(self): From a15caa013c388b5c17541f7dddca36091bf62d8a Mon Sep 17 00:00:00 2001 From: Maxim Chicherin Date: Mon, 9 Jun 2025 02:27:23 +0300 Subject: [PATCH 25/66] pyverbs: Add support to MREX and DMA Handle 1. Add support to MR extended. 2. Add DMA Handle class. Signed-off-by: Maxim Chicherin Reviewed-by: Shachar Kagan Signed-off-by: Edward Srouji --- pyverbs/device.pxd | 1 + pyverbs/device.pyx | 7 +- pyverbs/libibverbs.pxd | 22 +++ pyverbs/libibverbs_enums.pxd | 17 +++ pyverbs/mr.pxd | 16 ++- pyverbs/mr.pyx | 257 +++++++++++++++++++++++++++++++---- 6 files changed, 294 insertions(+), 26 deletions(-) diff --git a/pyverbs/device.pxd b/pyverbs/device.pxd index 4f9c7ca94..a13dd063f 100644 --- a/pyverbs/device.pxd +++ b/pyverbs/device.pxd @@ -28,6 +28,7 @@ cdef class Context(PyverbsCM): cdef object rwq_ind_tbls cdef object crypto_logins cdef object event_channels + cdef object dmahs cdef class DeviceAttr(PyverbsObject): cdef v.ibv_device_attr dev_attr diff --git a/pyverbs/device.pyx b/pyverbs/device.pyx index aecfb9357..1eaf3cfd2 100644 --- a/pyverbs/device.pyx +++ b/pyverbs/device.pyx @@ -20,7 +20,7 @@ cimport pyverbs.librdmacm as cm from pyverbs.cmid cimport CMID from pyverbs.xrcd cimport XRCD from pyverbs.addr cimport GID -from pyverbs.mr import DMMR +from pyverbs.mr import DMMR, DMAHandle from pyverbs.pd cimport PD from pyverbs.qp cimport QP from libc.stdlib cimport free, malloc @@ -123,6 +123,7 @@ cdef class Context(PyverbsCM): self.rwq_ind_tbls = weakref.WeakSet() self.crypto_logins = weakref.WeakSet() self.event_channels = weakref.WeakSet() + self.dmahs = weakref.WeakSet() self.name = kwargs.get('name') provider_attr = kwargs.get('attr') @@ -179,7 +180,7 @@ cdef class Context(PyverbsCM): self.logger.debug('Closing Context') close_weakrefs([self.qps, self.crypto_logins, self.rwq_ind_tbls, self.wqs, self.ccs, self.cqs, self.dms, self.pds, self.xrcds, self.vars, self.sched_leafs, - self.sched_nodes, self.dr_domains, self.event_channels]) + self.sched_nodes, self.dr_domains, self.event_channels, self.dmahs]) rc = v.ibv_close_device(self.context) if rc != 0: raise PyverbsRDMAErrno(f'Failed to close device {self.name}') @@ -346,6 +347,8 @@ cdef class Context(PyverbsCM): self.wqs.add(obj) elif isinstance(obj, RwqIndTable): self.rwq_ind_tbls.add(obj) + elif isinstance(obj, DMAHandle): + self.dmahs.add(obj) else: raise PyverbsError('Unrecognized object type') diff --git a/pyverbs/libibverbs.pxd b/pyverbs/libibverbs.pxd index e3b40aa31..18889f561 100644 --- a/pyverbs/libibverbs.pxd +++ b/pyverbs/libibverbs.pxd @@ -645,6 +645,25 @@ cdef extern from 'infiniband/verbs.h': ibv_wq **ind_tbl uint32_t comp_mask + cdef struct ibv_dmah_init_attr: + uint32_t comp_mask + uint32_t cpu_id + uint8_t ph + uint8_t tph_mem_type + + cdef struct ibv_dmah: + ibv_context *context + + cdef struct ibv_mr_init_attr: + size_t length + int access + uint64_t comp_mask + uint64_t iova + void *addr + int fd + uint64_t fd_offset + ibv_dmah *dmah + ibv_device **ibv_get_device_list(int *n) int ibv_get_device_index(ibv_device *device); void ibv_free_device_list(ibv_device **list) @@ -671,6 +690,9 @@ cdef extern from 'infiniband/verbs.h': int ibv_dereg_mr(ibv_mr *mr) int ibv_advise_mr(ibv_pd *pd, uint32_t advice, uint32_t flags, ibv_sge *sg_list, uint32_t num_sge) + ibv_dmah *ibv_alloc_dmah(ibv_context *context, ibv_dmah_init_attr *attr) + int ibv_dealloc_dmah(ibv_dmah *dmah) + ibv_mr *ibv_reg_mr_ex(ibv_pd *pd, ibv_mr_init_attr *mr_init_attr) ibv_mw *ibv_alloc_mw(ibv_pd *pd, ibv_mw_type type) int ibv_dealloc_mw(ibv_mw *mw) ibv_dm *ibv_alloc_dm(ibv_context *context, ibv_alloc_dm_attr *attr) diff --git a/pyverbs/libibverbs_enums.pxd b/pyverbs/libibverbs_enums.pxd index 4514d808e..aae55d1c3 100644 --- a/pyverbs/libibverbs_enums.pxd +++ b/pyverbs/libibverbs_enums.pxd @@ -504,6 +504,23 @@ cdef extern from '': IBV_FLUSH_MR IBV_FLUSH_RANGE + cpdef enum ibv_tph_mem_type: + IBV_TPH_MEM_TYPE_VM + IBV_TPH_MEM_TYPE_PM + + cpdef enum ibv_dmah_init_attr_mask: + IBV_DMAH_INIT_ATTR_MASK_CPU_ID + IBV_DMAH_INIT_ATTR_MASK_PH + IBV_DMAH_INIT_ATTR_MASK_TPH_MEM_TYPE + + cpdef enum ibv_mr_init_attr_mask: + IBV_REG_MR_MASK_IOVA + IBV_REG_MR_MASK_ADDR + IBV_REG_MR_MASK_FD + IBV_REG_MR_MASK_FD_OFFSET + IBV_REG_MR_MASK_DMAH + + cdef extern from "": cdef unsigned long long IBV_ADVISE_MR_ADVICE_PREFETCH cdef unsigned long long IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE diff --git a/pyverbs/mr.pxd b/pyverbs/mr.pxd index d9a79ff55..94ed2c251 100644 --- a/pyverbs/mr.pxd +++ b/pyverbs/mr.pxd @@ -4,7 +4,7 @@ #cython: language_level=3 -from pyverbs.base cimport PyverbsCM +from pyverbs.base cimport PyverbsCM, PyverbsObject cimport pyverbs.librdmacm as cm from . cimport libibverbs as v @@ -18,8 +18,13 @@ cdef class MR(PyverbsCM): cdef object is_user_addr cdef void *buf cdef object _is_imported + cdef void _allocate_buffer(self, size_t length, bint is_huge, int *mmap_length) + cdef void _free_buffer(self, bint is_huge, int mmap_length) cpdef read(self, length, offset) +cdef class MREx(MR): + cdef object dmah + cdef class MWBindInfo(PyverbsCM): cdef v.ibv_mw_bind_info info cdef object mr @@ -39,3 +44,12 @@ cdef class DmaBufMR(MR): cdef object dmabuf cdef unsigned long offset cdef object is_dmabuf_internal + +cdef class DmaHandleInitAttr(PyverbsObject): + cdef v.ibv_dmah_init_attr init_attr + +cdef class DMAHandle(PyverbsCM): + cdef v.ibv_dmah *dmah + cdef object mrs + cdef object ctx + cdef add_ref(self, obj) diff --git a/pyverbs/mr.pyx b/pyverbs/mr.pyx index 602a4fe99..706653b7c 100644 --- a/pyverbs/mr.pyx +++ b/pyverbs/mr.pyx @@ -4,6 +4,7 @@ import resource import logging +import weakref from posix.mman cimport mmap, munmap, MAP_PRIVATE, PROT_READ, PROT_WRITE, \ MAP_ANONYMOUS, MAP_HUGETLB, MAP_SHARED @@ -17,9 +18,11 @@ from libc.string cimport memcpy, memset cimport pyverbs.libibverbs_enums as e from pyverbs.device cimport DM from libc.stdlib cimport free, malloc +from pyverbs.device cimport Context from .cmid cimport CMID from .pd cimport PD from .dmabuf cimport DmaBuf +from pyverbs.base cimport close_weakrefs cdef extern from 'sys/mman.h': cdef void* MAP_FAILED @@ -55,6 +58,7 @@ cdef class MR(PyverbsCM): :return: The newly created MR on success """ super().__init__() + cdef int mmap_len = 0 if self.mr != NULL: return self.is_huge = True if access & e.IBV_ACCESS_HUGETLB else False @@ -78,21 +82,11 @@ cdef class MR(PyverbsCM): # Allocate a buffer if not address and length > 0: - if self.is_huge: - # Rounding up to multiple of HUGE_PAGE_SIZE - self.mmap_length = length + (HUGE_PAGE_SIZE - length % HUGE_PAGE_SIZE) \ - if length % HUGE_PAGE_SIZE else length - self.buf = mmap(NULL, self.mmap_length, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0) - if self.buf == MAP_FAILED: - raise PyverbsError('Failed to allocate MR buffer of size {l}'. - format(l=length)) - else: - rc = posix_memalign(&self.buf, resource.getpagesize(), length) - if rc: - raise PyverbsError('Failed to allocate MR buffer of size {l}'. - format(l=length)) - memset(self.buf, 0, length) + self._allocate_buffer(length, self.is_huge, &mmap_len) + if self.buf == NULL: + raise PyverbsError('Failed to allocate MR buffer of size {l}'. + format(l=length)) + self.mmap_length = mmap_len if isinstance(creator, PD): pd = creator if implicit: @@ -142,10 +136,7 @@ cdef class MR(PyverbsCM): if rc != 0: raise PyverbsRDMAError('Failed to dereg MR', rc) if not self.is_user_addr: - if self.is_huge: - munmap(self.buf, self.mmap_length) - else: - free(self.buf) + self._free_buffer(self.is_huge, self.mmap_length) self.mr = NULL self.pd = None self.buf = NULL @@ -206,10 +197,7 @@ cdef class MR(PyverbsCM): if flags & e.IBV_REREG_MR_CHANGE_TRANSLATION: if not self.is_user_addr: - if self.is_huge: - munmap(self.buf, self.mmap_length) - else: - free(self.buf) + self._free_buffer(self.is_huge, self.mmap_length) self.buf = addr self.is_user_addr = True @@ -218,6 +206,38 @@ cdef class MR(PyverbsCM): self.pd = pd pd.add_ref(self) + cdef void _allocate_buffer(self, size_t length, bint is_huge, int *mmap_length): + cdef void *buf = NULL + cdef size_t rounded + cdef int rc + if length == 0: + mmap_length[0] = 0 + return + if is_huge: + rounded = length + (HUGE_PAGE_SIZE - length % HUGE_PAGE_SIZE) if \ + length % HUGE_PAGE_SIZE else length + buf = mmap(NULL, rounded, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0) + if buf == MAP_FAILED: + return + mmap_length[0] = rounded + else: + rc = posix_memalign(&buf, resource.getpagesize(), length) + if rc: + return + mmap_length[0] = 0 + + memset(buf, 0, length) + self.buf = buf + + cdef void _free_buffer(self, bint is_huge, int mmap_length): + if self.buf == NULL: + return + if is_huge: + munmap(self.buf, mmap_length) + else: + free(self.buf) + @property def buf(self): return self.buf @@ -490,9 +510,200 @@ cdef class DmaBufMR(MR): return res +cdef class DmaHandleInitAttr(PyverbsObject): + def __init__(self, comp_mask=0, cpu_id=0, ph=0, tph_mem_type=0): + """ + Initialize a DmaHandleInitAttr object with the specified configuration. + :param comp_mask: Bitmask of initialization attributes + :param cpu_id: CPU identifier for DMA operations + :param ph: Processing hints for DMA operations + :param tph_mem_type: Type of target memory + """ + super().__init__() + self.init_attr.comp_mask = comp_mask + self.init_attr.cpu_id = cpu_id + self.init_attr.ph = ph + self.init_attr.tph_mem_type = tph_mem_type + + @property + def comp_mask(self): + return self.init_attr.comp_mask + + @comp_mask.setter + def comp_mask(self, value): + self.init_attr.comp_mask = value + + @property + def cpu_id(self): + return self.init_attr.cpu_id + + @cpu_id.setter + def cpu_id(self, value): + self.init_attr.cpu_id = value + + @property + def ph(self): + return self.init_attr.ph + + @ph.setter + def ph(self, value): + self.init_attr.ph = value + + @property + def tph_mem_type(self): + return self.init_attr.tph_mem_type + + @tph_mem_type.setter + def tph_mem_type(self, value): + self.init_attr.tph_mem_type = value + + def __str__(self): + print_format = '{:22}: {:<20}\n' + return 'DmaHandleInitAttr:\n' + \ + print_format.format('comp_mask', self.comp_mask) + \ + print_format.format('cpu_id', self.cpu_id) + \ + print_format.format('ph', self.ph) + \ + print_format.format('tph_mem_type', self.tph_mem_type) + + +cdef class DMAHandle(PyverbsCM): + def __init__(self, Context ctx not None, DmaHandleInitAttr init_attr not None): + """ + Initialize a DMAHandle object with the specified configuration. + :param ctx: A Context object representing the RDMA device context + :param init_attr: A DmaHandleInitAttr object containing initialization attributes + :raises PyverbsError: If DMA handle allocation fails + """ + super().__init__() + + self.dmah = v.ibv_alloc_dmah(ctx.context, &init_attr.init_attr) + if self.dmah == NULL: + raise PyverbsRDMAErrno('Failed to create DMA Handle') + + self.mrs = weakref.WeakSet() + self.ctx = ctx + ctx.add_ref(self) + + def __dealloc__(self): + self.close() + + cpdef close(self): + """ + Close and deallocate the DMA handle. + :raises PyverbsRDMAError: If deallocation fails + """ + if self.dmah != NULL: + if self.logger: + self.logger.debug('Closing DMA Handle') + close_weakrefs([self.mrs]) + rc = v.ibv_dealloc_dmah(self.dmah) + if rc != 0: + raise PyverbsRDMAError('Failed to dealloc DMA Handle', rc) + self.dmah = NULL + self.ctx = None + + cdef add_ref(self, obj): + if isinstance(obj, MREx): + self.mrs.add(obj) + else: + raise PyverbsError('Unrecognized object type') + + def mwtype2str(mw_type): mw_types = {1:'IBV_MW_TYPE_1', 2:'IBV_MW_TYPE_2'} try: return mw_types[mw_type] except KeyError: return 'Unknown MW type ({t})'.format(t=mw_type) + + +cdef class MREx(MR): + """ + MREx class represents a memory region registered using the extended API ibv_reg_mr_ex. + This class provides more flexibility in memory registration compared to the basic MR class. + """ + def __init__(self, PD pd not None, length=0, access=0, address=None, + iova=None, fd=None, fd_offset=0, dmah=None, implicit=False, **kwargs): + """ + Register a memory region using the extended API ibv_reg_mr_ex. + :param pd: A PD object + :param length: Length (in bytes) of MR's buffer + :param access: Access flags, see ibv_access_flags enum + :param address: Memory address to register (Optional) + :param iova: IOVA address to register (Optional) + :param fd: File descriptor for dma-buf based registration (Optional) + :param fd_offset: Offset in the dma-buf (Optional) + :param dmah: DMA handle for registration (Optional) + :param implicit: If True, register implicit MR + :param kwargs: Additional arguments + :return: The newly created MREx on success + """ + PyverbsCM.__init__(self) + cdef int mmap_len = 0 + cdef v.ibv_mr_init_attr in_ + self.is_huge = True if access & e.IBV_ACCESS_HUGETLB else False + self.is_user_addr = False + self.mmap_length = 0 + + # Handle memory allocation if no address is provided + if not address and length > 0 and fd is None: + self._allocate_buffer(length, self.is_huge, &mmap_len) + if self.buf == NULL: + raise PyverbsError(f'Failed to allocate MR buffer of size {length}') + self.mmap_length = mmap_len + elif address: + self.is_user_addr = True + self.buf = address + + memset(&in_, 0, sizeof(in_)) + if implicit: + in_.length = SIZE_MAX + in_.comp_mask |= e.IBV_REG_MR_MASK_ADDR + in_.addr = NULL + else: + in_.length = length + if self.buf != NULL: + in_.comp_mask |= e.IBV_REG_MR_MASK_ADDR + in_.addr = self.buf + + in_.access = access + if iova is not None: + in_.comp_mask |= e.IBV_REG_MR_MASK_IOVA + in_.iova = iova + if fd is not None: + in_.comp_mask |= e.IBV_REG_MR_MASK_FD | e.IBV_REG_MR_MASK_FD_OFFSET + in_.fd = fd + in_.fd_offset = fd_offset + if dmah is not None: + in_.comp_mask |= e.IBV_REG_MR_MASK_DMAH + in_.dmah = (dmah).dmah + + self.mr = v.ibv_reg_mr_ex(pd.pd, &in_) + if self.mr == NULL: + # Clean up allocated memory if registration fails + if not self.is_user_addr and self.buf != NULL: + self._free_buffer(self.is_huge, self.mmap_length) + raise PyverbsRDMAErrno('Failed to register MR using extended API') + + self.pd = pd + pd.add_ref(self) + if dmah is not None: + (dmah).add_ref(self) + self.dmah = dmah + self.logger.debug(f'Registered MR using extended API. Length: {length}, ' + f'access flags {access}') + + def __str__(self): + print_format = '{:22}: {:<20}\n' + return 'MREx:\n' + \ + print_format.format('lkey', self.lkey) + \ + print_format.format('rkey', self.rkey) + \ + print_format.format('length', self.length) + \ + print_format.format('buf', self.buf) + \ + print_format.format('handle', self.handle) + + cpdef close(self): + """Close MREx and release its association with DMAHandle.""" + if self.mr != NULL: + super(MREx, self).close() + self.dmah = None From 5e4b222e6dd668f23cac7e7b5c102db7014633c0 Mon Sep 17 00:00:00 2001 From: Maxim Chicherin Date: Mon, 9 Jun 2025 02:28:04 +0300 Subject: [PATCH 26/66] tests: Add tests for MREX and DMAHandle Add new tests to cover MREX and DMAHandle which represents new verbs API ibv_reg_mr_ex and ibv_alloc_dmah. Signed-off-by: Maxim Chicherin Reviewed-by: Shachar Kagan Signed-off-by: Edward Srouji --- tests/CMakeLists.txt | 1 + tests/test_cuda_dmabuf.py | 64 +++++++++++++-- tests/test_dmah.py | 137 +++++++++++++++++++++++++++++++++ tests/test_mr.py | 128 ++++++++++++++++++++++-------- tests/test_odp.py | 95 ++++++++++++++++++++++- tests/test_relaxed_ordering.py | 12 ++- tests/utils.py | 30 +++++++- 7 files changed, 419 insertions(+), 48 deletions(-) create mode 100644 tests/test_dmah.py diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 0570b8d2b..3ad9bfc5b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -19,6 +19,7 @@ rdma_python_test(tests test_cqex.py test_cuda_dmabuf.py test_device.py + test_dmah.py test_efa_srd.py test_efadv.py test_flow.py diff --git a/tests/test_cuda_dmabuf.py b/tests/test_cuda_dmabuf.py index b08af9285..5b5813806 100644 --- a/tests/test_cuda_dmabuf.py +++ b/tests/test_cuda_dmabuf.py @@ -6,10 +6,10 @@ from pyverbs.pyverbs_error import PyverbsRDMAError from tests.base import RCResources, RDMATestCase -from pyverbs.mr import DmaBufMR +from pyverbs.mr import DmaBufMR, MREx, DMAHandle from pyverbs.qp import QPAttr import tests.cuda_utils as cu -from pyverbs.libibverbs_enums import ibv_access_flags, ibv_wr_opcode +from pyverbs.libibverbs_enums import ibv_access_flags, ibv_wr_opcode, ibv_mr_init_attr_mask import tests.utils as u try: @@ -37,7 +37,7 @@ def __init__(self, dev_name, ib_port, gid_index, self.cuda_addr = None super().__init__(dev_name=dev_name, ib_port=ib_port, gid_index=gid_index) - def create_mr(self): + def dmabuf_cuda_init(self): self.cuda_addr = cu.check_cuda_errors(cuda.cuMemAlloc(GPU_PAGE_SIZE)) attr_flag = 1 @@ -47,15 +47,17 @@ def create_mr(self): int(self.cuda_addr))) dmabuf_fd = cu.check_cuda_errors( - cuda.cuMemGetHandleForAddressRange(self.cuda_addr, - GPU_PAGE_SIZE, - cuda.CUmemRangeHandleType.CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, - 0)) + cuda.cuMemGetHandleForAddressRange(self.cuda_addr, GPU_PAGE_SIZE, + cuda.CUmemRangeHandleType.CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)) + return dmabuf_fd + + def create_mr(self): + dmabuf_fd = self.dmabuf_cuda_init() try: self.mr = DmaBufMR(self.pd, self.msg_size, self.mr_access, dmabuf_fd) except PyverbsRDMAError as ex: if ex.error_code == errno.EOPNOTSUPP: - raise unittest.SkipTest(f'Registering DMABUF MR is not supported') + raise unittest.SkipTest('Registering DMABUF with DmaBufMR is not supported') raise ex def create_qp_attr(self): @@ -66,6 +68,37 @@ def create_qp_attr(self): return qp_attr +@cu.set_mem_io_cuda_methods +class MRExDmabufCudaRes(DmabufCudaRes): + """Resource class that registers an MREx with dma-buf FD.""" + + @u.skip_unsupported + def create_mr(self): + dmabuf_fd = self.dmabuf_cuda_init() + self.mr = MREx(self.pd, self.msg_size, self.mr_access, + comp_mask=ibv_mr_init_attr_mask.IBV_REG_MR_MASK_FD | \ + ibv_mr_init_attr_mask.IBV_REG_MR_MASK_IOVA, fd=dmabuf_fd, iova=0) + + +@cu.set_mem_io_cuda_methods +class MRExDmabufDmaHCudaRes(DmabufCudaRes): + """Resource class that registers an MREx with dma-buf FD and DMAHandle.""" + + @u.skip_unsupported + def create_mr(self): + dmabuf_fd = self.dmabuf_cuda_init() + try: + attr = u.create_dmah_init_attr() + dmah = DMAHandle(self.ctx, attr) + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('DMAHandle is not supported') + raise ex + + self.mr = MREx(self.pd, self.msg_size, self.mr_access, + comp_mask=ibv_mr_init_attr_mask.IBV_REG_MR_MASK_FD | \ + ibv_mr_init_attr_mask.IBV_REG_MR_MASK_IOVA, fd=dmabuf_fd, iova=0, dmah=dmah) + @cu.set_init_cuda_methods class DmabufCudaTest(RDMATestCase): """ @@ -80,3 +113,18 @@ def test_cuda_dmabuf_rdma_write_traffic(self): access = ibv_access_flags.IBV_ACCESS_LOCAL_WRITE | ibv_access_flags.IBV_ACCESS_REMOTE_WRITE self.create_players(DmabufCudaRes, mr_access=access) u.rdma_traffic(**self.traffic_args, send_op=ibv_wr_opcode.IBV_WR_RDMA_WRITE) + + def test_mrex_cuda_dmabuf_rdma_write_traffic(self): + """ + Runs RDMA Write traffic over CUDA allocated memory using DMA BUF and + RC QPs with MREx. + """ + access = ibv_access_flags.IBV_ACCESS_LOCAL_WRITE | ibv_access_flags.IBV_ACCESS_REMOTE_WRITE + self.create_players(MRExDmabufCudaRes, mr_access=access) + u.rdma_traffic(**self.traffic_args, send_op=ibv_wr_opcode.IBV_WR_RDMA_WRITE) + + def test_mrex_dmah_cuda_dmabuf_rdma_write_traffic(self): + """Runs RDMA Write traffic using MREx + DMAHandle over CUDA dma-buf.""" + access = ibv_access_flags.IBV_ACCESS_LOCAL_WRITE | ibv_access_flags.IBV_ACCESS_REMOTE_WRITE + self.create_players(MRExDmabufDmaHCudaRes, mr_access=access) + u.rdma_traffic(**self.traffic_args, send_op=ibv_wr_opcode.IBV_WR_RDMA_WRITE) diff --git a/tests/test_dmah.py b/tests/test_dmah.py new file mode 100644 index 000000000..885855f92 --- /dev/null +++ b/tests/test_dmah.py @@ -0,0 +1,137 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2025 NVIDIA Corporation . All rights reserved. See COPYING file + +from pyverbs.libibverbs_enums import ibv_access_flags, ibv_wr_opcode, ibv_odp_transport_cap_bits, \ + ibv_tph_mem_type +from tests.base import PyverbsAPITestCase, RCResources, RDMATestCase +from pyverbs.pyverbs_error import PyverbsError, PyverbsRDMAError +import pyverbs.device as d +from pyverbs.pd import PD +from pyverbs.mr import MREx, DMAHandle +from pyverbs.qp import QPAttr +import tests.utils as u + + +class DMAHandleTest(PyverbsAPITestCase): + @u.skip_unsupported + def test_dmah_with_mrex(self): + """Verify DMAHandle can be used during MREx registration.""" + with d.Context(name=self.dev_name) as ctx: + with PD(ctx) as pd: + attr = u.create_dmah_init_attr() + with DMAHandle(ctx, attr) as dmah: + length = u.get_mr_length() + access = ibv_access_flags.IBV_ACCESS_LOCAL_WRITE + with MREx(pd, length, access, dmah=dmah) as mr: + pass + + @u.skip_unsupported + def test_dmah_invalid_ph(self): + """Verify DMAHandle with invalid PH value, max ph value is 3.""" + with d.Context(name=self.dev_name) as ctx: + with PD(ctx) as pd: + attr = u.create_dmah_init_attr(ph=4) + with self.assertRaises(PyverbsError): + DMAHandle(ctx, attr) + + @u.skip_unsupported + def test_dmah_persistent_memory(self): + """Attempt to create DMAHandle targeting persistent memory.""" + with d.Context(name=self.dev_name) as ctx: + attr = u.create_dmah_init_attr(tph_mem_type=ibv_tph_mem_type.IBV_TPH_MEM_TYPE_PM) + with DMAHandle(ctx, attr): + pass + + @u.skip_unsupported + def test_dmah_invalid_mem_type(self): + """Pass an unsupported TPH memory-type and verify provider rejects it.""" + with d.Context(name=self.dev_name) as ctx: + attr = u.create_dmah_init_attr(tph_mem_type=0xFE) + with self.assertRaises(PyverbsError): + DMAHandle(ctx, attr) + + @u.skip_unsupported + def test_dmah_inval_cpu_id(self): + """Attempt to create DMAHandle with invalid CPU ID (0xffff). Expect failure.""" + with d.Context(name=self.dev_name) as ctx: + attr = u.create_dmah_init_attr(cpu_id=0xffff, ph=3) + with self.assertRaises(PyverbsError): + DMAHandle(ctx, attr) + + @u.skip_unsupported + def test_dmah_mrex_odp_bad_flow(self): + """Attempt to register ODP-capable MREx with DMAHandle. + Expect failure since ODP isn't supported with DMAHandle.""" + with d.Context(name=self.dev_name) as ctx: + # Check ODP support; skip if not available + odp_cap = (ibv_odp_transport_cap_bits.IBV_ODP_SUPPORT_SEND | + ibv_odp_transport_cap_bits.IBV_ODP_SUPPORT_RECV) + u.odp_supported(ctx, 'rc', odp_cap) + with PD(ctx) as pd: + attr = u.create_dmah_init_attr() + dmah = DMAHandle(ctx, attr) + with self.assertRaises(PyverbsError): + length = u.get_mr_length() + access = (ibv_access_flags.IBV_ACCESS_LOCAL_WRITE | + ibv_access_flags.IBV_ACCESS_ON_DEMAND) + MREx(pd, length, access, dmah=dmah) + + +class DmaHandleMRExRC(RCResources): + """RC resource class that registers an MREx with a DMAHandle.""" + + def __init__(self, dev_name, ib_port, gid_index, + mr_access=ibv_access_flags.IBV_ACCESS_LOCAL_WRITE, msg_size=1024): + self.dmah = None + self.mr_access = mr_access + super().__init__(dev_name=dev_name, ib_port=ib_port, gid_index=gid_index, + msg_size=msg_size) + + def create_dmah(self): + """Allocate a DMAHandle using the existing device Context.""" + attr = u.create_dmah_init_attr() + self.dmah = DMAHandle(self.ctx, attr) + + @u.skip_unsupported + def create_mr(self): + self.create_dmah() + self.mr = MREx(self.pd, self.msg_size, self.mr_access, dmah=self.dmah) + + def create_qp_attr(self): + qp_attr = QPAttr(port_num=self.ib_port) + qp_access = (ibv_access_flags.IBV_ACCESS_LOCAL_WRITE | + ibv_access_flags.IBV_ACCESS_REMOTE_WRITE | + ibv_access_flags.IBV_ACCESS_REMOTE_ATOMIC) + qp_attr.qp_access_flags = qp_access + return qp_attr + + +class DmaHandleTrafficTest(RDMATestCase): + """Traffic tests for MREx + DMAHandle combinations.""" + + def setUp(self): + super().setUp() + self.iters = 10 + self.server = None + self.client = None + self.traffic_args = None + + def test_dmah_mrex_rc_send(self): + """Checks basic RC send/recv traffic with DMAHandle-registered MREx.""" + access = ibv_access_flags.IBV_ACCESS_LOCAL_WRITE + self.create_players(DmaHandleMRExRC, mr_access=access, msg_size=1024) + u.traffic(**self.traffic_args) + + def test_dmah_mrex_rc_rdma_write(self): + """Validates RC RDMA Write traffic with DMAHandle & MREx.""" + access = ibv_access_flags.IBV_ACCESS_LOCAL_WRITE | ibv_access_flags.IBV_ACCESS_REMOTE_WRITE + self.create_players(DmaHandleMRExRC, mr_access=access, msg_size=1024) + u.rdma_traffic(**self.traffic_args, send_op=ibv_wr_opcode.IBV_WR_RDMA_WRITE) + + def test_dmah_mrex_rc_atomic(self): + """Tests RC atomic fetch&add using a DMAHandle-backed MREx.""" + access = (ibv_access_flags.IBV_ACCESS_LOCAL_WRITE | + ibv_access_flags.IBV_ACCESS_REMOTE_ATOMIC | + ibv_access_flags.IBV_ACCESS_REMOTE_WRITE) + self.create_players(DmaHandleMRExRC, mr_access=access, msg_size=8) + u.atomic_traffic(**self.traffic_args, send_op=ibv_wr_opcode.IBV_WR_ATOMIC_FETCH_AND_ADD) diff --git a/tests/test_mr.py b/tests/test_mr.py index ce6a47602..fdde078e9 100644 --- a/tests/test_mr.py +++ b/tests/test_mr.py @@ -10,7 +10,7 @@ from tests.base import PyverbsAPITestCase, RCResources, RDMATestCase from pyverbs.pyverbs_error import PyverbsRDMAError, PyverbsError -from pyverbs.mr import MR, MW, DMMR, DmaBufMR, MWBindInfo, MWBind +from pyverbs.mr import MR, MW, DMMR, DmaBufMR, MWBindInfo, MWBind, MREx from pyverbs.mem_alloc import posix_memalign, free from pyverbs.dmabuf import DmaBuf from pyverbs.qp import QPAttr @@ -62,6 +62,12 @@ def rereg_mr(self, flags, pd=None, addr=0, length=0, access=0): raise ex +class MRExRes(MRRes): + @u.skip_unsupported + def create_mr(self): + self.mr = MREx(self.pd, self.msg_size, self.mr_access) + + class MRTest(RDMATestCase): """ Test various functionalities of the MR class. @@ -75,6 +81,16 @@ def setUp(self): self.client_qp_attr = None self.traffic_args = None + def query_qp_attrs(self): + """Query and store QP attributes for both server and client.""" + self.server_qp_attr, _ = self.server.qp.query(0x1ffffff) + self.client_qp_attr, _ = self.client.qp.query(0x1ffffff) + + def create_players_and_query_qp(self, res_cls, **kwargs): + """Create players of a given resource class and query their QPs.""" + self.create_players(res_cls, **kwargs) + self.query_qp_attrs() + def restate_qps(self): """ Restate the resources QPs from ERR back to RTS state. @@ -84,6 +100,34 @@ def restate_qps(self): self.client.qp.modify(QPAttr(qp_state=ibv_qp_state.IBV_QPS_RESET), ibv_qp_attr_mask.IBV_QP_STATE) self.client.qp.to_rts(self.client_qp_attr) + def validate_rereg_addr(self, res_cls): + """Common logic for address translation re-registration tests.""" + self.create_players_and_query_qp(res_cls) + # Post a receive WR to ensure old address is in use and will break. + s_recv_wr = u.get_recv_wr(self.server) + self.server.qp.post_recv(s_recv_wr) + server_addr = posix_memalign(self.server.msg_size) + try: + self.server.rereg_mr(flags=ibv_rereg_mr_flags.IBV_REREG_MR_CHANGE_TRANSLATION, + addr=server_addr, + length=self.server.msg_size) + with self.assertRaisesRegex(PyverbsRDMAError, 'Remote operation error'): + # The server QP receive queue has WR with the old MR address, + # therefore traffic should fail. + u.traffic(**self.traffic_args) + self.restate_qps() + u.traffic(**self.traffic_args) + finally: + free(server_addr) + + def test_mr_rereg_addr(self): + """Test MR reregistration with different address""" + self.validate_rereg_addr(MRRes) + + def test_mrex_rereg_addr(self): + """Test MREx reregistration with different address""" + self.validate_rereg_addr(MRExRes) + def test_mr_rereg_atomic(self): """ Test the rereg of MR's atomic access with the following flow: @@ -92,9 +136,7 @@ def test_mr_rereg_atomic(self): Rereg the MRs back to atomic access and verify that traffic now succeeds. """ atomic_mr_access = ibv_access_flags.IBV_ACCESS_LOCAL_WRITE | ibv_access_flags.IBV_ACCESS_REMOTE_ATOMIC - self.create_players(MRRes, mr_access=atomic_mr_access) - self.server_qp_attr, _ = self.server.qp.query(0x1ffffff) - self.client_qp_attr, _ = self.client.qp.query(0x1ffffff) + self.create_players_and_query_qp(MRRes, mr_access=atomic_mr_access) access = ibv_access_flags.IBV_ACCESS_LOCAL_WRITE self.server.rereg_mr(flags=ibv_rereg_mr_flags.IBV_REREG_MR_CHANGE_ACCESS, access=access) self.client.rereg_mr(flags=ibv_rereg_mr_flags.IBV_REREG_MR_CHANGE_ACCESS, access=access) @@ -105,13 +147,20 @@ def test_mr_rereg_atomic(self): self.client.rereg_mr(flags=ibv_rereg_mr_flags.IBV_REREG_MR_CHANGE_ACCESS, access=atomic_mr_access) u.atomic_traffic(**self.traffic_args, send_op=ibv_wr_opcode.IBV_WR_ATOMIC_FETCH_AND_ADD) - def test_mr_rereg_access(self): - self.create_players(MRRes) + def validate_rereg_access(self, res_cls): + """Common logic for reregistration of access flags and RDMA traffic.""" + self.create_players(res_cls) access = ibv_access_flags.IBV_ACCESS_LOCAL_WRITE | ibv_access_flags.IBV_ACCESS_REMOTE_WRITE self.server.rereg_mr(flags=ibv_rereg_mr_flags.IBV_REREG_MR_CHANGE_ACCESS, access=access) self.client.rereg_mr(flags=ibv_rereg_mr_flags.IBV_REREG_MR_CHANGE_ACCESS, access=access) u.rdma_traffic(**self.traffic_args, send_op=ibv_wr_opcode.IBV_WR_RDMA_WRITE) + def test_mr_rereg_access(self): + self.validate_rereg_access(MRRes) + + def test_mrex_rereg_access(self): + self.validate_rereg_access(MRExRes) + def test_mr_rereg_access_bad_flow(self): """ Test that cover rereg MR's access with this flow: @@ -147,40 +196,37 @@ def test_mr_rereg_pd(self): self.restate_qps() self.server.rereg_mr(flags=ibv_rereg_mr_flags.IBV_REREG_MR_CHANGE_PD, pd=self.server.pd) u.traffic(**self.traffic_args) - # Rereg the MR again with the new PD to cover - # destroying a PD with a re-registered MR. self.server.rereg_mr(flags=ibv_rereg_mr_flags.IBV_REREG_MR_CHANGE_PD, pd=server_new_pd) - def test_mr_rereg_addr(self): - self.create_players(MRRes) - self.server_qp_attr, _ = self.server.qp.query(0x1ffffff) - self.client_qp_attr, _ = self.client.qp.query(0x1ffffff) - s_recv_wr = u.get_recv_wr(self.server) - self.server.qp.post_recv(s_recv_wr) - server_addr = posix_memalign(self.server.msg_size) - self.server.rereg_mr(flags=ibv_rereg_mr_flags.IBV_REREG_MR_CHANGE_TRANSLATION, - addr=server_addr, - length=self.server.msg_size) - with self.assertRaisesRegex(PyverbsRDMAError, 'Remote operation error'): - # The server QP receive queue has WR with the old MR address, - # therefore traffic should fail. - u.traffic(**self.traffic_args) - self.restate_qps() - u.traffic(**self.traffic_args) - free(server_addr) + def validate_bad_flags_registration(self, mr_cls, expected_regex): + """Validate that registering a given MR class with illegal flags fails.""" + bad_flags = [ibv_access_flags.IBV_ACCESS_REMOTE_WRITE, + ibv_access_flags.IBV_ACCESS_REMOTE_ATOMIC] + with d.Context(name=self.dev_name) as ctx: + with PD(ctx) as pd: + for flag in bad_flags: + with self.assertRaisesRegex(PyverbsRDMAError, expected_regex): + mr_cls(pd, u.get_mr_length(), flag) def test_reg_mr_bad_flags(self): """ - Verify that illegal flags combination fails as expected + Verify that illegal flags combination fails as expected for classic MR + """ + self.validate_bad_flags_registration(MR, 'Failed to register a MR') + + def test_reg_mrex_bad_flags(self): """ + Verify that illegal flags combination fails as expected for MREx + """ + self.validate_bad_flags_registration(MREx, 'Failed to register MR using extended API') + + def test_reg_mrex_inval_params(self): + """Verify MREx registration fails when ADDR & FD flags set.""" with d.Context(name=self.dev_name) as ctx: with PD(ctx) as pd: - with self.assertRaisesRegex(PyverbsRDMAError, - 'Failed to register a MR'): - MR(pd, u.get_mr_length(), ibv_access_flags.IBV_ACCESS_REMOTE_WRITE) - with self.assertRaisesRegex(PyverbsRDMAError, - 'Failed to register a MR'): - MR(pd, u.get_mr_length(), ibv_access_flags.IBV_ACCESS_REMOTE_ATOMIC) + with self.assertRaises(PyverbsRDMAError): + MREx(pd, u.get_mr_length(), ibv_access_flags.IBV_ACCESS_LOCAL_WRITE, + implicit=True, fd=0) class MWRC(RCResources): @@ -223,6 +269,16 @@ def create_qp_attr(self): return qp_attr +class MWExRC(MWRC): + """Resource class that uses MREx instead of MR for MW tests.""" + + @u.skip_unsupported + def create_mr(self): + # Register an MREx that can be bound to a memory window + access = ibv_access_flags.IBV_ACCESS_LOCAL_WRITE | ibv_access_flags.IBV_ACCESS_MW_BIND + self.mr = MREx(self.pd, self.msg_size, access) + + class MWTest(RDMATestCase): """ Test various functionalities of the MW class. @@ -320,6 +376,14 @@ def test_mw_type1(self): self.bind_mw_type_1() u.rdma_traffic(**self.traffic_args, send_op=ibv_wr_opcode.IBV_WR_RDMA_WRITE) + def test_mw_ex_type1(self): + """ + Test Memory Window type 1 with MREx + """ + self.create_players(MWExRC, mw_type=ibv_mw_type.IBV_MW_TYPE_1) + self.bind_mw_type_1() + u.rdma_traffic(**self.traffic_args, send_op=ibv_wr_opcode.IBV_WR_RDMA_WRITE) + def test_invalidate_mw_type1(self): self.test_mw_type1() self.invalidate_mw_type1() diff --git a/tests/test_odp.py b/tests/test_odp.py index 42f2f5213..d75ab84cb 100644 --- a/tests/test_odp.py +++ b/tests/test_odp.py @@ -1,16 +1,19 @@ from pyverbs.mem_alloc import mmap, munmap, madvise, MAP_ANONYMOUS_, MAP_PRIVATE_, \ MAP_HUGETLB_ +from pyverbs.pyverbs_error import PyverbsError, PyverbsRDMAError from tests.base import RCResources, UDResources, XRCResources from pyverbs.qp import QPCap, QPAttr, QPInitAttr from pyverbs.wr import SGE, SendWR, RecvWR from tests.base import RDMATestCase -from pyverbs.mr import MR +from pyverbs.mr import MR, MREx from pyverbs.libibverbs_enums import ibv_odp_transport_cap_bits, ibv_access_flags, ibv_odp_transport_cap_bits, \ ibv_qp_type, ibv_placement_type, ibv_selectivity_level, ibv_qp_create_send_ops_flags, ibv_wr_opcode, \ ibv_wc_status, _IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE, _IBV_ADVISE_MR_ADVICE_PREFETCH, \ _IBV_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT import tests.utils as u import unittest +import errno + HUGE_PAGE_SIZE = 0x200000 @@ -174,6 +177,53 @@ def create_mr(self): else: raise unittest.SkipTest('There is no qpex test for the specified ODP caps.') + +class OdpQpExMRExRC(OdpQpExRC): + """ + MREx version of OdpQpExRC that uses MREx instead of MR for memory registration. + """ + def create_mr(self): + u.odp_supported(self.ctx, 'rc', self.odp_caps) + if self.odp_caps & ibv_odp_transport_cap_bits.IBV_ODP_SUPPORT_FLUSH: + try: + self.mr = u.create_custom_mr(self, ibv_access_flags.IBV_ACCESS_FLUSH_GLOBAL | + ibv_access_flags.IBV_ACCESS_REMOTE_WRITE | + ibv_access_flags.IBV_ACCESS_ON_DEMAND, + mr_ex=True) + except PyverbsRDMAError as ex: + if ex.error_code == errno.EINVAL: + raise unittest.SkipTest('Create mr with IBV_ACCESS_FLUSH_GLOBAL access flag is' + ' not supported in kernel') + raise ex + else: + raise unittest.SkipTest('There is no qpex test for the specified ODP caps.') + + +class MRExOdpRC(OdpRC): + """ + MREx version of OdpRC that uses MREx instead of MR for memory registration. + """ + @u.requires_odp('rc', ibv_odp_transport_cap_bits.IBV_ODP_SUPPORT_SEND | + ibv_odp_transport_cap_bits.IBV_ODP_SUPPORT_RECV) + @u.skip_unsupported + def create_mr(self): + u.odp_supported(self.ctx, 'rc', self.odp_caps) + if self.request_user_addr: + mmap_flags = MAP_ANONYMOUS_| MAP_PRIVATE_ + length = self.msg_size + if self.is_huge: + mmap_flags |= MAP_HUGETLB_ + length = HUGE_PAGE_SIZE + self.user_addr = mmap(length=length, flags=mmap_flags) + access = self.access + if self.is_huge: + access |= ibv_access_flags.IBV_ACCESS_HUGETLB + self.mr = MREx(self.pd, self.msg_size, access, address=self.user_addr, + implicit=self.is_implicit) + if self.use_mixed_mr: + self.non_odp_mr = MREx(self.pd, self.msg_size, ibv_access_flags.IBV_ACCESS_LOCAL_WRITE) + + class OdpTestCase(RDMATestCase): def setUp(self): super(OdpTestCase, self).setUp() @@ -225,13 +275,13 @@ def test_odp_qp_ex_rc_flush(self): wcs = u.flush_traffic(**self.traffic_args, new_send=True, send_op=ibv_wr_opcode.IBV_WR_FLUSH) if wcs[0].status != ibv_wc_status.IBV_WC_SUCCESS: - raise PyverbsError(f'Unexpected {wc_status_to_str(wcs[0].status)}') + raise PyverbsError(f'Unexpected {u.wc_status_to_str(wcs[0].status)}') self.client.level = ibv_selectivity_level.IBV_FLUSH_MR wcs = u.flush_traffic(**self.traffic_args, new_send=True, send_op=ibv_wr_opcode.IBV_WR_FLUSH) if wcs[0].status != ibv_wc_status.IBV_WC_SUCCESS: - raise PyverbsError(f'Unexpected {wc_status_to_str(wcs[0].status)}') + raise PyverbsError(f'Unexpected {u.wc_status_to_str(wcs[0].status)}') def test_odp_rc_atomic_cmp_and_swp(self): self.create_players(OdpRC, request_user_addr=self.force_page_faults, @@ -334,3 +384,42 @@ def test_odp_prefetch_async_no_page_fault_rc_traffic(self): self.create_players(OdpRC, request_user_addr=self.force_page_faults, use_mr_prefetch='async', prefetch_advice=prefetch_advice) u.traffic(**self.traffic_args) + + def test_mrex_odp_rc_traffic(self): + """ + Test ODP traffic with MREx + """ + self.create_players(MRExOdpRC, request_user_addr=self.force_page_faults) + u.traffic(**self.traffic_args) + + @u.requires_huge_pages() + def test_mrex_odp_rc_huge_traffic(self): + self.force_page_faults = False + self.create_players(MRExOdpRC, request_user_addr=self.force_page_faults, + is_huge=True) + u.traffic(**self.traffic_args) + + def test_mrex_odp_implicit_rc_traffic(self): + """ + Test ODP implicit traffic with MREx + """ + self.create_players(MRExOdpRC, request_user_addr=self.force_page_faults, + is_implicit=True) + u.traffic(**self.traffic_args) + + def test_mrex_odp_qp_ex_rc_flush(self): + """ + Test ODP QP extended flush with MREx + """ + super().create_players(OdpQpExMRExRC, request_user_addr=self.force_page_faults, + odp_caps=ibv_odp_transport_cap_bits.IBV_ODP_SUPPORT_FLUSH) + wcs = u.flush_traffic(**self.traffic_args, new_send=True, + send_op=ibv_wr_opcode.IBV_WR_FLUSH) + if wcs[0].status != ibv_wc_status.IBV_WC_SUCCESS: + raise PyverbsError(f'Unexpected {u.wc_status_to_str(wcs[0].status)}') + + self.client.level = ibv_selectivity_level.IBV_FLUSH_MR + wcs = u.flush_traffic(**self.traffic_args, new_send=True, + send_op=ibv_wr_opcode.IBV_WR_FLUSH) + if wcs[0].status != ibv_wc_status.IBV_WC_SUCCESS: + raise PyverbsError(f'Unexpected {u.wc_status_to_str(wcs[0].status)}') diff --git a/tests/test_relaxed_ordering.py b/tests/test_relaxed_ordering.py index 1b27ca03f..f9256a6d6 100644 --- a/tests/test_relaxed_ordering.py +++ b/tests/test_relaxed_ordering.py @@ -1,8 +1,9 @@ from tests.base import RCResources, UDResources, XRCResources -from tests.utils import traffic, xrc_traffic +from tests.test_mr import MRExRes +from tests.utils import traffic, xrc_traffic, rdma_traffic from tests.base import RDMATestCase from pyverbs.mr import MR -from pyverbs.libibverbs_enums import ibv_access_flags +from pyverbs.libibverbs_enums import ibv_access_flags, ibv_wr_opcode class RoUD(UDResources): def create_mr(self): @@ -38,3 +39,10 @@ def test_ro_ud_traffic(self): def test_ro_xrc_traffic(self): self.create_players(RoXRC) xrc_traffic(self.client, self.server) + + def test_ro_mr_ex_traffic(self): + access = (ibv_access_flags.IBV_ACCESS_LOCAL_WRITE | + ibv_access_flags.IBV_ACCESS_REMOTE_WRITE | + ibv_access_flags.IBV_ACCESS_RELAXED_ORDERING) + self.create_players(MRExRes, mr_access=access) + rdma_traffic(**self.traffic_args, send_op=ibv_wr_opcode.IBV_WR_RDMA_WRITE) diff --git a/tests/utils.py b/tests/utils.py index 7e27cf781..333fef7c8 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -35,8 +35,9 @@ ibv_wr_opcode, ibv_send_flags, ibv_mw_type, ibv_wc_status, ibv_wc_flags, ibv_wc_opcode, \ ibv_selectivity_level, ibv_odp_general_caps, ibv_qp_state, ibv_qp_attr_mask, ibv_qp_create_send_ops_flags, \ IBV_LINK_LAYER_ETHERNET, _IBV_DEVICE_RAW_SCATTER_FCS, _IBV_DEVICE_PCI_WRITE_END_PADDING, \ - _IBV_ADVISE_MR_FLAG_FLUSH, _IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE -from pyverbs.mr import MR + _IBV_ADVISE_MR_FLAG_FLUSH, _IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE, ibv_tph_mem_type, \ + ibv_dmah_init_attr_mask +from pyverbs.mr import MR, MREx, DmaHandleInitAttr MAX_MR_SIZE = 4194304 @@ -121,6 +122,24 @@ class PacketConsts: ROCE_PORT = 4791 +def create_dmah_init_attr(cpu_id=0, tph_mem_type=ibv_tph_mem_type.IBV_TPH_MEM_TYPE_VM, ph=0): + """ + Helper function to create DmaHandleInitAttr with standard parameters. + + :param cpu_id: CPU ID (default: 0) + :param tph_mem_type: TPH memory type (default: IBV_TPH_MEM_TYPE_VM) + :param ph: Processing hint (default: 0) + :return: DmaHandleInitAttr object + """ + return DmaHandleInitAttr( + comp_mask=(ibv_dmah_init_attr_mask.IBV_DMAH_INIT_ATTR_MASK_CPU_ID | + ibv_dmah_init_attr_mask.IBV_DMAH_INIT_ATTR_MASK_TPH_MEM_TYPE | + ibv_dmah_init_attr_mask.IBV_DMAH_INIT_ATTR_MASK_PH), + cpu_id=cpu_id, + tph_mem_type=tph_mem_type, + ph=ph) + + def get_mr_length(): """ Provide a random value for MR length. We avoid large buffers as these @@ -418,7 +437,7 @@ def wc_status_to_str(status): return 'Unknown WC status ({s})'.format(s=status) -def create_custom_mr(agr_obj, additional_access_flags=0, size=None, user_addr=None): +def create_custom_mr(agr_obj, additional_access_flags=0, size=None, user_addr=None, mr_ex=False): """ Creates a memory region using the aggregation object's PD. If size is None, the agr_obj's message size is used to set the MR's size. @@ -427,9 +446,14 @@ def create_custom_mr(agr_obj, additional_access_flags=0, size=None, user_addr=No :param additional_access_flags: Addition access flags to set in the MR :param size: MR's length. If None, agr_obj.msg_size is used. :param user_addr: The MR's buffer address. If None, the buffer will be allocated by pyverbs. + :param mr_ex: Whether to create an MREx MR instead of a regular MR """ mr_length = size if size else agr_obj.msg_size try: + if mr_ex: + return MREx(agr_obj.pd, mr_length, + ibv_access_flags.IBV_ACCESS_LOCAL_WRITE | additional_access_flags, + address=user_addr) return MR(agr_obj.pd, mr_length, ibv_access_flags.IBV_ACCESS_LOCAL_WRITE | additional_access_flags, address=user_addr) except PyverbsRDMAError as ex: From bc5b0688aef031c8d1ba1021495a39b3afcb5a32 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Mon, 18 Aug 2025 10:55:05 -0700 Subject: [PATCH 27/66] Update kernel headers To commit: 810f874eda8e ("RDMA/ucma: Support query resolved service records"). Signed-off-by: Sean Hefty --- kernel-headers/rdma/ib_user_sa.h | 14 ++++++++++++++ kernel-headers/rdma/rdma_user_cm.h | 28 ++++++++++++++++++++++++++-- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/kernel-headers/rdma/ib_user_sa.h b/kernel-headers/rdma/ib_user_sa.h index 435155d6e..acfa20816 100644 --- a/kernel-headers/rdma/ib_user_sa.h +++ b/kernel-headers/rdma/ib_user_sa.h @@ -74,4 +74,18 @@ struct ib_user_path_rec { __u8 preference; }; +struct ib_user_service_rec { + __be64 id; + __u8 gid[16]; + __be16 pkey; + __u8 reserved[2]; + __be32 lease; + __u8 key[16]; + __u8 name[64]; + __u8 data_8[16]; + __be16 data_16[8]; + __be32 data_32[4]; + __be64 data_64[2]; +}; + #endif /* IB_USER_SA_H */ diff --git a/kernel-headers/rdma/rdma_user_cm.h b/kernel-headers/rdma/rdma_user_cm.h index 7cea03581..00501da05 100644 --- a/kernel-headers/rdma/rdma_user_cm.h +++ b/kernel-headers/rdma/rdma_user_cm.h @@ -67,7 +67,8 @@ enum { RDMA_USER_CM_CMD_QUERY, RDMA_USER_CM_CMD_BIND, RDMA_USER_CM_CMD_RESOLVE_ADDR, - RDMA_USER_CM_CMD_JOIN_MCAST + RDMA_USER_CM_CMD_JOIN_MCAST, + RDMA_USER_CM_CMD_RESOLVE_IB_SERVICE }; /* See IBTA Annex A11, servies ID bytes 4 & 5 */ @@ -147,7 +148,8 @@ struct rdma_ucm_resolve_route { enum { RDMA_USER_CM_QUERY_ADDR, RDMA_USER_CM_QUERY_PATH, - RDMA_USER_CM_QUERY_GID + RDMA_USER_CM_QUERY_GID, + RDMA_USER_CM_QUERY_IB_SERVICE }; struct rdma_ucm_query { @@ -187,6 +189,11 @@ struct rdma_ucm_query_path_resp { struct ib_path_rec_data path_data[]; }; +struct rdma_ucm_query_ib_service_resp { + __u32 num_service_recs; + struct ib_user_service_rec recs[]; +}; + struct rdma_ucm_conn_param { __u32 qp_num; __u32 qkey; @@ -338,4 +345,21 @@ struct rdma_ucm_migrate_resp { __u32 events_reported; }; +enum { + RDMA_USER_CM_IB_SERVICE_FLAG_ID = 1 << 0, + RDMA_USER_CM_IB_SERVICE_FLAG_NAME = 1 << 1, +}; + +#define RDMA_USER_CM_IB_SERVICE_NAME_SIZE 64 +struct rdma_ucm_ib_service { + __u64 service_id; + __u8 service_name[RDMA_USER_CM_IB_SERVICE_NAME_SIZE]; + __u32 flags; + __u32 reserved; +}; + +struct rdma_ucm_resolve_ib_service { + __u32 id; + struct rdma_ucm_ib_service ibs; +}; #endif /* RDMA_USER_CM_H */ From 41817007b1f0b2ce185c1e8defdc46d9ab83d2f7 Mon Sep 17 00:00:00 2001 From: daiyanlong Date: Fri, 22 Aug 2025 11:03:13 +0800 Subject: [PATCH 28/66] libibverbs: Fix the issue of ibv_ud_pingpong failing in RDMA communication environments across three layers of networks (different subnets) Fixed the issue where, in an RDMA communication environment across three layers of networks (different subnets), when using a global routing header (GRH) to establish an address handle (AH), the data packet could not be transmitted across the router due to the hard coded setting of ah_ attr. grh. hop limit to 1, resulting in RDMA communication failure across network nodes. Now adjust ah_ attr. grh. hop limit to 64 Co-authored-by: kanyong Co-authored-by: ningjin Co-authored-by: liangchangwei Co-authored-by: daiyanlong Signed-off-by: daiyanlong --- libibverbs/examples/ud_pingpong.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libibverbs/examples/ud_pingpong.c b/libibverbs/examples/ud_pingpong.c index 4b0e8aff1..bd7db31c5 100644 --- a/libibverbs/examples/ud_pingpong.c +++ b/libibverbs/examples/ud_pingpong.c @@ -109,7 +109,7 @@ static int pp_connect_ctx(struct pingpong_context *ctx, int port, int my_psn, if (dest->gid.global.interface_id) { ah_attr.is_global = 1; - ah_attr.grh.hop_limit = 1; + ah_attr.grh.hop_limit = 64; ah_attr.grh.dgid = dest->gid; ah_attr.grh.sgid_index = sgid_idx; } From 5efc18155e10e83cd7440da7bed59dad85ca636c Mon Sep 17 00:00:00 2001 From: Saravanan Vajravel Date: Mon, 18 Aug 2025 14:05:27 +0530 Subject: [PATCH 29/66] bnxt_re/lib: Add support for flow create/destroy To support RawEth QP: - Added flow verbs create_fow/destroy_flow to roce lib These flow verbs are used to manage flows on RawEh QP. - Added CQE processing for completion type BNXT_RE_WC_TYPE_RECV_RAW. Signed-off-by: Saravanan Vajravel Reviewed-by: Kashyap Desai Reviewed-by: Anantha Prabhu Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier --- providers/bnxt_re/main.c | 2 + providers/bnxt_re/verbs.c | 80 +++++++++++++++++++++++++++------------ providers/bnxt_re/verbs.h | 3 ++ 3 files changed, 61 insertions(+), 24 deletions(-) diff --git a/providers/bnxt_re/main.c b/providers/bnxt_re/main.c index 2c4445e12..a99f3afc1 100644 --- a/providers/bnxt_re/main.c +++ b/providers/bnxt_re/main.c @@ -121,6 +121,8 @@ static const struct verbs_context_ops bnxt_re_cntx_ops = { .free_context = bnxt_re_free_context, .create_qp_ex = bnxt_re_create_qp_ex, + .create_flow = bnxt_re_create_flow, + .destroy_flow = bnxt_re_destroy_flow, }; static inline bool bnxt_re_is_chip_gen_p7(struct bnxt_re_chip_ctx *cctx) diff --git a/providers/bnxt_re/verbs.c b/providers/bnxt_re/verbs.c index 28fed512d..50f70f912 100644 --- a/providers/bnxt_re/verbs.c +++ b/providers/bnxt_re/verbs.c @@ -719,6 +719,7 @@ static void bnxt_re_poll_success_rcqe(struct bnxt_re_qp *qp, { uint8_t flags, is_imm, is_rdma; struct bnxt_re_rc_cqe *rcqe; + uint8_t qp_type = qp->qptyp; struct bnxt_re_wrid *swque; struct bnxt_re_queue *rq; uint32_t rcqe_len; @@ -747,31 +748,34 @@ static void bnxt_re_poll_success_rcqe(struct bnxt_re_qp *qp, ibvwc->status = IBV_WC_SUCCESS; ibvwc->qp_num = qp->qpid; rcqe_len = le32toh(rcqe->length); - ibvwc->byte_len = (qp->qptyp == IBV_QPT_UD) ? - rcqe_len & BNXT_RE_UD_CQE_LEN_MASK: rcqe_len ; + ibvwc->byte_len = (qp_type == IBV_QPT_UD || + qp_type == IBV_QPT_RAW_PACKET) ? + rcqe_len & BNXT_RE_UD_CQE_LEN_MASK : rcqe_len; ibvwc->opcode = IBV_WC_RECV; - flags = (le32toh(hdr->flg_st_typ_ph) >> BNXT_RE_BCQE_FLAGS_SHIFT) & - BNXT_RE_BCQE_FLAGS_MASK; - is_imm = (flags & BNXT_RE_RC_FLAGS_IMM_MASK) >> - BNXT_RE_RC_FLAGS_IMM_SHIFT; - is_rdma = (flags & BNXT_RE_RC_FLAGS_RDMA_MASK) >> - BNXT_RE_RC_FLAGS_RDMA_SHIFT; - ibvwc->wc_flags = 0; - if (is_imm) { - ibvwc->wc_flags |= IBV_WC_WITH_IMM; - /* Completion reports the raw-data in LE format, While - * user expects it in BE format. Thus, swapping on outgoing - * data is needed. On a BE platform le32toh will do the swap - * while on LE platform htobe32 will do the job. - */ - ibvwc->imm_data = htobe32(le32toh(rcqe->imm_key)); - if (is_rdma) - ibvwc->opcode = IBV_WC_RECV_RDMA_WITH_IMM; - } + if (qp_type == IBV_QPT_UD || qp_type == IBV_QPT_RC) { + flags = (le32toh(hdr->flg_st_typ_ph) >> BNXT_RE_BCQE_FLAGS_SHIFT) & + BNXT_RE_BCQE_FLAGS_MASK; + is_imm = (flags & BNXT_RE_RC_FLAGS_IMM_MASK) >> + BNXT_RE_RC_FLAGS_IMM_SHIFT; + is_rdma = (flags & BNXT_RE_RC_FLAGS_RDMA_MASK) >> + BNXT_RE_RC_FLAGS_RDMA_SHIFT; + ibvwc->wc_flags = 0; + if (is_imm) { + ibvwc->wc_flags |= IBV_WC_WITH_IMM; + /* Completion reports the raw-data in LE format, While + * user expects it in BE format. Thus, swapping on outgoing + * data is needed. On a BE platform le32toh will do the swap + * while on LE platform htobe32 will do the job. + */ + ibvwc->imm_data = htobe32(le32toh(rcqe->imm_key)); + if (is_rdma) + ibvwc->opcode = IBV_WC_RECV_RDMA_WITH_IMM; + } - if (qp->qptyp == IBV_QPT_UD) - bnxt_re_fill_ud_cqe(ibvwc, hdr, cqe, flags); + if (qp_type == IBV_QPT_UD) + bnxt_re_fill_ud_cqe(ibvwc, hdr, cqe, flags); + } if (!qp->srq) bnxt_re_jqq_mod_last(qp->jrqq, head); @@ -872,6 +876,7 @@ static int bnxt_re_poll_one(struct bnxt_re_cq *cq, int nwc, struct ibv_wc *wc, break; case BNXT_RE_WC_TYPE_RECV_RC: case BNXT_RE_WC_TYPE_RECV_UD: + case BNXT_RE_WC_TYPE_RECV_RAW: rcqe = cqe; qp_handle = (uint64_t *)&rcqe->qp_handle; qp = (struct bnxt_re_qp *) @@ -880,8 +885,6 @@ static int bnxt_re_poll_one(struct bnxt_re_cq *cq, int nwc, struct ibv_wc *wc, break; /*stale cqe. should be rung.*/ pcqe = bnxt_re_poll_rcqe(qp, wc, cqe, &cnt); break; - case BNXT_RE_WC_TYPE_RECV_RAW: - break; case BNXT_RE_WC_TYPE_TERM: scqe = cqe; qp_handle = (uint64_t *)&scqe->qp_handle; @@ -2956,3 +2959,32 @@ int bnxt_re_destroy_ah(struct ibv_ah *ibvah) return 0; } + +struct ibv_flow *bnxt_re_create_flow(struct ibv_qp *qp, + struct ibv_flow_attr *attr) +{ + struct ibv_flow *flow; + int ret; + + flow = calloc(1, sizeof(*flow)); + if (!flow) + return NULL; + + ret = ibv_cmd_create_flow(qp, flow, attr, NULL, 0); + if (ret) { + free(flow); + flow = NULL; + } + + return flow; +} + +int bnxt_re_destroy_flow(struct ibv_flow *flow) +{ + int ret; + + ret = ibv_cmd_destroy_flow(flow); + + free(flow); + return ret; +} diff --git a/providers/bnxt_re/verbs.h b/providers/bnxt_re/verbs.h index e46b5a275..7927755c0 100644 --- a/providers/bnxt_re/verbs.h +++ b/providers/bnxt_re/verbs.h @@ -120,6 +120,9 @@ int bnxt_re_post_srq_recv(struct ibv_srq *ibvsrq, struct ibv_recv_wr *wr, struct ibv_ah *bnxt_re_create_ah(struct ibv_pd *ibvpd, struct ibv_ah_attr *attr); int bnxt_re_destroy_ah(struct ibv_ah *ibvah); +struct ibv_flow *bnxt_re_create_flow(struct ibv_qp *qp, + struct ibv_flow_attr *flow); +int bnxt_re_destroy_flow(struct ibv_flow *flow); void bnxt_re_async_event(struct ibv_context *context, struct ibv_async_event *event); From 0f09871ae6abe72d97db4b83de70d2ec0043fb02 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Mon, 18 Aug 2025 14:14:29 +0530 Subject: [PATCH 30/66] bnxt_re/lib: Dont allow unsupported qp type creation libbnxt_re should not pass down the commands to create the unsupported QP types. Reviewed-by: Selvin Xavier Signed-off-by: Kalesh AP --- providers/bnxt_re/verbs.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/providers/bnxt_re/verbs.c b/providers/bnxt_re/verbs.c index 50f70f912..051c80b7f 100644 --- a/providers/bnxt_re/verbs.c +++ b/providers/bnxt_re/verbs.c @@ -1138,6 +1138,11 @@ static int bnxt_re_check_qp_limits(struct bnxt_re_context *cntx, rdev = cntx->rdev; devattr = &rdev->devattr; + if (attr->qp_type != IBV_QPT_RC && + attr->qp_type != IBV_QPT_UD && + attr->qp_type != IBV_QPT_RAW_PACKET) + return EINVAL; + if (attr->cap.max_send_sge > devattr->max_sge) return EINVAL; if (attr->cap.max_recv_sge > devattr->max_sge) From 7b1a686bc3c7107d774ba18f9f177977c0859090 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Mon, 25 Aug 2025 09:13:24 -0700 Subject: [PATCH 31/66] librdmacm: Provide interfaces to resolve IB services Add the following APIs to resolve IB services and report addrinfo structures: * rdma_resolve_addrinfo() - To resolve an IB service; * rdma_query_addrinfo() - To get the resolved information. A new flag "RAI_SA" is added, which can be used as a hint to indicate resolve SA, as this API is going to support DNS also. Two new CM events are added: * RDMA_CM_EVENT_ADDRINFO_RESOLVED: resolution completed successfully. * RDMA_CM_EVENT_ADDRINFO_ERROR: resolution failed. On a successful resolve, one or more addrinfo is returned and saved internally, the first one will be used by default. Signed-off-by: Mark Zhang Signed-off-by: Sean Hefty --- debian/librdmacm1.symbols | 3 + librdmacm/CMakeLists.txt | 2 +- librdmacm/cma.c | 233 ++++++++++++++++++++++++++++++++++++++ librdmacm/librdmacm.map | 6 + librdmacm/rdma_cma.h | 23 +++- librdmacm/rdma_cma_abi.h | 46 +++++++- 6 files changed, 309 insertions(+), 4 deletions(-) diff --git a/debian/librdmacm1.symbols b/debian/librdmacm1.symbols index a2a2e8256..4637147d6 100644 --- a/debian/librdmacm1.symbols +++ b/debian/librdmacm1.symbols @@ -4,6 +4,7 @@ librdmacm.so.1 librdmacm1 #MINVER# RDMACM_1.1@RDMACM_1.1 16 RDMACM_1.2@RDMACM_1.2 23 RDMACM_1.3@RDMACM_1.3 31 + RDMACM_1.4@RDMACM_1.4 60 raccept@RDMACM_1.0 1.0.16 rbind@RDMACM_1.0 1.0.16 rclose@RDMACM_1.0 1.0.16 @@ -43,9 +44,11 @@ librdmacm.so.1 librdmacm1 #MINVER# rdma_listen@RDMACM_1.0 1.0.15 rdma_migrate_id@RDMACM_1.0 1.0.15 rdma_notify@RDMACM_1.0 1.0.15 + rdma_query_addrinfo@RDMACM_1.4 60 rdma_reject@RDMACM_1.0 1.0.15 rdma_reject_ece@RDMACM_1.3 31 rdma_resolve_addr@RDMACM_1.0 1.0.15 + rdma_resolve_addrinfo@RDMACM_1.4 60 rdma_resolve_route@RDMACM_1.0 1.0.15 rdma_set_local_ece@RDMACM_1.3 31 rdma_set_option@RDMACM_1.0 1.0.15 diff --git a/librdmacm/CMakeLists.txt b/librdmacm/CMakeLists.txt index b01fef4fb..ea1d1550f 100644 --- a/librdmacm/CMakeLists.txt +++ b/librdmacm/CMakeLists.txt @@ -11,7 +11,7 @@ publish_headers(infiniband rdma_library(rdmacm librdmacm.map # See Documentation/versioning.md - 1 1.3.${PACKAGE_VERSION} + 1 1.4.${PACKAGE_VERSION} acm.c addrinfo.c cma.c diff --git a/librdmacm/cma.c b/librdmacm/cma.c index 93c7fd683..270781f0a 100644 --- a/librdmacm/cma.c +++ b/librdmacm/cma.c @@ -115,6 +115,9 @@ struct cma_id_private { uint8_t responder_resources; struct ibv_ece local_ece; struct ibv_ece remote_ece; + bool resolving_ai; + struct rdma_addrinfo *resolved_ai; + }; struct cma_multicast { @@ -712,6 +715,8 @@ static void ucma_free_id(struct cma_id_private *id_priv) rdma_destroy_event_channel(id_priv->id.channel); if (id_priv->connect_len) free(id_priv->connect); + if (id_priv->resolved_ai) + rdma_freeaddrinfo(id_priv->resolved_ai); free(id_priv); } @@ -2483,6 +2488,122 @@ static void ucma_copy_ud_event(struct cma_event *event, dst->qkey = src->qkey; } +static struct rdma_addrinfo * +ucma_convert_service_recs(struct ucma_user_service_rec *recs, int num_recs) +{ + struct rdma_addrinfo *head = NULL, *prev = NULL, *ai; + struct sockaddr_ib *sib; + int i; + + for (i = 0; i < num_recs; i++) { + ai = calloc(1, sizeof(*ai)); + if (!ai) + goto fail; + if (!head) + head = ai; + + sib = calloc(1, sizeof(*sib)); + if (!sib) + goto fail; + ai->ai_dst_addr = (struct sockaddr *) sib; + ai->ai_dst_len = sizeof(*sib); + + ai->ai_flags = RAI_SA; + ai->ai_family = AF_IB; + ai->ai_port_space = RDMA_PS_IB; + + sib->sib_family = AF_IB; + sib->sib_pkey = recs[i].pkey; + memcpy(&sib->sib_addr, &recs[i].gid, sizeof(sib->sib_addr)); + sib->sib_sid = recs[i].id; + sib->sib_sid_mask = htobe64(~0ULL); + + if (prev) + prev->ai_next = ai; + + prev = ai; + } + + return head; + +fail: + if (head) + rdma_freeaddrinfo(head); + + return NULL; +} + +static struct rdma_addrinfo *ucma_query_ib_service(struct cma_id_private *id_priv) +{ + struct ucma_abi_query_ib_service_resp *resp; + struct rdma_addrinfo *rai = NULL; + struct ucma_abi_query cmd; + int ret, size, num = 6; + +retry: + size = sizeof(*resp) + sizeof(struct ucma_user_service_rec) * num; + resp = calloc(1, size); + if (!resp) + return NULL; + + CMA_INIT_CMD_RESP(&cmd, sizeof(cmd), QUERY, resp, size); + cmd.id = id_priv->handle; + cmd.option = UCMA_QUERY_IB_SERVICE; + + ret = write(id_priv->id.channel->fd, &cmd, sizeof(cmd)); + if (ret != sizeof(cmd)) + goto out; + + VALGRIND_MAKE_MEM_DEFINED(resp, size); + + if (!resp->num_service_recs) { + errno = ENOENT; + goto out; + } + + if (resp->num_service_recs > num) { + num = resp->num_service_recs; + free(resp); + goto retry; + } + + ret = ucma_query_addr(&id_priv->id); + if (ret) + goto out; + + rai = ucma_convert_service_recs(resp->recs, resp->num_service_recs); + if (!rai) + goto out; + +out: + free(resp); + return rai; +} + +static void clear_resolving_ai_flag(struct cma_id_private *id_priv) +{ + pthread_mutex_lock(&id_priv->mut); + id_priv->resolving_ai = false; + pthread_mutex_unlock(&id_priv->mut); +} + +static void ucma_process_addrinfo_resolved(struct cma_event *evt) +{ + struct rdma_addrinfo *rai; + + rai = ucma_query_ib_service(evt->id_priv); + if (!rai) { + evt->event.event = RDMA_CM_EVENT_ADDRINFO_ERROR; + evt->event.status = errno; + clear_resolving_ai_flag(evt->id_priv); + return; + } + + pthread_mutex_lock(&evt->id_priv->mut); + evt->id_priv->resolved_ai = rai; + pthread_mutex_unlock(&evt->id_priv->mut); +} + int rdma_establish(struct rdma_cm_id *id) { if (id->qp) @@ -2625,6 +2746,12 @@ int rdma_get_cm_event(struct rdma_event_channel *channel, evt->event.id = &evt->id_priv->id; evt->event.param.ud.private_data = evt->mc->context; break; + case RDMA_CM_EVENT_ADDRINFO_RESOLVED: + ucma_process_addrinfo_resolved(evt); + break; + case RDMA_CM_EVENT_ADDRINFO_ERROR: + clear_resolving_ai_flag(evt->id_priv); + break; default: evt->id_priv = (void *) (uintptr_t) resp.uid; evt->event.id = &evt->id_priv->id; @@ -2675,6 +2802,10 @@ const char *rdma_event_str(enum rdma_cm_event_type event) return "RDMA_CM_EVENT_ADDR_CHANGE"; case RDMA_CM_EVENT_TIMEWAIT_EXIT: return "RDMA_CM_EVENT_TIMEWAIT_EXIT"; + case RDMA_CM_EVENT_ADDRINFO_RESOLVED: + return "RDMA_CM_EVENT_ADDRINFO_RESOLVED"; + case RDMA_CM_EVENT_ADDRINFO_ERROR: + return "RDMA_CM_EVENT_ADDRINFO_ERROR"; default: return "UNKNOWN EVENT"; } @@ -2940,3 +3071,105 @@ int rdma_get_remote_ece(struct rdma_cm_id *id, struct ibv_ece *ece) return 0; } + +static void resolve_ai_set_cmd_service(const char *service, + struct ucma_abi_ib_service *ibs) +{ + char *endptr = NULL; + + errno = 0; + ibs->service_id = strtoull(service, &endptr, 0); + if ((errno != 0) || (endptr == service) || *endptr) { + ibs->service_id = 0; + ibs->flags |= UCMA_IB_SERVICE_FLAG_NAME; + strncpy((char *)ibs->service_name, service, + sizeof(ibs->service_name) - 1); + } else { + ibs->flags |= UCMA_IB_SERVICE_FLAG_ID; + } +} + +static int resolve_ai_sa(struct cma_id_private *id_priv, const char *service) +{ + struct ucma_abi_resolve_ib_service cmd; + int ret; + + if (!service) + return ERR(EINVAL); + + CMA_INIT_CMD(&cmd, sizeof(cmd), RESOLVE_IB_SERVICE); + cmd.id = id_priv->handle; + resolve_ai_set_cmd_service(service, &cmd.ibs); + + ret = write(id_priv->id.channel->fd, &cmd, sizeof(cmd)); + if (ret != sizeof(cmd)) + return (ret >= 0) ? ERR(ENODATA) : -1; + + return ucma_complete(&id_priv->id); +} + +static int __rdma_resolve_addrinfo(struct cma_id_private *id_priv, + const char *node, const char *service, + const struct rdma_addrinfo *hints) +{ + if (hints->ai_flags & RAI_SA) { + if (node) + return ENOTSUP; + return resolve_ai_sa(id_priv, service); + } + + return EINVAL; +} + +int rdma_resolve_addrinfo(struct rdma_cm_id *id, const char *node, + const char *service, + const struct rdma_addrinfo *hints) +{ + struct cma_id_private *id_priv; + int ret = 0; + + if (!id || !hints) + return ERR(EINVAL); + + id_priv = container_of(id, struct cma_id_private, id); + + pthread_mutex_lock(&id_priv->mut); + if (id_priv->resolving_ai) { + pthread_mutex_unlock(&id_priv->mut); + return ERR(EBUSY); + } + + ret = __rdma_resolve_addrinfo(id_priv, node, service, hints); + if (ret) + goto fail; + + id_priv->resolving_ai = true; + pthread_mutex_unlock(&id_priv->mut); + return 0; + +fail: + pthread_mutex_unlock(&id_priv->mut); + return ERR(ret); +} + +int rdma_query_addrinfo(struct rdma_cm_id *id, struct rdma_addrinfo **info) +{ + struct cma_id_private *id_priv; + + if (!id || !info) + return ERR(EINVAL); + + id_priv = container_of(id, struct cma_id_private, id); + pthread_mutex_lock(&id_priv->mut); + if (!id_priv->resolving_ai || !id_priv->resolved_ai) { + pthread_mutex_unlock(&id_priv->mut); + return ERR(ENOENT); + } + + *info = id_priv->resolved_ai; + id_priv->resolved_ai = NULL; + id_priv->resolving_ai = false; + + pthread_mutex_unlock(&id_priv->mut); + return 0; +} diff --git a/librdmacm/librdmacm.map b/librdmacm/librdmacm.map index 84b36a94a..2b6349fa9 100644 --- a/librdmacm/librdmacm.map +++ b/librdmacm/librdmacm.map @@ -87,3 +87,9 @@ RDMACM_1.3 { rdma_reject_ece; rdma_set_local_ece; } RDMACM_1.2; + +RDMACM_1.4 { + global: + rdma_query_addrinfo; + rdma_resolve_addrinfo; +} RDMACM_1.3; diff --git a/librdmacm/rdma_cma.h b/librdmacm/rdma_cma.h index e1f4e2364..1f57bbdf8 100644 --- a/librdmacm/rdma_cma.h +++ b/librdmacm/rdma_cma.h @@ -63,7 +63,9 @@ enum rdma_cm_event_type { RDMA_CM_EVENT_MULTICAST_JOIN, RDMA_CM_EVENT_MULTICAST_ERROR, RDMA_CM_EVENT_ADDR_CHANGE, - RDMA_CM_EVENT_TIMEWAIT_EXIT + RDMA_CM_EVENT_TIMEWAIT_EXIT, + RDMA_CM_EVENT_ADDRINFO_RESOLVED, + RDMA_CM_EVENT_ADDRINFO_ERROR, }; enum rdma_port_space { @@ -178,6 +180,7 @@ struct rdma_cm_event { #define RAI_NUMERICHOST 0x00000002 #define RAI_NOROUTE 0x00000004 #define RAI_FAMILY 0x00000008 +#define RAI_SA 0x00000010 struct rdma_addrinfo { int ai_flags; @@ -777,6 +780,24 @@ int rdma_set_local_ece(struct rdma_cm_id *id, struct ibv_ece *ece); * @ece: ECE parameters */ int rdma_get_remote_ece(struct rdma_cm_id *id, struct ibv_ece *ece); + +/** + * rdma_resolve_addrinfo - Resolve address information asynchronously + * @id: Communication identifier + * @node: The host to resolve + * @service: The service to resolve which supports IB + * @hints: An addrinfo structure that specifies criteria for selecting the + * returned addresses + */ +int rdma_resolve_addrinfo(struct rdma_cm_id *id, const char *node, + const char *service, + const struct rdma_addrinfo *hints); + +/** + * rdma_query_addrinfo - Query the resolved address information + */ +int rdma_query_addrinfo(struct rdma_cm_id *id, struct rdma_addrinfo **info); + #ifdef __cplusplus } #endif diff --git a/librdmacm/rdma_cma_abi.h b/librdmacm/rdma_cma_abi.h index 9177282e0..80f0ac44e 100644 --- a/librdmacm/rdma_cma_abi.h +++ b/librdmacm/rdma_cma_abi.h @@ -70,7 +70,8 @@ enum { UCMA_CMD_QUERY, UCMA_CMD_BIND, UCMA_CMD_RESOLVE_ADDR, - UCMA_CMD_JOIN_MCAST + UCMA_CMD_JOIN_MCAST, + UCMA_CMD_RESOLVE_IB_SERVICE }; struct ucma_abi_cmd_hdr { @@ -160,7 +161,8 @@ struct ucma_abi_resolve_route { enum { UCMA_QUERY_ADDR, UCMA_QUERY_PATH, - UCMA_QUERY_GID + UCMA_QUERY_GID, + UCMA_QUERY_IB_SERVICE, }; struct ucma_abi_query { @@ -361,4 +363,44 @@ struct ucma_abi_migrate_resp { __u32 events_reported; }; +enum { + UCMA_IB_SERVICE_FLAG_ID = 1 << 0, + UCMA_IB_SERVICE_FLAG_NAME = 1 << 1, +}; + +#define UCMA_IB_SERVICE_NAME_SIZE 64 +struct ucma_abi_ib_service { + __u64 service_id; + __u8 service_name[UCMA_IB_SERVICE_NAME_SIZE]; + __u32 flags; + __u32 reserved; +}; + +struct ucma_abi_resolve_ib_service { + __u32 cmd; + __u16 in; + __u16 out; + __u32 id; + struct ucma_abi_ib_service ibs; +}; + +struct ucma_user_service_rec { + __be64 id; + __u8 gid[16]; + __be16 pkey; + __u8 reserved[2]; + __be32 lease; + __u8 key[16]; + __u8 name[64]; + __u8 data_8[16]; + __be16 data_16[8]; + __be32 data_32[4]; + __be64 data_64[2]; +}; + +struct ucma_abi_query_ib_service_resp { + __u32 num_service_recs; + struct ucma_user_service_rec recs[]; +}; + #endif /* RDMA_CMA_ABI_H */ From 179b646016bfd5dedec9b76906c608b9ffd238ab Mon Sep 17 00:00:00 2001 From: Vlad Dumitrescu Date: Tue, 2 Sep 2025 16:02:11 -0700 Subject: [PATCH 32/66] librdmacm/cmtime: Drop unused 's' option Fixes: 67879d9f22b7 ("librdmacm/cmtime: Support mesh based connection testing") Reviewed-by: Sean Hefty Signed-off-by: Vlad Dumitrescu --- librdmacm/examples/cmtime.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/librdmacm/examples/cmtime.c b/librdmacm/examples/cmtime.c index f353c76d5..7525eb0c2 100644 --- a/librdmacm/examples/cmtime.c +++ b/librdmacm/examples/cmtime.c @@ -1135,7 +1135,7 @@ int main(int argc, char **argv) bool socktest = false; int op, ret; - while ((op = getopt(argc, argv, "B:b:C:c:Lm:n:P:p:q:r:Ss:t:")) != -1) { + while ((op = getopt(argc, argv, "B:b:C:c:Lm:n:P:p:q:r:S:t:")) != -1) { switch (op) { case 'B': if (src_addr) From 6d71f1cbd1073e319222b23eb0736198ee9cef20 Mon Sep 17 00:00:00 2001 From: Vlad Dumitrescu Date: Tue, 2 Sep 2025 16:37:45 -0700 Subject: [PATCH 33/66] librdmacm/cmtime: Update man page Recent patches significantly changed the usage of cmtime example application. Update the man page to reflect these changes. Fixes: 18fce1e54f0e ("librdmacm/cmtime: Determine root or leaf based on bind result") Fixes: 93bf54a43de2 ("librdmacm/cmtime: Bind to named interface") Fixes: 67879d9f22b7 ("librdmacm/cmtime: Support mesh based connection testing") Fixes: 0892dd7700f4 ("librdmacm/cmtime: Accept connections from multiple clients") Reported-by: Mark Haywood Closes: https://lore.kernel.org/linux-rdma/1e5a6494-91fd-40a3-abaf-a614bc3f0e2a@oracle.com/ Reviewed-by: Sean Hefty Signed-off-by: Vlad Dumitrescu --- librdmacm/man/cmtime.1 | 85 ++++++++++++++++++++++++++++++------------ 1 file changed, 62 insertions(+), 23 deletions(-) diff --git a/librdmacm/man/cmtime.1 b/librdmacm/man/cmtime.1 index 28e2beb8c..6af8fcc6a 100644 --- a/librdmacm/man/cmtime.1 +++ b/librdmacm/man/cmtime.1 @@ -1,19 +1,32 @@ .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md -.TH "CMTIME" 1 "2017-04-28" "librdmacm" "librdmacm" librdmacm +.TH "CMTIME" 1 "2025-09-03" "librdmacm" "librdmacm" librdmacm .SH NAME cmtime \- RDMA CM connection steps timing test. .SH SYNOPSIS .sp .nf -\fIcmtime\fR [-s server_address] [-b bind_address] - [-c connections] [-p port_number] - [-q base_qpn] - [-r retries] [-t timeout_ms] +\fIcmtime\fR -C controller_address + (-B bind_interface | -b bind_address) + [-L] + [-c connections_per_pair] [-P num_peers] + [-p controller_port] + [-q base_qpn] [-n num_threads] + [-m mimic_qp_delay_us] [-r retries] + [-t timeout_ms] + [-S] .fi .SH "DESCRIPTION" Determines min, max, and average times for various "steps" in RDMA CM -connection setup and teardown between a client and server -application. +connection setup and teardown between clients and servers. + +To use, start one or more servers (-L), plus one or more clients. Each +client will establish -c number of connections to each server. By default, +the test runs with 1 client and 1 server. + +One process will act as a controller process. The controller coordinates +the clients and servers -- uses out-of-band sockets to collect and +distribute addresses and keeps clients and servers in sync as they advance +through each "step". "Steps" that are timed are: create ID, bind address, resolve address, resolve route, create QP, modify QP to INIT, modify QP to RTR, @@ -34,24 +47,40 @@ roughly the same. For asynchronous steps, the total may be significantly lower than the sum, as multiple connections will be in progress simultanesously. The avg/iter is the total time divided by the number of connections. -In many cases, times may not be available or only available on the client. -Is such situations, the output will show 0. +In many cases, times may not be available or only available on the clients. +Is such situations, the output will show 0. Stats printed by each instance +pertain to connections handled by it -- on runs with multiple clients and/or +servers (-P > 2) further external aggregation might be required. .SH "OPTIONS" .TP -\-s server_address -The network name or IP address of the server system listening for -connections. The used name or address must route over an RDMA device. -This option must be specified by the client. +\-C controller_address +The network name or IP address of the instance which will provide step +synchronization. This option must be specified by all instances. The +first instance to start which discovers this address as local will act as +controller and will synchronize all other instances via an out-of-band +channel as benchmark steps progress. Any type of instance (client or +server) can act as controller. +.TP +\-B bind_interface +The local RDMA interface to bind to. Only one of -B or -b is accepted. .TP \-b bind_address -The local network address to bind to. +The local network address to bind to. Only one of -B or -b is accepted. +.TP .TP -\-c connections -The number of connections to establish between the client and -server. (default 100) +\-L +Whether this instance is an RDMA CM server (set) or client (unset). +(default client/unset) +\-c connections_per_pair +The number of connections to establish between each client-server +pair. (default 100) .TP -\-p port_number -The server's port number. +\-P num_peers +Total number of peers (clients, servers) which are going to be +started. (default 2) +.TP +\-p controller_port +The controller's port number. .TP \-q base_qpn The first QP number to use when creating connections without allocating @@ -74,15 +103,25 @@ Number of retries when resolving address or route. (default 2) \-S Run connection rate test using sockets. This provides a baseline comparison of RDMA connections versus TCP connections. Sockets are -set to blocking mode. +set to blocking mode. Only supported in 1 client 1 server mode (-P 2). .TP \-t timeout_ms Timeout in millseconds (ms) when resolving address or route. (default 2000 - 2 seconds) +.SH "EXAMPLES" +One client connecting to one server: + + server$ cmtime -B enp1s0f0np0 -C 192.0.2.1 -L + client$ cmtime -B enp1s0f0np0 -C 192.0.2.1 + +Two clients connecting to three servers, 6k connections in total: + + client1$ cmtime -B enp1s0f0np0 -P 5 -C 192.0.2.1 -c 1000 + client2$ cmtime -B enp1s0f0np0 -P 5 -C 192.0.2.1 -c 1000 + server1$ cmtime -B enp1s0f0np0 -P 5 -C 192.0.2.1 -c 1000 -L + server2$ cmtime -B enp1s0f0np0 -P 5 -C 192.0.2.1 -c 1000 -L + server3$ cmtime -B enp1s0f0np0 -P 5 -C 192.0.2.1 -c 1000 -L .SH "NOTES" -Basic usage is to start cmtime on a server system, then run -cmtime -s server_name on a client system. -.P Because this test maps RDMA resources to userspace, users must ensure that they have available system resources and permissions. See the libibverbs README file for additional details. From 8efa9755ede337d28b4b98b80bda9b9afb17ab8f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 9 Sep 2025 15:40:07 -0300 Subject: [PATCH 34/66] efa: Make base_ops static Nothing uses it outside this file and sparse complains. Fixes: a4f3f441e66f ("efa: Add single sub CQ poll variant") Signed-off-by: Jason Gunthorpe --- providers/efa/verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/providers/efa/verbs.c b/providers/efa/verbs.c index 708199355..5d9bcbe97 100644 --- a/providers/efa/verbs.c +++ b/providers/efa/verbs.c @@ -1035,7 +1035,7 @@ enum cq_pfns_attr { .end_poll = efa_end_poll_name(single_thread), \ } -struct cq_base_ops { +static struct cq_base_ops { int (*start_poll)(struct ibv_cq_ex *ibcq, struct ibv_poll_cq_attr *attr); int (*next_poll)(struct ibv_cq_ex *ibcq); void (*end_poll)(struct ibv_cq_ex *ibcq); From f75db8c867400926720b33b63ff2af979cddb506 Mon Sep 17 00:00:00 2001 From: Asaf Mazor Date: Thu, 4 Sep 2025 11:18:41 +0300 Subject: [PATCH 35/66] infiniband-diags: Fix sa_get_handle to use smi/gsi API - Use mad_rpc_open_port_2 inside sa_get_handle - Update handle members according to gsi port from mad_rpc_open_port2 - Replace umad_close_port in mad_rpc_close_port_2 inside sa_free_handle This fixes behavior where we can take smi handle by mistake. mad_rpc_open_port2 does inside umad_open_port and umad_register, so we just need to assign the relevant values to the handle. Fixes: aaf9f699c10f ("ibqueryerrors: Use API supporting SMI/GSI seperation") Signed-off-by: Asaf Mazor --- infiniband-diags/ibdiag_sa.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/infiniband-diags/ibdiag_sa.c b/infiniband-diags/ibdiag_sa.c index 5c4db68da..4414589e1 100644 --- a/infiniband-diags/ibdiag_sa.c +++ b/infiniband-diags/ibdiag_sa.c @@ -48,6 +48,8 @@ * the saquery tool and provides it to other utilities. */ +static struct ibmad_ports_pair *srcports; + struct sa_handle *sa_get_handle(char *ca_name) { struct sa_handle *handle; @@ -57,7 +59,13 @@ struct sa_handle *sa_get_handle(char *ca_name) char *name = ca_name ? ca_name : ibd_ca; - resolve_sm_portid(name, ibd_ca_port, &handle->dport); + int mgmt_classes[1] = { IB_SA_CLASS }; + + srcports = mad_rpc_open_port2(name, ibd_ca_port, mgmt_classes, 1, 0); + if (!srcports) + IBEXIT("Failed to open '%s' port '%d'", name, ibd_ca_port); + + resolve_sm_portid(srcports->gsi.ca_name, ibd_ca_port, &handle->dport); if (!handle->dport.lid) { IBWARN("No SM/SA found on port %s:%d", name ? "" : name, @@ -69,20 +77,8 @@ struct sa_handle *sa_get_handle(char *ca_name) if (!handle->dport.qkey) handle->dport.qkey = IB_DEFAULT_QP1_QKEY; - handle->fd = umad_open_port(name, ibd_ca_port); - if (handle->fd < 0) { - IBWARN("umad_open_port on port %s:%d failed", - name ? "" : name, - ibd_ca_port); - goto err; - } - if ((handle->agent = umad_register(handle->fd, IB_SA_CLASS, 2, 1, NULL)) < 0) { - umad_close_port(handle->fd); - IBWARN("umad_register for SA class failed on port %s:%d", - name ? "" : name, - ibd_ca_port); - goto err; - } + handle->fd = mad_rpc_portid(srcports->gsi.port); + handle->agent = mad_rpc_class_agent(srcports->gsi.port, IB_SA_CLASS); return handle; @@ -94,7 +90,7 @@ struct sa_handle *sa_get_handle(char *ca_name) void sa_free_handle(struct sa_handle * h) { umad_unregister(h->fd, h->agent); - umad_close_port(h->fd); + mad_rpc_close_port2(srcports); free(h); } From 26480ca93ec108fbdefb23604dec6df0aa831f84 Mon Sep 17 00:00:00 2001 From: Asaf Mazor Date: Thu, 4 Sep 2025 11:34:24 +0300 Subject: [PATCH 36/66] ibqueryerrors: Fix SMP call to use correct port - Change to correct port on smp_query_via call Fixes: aaf9f699c10f ("ibqueryerrors: Use API supporting SMI/GSI seperation") Signed-off-by: Asaf Mazor --- infiniband-diags/ibqueryerrors.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/infiniband-diags/ibqueryerrors.c b/infiniband-diags/ibqueryerrors.c index 16d9e88b2..5c30b93b1 100644 --- a/infiniband-diags/ibqueryerrors.c +++ b/infiniband-diags/ibqueryerrors.c @@ -1124,10 +1124,12 @@ int main(int argc, char **argv) ibd_ca, ibd_ca_port); } - smp_mkey_set(ibmad_port, ibd_mkey); + smp_mkey_set(ibmad_ports->smi.port, ibd_mkey); - if (ibd_timeout) - mad_rpc_set_timeout(ibmad_port, ibd_timeout); + if (ibd_timeout) { + mad_rpc_set_timeout(ibmad_ports->smi.port, ibd_timeout); + mad_rpc_set_timeout(ibmad_ports->gsi.port, ibd_timeout); + } if (port_guid_str) { ibnd_port_t *ndport = ibnd_find_port_guid(fabric, port_guid); @@ -1141,7 +1143,7 @@ int main(int argc, char **argv) uint8_t ni[IB_SMP_DATA_SIZE] = { 0 }; if (!smp_query_via(ni, &portid, IB_ATTR_NODE_INFO, 0, - ibd_timeout, ibmad_port)) { + ibd_timeout, ibmad_ports->smi.port)) { fprintf(stderr, "Failed to query local Node Info\n"); goto close_port; } From 7d7ca522f4d04f9f2f399dc201f4985cc61a4415 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Thu, 23 Jan 2025 09:50:48 +0200 Subject: [PATCH 37/66] librdmacm: Provide an interface to write an event into a CM Support writing an event to a CM channel and add a new user API "rdma_write_cm_event()". Two new events are added and supported to be written: - RDMA_CM_EVENT_USER: User-defined, event details are specified by the user and not interpreted by the librdmacm. This event is useful for a multi-threaded application to signal a thread which may be waiting on the RDMA CM event channel. This is the only event that users can write; - RDMA_CM_EVENT_INTERNAL: Used and consumed internally by the librdmacm. Users should not write this event. A new event parameter "arg" is supported, which will be passed from sender to receiver along with the event. Signed-off-by: Mark Zhang --- debian/librdmacm1.symbols | 1 + librdmacm/cma.c | 36 ++++++++++++++++++++++++++++++++++++ librdmacm/librdmacm.map | 1 + librdmacm/rdma_cma.h | 9 +++++++++ librdmacm/rdma_cma_abi.h | 19 ++++++++++++++++++- 5 files changed, 65 insertions(+), 1 deletion(-) diff --git a/debian/librdmacm1.symbols b/debian/librdmacm1.symbols index 4637147d6..8580fb2dd 100644 --- a/debian/librdmacm1.symbols +++ b/debian/librdmacm1.symbols @@ -52,6 +52,7 @@ librdmacm.so.1 librdmacm1 #MINVER# rdma_resolve_route@RDMACM_1.0 1.0.15 rdma_set_local_ece@RDMACM_1.3 31 rdma_set_option@RDMACM_1.0 1.0.15 + rdma_write_cm_event@RDMACM_1.4 60 rfcntl@RDMACM_1.0 1.0.16 rgetpeername@RDMACM_1.0 1.0.16 rgetsockname@RDMACM_1.0 1.0.16 diff --git a/librdmacm/cma.c b/librdmacm/cma.c index 270781f0a..4f98a856b 100644 --- a/librdmacm/cma.c +++ b/librdmacm/cma.c @@ -2752,6 +2752,12 @@ int rdma_get_cm_event(struct rdma_event_channel *channel, case RDMA_CM_EVENT_ADDRINFO_ERROR: clear_resolving_ai_flag(evt->id_priv); break; + case RDMA_CM_EVENT_USER: + memcpy(&evt->event.param.arg, resp.param.arg32, + sizeof(evt->event.param.arg)); + break; + case RDMA_CM_EVENT_INTERNAL: + break; default: evt->id_priv = (void *) (uintptr_t) resp.uid; evt->event.id = &evt->id_priv->id; @@ -3089,6 +3095,36 @@ static void resolve_ai_set_cmd_service(const char *service, } } +static int __rdma_write_cm_event(struct rdma_cm_id *id, enum rdma_cm_event_type event, + int status, uint64_t arg) +{ + struct ucma_abi_write_cm_event cmd; + struct cma_id_private *id_priv; + int ret; + + CMA_INIT_CMD(&cmd, sizeof(cmd), WRITE_CM_EVENT); + + id_priv = container_of(id, struct cma_id_private, id); + cmd.id = id_priv->handle; + cmd.event = event; + cmd.status = status; + cmd.param.arg = arg; + ret = write(id->channel->fd, &cmd, sizeof(cmd)); + if (ret != sizeof(cmd)) + return (ret >= 0) ? ERR(ENODATA) : -1; + + return 0; +} + +int rdma_write_cm_event(struct rdma_cm_id *id, enum rdma_cm_event_type event, + int status, uint64_t arg) +{ + if (event != RDMA_CM_EVENT_USER) + return ERR(EINVAL); + + return __rdma_write_cm_event(id, event, status, arg); +} + static int resolve_ai_sa(struct cma_id_private *id_priv, const char *service) { struct ucma_abi_resolve_ib_service cmd; diff --git a/librdmacm/librdmacm.map b/librdmacm/librdmacm.map index 2b6349fa9..76853f2f7 100644 --- a/librdmacm/librdmacm.map +++ b/librdmacm/librdmacm.map @@ -92,4 +92,5 @@ RDMACM_1.4 { global: rdma_query_addrinfo; rdma_resolve_addrinfo; + rdma_write_cm_event; } RDMACM_1.3; diff --git a/librdmacm/rdma_cma.h b/librdmacm/rdma_cma.h index 1f57bbdf8..5f87314d2 100644 --- a/librdmacm/rdma_cma.h +++ b/librdmacm/rdma_cma.h @@ -66,6 +66,8 @@ enum rdma_cm_event_type { RDMA_CM_EVENT_TIMEWAIT_EXIT, RDMA_CM_EVENT_ADDRINFO_RESOLVED, RDMA_CM_EVENT_ADDRINFO_ERROR, + RDMA_CM_EVENT_USER, + RDMA_CM_EVENT_INTERNAL, }; enum rdma_port_space { @@ -173,6 +175,7 @@ struct rdma_cm_event { union { struct rdma_conn_param conn; struct rdma_ud_param ud; + uint64_t arg; } param; }; @@ -798,6 +801,12 @@ int rdma_resolve_addrinfo(struct rdma_cm_id *id, const char *node, */ int rdma_query_addrinfo(struct rdma_cm_id *id, struct rdma_addrinfo **info); +/** + * rdma_write_cm_event - Write an event into a cm channel. + */ +int rdma_write_cm_event(struct rdma_cm_id *id, enum rdma_cm_event_type event, + int status, uint64_t arg); + #ifdef __cplusplus } #endif diff --git a/librdmacm/rdma_cma_abi.h b/librdmacm/rdma_cma_abi.h index 80f0ac44e..c8d4e831c 100644 --- a/librdmacm/rdma_cma_abi.h +++ b/librdmacm/rdma_cma_abi.h @@ -71,7 +71,8 @@ enum { UCMA_CMD_BIND, UCMA_CMD_RESOLVE_ADDR, UCMA_CMD_JOIN_MCAST, - UCMA_CMD_RESOLVE_IB_SERVICE + UCMA_CMD_RESOLVE_IB_SERVICE, + UCMA_CMD_WRITE_CM_EVENT, }; struct ucma_abi_cmd_hdr { @@ -335,6 +336,7 @@ struct ucma_abi_event_resp { union { struct ucma_abi_conn_param conn; struct ucma_abi_ud_param ud; + __u32 arg32[2]; } param; struct ucma_abi_ece ece; }; @@ -403,4 +405,19 @@ struct ucma_abi_query_ib_service_resp { struct ucma_user_service_rec recs[]; }; +struct ucma_abi_write_cm_event { + __u32 cmd; + __u16 in; + __u16 out; + __u32 id; + __u32 reserved; + __u32 event; + __u32 status; + union { + struct ucma_abi_conn_param conn; + struct ucma_abi_ud_param ud; + __u64 arg; + } param; +}; + #endif /* RDMA_CMA_ABI_H */ From 30828be882b3159cd9398cc562ae10cd41d2fee1 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Mon, 27 Jan 2025 08:09:29 +0200 Subject: [PATCH 38/66] librdmacm: Support DNS in resolve and query addrinfo Allow rdma_resolve_addrinfo() and rdma_query_addrinfo() to support DNS resolve and query, respectively. A new flags "RAI_DNS" is supported to indicate this is a DNS resolve request. This flag is mutual exclusive with RAI_SA. If both are not set then DNS resolve is performed by default. With this patch a user may resolve both DNS and SA, and it's possible that one thread is querying the resolved addrinfo while another thread is resolving. So protect the route.resolved_ai with id_priv->mut lock. Signed-off-by: Mark Zhang --- librdmacm/cma.c | 48 +++++++++++++++++++++++++++++++++++++++++--- librdmacm/rdma_cma.h | 1 + 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/librdmacm/cma.c b/librdmacm/cma.c index 4f98a856b..c2fb070b2 100644 --- a/librdmacm/cma.c +++ b/librdmacm/cma.c @@ -2591,6 +2591,9 @@ static void ucma_process_addrinfo_resolved(struct cma_event *evt) { struct rdma_addrinfo *rai; + if (evt->id_priv->resolved_ai) /* DNS */ + return; + rai = ucma_query_ib_service(evt->id_priv); if (!rai) { evt->event.event = RDMA_CM_EVENT_ADDRINFO_ERROR; @@ -2756,8 +2759,19 @@ int rdma_get_cm_event(struct rdma_event_channel *channel, memcpy(&evt->event.param.arg, resp.param.arg32, sizeof(evt->event.param.arg)); break; - case RDMA_CM_EVENT_INTERNAL: + case RDMA_CM_EVENT_INTERNAL: { + uint64_t resp_arg; + + memcpy(&resp_arg, resp.param.arg32, sizeof(resp_arg)); + if (resp_arg == RDMA_CM_EVENT_ADDRINFO_RESOLVED) { + evt->event.event = RDMA_CM_EVENT_ADDRINFO_RESOLVED; + ucma_process_addrinfo_resolved(evt); + } else { + return ERR(EBADE); + } + break; + } default: evt->id_priv = (void *) (uintptr_t) resp.uid; evt->event.id = &evt->id_priv->id; @@ -3144,17 +3158,45 @@ static int resolve_ai_sa(struct cma_id_private *id_priv, const char *service) return ucma_complete(&id_priv->id); } +static int resolve_ai_dns(struct cma_id_private *id_priv, const char *node, + const char *service, + const struct rdma_addrinfo *hints) +{ + int ret; + + ret = rdma_getaddrinfo(node, service, hints, &id_priv->resolved_ai); + if (ret) + return ret; + + ret = __rdma_write_cm_event(&id_priv->id, RDMA_CM_EVENT_INTERNAL, 0, + (uint64_t)RDMA_CM_EVENT_ADDRINFO_RESOLVED); + if (ret) { + rdma_freeaddrinfo(id_priv->resolved_ai); + id_priv->resolved_ai = NULL; + return (ret >= 0) ? ERR(ENODATA) : -1; + } + + return ucma_complete(&id_priv->id); +} + static int __rdma_resolve_addrinfo(struct cma_id_private *id_priv, const char *node, const char *service, const struct rdma_addrinfo *hints) { + if (!hints) + goto resolve_dns; + + if ((hints->ai_flags & RAI_SA) && (hints->ai_flags & RAI_DNS)) + return ENOTSUP; + if (hints->ai_flags & RAI_SA) { if (node) return ENOTSUP; return resolve_ai_sa(id_priv, service); } - return EINVAL; +resolve_dns: + return resolve_ai_dns(id_priv, node, service, hints); } int rdma_resolve_addrinfo(struct rdma_cm_id *id, const char *node, @@ -3164,7 +3206,7 @@ int rdma_resolve_addrinfo(struct rdma_cm_id *id, const char *node, struct cma_id_private *id_priv; int ret = 0; - if (!id || !hints) + if (!id) return ERR(EINVAL); id_priv = container_of(id, struct cma_id_private, id); diff --git a/librdmacm/rdma_cma.h b/librdmacm/rdma_cma.h index 5f87314d2..39883d99f 100644 --- a/librdmacm/rdma_cma.h +++ b/librdmacm/rdma_cma.h @@ -184,6 +184,7 @@ struct rdma_cm_event { #define RAI_NOROUTE 0x00000004 #define RAI_FAMILY 0x00000008 #define RAI_SA 0x00000010 +#define RAI_DNS 0x00000020 /* Mutual-exclusive with RAI_SA */ struct rdma_addrinfo { int ai_flags; From dfc0afdeaa328bb387131adf6dd1de32f1a2510a Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Mon, 3 Feb 2025 09:48:03 +0200 Subject: [PATCH 39/66] librdmacm: Support IB SA resolve in rdma_getaddrinfo() Allow rdma_getaddrinfo() to support IB SA resolve when RAI_SA flag is set in the hints. It resolves the requested IB service on all IB ports one by one, and returns on the first successful resolve. Signed-off-by: Mark Zhang --- librdmacm/addrinfo.c | 7 +++ librdmacm/cma.c | 119 +++++++++++++++++++++++++++++++++++++++++++ librdmacm/cma.h | 1 + 3 files changed, 127 insertions(+) diff --git a/librdmacm/addrinfo.c b/librdmacm/addrinfo.c index 7e6606592..a72481d02 100644 --- a/librdmacm/addrinfo.c +++ b/librdmacm/addrinfo.c @@ -248,6 +248,13 @@ int rdma_getaddrinfo(const char *node, const char *service, if (ret) return ret; + if (hints && hints->ai_flags & RAI_SA) { + if (node || (hints->ai_flags & RAI_DNS)) + return ERR(EOPNOTSUPP); + + return ucma_getaddrinfo_sa(service, res); + } + rai = calloc(1, sizeof(*rai)); if (!rai) return ERR(ENOMEM); diff --git a/librdmacm/cma.c b/librdmacm/cma.c index c2fb070b2..c9800afc7 100644 --- a/librdmacm/cma.c +++ b/librdmacm/cma.c @@ -3251,3 +3251,122 @@ int rdma_query_addrinfo(struct rdma_cm_id *id, struct rdma_addrinfo **info) pthread_mutex_unlock(&id_priv->mut); return 0; } + +static int resolve_sa_on_gid(union ibv_gid *gid, struct rdma_event_channel *ech, + const char *service, struct rdma_addrinfo **res) +{ + struct sockaddr_ib sib = {}, *psib; + struct cma_id_private *id_priv; + struct rdma_cm_id *cm_id; + struct rdma_addrinfo *ai; + struct rdma_cm_event *e; + int ret; + + ret = rdma_create_id(ech, &cm_id, NULL, RDMA_PS_IB); + if (ret) + return ret; + + sib.sib_family = AF_IB; + memcpy(&sib.sib_addr, gid, sizeof(sib.sib_addr)); + ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sib); + if (ret) + goto out; + + id_priv = container_of(cm_id, struct cma_id_private, id); + ret = resolve_ai_sa(id_priv, service); + if (ret) + goto out; + + ret = rdma_get_cm_event(ech, &e); + if (ret) { + rdma_ack_cm_event(e); + goto out; + } + if (e->event != RDMA_CM_EVENT_ADDRINFO_RESOLVED) { + rdma_ack_cm_event(e); + ret = ENOENT; + goto out; + } + + rdma_ack_cm_event(e); + + ai = id_priv->resolved_ai; + while (ai) { + psib = calloc(1, sizeof(*psib)); + if (!psib) { + ret = errno; + goto out; + } + + psib->sib_family = AF_IB; + memcpy(&psib->sib_addr, gid, sizeof(psib->sib_addr)); + ucma_set_sid(RDMA_PS_IB, NULL, psib); + psib->sib_pkey = + ((struct sockaddr_ib *)ai->ai_dst_addr)->sib_pkey; + + ai->ai_src_addr = (struct sockaddr *)psib; + ai->ai_src_len = sizeof(*psib); + ai = ai->ai_next; + } + + *res = id_priv->resolved_ai; + id_priv->resolved_ai = NULL; +out: + rdma_destroy_id(cm_id); + return ret; +} + +int ucma_getaddrinfo_sa(const char *service, struct rdma_addrinfo **res) +{ + struct ibv_device_attr dev_attr = {}; + struct ibv_port_attr port_attr = {}; + struct rdma_event_channel *ech; + struct ibv_context *ibctx; + int i, j, ret, found = 0; + union ibv_gid gid; + + if (!service || !res) + return ERR(EINVAL); + + ech = rdma_create_event_channel(); + if (!ech) + return -1; + + for (i = 0; dev_list[i] != NULL; i++) { + ibctx = ibv_open_device(dev_list[i]); + if (!ibctx) + continue; + + ret = ibv_query_device(ibctx, &dev_attr); + if (ret) + goto next; + + for (j = 0; j < dev_attr.phys_port_cnt; j++) { + ret = ibv_query_port(ibctx, j + 1, &port_attr); + if (ret) + continue; + + if ((port_attr.link_layer != IBV_LINK_LAYER_INFINIBAND) || + (port_attr.state < IBV_PORT_ACTIVE)) + continue; + + ret = ibv_query_gid(ibctx, j + 1, 0, &gid); + if (ret) + continue; + + ret = resolve_sa_on_gid(&gid, ech, service, res); + if (!ret) { + found = 1; + break; + } + } + +next: + ibv_close_device(ibctx); + if (found) + break; + } + + rdma_destroy_event_channel(ech); + return found ? 0 : ERR(ENOENT); +} diff --git a/librdmacm/cma.h b/librdmacm/cma.h index 62821055a..e14c51778 100644 --- a/librdmacm/cma.h +++ b/librdmacm/cma.h @@ -114,4 +114,5 @@ struct ib_connect_hdr { #define cma_dst_ip6 dst_addr[0] }; +int ucma_getaddrinfo_sa(const char *service, struct rdma_addrinfo **res); #endif /* CMA_H */ From 4b83401574efd4ff93326bb155adc5137aa1d676 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Mon, 10 Feb 2025 03:55:32 +0200 Subject: [PATCH 40/66] librdmacm: Document new address resolution APIs Add manual pages for new APIs rdma_resolve_addrinfo(), rdma_query_addrinfo(), rdma_write_cm_event(), and RAI_DNS, RAI_SA flags. Signed-off-by: Mark Zhang --- librdmacm/man/CMakeLists.txt | 3 + librdmacm/man/rdma_cm.7 | 3 + librdmacm/man/rdma_get_cm_event.3 | 11 +++- librdmacm/man/rdma_getaddrinfo.3 | 11 +++- librdmacm/man/rdma_query_addrinfo.3.in.rst | 53 +++++++++++++++ librdmacm/man/rdma_resolve_addrinfo.3.in.rst | 68 ++++++++++++++++++++ librdmacm/man/rdma_write_cm_event.3.in.rst | 57 ++++++++++++++++ 7 files changed, 204 insertions(+), 2 deletions(-) create mode 100644 librdmacm/man/rdma_query_addrinfo.3.in.rst create mode 100644 librdmacm/man/rdma_resolve_addrinfo.3.in.rst create mode 100644 librdmacm/man/rdma_write_cm_event.3.in.rst diff --git a/librdmacm/man/CMakeLists.txt b/librdmacm/man/CMakeLists.txt index 9fcad4f28..173393926 100644 --- a/librdmacm/man/CMakeLists.txt +++ b/librdmacm/man/CMakeLists.txt @@ -51,15 +51,18 @@ rdma_man_pages( rdma_post_ud_send.3 rdma_post_write.3 rdma_post_writev.3 + rdma_query_addrinfo.3.in.rst rdma_reg_msgs.3 rdma_reg_read.3 rdma_reg_write.3 rdma_reject.3 rdma_resolve_addr.3 + rdma_resolve_addrinfo.3.in.rst rdma_resolve_route.3 rdma_server.1 rdma_set_local_ece.3.md rdma_set_option.3 + rdma_write_cm_event.3.in.rst rdma_xclient.1 rdma_xserver.1 riostream.1 diff --git a/librdmacm/man/rdma_cm.7 b/librdmacm/man/rdma_cm.7 index da4b11d0f..84599ee34 100644 --- a/librdmacm/man/rdma_cm.7 +++ b/librdmacm/man/rdma_cm.7 @@ -229,14 +229,17 @@ rdma_post_sendv(3), rdma_post_ud_send(3), rdma_post_write(3), rdma_post_writev(3), +rdma_query_addrinfo(3), rdma_reg_msgs(3), rdma_reg_read(3), rdma_reg_write(3), rdma_reject(3), rdma_resolve_addr(3), +rdma_resolve_addrinfo(3), rdma_resolve_route(3), rdma_get_remote_ece(3), rdma_set_option(3), +rdma_write_cm_event(3), mckey(1), rdma_client(1), rdma_server(1), diff --git a/librdmacm/man/rdma_get_cm_event.3 b/librdmacm/man/rdma_get_cm_event.3 index 624004414..c8fe2f6a7 100644 --- a/librdmacm/man/rdma_get_cm_event.3 +++ b/librdmacm/man/rdma_get_cm_event.3 @@ -162,7 +162,16 @@ The QP associated with a connection has exited its timewait state and is now ready to be re-used. After a QP has been disconnected, it is maintained in a timewait state to allow any in flight packets to exit the network. After the timewait state has completed, the rdma_cm will report this event. +.IP RDMA_CM_EVENT_ADDRINFO_RESOLVED +Address information resolution (rdma_resolve_addrinfo) completed successfully. +.IP RDMA_CM_EVENT_ADDRINFO_ERROR +Address information resolution (rdma_resolve_addrinfo) failed. +.IP RDMA_CM_EVENT_USER +This allows a user-defined event to be written to the RDMA CM event channel. +Event details are specified by the user and not interpreted by the librdmacm. +This event is useful for a multi-threaded application to signal a thread which +may be waiting on the RDMA CM event channel. .SH "SEE ALSO" rdma_ack_cm_event(3), rdma_create_event_channel(3), rdma_resolve_addr(3), rdma_resolve_route(3), rdma_connect(3), rdma_listen(3), rdma_join_multicast(3), -rdma_destroy_id(3), rdma_event_str(3) +rdma_destroy_id(3), rdma_event_str(3),rdma_write_cm_event(3) diff --git a/librdmacm/man/rdma_getaddrinfo.3 b/librdmacm/man/rdma_getaddrinfo.3 index b53ca8417..85fab2603 100644 --- a/librdmacm/man/rdma_getaddrinfo.3 +++ b/librdmacm/man/rdma_getaddrinfo.3 @@ -81,6 +81,14 @@ If set, this flag suppresses any lengthy route resolution. .IP "RAI_FAMILY" 12 If set, the ai_family setting should be used as an input hint for interpretting the node parameter. +.IP "RAI_DNS" 12 +Indicates that address resolution should use DNS to map the node and service +names to addresses. Flag is mutually exclusive with RAI_SA. DNS resolution +is the default option. +.IP "RAI_SA" 12 +Indicates that address resolution should query the Infiniband SA to map node +and service names to addresses. Flag is mutually exclusive with RAI_DNS. SA +resolution only applies to Infiniband ports. Non-Infiniband ports will be skipped. .IP "ai_family" 12 Address family for the source and destination address. Supported families are: AF_INET, AF_INET6, and AF_IB. @@ -131,4 +139,5 @@ Pointer to the next rdma_addrinfo structure in the list. Will be NULL if no more structures exist. .SH "SEE ALSO" rdma_create_id(3), rdma_resolve_route(3), rdma_connect(3), rdma_create_qp(3), -rdma_bind_addr(3), rdma_create_ep(3), rdma_freeaddrinfo(3) +rdma_bind_addr(3), rdma_create_ep(3), rdma_freeaddrinfo(3), +rdma_resolve_addrinfo(3),rdma_query_addrinfo(3) diff --git a/librdmacm/man/rdma_query_addrinfo.3.in.rst b/librdmacm/man/rdma_query_addrinfo.3.in.rst new file mode 100644 index 000000000..3c4009666 --- /dev/null +++ b/librdmacm/man/rdma_query_addrinfo.3.in.rst @@ -0,0 +1,53 @@ +=================== +RDMA_QUERY_ADDRINFO +=================== + +--------------------------------------- +Query the resolved address information. +--------------------------------------- + +:Date: 2025-02-06 +:Manual section: 3 +:Manual group: Librdmacm Programmer's Manual + + +SYNOPSIS +======== + +#include + +int rdma_query_addrinfo(struct rdma_cm_id \*id, struct rdma_addrinfo \*\*info); + +ARGUMENTS +========= + +id RDMA identifier. + +info A pointer to a linked list of rdma_addrinfo structures containing resolved information. + +DESCRIPTION +=========== + +This function retrieves the resulting rdma_addrinfo structures from a successful rdma_resolve_addrinfo() operation. + +RETURN VALUE +============ + +On success 0 is returned, info contains a resolved address information +On error -1 is returned, errno will be set to indicate the failure reason. + +NOTES +===== + +The info must be released with rdma_freeaddrinfo(3) + + +SEE ALSO +======== + +rdma_getaddrinfo(3), rdma_freeaddrinfo(3), rdma_resolve_addrinfo(3) + +AUTHOR +====== + +Mark Zhang diff --git a/librdmacm/man/rdma_resolve_addrinfo.3.in.rst b/librdmacm/man/rdma_resolve_addrinfo.3.in.rst new file mode 100644 index 000000000..307c67b01 --- /dev/null +++ b/librdmacm/man/rdma_resolve_addrinfo.3.in.rst @@ -0,0 +1,68 @@ +===================== +RDMA_RESOLVE_ADDRINFO +===================== + +--------------------------------------------------------- +Resolve RDMA addresses which supports both DNS and IB SA. +--------------------------------------------------------- + +:Date: 2025-02-06 +:Manual section: 3 +:Manual group: Librdmacm Programmer's Manual + + +SYNOPSIS +======== + +#include + +int rdma_resolve_addrinfo(struct rdma_cm_id \*id, const char \*node, const char \*service, const struct rdma_addrinfo \*hints); + +ARGUMENTS +========= + +id RDMA identifier. + +node Optional, name, dotted-decimal IPv4, or IPv6 hex address to resolve. + +service The service name or port number of address. + +hints Reference to an rdma_addrinfo structure containing hints about the type of service the caller supports. + +DESCRIPTION +=========== + +This call submits an asynchronous address resolution request. The behavior is similar to rdma_getaddrinfo(), +except that the operation is asynchronous, generating an event on the RDMA CM event channel that is +associated with the specified rdma_cm_id when complete. The %node, %service, and %hints parameters are defined +similarly to rdma_getaddrinfo(). + +RETURN VALUE +============ + +Returns 0 on success. Success indicates that asynchronous address resolution was initiated. The result of +the resolution, whether successful or failed, will be reported as an event on the related event channel. + +Returns -1 on error, errno will be set to indicate the failure reason. The address resolution was not +started, and no event will be generated on the event channel. + +NOTES +===== + +This call supports both DNS and IB SA resolution, depends on the hints.ai_flags: + - RAI_DNS: Performs address resolution using DNS. + - RAI_SA: Performs address resolution using the Infiniband SA. The rdma_cm_id associated with the call must be bound to an Infiniband port, or an error will occur. The %node parameter must be null (not supported). %Service should be an IB service name or ID. + +These 2 flags are mutual-exclusive; If none of them is set then DNS is the default. + +The cm event RDMA_CM_EVENT_ADDRINFO_RESOLVED (on success) or RDMA_CM_EVENT_ADDRINFO_ERROR (on failure) is generated. + +SEE ALSO +======== + +rdma_getaddrinfo(3), rdma_query_addrinfo(3) + +AUTHOR +====== + +Mark Zhang diff --git a/librdmacm/man/rdma_write_cm_event.3.in.rst b/librdmacm/man/rdma_write_cm_event.3.in.rst new file mode 100644 index 000000000..432e4fc16 --- /dev/null +++ b/librdmacm/man/rdma_write_cm_event.3.in.rst @@ -0,0 +1,57 @@ +=================== +RDMA_WRITE_CM_EVENT +=================== + +------------------------- +Write an event into a CM. +------------------------- + +:Date: 2025-02-06 +:Manual section: 3 +:Manual group: Librdmacm Programmer's Manual + + +SYNOPSIS +======== + +#include + +int rdma_write_cm_event(struct rdma_cm_id \*id, enum rdma_cm_event_type event, int status, uint64_t arg); + +ARGUMENTS +========= + +id The RDMA identifier associated with the reported rdma_cm_event. + +event The communication event value to report. This should be set to RDMA_CM_EVENT_USER. + +status The status value reported in the rdma_cm_event. + +arg A user-specified value reported in the rdma_cm_event. + +DESCRIPTION +=========== + +Write an event into a CM, with a status and an argument. + +RETURN VALUE +============ + +On success 0 is returned, on error -1 is returned, errno will be set to indicate the failure reason. + +NOTES +===== + +This call allows an application to write a user-defined event to the event channel associated with the +specified rdma_cm_id. Valid user events are: RDMA_CM_EVENT_USER. Applications may use this for internal +signaling purposes, such as waking a thread blocked on the event channel. + +SEE ALSO +======== + +rdma_get_cm_event(3) + +AUTHOR +====== + +Mark Zhang From 93e224fc48a1d10a90e8f839388659aa653c1c09 Mon Sep 17 00:00:00 2001 From: Michael Margolin Date: Mon, 15 Sep 2025 08:50:42 +0000 Subject: [PATCH 41/66] efa: Extend DV query CQ to return doorbell Return the CQ reverse doorbell to give external data path implementations (GPU, etc.) an option to notify the device of polled entries. Reviewed-by: Yonatan Nachum Signed-off-by: Michael Margolin --- providers/efa/efadv.h | 1 + providers/efa/man/efadv_query_cq.3.md | 5 +++++ providers/efa/verbs.c | 3 +++ 3 files changed, 9 insertions(+) diff --git a/providers/efa/efadv.h b/providers/efa/efadv.h index c1a53b569..7c034f881 100644 --- a/providers/efa/efadv.h +++ b/providers/efa/efadv.h @@ -127,6 +127,7 @@ struct efadv_cq_attr { uint8_t *buffer; uint32_t entry_size; uint32_t num_entries; + uint32_t *doorbell; }; int efadv_query_cq(struct ibv_cq *ibvcq, struct efadv_cq_attr *attr, uint32_t inlen); diff --git a/providers/efa/man/efadv_query_cq.3.md b/providers/efa/man/efadv_query_cq.3.md index ac9b0cf49..287221720 100644 --- a/providers/efa/man/efadv_query_cq.3.md +++ b/providers/efa/man/efadv_query_cq.3.md @@ -33,6 +33,7 @@ struct efadv_cq_attr { uint8_t *buffer; uint32_t entry_size; uint32_t num_entries; + uint32_t *doorbell; }; ``` @@ -51,6 +52,10 @@ struct efadv_cq_attr { *num_entries* : Maximal number of entries in the completion queue. +*doorbell* +: Reverse doorbell used to update the device of polled entries and to + request notifications. NULL when not in use for this Completion Queue. + # RETURN VALUE **efadv_query_cq()** returns 0 on success, or the value of errno on failure diff --git a/providers/efa/verbs.c b/providers/efa/verbs.c index 708199355..681d90897 100644 --- a/providers/efa/verbs.c +++ b/providers/efa/verbs.c @@ -1326,6 +1326,9 @@ int efadv_query_cq(struct ibv_cq *ibvcq, struct efadv_cq_attr *attr, uint32_t in attr->entry_size = cq->cqe_size; attr->num_entries = ibvcq->cqe; + if (vext_field_avail(typeof(*attr), doorbell, inlen)) + attr->doorbell = cq->db; + return 0; } From 49628442a749bac398e6638ab64e02877ce8d22f Mon Sep 17 00:00:00 2001 From: Jo Zzsi Date: Mon, 8 Sep 2025 21:03:02 -0400 Subject: [PATCH 42/66] dracut: unify and improve dracut rdma module The intention of this PR is eliminate the need to have a separate module in the redhat and suse directories. Additional improvements beyond the original goal - moves the dracut module ordering to 50, which is the recommended ordering for out of tree dracut modules - fixes dracut dependency and shellcheck issues - fixes "failed to add dependency" with dracut v108+ Signed-off-by: Jo Zzsi --- kernel-boot/dracut/50rdma/module-setup.sh | 40 +++++++++++++++++++++++ redhat/rdma-core.spec | 8 ++--- redhat/rdma.modules-setup.sh | 40 ----------------------- suse/module-setup.sh | 38 --------------------- suse/rdma-core.spec | 10 +++--- 5 files changed, 49 insertions(+), 87 deletions(-) create mode 100644 kernel-boot/dracut/50rdma/module-setup.sh delete mode 100644 redhat/rdma.modules-setup.sh delete mode 100644 suse/module-setup.sh diff --git a/kernel-boot/dracut/50rdma/module-setup.sh b/kernel-boot/dracut/50rdma/module-setup.sh new file mode 100644 index 000000000..091620620 --- /dev/null +++ b/kernel-boot/dracut/50rdma/module-setup.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +check() { + [ -n "$hostonly" ] && [ -e /sys/class/infiniband_verbs/uverbs0 ] && return 0 + [ -n "$hostonly" ] && return 255 + return 0 +} + +depends() { + echo systemd + return 0 +} + +install() { + inst /etc/rdma/mlx4.conf + inst /etc/rdma/modules/infiniband.conf + inst /etc/rdma/modules/iwarp.conf + inst /etc/rdma/modules/opa.conf + inst /etc/rdma/modules/rdma.conf + inst /etc/rdma/modules/roce.conf + inst /usr/libexec/mlx4-setup.sh + inst_multiple lspci setpci awk sleep + inst_rules 60-rdma-persistent-naming.rules 70-persistent-ipoib.rules 75-rdma-description.rules 90-rdma-hw-modules.rules 90-rdma-ulp-modules.rules 90-rdma-umad.rules + inst_multiple -o \ + inst /usr/lib/modprobe.d/libmlx4.conf \ + "$systemdsystemunitdir"/rdma-hw.target \ + "$systemdsystemunitdir"/rdma-load-modules@.service + + for i in \ + rdma-load-modules@rdma.service \ + rdma-load-modules@roce.service \ + rdma-load-modules@infiniband.service; do + $SYSTEMCTL -q --root "$initdir" add-wants initrd.target "$i" + done +} + +installkernel() { + hostonly='' instmods "=drivers/infiniband" "=drivers/net/ethernet/mellanox" "=drivers/net/ethernet/chelsio" "=drivers/net/ethernet/cisco" "=drivers/net/ethernet/emulex" "=drivers/target" + hostonly='' instmods crc-t10dif crct10dif_common xprtrdma svcrdma +} diff --git a/redhat/rdma-core.spec b/redhat/rdma-core.spec index a57743d12..83ba45cc7 100644 --- a/redhat/rdma-core.spec +++ b/redhat/rdma-core.spec @@ -339,10 +339,10 @@ mkdir -p %{buildroot}/%{_sysconfdir}/rdma %global sysmodprobedir %{_prefix}/lib/modprobe.d mkdir -p %{buildroot}%{_libexecdir} mkdir -p %{buildroot}%{_udevrulesdir} -mkdir -p %{buildroot}%{dracutlibdir}/modules.d/05rdma +mkdir -p %{buildroot}%{dracutlibdir}/modules.d/50rdma mkdir -p %{buildroot}%{sysmodprobedir} install -D -m0644 redhat/rdma.mlx4.conf %{buildroot}/%{_sysconfdir}/rdma/mlx4.conf -install -D -m0755 redhat/rdma.modules-setup.sh %{buildroot}%{dracutlibdir}/modules.d/05rdma/module-setup.sh +install -D -m0755 kernel-boot/dracut/50rdma/module-setup.sh %{buildroot}%{dracutlibdir}/modules.d/50rdma/module-setup.sh install -D -m0644 redhat/rdma.mlx4.sys.modprobe %{buildroot}%{sysmodprobedir}/libmlx4.conf install -D -m0755 redhat/rdma.mlx4-setup.sh %{buildroot}%{_libexecdir}/mlx4-setup.sh rm -f %{buildroot}%{_sysconfdir}/rdma/modules/rdma.conf @@ -430,8 +430,8 @@ fi %{_unitdir}/rdma-load-modules@.service %dir %{dracutlibdir} %dir %{dracutlibdir}/modules.d -%dir %{dracutlibdir}/modules.d/05rdma -%{dracutlibdir}/modules.d/05rdma/module-setup.sh +%dir %{dracutlibdir}/modules.d/50rdma +%{dracutlibdir}/modules.d/50rdma/module-setup.sh %dir %{_udevrulesdir} %{_udevrulesdir}/../rdma_rename %{_udevrulesdir}/60-rdma-ndd.rules diff --git a/redhat/rdma.modules-setup.sh b/redhat/rdma.modules-setup.sh deleted file mode 100644 index 19bb06d7d..000000000 --- a/redhat/rdma.modules-setup.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash - -check() { - [ -n "$hostonly" -a -d /sys/class/infiniband_verbs/uverbs0 ] && return 0 - [ -n "$hostonly" ] && return 255 - return 0 -} - -depends() { - return 0 -} - -install() { - inst /etc/rdma/mlx4.conf - inst /etc/rdma/modules/infiniband.conf - inst /etc/rdma/modules/iwarp.conf - inst /etc/rdma/modules/opa.conf - inst /etc/rdma/modules/rdma.conf - inst /etc/rdma/modules/roce.conf - inst /usr/libexec/mlx4-setup.sh - inst /usr/lib/modprobe.d/libmlx4.conf - inst_multiple lspci setpci awk sleep - inst_multiple -o /etc/modprobe.d/mlx4.conf - inst_rules 60-rdma-persistent-naming.rules 70-persistent-ipoib.rules 75-rdma-description.rules 90-rdma-hw-modules.rules 90-rdma-ulp-modules.rules 90-rdma-umad.rules - inst_multiple -o \ - $systemdsystemunitdir/rdma-hw.target \ - $systemdsystemunitdir/rdma-load-modules@.service - - for i in \ - rdma-load-modules@rdma.service \ - rdma-load-modules@roce.service \ - rdma-load-modules@infiniband.service; do - $SYSTEMCTL -q --root "$initdir" add-wants initrd.target "$i" - done -} - -installkernel() { - hostonly='' instmods =drivers/infiniband =drivers/net/ethernet/mellanox =drivers/net/ethernet/chelsio =drivers/net/ethernet/cisco =drivers/net/ethernet/emulex =drivers/target - hostonly='' instmods crc-t10dif crct10dif_common xprtrdma svcrdma -} diff --git a/suse/module-setup.sh b/suse/module-setup.sh deleted file mode 100644 index cdb89c643..000000000 --- a/suse/module-setup.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash - -check() { - [ -n "$hostonly" -a -c /sys/class/infiniband_verbs/uverbs0 ] && return 0 - [ -n "$hostonly" ] && return 255 - return 0 -} - -depends() { - return 0 -} - -install() { - inst /etc/rdma/mlx4.conf - inst /etc/rdma/modules/infiniband.conf - inst /etc/rdma/modules/iwarp.conf - inst /etc/rdma/modules/opa.conf - inst /etc/rdma/modules/rdma.conf - inst /etc/rdma/modules/roce.conf - inst /usr/libexec/mlx4-setup.sh - inst_multiple lspci setpci awk sleep - inst_rules 60-rdma-persistent-naming.rules 70-persistent-ipoib.rules 75-rdma-description.rules 90-rdma-hw-modules.rules 90-rdma-ulp-modules.rules - inst_multiple -o \ - $systemdsystemunitdir/rdma-hw.target \ - $systemdsystemunitdir/rdma-load-modules@.service - - for i in \ - rdma-load-modules@rdma.service \ - rdma-load-modules@roce.service \ - rdma-load-modules@infiniband.service; do - $SYSTEMCTL -q --root "$initdir" add-wants initrd.target "$i" - done -} - -installkernel() { - hostonly='' instmods =drivers/infiniband =drivers/net/ethernet/mellanox =drivers/net/ethernet/chelsio =drivers/net/ethernet/cisco =drivers/net/ethernet/emulex =drivers/target - hostonly='' instmods crc-t10dif crct10dif_common xprtrdma svcrdma -} diff --git a/suse/rdma-core.spec b/suse/rdma-core.spec index 869fa2580..99e9b3796 100644 --- a/suse/rdma-core.spec +++ b/suse/rdma-core.spec @@ -483,7 +483,7 @@ mkdir -p %{buildroot}/%{_sysconfdir}/rdma %global dracutlibdir %%{_prefix}/lib/dracut/ mkdir -p %{buildroot}%{_udevrulesdir} -mkdir -p %{buildroot}%{dracutlibdir}/modules.d/05rdma +mkdir -p %{buildroot}%{dracutlibdir}/modules.d/50rdma mkdir -p %{buildroot}%{_modprobedir} mkdir -p %{buildroot}%{_unitdir} @@ -496,11 +496,11 @@ chmod 0644 %{buildroot}%{_modprobedir}/mlx4.conf install -D -m0755 redhat/rdma.mlx4-setup.sh %{buildroot}%{_libexecdir}/mlx4-setup.sh # Dracut file for IB support during boot -install -D -m0644 suse/module-setup.sh %{buildroot}%{dracutlibdir}/modules.d/05rdma/module-setup.sh +install -D -m0644 kernel-boot/dracut/50rdma/module-setup.sh %{buildroot}%{dracutlibdir}/modules.d/50rdma/module-setup.sh %if "%{_libexecdir}" != "/usr/libexec" sed 's-/usr/libexec-%{_libexecdir}-g' -i %{buildroot}%{_modprobedir}/50-libmlx4.conf -sed 's-/usr/libexec-%{_libexecdir}-g' -i %{buildroot}%{dracutlibdir}/modules.d/05rdma/module-setup.sh +sed 's-/usr/libexec-%{_libexecdir}-g' -i %{buildroot}%{dracutlibdir}/modules.d/50rdma/module-setup.sh %endif # ibacm @@ -652,8 +652,8 @@ done %{_unitdir}/rdma-load-modules@.service %dir %{dracutlibdir} %dir %{dracutlibdir}/modules.d -%dir %{dracutlibdir}/modules.d/05rdma -%{dracutlibdir}/modules.d/05rdma/module-setup.sh +%dir %{dracutlibdir}/modules.d/50rdma +%{dracutlibdir}/modules.d/50rdma/module-setup.sh %{_udevrulesdir}/../rdma_rename %{_udevrulesdir}/60-rdma-persistent-naming.rules %{_udevrulesdir}/75-rdma-description.rules From 11d1baaa0a440b7dc3383d361b47b4c28f7ff0e7 Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Wed, 17 Sep 2025 14:08:48 +0800 Subject: [PATCH 43/66] providers/erdma: Add SEND_WITH_INV support Opcode IBV_WR_SEND_WITH_INV may be used by SPDK rdma, so we add support for it. Signed-off-by: Cheng Xu --- providers/erdma/erdma_hw.h | 5 ++++- providers/erdma/erdma_verbs.c | 10 ++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/providers/erdma/erdma_hw.h b/providers/erdma/erdma_hw.h index 1eecbcec2..45ca1a7ff 100644 --- a/providers/erdma/erdma_hw.h +++ b/providers/erdma/erdma_hw.h @@ -185,7 +185,10 @@ struct erdma_write_sqe { struct erdma_send_sqe { __le64 hdr; - __be32 imm_data; + union { + __be32 imm_data; + __le32 invalid_stag; + }; __le32 length; struct erdma_sge sgl[]; }; diff --git a/providers/erdma/erdma_verbs.c b/providers/erdma/erdma_verbs.c index 2d33d455d..0ac24b700 100644 --- a/providers/erdma/erdma_verbs.c +++ b/providers/erdma/erdma_verbs.c @@ -583,13 +583,19 @@ static int erdma_push_one_sqe(struct erdma_qp *qp, struct ibv_send_wr *wr, break; case IBV_WR_SEND: case IBV_WR_SEND_WITH_IMM: + case IBV_WR_SEND_WITH_INV: if (wr->opcode == IBV_WR_SEND) opcode = ERDMA_OP_SEND; - else + else if (wr->opcode == IBV_WR_SEND_WITH_IMM) opcode = ERDMA_OP_SEND_WITH_IMM; + else + opcode = ERDMA_OP_SEND_WITH_INV; sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, opcode); send_sqe = sqe; - send_sqe->imm_data = wr->imm_data; + if (wr->opcode == IBV_WR_SEND_WITH_INV) + send_sqe->invalid_stag = htole32(wr->invalidate_rkey); + else + send_sqe->imm_data = wr->imm_data; length_field = &send_sqe->length; /* sgl is in the half of current wqebb (offset 16Byte) */ From 949e4a3ae8998afd51dc418986f2a5241f703786 Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Wed, 17 Sep 2025 14:11:23 +0800 Subject: [PATCH 44/66] providers/erdma: Fix typo Atomic FAA is short for atomic fetch and add, not FAD. Signed-off-by: Cheng Xu --- providers/erdma/erdma_hw.h | 2 +- providers/erdma/erdma_verbs.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/providers/erdma/erdma_hw.h b/providers/erdma/erdma_hw.h index 45ca1a7ff..09f7d63d1 100644 --- a/providers/erdma/erdma_hw.h +++ b/providers/erdma/erdma_hw.h @@ -61,7 +61,7 @@ enum erdma_opcode { ERDMA_OP_LOCAL_INV = 15, ERDMA_OP_READ_WITH_INV = 16, ERDMA_OP_ATOMIC_CAS = 17, - ERDMA_OP_ATOMIC_FAD = 18, + ERDMA_OP_ATOMIC_FAA = 18, ERDMA_NUM_OPCODES = 19, ERDMA_OP_INVALID = ERDMA_NUM_OPCODES + 1 }; diff --git a/providers/erdma/erdma_verbs.c b/providers/erdma/erdma_verbs.c index 0ac24b700..fe001d60f 100644 --- a/providers/erdma/erdma_verbs.c +++ b/providers/erdma/erdma_verbs.c @@ -637,7 +637,7 @@ static int erdma_push_one_sqe(struct erdma_qp *qp, struct ibv_send_wr *wr, htole64(wr->wr.atomic.compare_add); } else { sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, - ERDMA_OP_ATOMIC_FAD); + ERDMA_OP_ATOMIC_FAA); atomic_sqe->fetchadd_swap_data = htole64(wr->wr.atomic.compare_add); } @@ -879,7 +879,7 @@ static const enum ibv_wc_opcode wc_mapping_table[ERDMA_NUM_OPCODES] = { [ERDMA_OP_SEND_WITH_INV] = IBV_WC_SEND, [ERDMA_OP_READ_WITH_INV] = IBV_WC_RDMA_READ, [ERDMA_OP_ATOMIC_CAS] = IBV_WC_COMP_SWAP, - [ERDMA_OP_ATOMIC_FAD] = IBV_WC_FETCH_ADD, + [ERDMA_OP_ATOMIC_FAA] = IBV_WC_FETCH_ADD, }; static const struct { From 359bc2f6e52d8dcfe4f41bc7fc389e4d87954d80 Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Wed, 17 Sep 2025 14:18:46 +0800 Subject: [PATCH 45/66] proviers/erdma: Fix wrong length passed to ibv_dofork_range The length passed to ibv_dofork_range may be shorter than the actual length of the CQ queue buffer in erdma_destroy_cq, so correct it. Signed-off-by: Cheng Xu --- providers/erdma/erdma_verbs.c | 3 ++- providers/erdma/erdma_verbs.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/providers/erdma/erdma_verbs.c b/providers/erdma/erdma_verbs.c index fe001d60f..28fc67c31 100644 --- a/providers/erdma/erdma_verbs.c +++ b/providers/erdma/erdma_verbs.c @@ -193,6 +193,7 @@ struct ibv_cq *erdma_create_cq(struct ibv_context *ctx, int num_cqe, } memset(cq->queue, 0, cq_size); + cq->qbuf_size = cq_size; db_records = erdma_alloc_dbrecords(ectx); if (!db_records) { @@ -262,7 +263,7 @@ int erdma_destroy_cq(struct ibv_cq *base_cq) erdma_dealloc_dbrecords(ctx, cq->db_record); if (cq->queue) { - ibv_dofork_range(cq->queue, cq->depth << CQE_SHIFT); + ibv_dofork_range(cq->queue, cq->qbuf_size); free(cq->queue); } diff --git a/providers/erdma/erdma_verbs.h b/providers/erdma/erdma_verbs.h index ce9a12349..1ba75afa8 100644 --- a/providers/erdma/erdma_verbs.h +++ b/providers/erdma/erdma_verbs.h @@ -63,6 +63,7 @@ struct erdma_cq { uint32_t depth; uint32_t ci; struct erdma_cqe *queue; + size_t qbuf_size; void *db; uint16_t db_offset; From 4010460cbe103c766a8dd514643f9cf5dbac8e36 Mon Sep 17 00:00:00 2001 From: Timon Kruiper Date: Thu, 18 Sep 2025 11:35:00 +0200 Subject: [PATCH 46/66] pyverbs: Release Python GIL when calling blocking CMID functions This releases the Python GIL when calling the following functions: - rdma_get_request (CMID.get_request) - rdma_get_send_comp (CMID.get_send_comp) - rdma_get_recv_comp (CMID.get_recv_comp) This allows the user to create a functional multithreaded Python application using the PyVerbs API. Signed-off-by: Timon Kruiper --- pyverbs/cmid.pyx | 9 ++++++--- pyverbs/librdmacm.pxd | 6 +++--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/pyverbs/cmid.pyx b/pyverbs/cmid.pyx index 371ee2ee3..1b38ce6eb 100644 --- a/pyverbs/cmid.pyx +++ b/pyverbs/cmid.pyx @@ -446,7 +446,8 @@ cdef class CMID(PyverbsCM): :return: New CMID representing the connection request. """ to_conn = CMID() - ret = cm.rdma_get_request(self.id, &to_conn.id) + with nogil: + ret = cm.rdma_get_request(self.id, &to_conn.id) if ret != 0: raise PyverbsRDMAErrno('Failed to get request, no connection established') self.ctx = Context(cmid=to_conn) @@ -776,7 +777,8 @@ cdef class CMID(PyverbsCM): :return: The retrieved WC or None if there is no completions """ cdef v.ibv_wc wc - ret = cm.rdma_get_recv_comp(self.id, &wc) + with nogil: + ret = cm.rdma_get_recv_comp(self.id, &wc) if ret < 0: raise PyverbsRDMAErrno('Failed to retrieve receive completion') elif ret == 0: @@ -794,7 +796,8 @@ cdef class CMID(PyverbsCM): :return: The retrieved WC or None if there is no completions """ cdef v.ibv_wc wc - ret = cm.rdma_get_send_comp(self.id, &wc) + with nogil: + ret = cm.rdma_get_send_comp(self.id, &wc) if ret < 0: raise PyverbsRDMAErrno('Failed to retrieve send completion') elif ret == 0: diff --git a/pyverbs/librdmacm.pxd b/pyverbs/librdmacm.pxd index 0d6fa912f..b901f4d7e 100644 --- a/pyverbs/librdmacm.pxd +++ b/pyverbs/librdmacm.pxd @@ -109,7 +109,7 @@ cdef extern from '': int rdma_destroy_id(rdma_cm_id *id) int rdma_get_remote_ece(rdma_cm_id *id, ibv_ece *ece) int rdma_set_local_ece(rdma_cm_id *id, ibv_ece *ece) - int rdma_get_request(rdma_cm_id *listen, rdma_cm_id **id) + int rdma_get_request(rdma_cm_id *listen, rdma_cm_id **id) nogil int rdma_bind_addr(rdma_cm_id *id, sockaddr *addr) int rdma_resolve_addr(rdma_cm_id *id, sockaddr *src_addr, sockaddr *dst_addr, int timeout_ms) @@ -149,8 +149,8 @@ cdef extern from '': int rdma_post_write(rdma_cm_id *id, void *context, void *addr, size_t length, ibv_mr *mr, int flags, uint64_t remote_addr, uint32_t rkey) - int rdma_get_send_comp(rdma_cm_id *id, ibv_wc *wc) - int rdma_get_recv_comp(rdma_cm_id *id, ibv_wc *wc) + int rdma_get_send_comp(rdma_cm_id *id, ibv_wc *wc) nogil + int rdma_get_recv_comp(rdma_cm_id *id, ibv_wc *wc) nogil ibv_mr *rdma_reg_msgs(rdma_cm_id *id, void *addr, size_t length) ibv_mr *rdma_reg_read(rdma_cm_id *id, void *addr, size_t length) ibv_mr *rdma_reg_write(rdma_cm_id *id, void *addr, size_t length) From e23cb0a72e454d0f985316679758fe2ff3320988 Mon Sep 17 00:00:00 2001 From: Dennis van der Staay Date: Sat, 11 Oct 2025 08:44:55 -0700 Subject: [PATCH 47/66] mlx5: Fix byte_count type in umr_sg_list_create Change byte_count variable from int to uint64_t to match the reglen parameter type and prevent potential overflow issues when working with large memory registrations in UMR (User-Mode Memory Registration) operations. The function signature uses uint64_t *reglen, but byte_count was declared as int, which could cause type mismatch issues and potential overflow when accumulating large transfer sizes across multiple SGEs. Signed-off-by: Dennis van der Staay --- providers/mlx5/qp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/providers/mlx5/qp.c b/providers/mlx5/qp.c index 76e1a428b..60f0e2c75 100644 --- a/providers/mlx5/qp.c +++ b/providers/mlx5/qp.c @@ -2106,7 +2106,7 @@ static int umr_sg_list_create(struct mlx5_qp *qp, uint64_t *reglen) { struct mlx5_wqe_data_seg *dseg; - int byte_count = 0; + uint64_t byte_count = 0; int i; size_t tmp; From 15adbcf23df2821e8e5a211a7f527fd2a649fb3e Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Wed, 22 Oct 2025 14:14:23 +0800 Subject: [PATCH 48/66] libhns: Fix wrong WQE data when QP wraps around DirectWQE fields are not assigned or cleared explicitly when DirectWQE not used. When QP wraps around, data in these fields from the previous use at the same position still remains and are issued to HW by mistake. Clear these fields before issuing doorbell to HW. Fixes: 159933c37450 ("libhns: Add support for direct wqe") Signed-off-by: Junxian Huang --- providers/hns/hns_roce_u_hw_v2.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/providers/hns/hns_roce_u_hw_v2.c b/providers/hns/hns_roce_u_hw_v2.c index 784841f43..f4b7a1747 100644 --- a/providers/hns/hns_roce_u_hw_v2.c +++ b/providers/hns/hns_roce_u_hw_v2.c @@ -1370,6 +1370,9 @@ int hns_roce_u_v2_post_send(struct ibv_qp *ibvqp, struct ibv_send_wr *wr, wqe = get_send_wqe(qp, wqe_idx); qp->sq.wrid[wqe_idx] = wr->wr_id; + /* RC and UD share the same DirectWQE field layout */ + ((struct hns_roce_rc_sq_wqe *)wqe)->byte_4 = 0; + switch (ibvqp->qp_type) { case IBV_QPT_XRC_SEND: hr_reg_write(wqe, RCWQE_XRC_SRQN, From 19bd0c8680f906acc08c4fad13869077be03f699 Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Wed, 22 Oct 2025 14:14:24 +0800 Subject: [PATCH 49/66] libhns: Clean up an extra blank line Clean up an extra blank line. Fixes: b47932358e64 ("libhns: Fix the sge num problem of atomic op") Signed-off-by: Junxian Huang --- providers/hns/hns_roce_u_hw_v2.c | 1 - 1 file changed, 1 deletion(-) diff --git a/providers/hns/hns_roce_u_hw_v2.c b/providers/hns/hns_roce_u_hw_v2.c index f4b7a1747..fd266b10c 100644 --- a/providers/hns/hns_roce_u_hw_v2.c +++ b/providers/hns/hns_roce_u_hw_v2.c @@ -2199,7 +2199,6 @@ static void wr_set_sge_list_rc(struct ibv_qp_ex *ibv_qp, size_t num_sge, return; } - hr_reg_write(wqe, RCWQE_MSG_START_SGE_IDX, qp->sge_info.start_idx & (qp->ex_sge.sge_cnt - 1)); From 2671a1402c615a0e4756b19de0f13be543d39187 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 26 Oct 2025 02:54:32 -0400 Subject: [PATCH 50/66] Update library version to be 61.0 Signed-off-by: Leon Romanovsky --- CMakeLists.txt | 2 +- debian/changelog | 2 +- redhat/rdma-core.spec | 2 +- suse/rdma-core.spec | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0606e29c3..77962804e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -79,7 +79,7 @@ endif() set(PACKAGE_NAME "RDMA") # See Documentation/versioning.md -set(PACKAGE_VERSION "60.0") +set(PACKAGE_VERSION "61.0") # When this is changed the values in these files need changing too: # debian/control # debian/libibverbs1.symbols diff --git a/debian/changelog b/debian/changelog index 70207d365..347fec556 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,4 +1,4 @@ -rdma-core (60.0-1) unstable; urgency=medium +rdma-core (61.0-1) unstable; urgency=medium * New upstream release. diff --git a/redhat/rdma-core.spec b/redhat/rdma-core.spec index 83ba45cc7..36c411890 100644 --- a/redhat/rdma-core.spec +++ b/redhat/rdma-core.spec @@ -1,5 +1,5 @@ Name: rdma-core -Version: 60.0 +Version: 61.0 Release: 1%{?dist} Summary: RDMA core userspace libraries and daemons diff --git a/suse/rdma-core.spec b/suse/rdma-core.spec index 99e9b3796..9c7e4d68e 100644 --- a/suse/rdma-core.spec +++ b/suse/rdma-core.spec @@ -28,7 +28,7 @@ %define git_ver %{nil} Name: rdma-core -Version: 60.0 +Version: 61.0 Release: 0 Summary: RDMA core userspace libraries and daemons License: BSD-2-Clause OR GPL-2.0-only From 55841caff236f58d9680d9f9b6ada24256b34b85 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Sat, 11 Oct 2025 20:20:07 -0700 Subject: [PATCH 51/66] libibverbs: Document verbs semantic model A user of libibverbs must rely heavily on external documentation, specifically the IBTA vol. 1 specification, to understand how the API is used. However, the API itself has evolved beyond support for only Infiniband. This leaves both users and potential vendors trying to plug into the API struggling, as the names used by the library reflect Infiniband naming, but the concepts have broader use. To provide better guidance on what the current verbs semantic model describes, provide documentation on how major verbs constructs are used. This includes referencing the historical meaning of verbs objects, as well as their evolved use. The proposed descriptions are directly intended to help new transports, such as Ultra Ethernet, understand how to adopt verbs for best results and where potential changes may be needed. Signed-off-by: Sean Hefty --- Documentation/libibverbs.md | 285 +++++++++++++++++++++++++++++++++++- 1 file changed, 280 insertions(+), 5 deletions(-) diff --git a/Documentation/libibverbs.md b/Documentation/libibverbs.md index 980f354a3..0f7984382 100644 --- a/Documentation/libibverbs.md +++ b/Documentation/libibverbs.md @@ -1,10 +1,9 @@ # Introduction -libibverbs is a library that allows programs to use RDMA "verbs" for -direct access to RDMA (currently InfiniBand and iWARP) hardware from -userspace. For more information on RDMA verbs, see the InfiniBand -Architecture Specification vol. 1, especially chapter 11, and the RDMA -Consortium's RDMA Protocol Verbs Specification. +libibverbs is a library that allows userspace programs direct +access to high-performance network hardware. See the Verbs +Semantics section at the end of this document for details +on RDMA and verbs constructs. # Using libibverbs @@ -74,3 +73,279 @@ The following table describes the expected behavior when VERBS_LOG_LEVEL is set: |-----------------|---------------------------------|------------------------------------------------| | Regular prints | Output to VERBS_LOG_FILE if set | Output to VERBS_LOG_FILE, or stderr if not set | | Datapath prints | Compiled out, no output | Output to VERBS_LOG_FILE, or stderr if not set | + + +# Verbs Semantics + +Verbs is defined by the InfiniBand Architecture Specification +(vol. 1, chapter 11) as an abstract definition of the functionality +provided by an Infiniband NIC. libibverbs was designed as a formal +software API aligned with that abstraction. As a result, API names, +including the library name, are closely aligned with those defined +for Infiniband. + +However, the library and API have evolved to support additional +high-performance transports and NICs. libibverbs constructs have +expanded beyond their traditional roles and definitions, except that +the original Infiniband naming has been kept for backwards +compatibility purposes. + +Today, verbs can be viewed as defining software primitives for +network hardware supporting one or more of the following: + +- Network queues are directly accessible from user space. +- Network hardware can directly access application memory buffers. +- The transport supports RDMA operations. + +The following sections describe select libibverbs constructs in terms +of their current semantics and, where appropriate, historical context. +Items are ordered conceptually. + +*RDMA* +: RDMA takes on several different meanings based on context, + which are further described below. RDMA stands for remote direct memory + access. Historically, RDMA referred to network operations which could + directly read or write application data buffers at the target. + The use of the term RDMA has since evolved to encompass not just + network operations, but also the key features of such devices: + + - Zero-copy: no intermediate buffering + - Low CPU utilization: transport offload + - High bandwidth and low latency + +*RDMA Verbs* +: RDMA verbs is the more generic name given to the libibverbs API, + as it implies support for other transports beyond Infiniband. + A device which supports RDMA verbs is accessible through this library. + + A common, but restricted, industry use of the term RDMA verbs frequently + implies the subset of libibverbs APIs and semantics focused on reliable- + connected communication. This document will use the term RDMA verbs as + a synonym for the libibverbs API as a whole. + +*RDMA-Core* +: The rdma-core is a set of libraries for interfacing with the Linux + kernel RDMA subsystem. Two key rdma-core libraries are this one, + libibverbs, and the librdmacm, which is used to establish connections. + + The rdma-core is considered an essential component of Linux RDMA. + It is used to ensure that the kernel ABI is stable and implements the + user space portion of the kernel RDMA IOCTL API. + +*RDMA Device / Verbs Device / NIC* +: An RDMA or verbs device is one which is accessible through the Linux + RDMA subsystem, and as a result, plugs into the libibverbs and rdma-core + framework. NICs plug into the RDMA subsystem to expose hardware + primitives supported by verbs (described above) or RDMA-like features. + + NICs do not necessarily need to support RDMA operations or transports + in order to leverage the rdma-core infrastructure. It is sufficient for + a NIC to expose similar features found in RDMA devices. + +*RDMA Operation* +: RDMA operations refer to network transport functions that read or write + data buffers at the target without host CPU intervention. RDMA reads + copy data from a remote memory region to the network and return the data + to the initiator of the request. RDMA writes copy data from a local + memory region to the network and place it directly into a memory region + at the target. + +*RDMA Transport* +: An RDMA transport can be considered any transport that supports RDMA + operations. Common RDMA transports include Infiniband, + RoCE (RDMA over Converged Ethernet), RoCE version 2, and iWarp. RoCE + and RoCEv2 are Infiniband transports over the Ethernet link layer, with + differences only in their lower-level addressing. + However, the term Infiniband usually refers to the Infiniband transport + over the Infiniband link layer. RoCE is used when explicitly + referring to Ethernet based solutions. RoCE version 2 is often included + or implied by references to RoCE. + +*Device Node* +: The original intent of device node type was to identify if an Infiniband + device was a NIC, switch, or router. Infiniband NICs were labeled as + channel adapters (CA). Node type was extended to identify the transport + being manipulated by verb primitives. Devices which implemented other + transports were assigned new node types. As a result, applications which + targeted a specific transport, such as Infiniband or RoCE, relied on node + type to indirectly identify the transport. + +*Protection Domain (PD)* +: A protection domain provides process-level isolation of resources and is + considered a fundamental security construct for Linux RDMA devices. + A PD defines a boundary between memory regions and queue pairs. A + network data transfer is associated with a single queue pair. That queue + pair may only access a memory region that shares the same protection + domain as itself. This prevents a user space process from accessing + memory buffers outside of its address space. + + Protection domains provide security for regions accessed + by both local and remote operations. Local access includes work requests + posted to HW command queues which reference memory regions. Remote + access includes RDMA operations which read or write memory regions. + + A queue pair is associated with a single PD. The PD verifies that hardware + access to a given lkey or rkey is valid for the specified QP and the + initiating or targeted process has permission to the lkey or rkey. Vendors + may implement a PD using a variety of mechanisms, but are required to meet + the defined security isolation. + +*Memory Region (MR)* +: A memory region identifies a virtual address range known to the NIC. + MRs are registered address ranges accessible by the NIC for local and + remote operations. The process of creating a MR associates the given + virtual address range with a protection domain, in order to ensure + process-level isolation. + + Once allocated, data transfers reference the MR using a key value (lkey + and/or rkey). When accessing a MR as part of a data transfer, an offset + into the memory region is specified. The offset is relative to the start + of the region and may either be 0-based or based on the region’s starting + virtual address. + +*lkey* +: The lkey is designed as a hardware identifier for a locally accessed data + buffer. Because work requests are formatted by user space software and + may be written directly to hardware queues, hardware must validate + that the memory buffers being referenced are accessible to the application. + + NIC hardware may not have access to the operating system's + virtual address translation table. Instead, hardware can use the lkey to + identify the registered memory region, which in turn identifies a protection + domain, which finally identifies the calling process. The protection domain + the processing queue pair must match that of the accessed memory region. + This prevents an application from sending data from buffers outside of its + virtual address space. + +*rkey* +: The rkey is designed as a transport identifier for remotely accessed data + buffers. It's conceptually like an lkey, but the value is + shared across the network. An rkey is associated with transport + permissions. + +*Completion Queue (CQ)* +: A completion queue is designed to represent a hardware queue where the + status of asynchronous operations is reported. Each asynchronous + operation (i.e. data transfer) is expected to write a single entry + into the completion queue. + +*Queue Pair (QP)* +: A queue pair was originally defined as a transport addressable set of + hardware queues, with a QP consisting of send and receive queues (defined + below). The evolved definition of a QP refers only to the transport + addressability of an endpoint. A QP's address is identified as a + queue pair number (QPN), which is conceptually like a transport + port number. In networking stack models, a QP is considered a transport + layer object. + + The internal structure of the QP is not constrained to a pair of queues. + The number of hardware queues and their purpose may vary based on how + the QP is configured. A QP may have 0 or more command queues used for + posting data transfer requests (send queues) and 0 or more command queues + for posting data buffers used to receive incoming messages (receive queues). + +*Receive Queue (RQ)* +: Receive queues are command queues belonging to queue pairs. Receive + commands post application buffers to receive incoming data. + + Receive queues are configured as part of queue pair setup. A RQ is + accessed indirectly through the QP when submitting receive work requests. + +*Shared Receive Queue (SRQ)* +: A shared receive queue is a single hardware command queue for posting + buffers to receive incoming data. This command queue may be shared + among multiple QPs, such that data that arrives on any associated QP + may retrieve a previously posted buffer from the SRQ. QPs that share + the same SRQ coordinate their access to posted buffers such that a + single posted operation is matched with a single incoming message. + + Unlike receive queues, SRQs are accessed directly by applications to + submit receive work requests. + +*Send Queue (SQ)* +: More generically, a send queue is a transmit queue. It + represents a command queue for operations that initiate a network operation. + A send queue may also be used to submit commands that update hardware + resources, such as updating memory regions. Network operations submitted + through the send queue include message sends, RDMA reads, RDMA writes, and + atomic operations, among others. + + Send queues are configured as part of queue pair setup. A SQ is + accessed indirectly through the QP when submitting send work requests. + +*Send Message* +: A send message refers to a specific type of transport data transfer. + A send message operation copies data from a local buffer to the network + and transfers the data as a single transport unit. The receiving NIC + copies the data from the network into a user posted receive message + buffer(s). + + Like the term RDMA, the meaning of send is context dependent. Send could + refer to the transmit command queue, any operation posted to the transmit + (send) queue, or a send message operation. + +*Work Request (WR)* +: A work request is a command submitted to a queue pair, work queue, or + shared receive queue. Work requests define the type of network operation + to perform, including references to any memory regions the operation will + access. + + A send work request is a transmit operation that is directed to the send + queue of a queue pair. A receive work request is an operation posted + to either a shared receive queue or a QP's receive queue. + +*Address Handle (AH)* +: An address handle identifies the link and/or network layer addressing to + a network port or multicast group. + + With legacy Infiniband, an address handle is a link layer object. For other + transports, including RoCE, the address handle is a network layer object. + +*Global Identifier (GID)* +: Infiniband defines a GID as an optional network-layer or multicast address. + Because GIDs are large enough to store an IPv6 address, their use has evolved + to support other transports. A GID identifies a network port, with the most + well-known GIDs being IPv4 and IPv6 addresses. + +*GID Type* +: The GID type determines the specific type of GID address being referenced. + Additionally, it identifies the set of addressing headers underneath the + transport header. + + An RDMA transport protocol may be layered over different networking stacks. + An RDMA transport may layer directly over a link layer (like Infiniband or + Ethernet), over the network layer (such as IP), or another transport + layer (such as TCP or UDP). The GID type conveys how the RDMA transport + stack is constructed, as well as how the GID address is interpreted. + +*GID Index* +: RDMA addresses are securely managed to ensure that unprivileged + applications do not inject arbitrary source addresses into the network. + Transport addresses are injected by the queue pair. Network addresses + are selected from a set of addresses stored in a source addressing table. + + The source addressing table is referred to as a GID table. The GID index + identifies an entry into that table. The GID table exposed to a user + space process contains only those addresses usable by that process. + Queue pairs are frequently assigned a specific GID index to use for their + source network address when initially configured. + +*Device Context* +: Identifies an instance of an opened RDMA device. + +*command fd - cmd_fd* +: File descriptor used to communicate with the kernel device driver. + Associated with the device context and opened by the library. + The cmd_fd communicates with the kernel via ioctl’s and is used + to allocate, configure, and release device resources. + + Applications interact with the cmd_fd indirectly by calling libibverbs + function calls. + +*async_fd* +: File descriptor used to report asynchronous events. + Associated with the device context and opened by the library. + + Applications may interact directly with the async_fd, such as waiting + on the fd via select/poll, to receive notifications when an async event + has been reported. From a8a8b6659b16ddd3b8eac259709c6b14accabebe Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Mon, 9 Dec 2024 16:52:29 -0800 Subject: [PATCH 52/66] libibverbs: Introduce ultra ethernet transport support Ultra ethernet is a new connectionless transport that targets HPC and AI applications running at extreme scale. Introduce new node and transport types for devices that only support the new ultra ethernet transport. UET may be layered over UDP/IP using a well-known UDP port (similar to RoCEv2), or may be layered directly over IP. Define new GID types to allow users to select UET plus the underlying protocol layering (similar to how RoCEv1 and RoCEv2 are handled). Signed-off-by: Sean Hefty --- libibverbs/verbs.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libibverbs/verbs.h b/libibverbs/verbs.h index 821341242..a15403976 100644 --- a/libibverbs/verbs.h +++ b/libibverbs/verbs.h @@ -74,6 +74,8 @@ enum ibv_gid_type { IBV_GID_TYPE_IB, IBV_GID_TYPE_ROCE_V1, IBV_GID_TYPE_ROCE_V2, + IBV_GID_TYPE_UET_UDP, + IBV_GID_TYPE_UET_IP, }; struct ibv_gid_entry { From f03837f26b2349fdd20450108c1d6f33b05e8442 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Mon, 9 Dec 2024 18:22:23 -0800 Subject: [PATCH 53/66] libibverbs: Add support for UET QPs UET is designed around connectionless communication. To expose UET through verbs, we introduce a new reliable- unconnected QP type (named to align with existing QP types). Infiniband defines several states that a QP may be in. Many of the states are unsuitable for unconnected QPs in general and may not irrevelent depending on HW implementations. For UET, we define only 2 states for a UET QP: RTS and error. A UET QP is created in the ready-to-send state. To create a UET QP directly into the RTS state, the full set of QP attributes are needed at creation time. Struct ibv_qp_init_attr_ex is extended to include struct ibv_qp_attr for this purpose. Signed-off-by: Sean Hefty --- libibverbs/verbs.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/libibverbs/verbs.h b/libibverbs/verbs.h index a15403976..c5d1e3003 100644 --- a/libibverbs/verbs.h +++ b/libibverbs/verbs.h @@ -932,6 +932,7 @@ enum ibv_qp_type { IBV_QPT_RAW_PACKET = 8, IBV_QPT_XRC_SEND = 9, IBV_QPT_XRC_RECV, + IBV_QPT_RU, IBV_QPT_DRIVER = 0xff, }; @@ -961,6 +962,7 @@ enum ibv_qp_init_attr_mask { IBV_QP_INIT_ATTR_IND_TABLE = 1 << 4, IBV_QP_INIT_ATTR_RX_HASH = 1 << 5, IBV_QP_INIT_ATTR_SEND_OPS_FLAGS = 1 << 6, + IBV_QP_INIT_ATTR_QP_ATTR = 1 << 7, }; enum ibv_qp_create_flags { @@ -1015,6 +1017,9 @@ struct ibv_qp_init_attr_ex { uint32_t source_qpn; /* See enum ibv_qp_create_send_ops_flags */ uint64_t send_ops_flags; + + struct ibv_qp_attr *qp_attr; + int qp_attr_mask; }; enum ibv_qp_open_attr_mask { From d8f141adae5e690d673eb9a22dedc8b9d12409d7 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Mon, 14 Apr 2025 12:32:35 -0700 Subject: [PATCH 54/66] libibverbs: Add job id support Job IDs are used to identify a distributed application. The concept is widely used in HPC and AI applications, to identify a set of distributed processes as belonging to a single application. Job IDs are integral to ultra ethernet. A job ID is carried in every transport message and is part of a UET QP address. UEC defines that job IDs must be managed by a privileged entity. The association of a job ID to a specific QP is a protected operation. A simple view of the job security model is shown as this object model: device <--- job ID ^ ^ | | PD <--- job key ^ ^ ^ | \___ | (optional) QP --- MR This patch focuses on the job ID. Job keys are discussed in a following patch. We define new verb calls to allocate a job object. Each job object is assigned a unique ID. The assignment of ID values to job objects it outside the scope of the API, and would usually be handled through a job launcher or process manager. The ibv_alloc_job() call is use to create and configure a job object. It is expected that the kernel will enforce that callers have the proper privileges to create job objects on devices. (Similar to opening QP 0 or 1). Once a job object has been created, it may be shared with local processes using a shared fd mechanism. The creating process obtains a sharable fd using ibv_export_job() and exchanges the fd with the processes of the job (e.g. via sockets). On receiving the fd, the processes use ibv_import_job() to setup local job resources. A job is associated with addressing information, which includes protocol stack data, as well as an ID. The number of bytes of the ID which are valid is dependent on the associated protocol. For UET, it is 3-bytes. A job object performs an additional function beyond linking a QP with a job ID. It defines a mechanism by which local processes can share addressing information of peers. This can reduce the amount of memory used to store addresses locally and enables future optimizations, such as applying job level encryption. The feature will also map well to HPC and AI applications that identify peers using a rank. Conceptually, a virtual address array may be stored with a job object. Addresses are inserted or removed from the array at a given index location. The intent is that the index can map directly to the process' rank. When sending to a peer, the peer can be identified by the job plus the index. Note that the implementation for the job's addressing array is not defined. A vendor may implement this in a variety of ways. Addresses may be pre-inserted by the job launcher, and the transport addresses may be generated using an algorithm. Signed-off-by: Sean Hefty --- libibverbs/verbs.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/libibverbs/verbs.h b/libibverbs/verbs.h index c5d1e3003..d418bd4fb 100644 --- a/libibverbs/verbs.h +++ b/libibverbs/verbs.h @@ -363,6 +363,8 @@ struct ibv_device_attr_ex { struct ibv_pci_atomic_caps pci_atomic_caps; uint32_t xrc_odp_caps; uint32_t phys_port_cnt_ex; + uint32_t max_job_ids; + uint32_t max_addr_entries; }; enum ibv_mtu { @@ -2000,6 +2002,40 @@ struct ibv_flow_action_esp_attr { uint32_t esn; }; +struct ibv_job { + struct ibv_context *context; + void *user_context; + uint32_t handle; +}; + +struct ibv_job_attr { + uint32_t comp_mask; + unsigned int flags; + uint64_t id; + uint32_t max_addr_entries; + enum ibv_qp_type qp_type; + struct ibv_ah_attr ah_attr; +}; + +struct ibv_job * +ibv_alloc_job(struct ibv_context *context, struct ibv_job_attr *attr, + void *user_context); +int ibv_close_job(struct ibv_job *job); + +int ibv_insert_addr(struct ibv_job *job, uint32_t qpn, + struct ibv_ah_attr ah_attr, + unsigned int addr_idx, unsigned int flags); +int ibv_remove_addr(struct ibv_job *job, unsigned int addr_idx, + unsigned int flags); +int ibv_query_addr(struct ibv_job *job, unsigned int addr_idx, + uint32_t *qpn, struct ibv_ah_attr *ah_attr, + unsigned int flags); + +int ibv_export_job(struct ibv_job *job, int *fd); +int ibv_import_job(struct ibv_context *context, int fd, struct ibv_job **job); + +int ibv_query_job(struct ibv_job *job, struct ibv_job_attr *attr); + struct ibv_device; struct ibv_context; From 8cea5c15babfddeae4a2adb38234551d0637134a Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Tue, 15 Apr 2025 10:20:42 -0700 Subject: [PATCH 55/66] libibverbs: Add job key support The job object model can be viewed as: device <--- job ID ^ ^ | | PD <--- job key ^ ^ ^ | \___ | (optional) QP --- MR This patch introduces the job key object. The relationship between a job key and a job ID is similar to an lkey to a MR. A job object maps to a job ID value. Job objects are device level objects. A job key associates the job ID with a protection domain to provide process level protections. Job keys are associated with a 32-bit jkey value. The jkey will be used when posting a WR to associate a transfer with a specific job. That is, the jkey is what mirrors the lkey concept. The NIC converts the jkey to the job ID when transmitting packets on the wire, applying appropriate checks that the QP has access to the target job ID. E.g. the job key and QP belong to the same PD. UET allows a registered MR to optionally be accessible only to members of a specific job. The job key will also be used as an optional attribute when creating a MR. Details on associating a MR with a job key are defined in a later patch. Signed-off-by: Sean Hefty --- libibverbs/verbs.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/libibverbs/verbs.h b/libibverbs/verbs.h index d418bd4fb..fe88f9948 100644 --- a/libibverbs/verbs.h +++ b/libibverbs/verbs.h @@ -365,6 +365,7 @@ struct ibv_device_attr_ex { uint32_t phys_port_cnt_ex; uint32_t max_job_ids; uint32_t max_addr_entries; + uint32_t max_jkeys_per_pd; }; enum ibv_mtu { @@ -2036,6 +2037,17 @@ int ibv_import_job(struct ibv_context *context, int fd, struct ibv_job **job); int ibv_query_job(struct ibv_job *job, struct ibv_job_attr *attr); +struct ibv_job_key { + struct ibv_pd *pd; + uint32_t handle; + uint32_t jkey; +}; + +struct ibv_job_key * +ibv_create_jkey(struct ibv_pd *pd, struct ibv_job *job, unsigned int flags); +int ibv_destroy_jkey(struct ibv_job_key *job_key); + + struct ibv_device; struct ibv_context; From 7a77e43d8334628f1c6b380e4705f0251f3b6359 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Wed, 11 Dec 2024 10:50:29 -0800 Subject: [PATCH 56/66] libibverbs: Allow posting WRs for RU QPs Add new extended QP functions to set necessary input fields related to supporting RU QPs and UE transport. The UE transport supports 64-bits of immediate data and 64-bit rkeys. Provide expanded APIs to support both. Also include APIs to set full UET destination address data. UET QPs have an additional address component beyond the QP or endpoint address. They have a concept defined as a resource index. A resource index can be viewed as additional receive queues attached to the QP, which are directly addressable by a sender. One intended use of resource indices is to allow a single UET QP to separate traffic from different services. For example, HPC traffic may use one subset of indices, AI traffic a different subset, and storage a third. The number of resource indices supported by a QP is vendor specific, and how they are used by applications it outside the scope of the verbs API. The resource index concept reuses the verbs work queue concept A new send WR flag is also added, delivery complete. When requested and supported by the provider, this flag indicates that a completion for the send operation indicates that the data is globally observable at the target. This is an optional feature of the UE transport. Signed-off-by: Sean Hefty --- libibverbs/verbs.h | 58 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/libibverbs/verbs.h b/libibverbs/verbs.h index fe88f9948..97e6ba498 100644 --- a/libibverbs/verbs.h +++ b/libibverbs/verbs.h @@ -1160,7 +1160,8 @@ enum ibv_send_flags { IBV_SEND_SIGNALED = 1 << 1, IBV_SEND_SOLICITED = 1 << 2, IBV_SEND_INLINE = 1 << 3, - IBV_SEND_IP_CSUM = 1 << 4 + IBV_SEND_IP_CSUM = 1 << 4, + IBV_SEND_DELIVERY_COMPLETE = 1 << 5, }; enum ibv_placement_type { @@ -1390,6 +1391,19 @@ struct ibv_qp_ex { void (*wr_flush)(struct ibv_qp_ex *qp, uint32_t rkey, uint64_t remote_addr, size_t len, uint8_t type, uint8_t level); + + void (*wr_send_imm64)(struct ibv_qp_ex *qp, __be64 imm_data); + void (*wr_rdma_read64)(struct ibv_qp_ex *qp, uint64_t rkey, + uint64_t remote_addr); + void (*wr_rdma_write64)(struct ibv_qp_ex *qp, uint64_t rkey, + uint64_t remote_addr); + void (*wr_rdma_write64_imm)(struct ibv_qp_ex *qp, uint64_t rkey, + uint64_t remote_addr, __be64 imm_data); + void (*wr_set_ru_addr)(struct ibv_qp_ex *qp, struct ibv_ah *ah, + uint32_t remote_qpn, uint32_t jkey); + void (*wr_set_job_addr)(struct ibv_qp_ex *qp, unsigned int addr_idx, + uint32_t jkey); + void (*wr_set_wq_num)(struct ibv_qp_ex *qp, uint32_t wq_num); }; struct ibv_qp_ex *ibv_qp_to_qp_ex(struct ibv_qp *qp); @@ -1426,12 +1440,24 @@ static inline void ibv_wr_rdma_read(struct ibv_qp_ex *qp, uint32_t rkey, qp->wr_rdma_read(qp, rkey, remote_addr); } +static inline void ibv_wr_rdma_read64(struct ibv_qp_ex *qp, uint64_t rkey, + uint64_t remote_addr) +{ + qp->wr_rdma_read64(qp, rkey, remote_addr); +} + static inline void ibv_wr_rdma_write(struct ibv_qp_ex *qp, uint32_t rkey, uint64_t remote_addr) { qp->wr_rdma_write(qp, rkey, remote_addr); } +static inline void ibv_wr_rdma_write64(struct ibv_qp_ex *qp, uint64_t rkey, + uint64_t remote_addr) +{ + qp->wr_rdma_write64(qp, rkey, remote_addr); +} + static inline void ibv_wr_flush(struct ibv_qp_ex *qp, uint32_t rkey, uint64_t remote_addr, size_t len, uint8_t type, uint8_t level) @@ -1445,6 +1471,12 @@ static inline void ibv_wr_rdma_write_imm(struct ibv_qp_ex *qp, uint32_t rkey, qp->wr_rdma_write_imm(qp, rkey, remote_addr, imm_data); } +static inline void ibv_wr_rdma_write64_imm(struct ibv_qp_ex *qp, uint64_t rkey, + uint64_t remote_addr, __be64 imm_data) +{ + qp->wr_rdma_write64_imm(qp, rkey, remote_addr, imm_data); +} + static inline void ibv_wr_send(struct ibv_qp_ex *qp) { qp->wr_send(qp); @@ -1455,6 +1487,11 @@ static inline void ibv_wr_send_imm(struct ibv_qp_ex *qp, __be32 imm_data) qp->wr_send_imm(qp, imm_data); } +static inline void ibv_wr_send_imm64(struct ibv_qp_ex *qp, __be64 imm_data) +{ + qp->wr_send_imm64(qp, imm_data); +} + static inline void ibv_wr_send_inv(struct ibv_qp_ex *qp, uint32_t invalidate_rkey) { @@ -1473,6 +1510,25 @@ static inline void ibv_wr_set_ud_addr(struct ibv_qp_ex *qp, struct ibv_ah *ah, qp->wr_set_ud_addr(qp, ah, remote_qpn, remote_qkey); } +static inline void ibv_wr_set_ru_addr(struct ibv_qp_ex *qp, struct ibv_ah *ah, + uint32_t remote_qpn, uint32_t jkey) +{ + qp->wr_set_ru_addr(qp, ah, remote_qpn, jkey); +} + +static inline void ibv_wr_set_job_addr(struct ibv_qp_ex *qp, + unsigned int addr_idx, + uint32_t jkey) +{ + qp->wr_set_job_addr(qp, addr_idx, jkey); +} + +static inline void ibv_wr_set_wq_num(struct ibv_qp_ex *qp, + uint32_t wq_num) +{ + qp->wr_set_wq_num(qp, wq_num); +} + static inline void ibv_wr_set_xrc_srqn(struct ibv_qp_ex *qp, uint32_t remote_srqn) { From 387ed8e16b52a88b30f9f2658204f84986a190c1 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Wed, 11 Dec 2024 11:24:12 -0800 Subject: [PATCH 57/66] libibverbs: Report UET transport details in completions Allow UET specific information to be reported as part of work completions. This includes the larger immediate data size, the job ID carried in the transport header, and a peer ID, also carried in the transport header. Included with completion data is a UET transport field, called the initiator in UEC terminology. This is a user configurable value intended to map to the rank number for a parallel application. The initiator field only has meaning within a specific job ID. As a result, when the value is valid in a completion, so is the job ID. (For UET, the initiator value is part of the UET address.) The verbs naming of this field is the slightly more generic term, src_id, to align with src_qpn (in ibv_wc). Signed-off-by: Sean Hefty --- libibverbs/verbs.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/libibverbs/verbs.h b/libibverbs/verbs.h index 97e6ba498..a5762e6ba 100644 --- a/libibverbs/verbs.h +++ b/libibverbs/verbs.h @@ -562,6 +562,8 @@ enum ibv_create_cq_wc_flags { IBV_WC_EX_WITH_FLOW_TAG = 1 << 9, IBV_WC_EX_WITH_TM_INFO = 1 << 10, IBV_WC_EX_WITH_COMPLETION_TIMESTAMP_WALLCLOCK = 1 << 11, + IBV_WC_EX_WITH_IMM64 = 1 << 12, + IBV_WC_EX_WITH_SRC_ID = 1 << 13, /* implies job id */ }; enum { @@ -1659,6 +1661,9 @@ struct ibv_cq_ex { void (*read_tm_info)(struct ibv_cq_ex *current, struct ibv_wc_tm_info *tm_info); uint64_t (*read_completion_wallclock_ns)(struct ibv_cq_ex *current); + __be64 (*read_imm64_data)(struct ibv_cq_ex *current); + uint64_t (*read_job_id)(struct ibv_cq_ex *current); + uint32_t (*read_src_id)(struct ibv_cq_ex *current); }; static inline struct ibv_cq *ibv_cq_ex_to_cq(struct ibv_cq_ex *cq) @@ -1717,6 +1722,11 @@ static inline __be32 ibv_wc_read_imm_data(struct ibv_cq_ex *cq) return cq->read_imm_data(cq); } +static inline __be64 ibv_wc_read_imm64_data(struct ibv_cq_ex *cq) +{ + return cq->read_imm64_data(cq); +} + static inline uint32_t ibv_wc_read_invalidated_rkey(struct ibv_cq_ex *cq) { #ifdef __CHECKER__ @@ -1736,6 +1746,16 @@ static inline uint32_t ibv_wc_read_src_qp(struct ibv_cq_ex *cq) return cq->read_src_qp(cq); } +static inline uint64_t ibv_wc_read_job_id(struct ibv_cq_ex *cq) +{ + return cq->read_job_id(cq); +} + +static inline uint32_t ibv_wc_read_src_id(struct ibv_cq_ex *cq) +{ + return cq->read_src_id(cq); +} + static inline unsigned int ibv_wc_read_wc_flags(struct ibv_cq_ex *cq) { return cq->read_wc_flags(cq); From 7279497b7410a2b222d110f83404e7d0048c0243 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Wed, 15 Oct 2025 11:00:58 -0700 Subject: [PATCH 58/66] libibverbs: Support memory registrations for UET The UET protocol and devices support advanced features for memory regions. From the viewpoint of the protocol, an rkey is 64-bits, with specific meaning applied to several of the bits. Struct ibv_mr is extended to report a 64-bit rkey. Providers are expected to set the 32-bit rkey and/or rkey64 field in struct ibv_mr correctly based on the transports supported by the device. A second protocol feature is that a MR may be restricted to being accessible by a specific job. Since a UET QP may be used to communicate with multiple jobs simultaneously, the memory registration call is expanded to allow associating a job key with a MR. Signed-off-by: Sean Hefty --- libibverbs/verbs.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libibverbs/verbs.h b/libibverbs/verbs.h index a5762e6ba..3fd45a9a1 100644 --- a/libibverbs/verbs.h +++ b/libibverbs/verbs.h @@ -686,6 +686,7 @@ struct ibv_mr { uint32_t handle; uint32_t lkey; uint32_t rkey; + uint64_t rkey64; }; enum ibv_mr_init_attr_mask { @@ -694,6 +695,7 @@ enum ibv_mr_init_attr_mask { IBV_REG_MR_MASK_FD = 1 << 2, IBV_REG_MR_MASK_FD_OFFSET = 1 << 3, IBV_REG_MR_MASK_DMAH = 1 << 4, + IBV_REG_MR_MASK_JKEY = 1 << 5, }; struct ibv_mr_init_attr { @@ -705,6 +707,7 @@ struct ibv_mr_init_attr { int fd; uint64_t fd_offset; struct ibv_dmah *dmah; + struct ibv_job_key *jkey; }; enum ibv_mw_type { From 7794cae7963198fd2a5e355d313ea248e85e5949 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Wed, 30 Jul 2025 14:24:01 -0700 Subject: [PATCH 59/66] libibverbs: Support adjustable QP msg and data semantics UET defines multiple packet delivery modes: ROD - reliable, ordered delivery RUD - reliable, unordered delivery RUDI - reliable, unordered delivery for idempotent transfers UUD - unreliable, unordered delivery The packet delivery modes impact how out of order packets are handled at the receiver, retry mechanisms, multi-pathing support, and congestion control algorithms, among other behavior. A single UET QP may use multiple packet delivery modes simultaneously based on the application data transfer being performed. Even traditional RDMA protocols are evolving to allow greater flexibility in how message and data ordering are delivered at the receiver. This patch introduces a new QP attribute structure called QP semantics. This structure defines the message and data ordering requirements that a QP must implement. If a QP cannot meet the requested semantics, QP creation should fail, but a vendor can always provide stronger guarantees than those requested by the user. QP semantics indicate if the QP must provider message and data ordering guarantees, such as write-after-write, read- after-write, send-after-write, etc. Traditionally, these ordering guarantees were defined by the relevent RDMA specifications, and users of the libibverbs API needed to know to reference those specs in order to use a QP correctly (such as when to fence data transfers). As an alternative, a new device level query call is added, which can return the supported ordering guarantees for a given QP type over a specific transport. The QP semantics may optionally be passed into the create QP operation. After querying for supported semantics, applications can remove unneeded ordering guarantees in order to leverage available network features (such as multipath support). This allows vendors to adjust transport behavior accordingly. For example, UET can leverage ROD when sending messages, but use RUD or RUDI for RDMA transfers. Data ordering between messages is further defined by to indicate the maximum size transfer that ordering holds. For example, RDMA write-after-read ordering may be restricted to single MTU transfers. Finally, as a 'fix' to MTU sizes forced to being a power of 2, a max_pdu is introduced. The max PDU reports the maximum size of *user* data that can be carried in a single transport packet. The max PDU is relative to the port MTU, minus protocol headers. Signed-off-by: Sean Hefty --- libibverbs/verbs.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/libibverbs/verbs.h b/libibverbs/verbs.h index 3fd45a9a1..e4cff1cdd 100644 --- a/libibverbs/verbs.h +++ b/libibverbs/verbs.h @@ -114,6 +114,39 @@ enum ibv_transport_type { IBV_TRANSPORT_UNSPECIFIED, }; +enum ibv_qp_msg_order { + /* Atomic-Atomic Rd/Wr ordering */ + IBV_ORDER_ATOMIC_RAR = (1 << 0), + IBV_ORDER_ATOMIC_RAW = (1 << 1), + IBV_ORDER_ATOMIC_WAR = (1 << 2), + IBV_ORDER_ATOMIC_WAW = (1 << 3), + /* RDMA-RDMA Rd/Wr ordering */ + IBV_ORDER_RDMA_RAR = (1 << 4), + IBV_ORDER_RDMA_RAW = (1 << 5), + IBV_ORDER_RDMA_WAR = (1 << 6), + IBV_ORDER_RDMA_WAW = (1 << 7), + /* Send ordering wrt Atomic and RDMA Rd/Wr */ + IBV_ORDER_RAS = (1 << 8), + IBV_ORDER_SAR = (1 << 9), + IBV_ORDER_SAS = (1 << 10), + IBV_ORDER_SAW = (1 << 11), + IBV_ORDER_WAS = (1 << 12), + /* Atomic and RDMA Rd/Wr ordering */ + IBV_ORDER_RAR = (1 << 13), + IBV_ORDER_RAW = (1 << 14), + IBV_ORDER_WAR = (1 << 15), + IBV_ORDER_WAW = (1 << 16), +}; + +struct ibv_qp_semantics { + uint32_t comp_mask; + uint32_t msg_order; + uint32_t max_rdma_raw_size; + uint32_t max_rdma_war_size; + uint32_t max_rdma_waw_size; + uint32_t max_pdu; +}; + enum ibv_device_cap_flags { IBV_DEVICE_RESIZE_MAX_WR = 1, IBV_DEVICE_BAD_PKEY_CNTR = 1 << 1, @@ -971,6 +1004,7 @@ enum ibv_qp_init_attr_mask { IBV_QP_INIT_ATTR_RX_HASH = 1 << 5, IBV_QP_INIT_ATTR_SEND_OPS_FLAGS = 1 << 6, IBV_QP_INIT_ATTR_QP_ATTR = 1 << 7, + IBV_QP_INIT_ATTR_QP_SEMANTICS = 1 << 8, }; enum ibv_qp_create_flags { @@ -1028,6 +1062,7 @@ struct ibv_qp_init_attr_ex { struct ibv_qp_attr *qp_attr; int qp_attr_mask; + struct ibv_qp_semantics *qp_semantics; }; enum ibv_qp_open_attr_mask { @@ -2316,6 +2351,11 @@ struct ibv_values_ex { struct verbs_context { /* "grows up" - new fields go here */ + int (*query_qp_semantics)(struct ibv_context *context, + enum ibv_qp_type qp_type, + struct ibv_ah_attr *ah_attr, + struct ibv_qp_semantics *qp_semantics, + size_t qp_semantic_len); struct ibv_mr *(*reg_mr_ex)(struct ibv_pd *pd, struct ibv_mr_init_attr *mr_init_attr); int (*dealloc_dmah)(struct ibv_dmah *dmah); @@ -2658,6 +2698,21 @@ int ibv_query_pkey(struct ibv_context *context, uint8_t port_num, int ibv_get_pkey_index(struct ibv_context *context, uint8_t port_num, __be16 pkey); +static inline int ibv_query_qp_semantics(struct ibv_context *context, + enum ibv_qp_type qp_type, + struct ibv_ah_attr *ah_attr, + struct ibv_qp_semantics *qp_semantics, + size_t qp_semantic_len) +{ + struct verbs_context *vctx = verbs_get_ctx_op(context, query_qp_semantics); + + if (!vctx) + return EOPNOTSUPP; + + return vctx->query_qp_semantics(context, qp_type, ah_attr, + qp_semantics, qp_semantic_len); +} + /** * ibv_alloc_pd - Allocate a protection domain */ From 388704eda586b925332f60e0da31a565f41ea23a Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Tue, 29 Jul 2025 14:49:24 -0700 Subject: [PATCH 60/66] libibverbs: Allow provider to describe immediate data limits Legacy RDMA transports are restricted to 32-bits of immediate data, while UET supports 64-bits. Additionally, UET does not require that RDMA writes with immediate consume a posted receive buffer at the target. The spec even goes so far as to mandate that RDMA traffic be treated separately at the target than send operations; however, such a mandate is not visible in the transport and places restrictions on the NIC implementation. NICs that support multiple protocols, including UET, may be optimized for legacy RDMA support. For example, CQ entries may only be able to store 32-bits of immediate data. To handle different implementations and transports, we extend the QP semantic structure to report the immediate data size, as well as implementation constraints, such as the need to consume a posted receive buffer. This change has an added advantage that it is now possible for a user to indicate that immediate data will not be used by setting the size to 0 when creating the QP. For devices which support a smaller immediate data size than that carried by the transport, truncated immediate data is extended with 0s when writing to the wire, and completions report the lowest valid bits. The QP semantics are extended with a new use_flags. These flags will allow providers to direct applications on constraints on using the HW, allowing greater flexibility in implementations. When set, IBV_QP_USAGE_IMM_DATA_RQ indicates that RDMA writes with immediate data will consume a posted receive buffer on the QP. This is standard behavior for legacy RDMA transports, but not for UET. By setting this flag, a provider can indicate this as their default requirement even when using UET QPs. Signed-off-by: Sean Hefty --- libibverbs/verbs.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/libibverbs/verbs.h b/libibverbs/verbs.h index e4cff1cdd..5d852ac85 100644 --- a/libibverbs/verbs.h +++ b/libibverbs/verbs.h @@ -138,6 +138,10 @@ enum ibv_qp_msg_order { IBV_ORDER_WAW = (1 << 16), }; +enum ibv_qp_use_flags { + IBV_QP_USAGE_IMM_DATA_RQ = (1 << 0), +}; + struct ibv_qp_semantics { uint32_t comp_mask; uint32_t msg_order; @@ -145,6 +149,8 @@ struct ibv_qp_semantics { uint32_t max_rdma_war_size; uint32_t max_rdma_waw_size; uint32_t max_pdu; + uint8_t imm_data_size; + unsigned int usage_flags; }; enum ibv_device_cap_flags { From a3223c2dd674f365ad3f870d52b3174f3d425e29 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Wed, 15 Oct 2025 16:03:14 -0700 Subject: [PATCH 61/66] libibverbs: Define attaching a MR to a QP Legacy RDMA devices immediately expose a new MR as soon as the memory registration process completes. That is, even before reg_mr() returns to the caller, the region is accessible to any QP sharing the same PD. UET allows for greater control over access to a MR. Even once a MR has been created, exposure to the MR is treated as a separate operation. This further allows access to a MR to be invoked without it being destroyed, which enables a MR to be used-once. E.g. The MR may be the target of a single RDMA operation, with access controlled by the owner of the MR. This behavior differs from the remote invalidate operation. To support this additional level of control, we introduce new QP operations: attach MR and detach MR. A provider indicates that MRs must be explicitly attached to a QP through a new QP usage flag, as this behavior may be specific to a given transport protocol + QP type. E.g. UET + RU QPs may support MR attachment, but UET + UD QPs may not (since the feature is not required). Support and the need to attach a MR to a QP is indicated by the IBV_QP_USAGE_ATTACH_MR usage flag. Signed-off-by: Sean Hefty --- libibverbs/verbs.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/libibverbs/verbs.h b/libibverbs/verbs.h index 5d852ac85..692494abe 100644 --- a/libibverbs/verbs.h +++ b/libibverbs/verbs.h @@ -140,6 +140,7 @@ enum ibv_qp_msg_order { enum ibv_qp_use_flags { IBV_QP_USAGE_IMM_DATA_RQ = (1 << 0), + IBV_QP_USAGE_ATTACH_MR = (1 << 1), }; struct ibv_qp_semantics { @@ -2357,6 +2358,8 @@ struct ibv_values_ex { struct verbs_context { /* "grows up" - new fields go here */ + int (*attach_mr)(struct ibv_qp *qp, struct ibv_mr *mr); + int (*detach_mr)(struct ibv_qp *qp, struct ibv_mr *mr); int (*query_qp_semantics)(struct ibv_context *context, enum ibv_qp_type qp_type, struct ibv_ah_attr *ah_attr, @@ -2925,6 +2928,22 @@ static inline int ibv_dealloc_mw(struct ibv_mw *mw) return mw->context->ops.dealloc_mw(mw); } +static inline int ibv_attach_mr(struct ibv_qp *qp, struct ibv_mr *mr) +{ + struct verbs_context *vctx = verbs_get_ctx_op(qp->context, attach_mr); + + if (!vctx) + return EOPNOTSUPP; + + return vctx->attach_mr(qp, mr); +} + +static inline int ibv_detach_mr(struct ibv_qp *qp, struct ibv_mr *mr) +{ + struct verbs_context *vctx = verbs_get_ctx_op(qp->context, attach_mr); + return vctx->detach_mr(qp, mr); +} + /** * ibv_inc_rkey - Increase the 8 lsb in the given rkey */ From 0451f913c1b55da3a1073f1204de1c92835df348 Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Wed, 15 Oct 2025 14:41:39 -0700 Subject: [PATCH 62/66] libibverbs: Add support for user to select the rkey UET allows for user selected rkey values to improve scalability. Expose support via a device capability flag and update memory registration accordingly. Signed-off-by: Sean Hefty --- libibverbs/verbs.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libibverbs/verbs.h b/libibverbs/verbs.h index 692494abe..7fe8d7945 100644 --- a/libibverbs/verbs.h +++ b/libibverbs/verbs.h @@ -193,6 +193,7 @@ enum ibv_fork_status { */ #define IBV_DEVICE_RAW_SCATTER_FCS (1ULL << 34) #define IBV_DEVICE_PCI_WRITE_END_PADDING (1ULL << 36) +#define IBV_DEVICE_USER_RKEY (1ULL << 37) enum ibv_atomic_cap { IBV_ATOMIC_NONE, @@ -736,6 +737,7 @@ enum ibv_mr_init_attr_mask { IBV_REG_MR_MASK_FD_OFFSET = 1 << 3, IBV_REG_MR_MASK_DMAH = 1 << 4, IBV_REG_MR_MASK_JKEY = 1 << 5, + IBV_REG_MR_MASK_RKEY = 1 << 6, }; struct ibv_mr_init_attr { @@ -748,6 +750,7 @@ struct ibv_mr_init_attr { uint64_t fd_offset; struct ibv_dmah *dmah; struct ibv_job_key *jkey; + uint64_t rkey; }; enum ibv_mw_type { From 2d3fca9842e1e7814af2e3256bae652a806ec98c Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Wed, 15 Oct 2025 17:49:51 -0700 Subject: [PATCH 63/66] libibverbs: Add support for 'derived' MRs Introduce a concept called derived memory regions. Derived MRs are similar to legacy RDMA memory windows, but setup through the memory registration API, rather than post send. Derived MRs are new MRs that are wholy contained within an existing MR (to share page mappings, for example), but have different access rights or other attributes. For UET, a derived MR allows a MR to be associated with different jobs, with the access for each job to be different, while still being able to share the underlying HW page mappings. Applications must assume that a derived MR holds a reference on the original MR. The original MR may not be destroyed until all derived MRs have been closed. When a MR is created, a derive_cnt field may be provided to indicate the number of expected derived MRs that an application intends to create. This field is considered an optimization and may be ignored by the provider. Providers that do not support derived MRs may simply create a new MR without sharing resources with the original MR. A derived MR is subject to reported provider restrictions, such as IBV_QP_USAGE_ATTACH_MR. Signed-off-by: Sean Hefty --- libibverbs/verbs.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/libibverbs/verbs.h b/libibverbs/verbs.h index 7fe8d7945..ac6a32a80 100644 --- a/libibverbs/verbs.h +++ b/libibverbs/verbs.h @@ -738,6 +738,8 @@ enum ibv_mr_init_attr_mask { IBV_REG_MR_MASK_DMAH = 1 << 4, IBV_REG_MR_MASK_JKEY = 1 << 5, IBV_REG_MR_MASK_RKEY = 1 << 6, + IBV_REG_MR_MASK_CUR_MR = 1 << 7, + IBV_REG_MR_MASK_DERIVE_CNT = 1 << 8, }; struct ibv_mr_init_attr { @@ -751,6 +753,8 @@ struct ibv_mr_init_attr { struct ibv_dmah *dmah; struct ibv_job_key *jkey; uint64_t rkey; + struct ibv_mr *cur_mr; + uint32_t derive_cnt; }; enum ibv_mw_type { From bba2936255d2b1e15f48f17e6e7b4778d2d40ecc Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Thu, 16 Oct 2025 12:54:39 -0700 Subject: [PATCH 64/66] libibverbs: Add UET initiator setting The UET initiator is equivalent to an MPI rank or CCL communicator ID. It is a user settable value used for tag matching purposes. UET carries the initiator field directly in the transport header. Extend the initiator QP attributes to allow user to set the value. We use the more generic term, src_id, instead of the UET specific term. The naming is aligned with src_qpn in ibv_wc. Signed-off-by: Sean Hefty --- libibverbs/verbs.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libibverbs/verbs.h b/libibverbs/verbs.h index ac6a32a80..4d870e17a 100644 --- a/libibverbs/verbs.h +++ b/libibverbs/verbs.h @@ -1019,6 +1019,7 @@ enum ibv_qp_init_attr_mask { IBV_QP_INIT_ATTR_SEND_OPS_FLAGS = 1 << 6, IBV_QP_INIT_ATTR_QP_ATTR = 1 << 7, IBV_QP_INIT_ATTR_QP_SEMANTICS = 1 << 8, + IBV_QP_INIT_ATTR_SRC_ID = 1 << 9, }; enum ibv_qp_create_flags { @@ -1077,6 +1078,7 @@ struct ibv_qp_init_attr_ex { struct ibv_qp_attr *qp_attr; int qp_attr_mask; struct ibv_qp_semantics *qp_semantics; + uint32_t src_id; }; enum ibv_qp_open_attr_mask { From 85cc0e79eba4725f4551fffe218e93ed913fe29b Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Thu, 16 Oct 2025 14:50:53 -0700 Subject: [PATCH 65/66] libibverbs: Extend ibv_wq to support UET resource index UET associates multiple receive queues with a single queue pair. In UET terms, a QP maps to a PIDonFEP, and the receive queues are known as resource indices. Resource indices allow for receive side resources to be separated, such that they may be dedicated to separate services (e.g. MPI, CCL, storage). To support separate resources, we reuse the verbs work queue objects (ibv_wq). The API is extended slightly for UET. First, we add an extended device attribute, max_rqw_per_qp, to limit the number of WQs which may be associated with a QP. Secondly, we extend the WQ attributes to allow the user to select the wq_num (i.e. UET resource index) associated with a WQ. It is the responsibility of higher-level SW to allocate, configure, and associate WQs with QPs, so that the QP is assigned the correct number of WQs with the necessary addresses. Signed-off-by: Sean Hefty --- libibverbs/verbs.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libibverbs/verbs.h b/libibverbs/verbs.h index 4d870e17a..bcf0f3ab7 100644 --- a/libibverbs/verbs.h +++ b/libibverbs/verbs.h @@ -407,6 +407,7 @@ struct ibv_device_attr_ex { uint32_t max_job_ids; uint32_t max_addr_entries; uint32_t max_jkeys_per_pd; + uint16_t max_rwq_per_qp; }; enum ibv_mtu { @@ -906,6 +907,7 @@ enum ibv_wq_type { enum ibv_wq_init_attr_mask { IBV_WQ_INIT_ATTR_FLAGS = 1 << 0, IBV_WQ_INIT_ATTR_RESERVED = 1 << 1, + IBV_WQ_INIT_ATTR_WQ_NUM = 1 << 2, }; enum ibv_wq_flags { @@ -925,6 +927,7 @@ struct ibv_wq_init_attr { struct ibv_cq *cq; uint32_t comp_mask; /* Use ibv_wq_init_attr_mask */ uint32_t create_flags; /* use ibv_wq_flags */ + uint32_t wq_num; }; enum ibv_wq_state { From 942bee0f60468350876f1a14444f5cbc513b01fd Mon Sep 17 00:00:00 2001 From: Sean Hefty Date: Fri, 24 Oct 2025 14:33:41 -0700 Subject: [PATCH 66/66] libibverbs: Update API documentation with UET job concepts Include descriptions of new objects introduced for UET: job, jkey, and address table, with verbs semantic constructs definitions. Signed-off-by: Sean Hefty --- Documentation/libibverbs.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/Documentation/libibverbs.md b/Documentation/libibverbs.md index 0f7984382..902b2d45d 100644 --- a/Documentation/libibverbs.md +++ b/Documentation/libibverbs.md @@ -349,3 +349,29 @@ Items are ordered conceptually. Applications may interact directly with the async_fd, such as waiting on the fd via select/poll, to receive notifications when an async event has been reported. + +*Job ID* +: A job ID identifies a single distributed application. The job object + is a device-level object that maps to a job ID and may be shared between + processes. The configuration of a job object, such as assigning its + job ID value, is considered a privileged operation. + + Multiple job objects, each assigned the same job ID value, may be needed + to represent a single, higher-level logical job running on the network. + This may be nessary for jobs that span multiple RDMA devices, for + example, where each job object may be configured for different source + addressing. + +*Job Key* +: A job key associates a job object with a specific protection domain. This + provides secure access to the actual job ID value stored with the job + object, while restricting which memory regions data transfers to / from + that job may access. + +*Address Table* +: An address table is a virtual address array associated with a job object. + The address table allows local processes that belong to the same job to + share addressing and scalable encryption information to peer QPs. + + The address table is an optional but integrated component to a job + object.