From 3865ef270198ae236303587bb750355e427fe45d Mon Sep 17 00:00:00 2001 From: Tristan Rice Date: Fri, 9 May 2025 13:16:19 -0700 Subject: [PATCH] gloo/ibverbs: enable gpudirect with new nvidia drivers (#438) Summary: This is needed so we can detect whether the Gloo backend can do direct communication with the GPU or if we need to transfer them to CPU before doing accelerate collectives. Reviewed By: fduwjj Differential Revision: D74495060 --- gloo/transport/ibverbs/device.cc | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/gloo/transport/ibverbs/device.cc b/gloo/transport/ibverbs/device.cc index 9c18a7b82..ef0d5a123 100644 --- a/gloo/transport/ibverbs/device.cc +++ b/gloo/transport/ibverbs/device.cc @@ -24,6 +24,15 @@ namespace gloo { namespace transport { namespace ibverbs { +namespace { +bool hasNvPeerMem() { + const auto& modules = kernelModules(); + return modules.count("nv_peer_mem") > 0 || + // Newer nvidia drivers use a different module name + modules.count("nvidia_peermem") > 0; +} +} // namespace + // Scope guard for ibverbs device list. class IbvDevices { public: @@ -112,7 +121,7 @@ std::shared_ptr<::gloo::transport::Device> CreateDevice( Device::Device(const struct attr& attr, ibv_context* context) : attr_(attr), pciBusID_(infinibandToBusID(attr.name)), - hasNvPeerMem_(kernelModules().count("nv_peer_mem") > 0), + hasNvPeerMem_(hasNvPeerMem()), context_(context) { int rv;