Skip to content
This repository has been archived by the owner on Jul 1, 2023. It is now read-only.

Commit

Permalink
Make CUDA GDR auto-detect the mapping from GPUs to InfiniBand NICs (#258
Browse files Browse the repository at this point in the history
)

Summary:
Pull Request resolved: #258

GPU machines with InfiniBand often have several NICs, and GPUs need to be carefully mapped to them to avoid bottlenecks and congestion on the PCI links. Until now, for simplicity, we asked the user to provide such mapping when constructing the context, but this of course isn't practical. In this diff I introduce auto-detection for that mapping, by associating a GPU with the NIC that shares the longest prefix in the PCI "path".

I'm keeping the option to override this auto-detection and still manually specify a mapping, as a sort of "killswitch" should this logic not work for everyone.

Reviewed By: beauby

Differential Revision: D25824239

fbshipit-source-id: 190254c342783e6f5facca7ca1717b863b08168d
  • Loading branch information
lw authored and facebook-github-bot committed Jan 11, 2021
1 parent cd1c598 commit ea17890
Show file tree
Hide file tree
Showing 5 changed files with 153 additions and 11 deletions.
2 changes: 1 addition & 1 deletion tensorpipe/channel/cuda_gdr/context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ namespace tensorpipe {
namespace channel {
namespace cuda_gdr {

Context::Context(std::vector<std::string> gpuIdxToNicName)
Context::Context(optional<std::vector<std::string>> gpuIdxToNicName)
: impl_(std::make_shared<ContextImpl>(std::move(gpuIdxToNicName))) {}

// Explicitly define all methods of the context, which just forward to the impl.
Expand Down
4 changes: 3 additions & 1 deletion tensorpipe/channel/cuda_gdr/context.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <vector>

#include <tensorpipe/channel/cuda_context.h>
#include <tensorpipe/common/optional.h>
#include <tensorpipe/transport/context.h>

namespace tensorpipe {
Expand All @@ -23,7 +24,8 @@ class ContextImpl;

class Context : public CudaContext {
public:
explicit Context(std::vector<std::string> gpuIdxToNicName);
explicit Context(
optional<std::vector<std::string>> gpuIdxToNicName = nullopt);

Context(const Context&) = delete;
Context(Context&&) = delete;
Expand Down
150 changes: 145 additions & 5 deletions tensorpipe/channel/cuda_gdr/context_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#include <tensorpipe/channel/cuda_gdr/context_impl.h>

#include <array>
#include <climits>
#include <cstdlib>
#include <functional>
#include <string>
#include <tuple>
Expand All @@ -19,6 +21,7 @@
#include <vector>

#include <cuda.h>
#include <cuda_runtime.h>

#include <tensorpipe/channel/cuda_gdr/channel_impl.h>
#include <tensorpipe/channel/cuda_gdr/error.h>
Expand Down Expand Up @@ -52,6 +55,117 @@ auto applyFunc(IbvNic& subject, TMethod&& method, TArgsTuple&& args) {
std::tuple_size<std::remove_reference_t<TArgsTuple>>::value>{});
}

// The PCI topology is a tree, with the root being the host bridge, the leaves
// being the devices, and the other nodes being switches. We want to match each
// GPU to the InfiniBand NIC with which it shares the longest "prefix" in this
// tree, as that will route the data transfer away from the most "central"
// switches and from the host bridge. We extract the "path" of a device in the
// PCI tree by obtaining its "canonical" path in Linux's sysfs, which contains
// one component for each other device that is traversed. The format of such a
// path is /sys/devices/pci0123:45(/0123:45:67.8)+");
// See https://www.kernel.org/doc/ols/2005/ols2005v1-pages-321-334.pdf for more
// info on sysfs.

const std::string kPciPathPrefix = "/sys/devices/pci";

std::string getPciPathForIbvNic(const std::string& nicName) {
std::array<char, PATH_MAX> pciPath;
char* rv = ::realpath(
("/sys/class/infiniband/" + nicName + "/device").c_str(), pciPath.data());
TP_THROW_SYSTEM_IF(rv == nullptr, errno);
TP_DCHECK(rv == pciPath.data());

std::string res(pciPath.data());
TP_DCHECK(res.substr(0, kPciPathPrefix.size()) == kPciPathPrefix)
<< "Bad PCI path for InfiniBand NIC " << nicName << ": " << res;
return res;
}

std::string getPciPathForGpu(int gpuIdx) {
// The CUDA documentation says the ID will consist of a domain (16 bits), a
// bus (8 bits), a device (5 bits) and a function (3 bits). When represented
// as hex, including the separators and the null terminator, this takes up 13
// bytes. However NCCL seems to suggests that sometimes the domain takes twice
// that size, and hence 17 bytes are necessary.
// https://github.com/NVIDIA/nccl/blob/c6dbdb00849027b4e2c277653cbef53729f7213d/src/misc/utils.cc#L49-L53
std::array<char, 17> pciDeviceId;
TP_CUDA_CHECK(
cudaDeviceGetPCIBusId(pciDeviceId.data(), pciDeviceId.size(), gpuIdx));

// Fun fact: CUDA seems to format hex letters as uppercase, but Linux's sysfs
// expects them as lowercase.
for (char& c : pciDeviceId) {
if ('A' <= c && c <= 'F') {
c = c - 'A' + 'a';
}
}

std::array<char, PATH_MAX> pciPath;
char* rv = ::realpath(
("/sys/bus/pci/devices/" + std::string(pciDeviceId.data())).c_str(),
pciPath.data());
TP_THROW_SYSTEM_IF(rv == nullptr, errno);
TP_DCHECK(rv == pciPath.data());

std::string res(pciPath.data());
TP_DCHECK(res.substr(0, kPciPathPrefix.size()) == kPciPathPrefix)
<< "Bad PCI path for GPU #" << gpuIdx << ": " << res;
return res;
}

size_t commonPrefixLength(const std::string& a, const std::string& b) {
// The length of the longest common prefix is the index of the first char on
// which the two strings differ.
size_t maxLength = std::min(a.size(), b.size());
for (size_t idx = 0; idx < maxLength; idx++) {
if (a[idx] != b[idx]) {
return idx;
}
}
return maxLength;
}

std::vector<std::string> matchGpusToIbvNics(
IbvLib& ibvLib,
IbvDeviceList& deviceList) {
struct NicInfo {
std::string name;
std::string pciPath;
};
std::vector<NicInfo> nicInfos;
for (size_t deviceIdx = 0; deviceIdx < deviceList.size(); deviceIdx++) {
IbvLib::device& device = deviceList[deviceIdx];
std::string deviceName(TP_CHECK_IBV_PTR(ibvLib.get_device_name(&device)));
std::string pciPath = getPciPathForIbvNic(deviceName);
TP_VLOG(5) << "Resolved InfiniBand NIC " << deviceName << " to PCI path "
<< pciPath;
nicInfos.push_back(NicInfo{std::move(deviceName), std::move(pciPath)});
}

int numGpus;
TP_CUDA_CHECK(cudaGetDeviceCount(&numGpus));

std::vector<std::string> gpuIdxToIbvNicName;
for (int gpuIdx = 0; gpuIdx < numGpus; gpuIdx++) {
std::string gpuPciPath = getPciPathForGpu(gpuIdx);
TP_VLOG(5) << "Resolved GPU #" << gpuIdx << " to PCI path " << gpuPciPath;
ssize_t bestMatchLength = -1;
const std::string* bestMatchName = nullptr;
for (const auto& nicInfo : nicInfos) {
ssize_t matchLength = commonPrefixLength(gpuPciPath, nicInfo.pciPath);
if (matchLength > bestMatchLength) {
bestMatchLength = matchLength;
bestMatchName = &nicInfo.name;
}
}
TP_DCHECK_GE(bestMatchLength, 0);
TP_DCHECK(bestMatchName != nullptr);
gpuIdxToIbvNicName.push_back(*bestMatchName);
}

return gpuIdxToIbvNicName;
}

} // namespace

IbvNic::IbvNic(
Expand Down Expand Up @@ -220,7 +334,7 @@ void IbvNic::setId(std::string id) {
id_ = std::move(id);
}

ContextImpl::ContextImpl(std::vector<std::string> gpuIdxToNicName)
ContextImpl::ContextImpl(optional<std::vector<std::string>> gpuIdxToNicName)
: ContextImplBoilerplate<CudaBuffer, ContextImpl, ChannelImpl>("*") {
Error error;

Expand Down Expand Up @@ -249,12 +363,38 @@ ContextImpl::ContextImpl(std::vector<std::string> gpuIdxToNicName)
// TODO Check whether the NVIDIA memory peering kernel module is available.
// And maybe even allocate and register some CUDA memory to ensure it works.

IbvDeviceList deviceList(getIbvLib());
if (deviceList.size() == 0) {
TP_VLOG(5) << "Channel context " << id_
<< " is not viable because it couldn't find any InfiniBand NICs";
viable_ = false;
return;
}

std::vector<std::string> actualGpuIdxToNicName;
if (gpuIdxToNicName.has_value()) {
int numGpus;
TP_CUDA_CHECK(cudaGetDeviceCount(&numGpus));
TP_THROW_ASSERT_IF(numGpus != gpuIdxToNicName->size())
<< "The mapping from GPUs to InfiniBand NICs contains an unexpected "
<< "number of items: found " << gpuIdxToNicName->size() << ", expected "
<< numGpus;

actualGpuIdxToNicName = std::move(gpuIdxToNicName.value());
} else {
actualGpuIdxToNicName = matchGpusToIbvNics(ibvLib_, deviceList);
}

for (int gpuIdx = 0; gpuIdx < actualGpuIdxToNicName.size(); gpuIdx++) {
TP_VLOG(5) << "Channel context " << id_ << " mapped GPU #" << gpuIdx
<< " to InfiniBand NIC " << actualGpuIdxToNicName[gpuIdx];
}

std::unordered_set<std::string> nicNames;
for (const auto& nicName : gpuIdxToNicName) {
for (const auto& nicName : actualGpuIdxToNicName) {
nicNames.insert(nicName);
}

IbvDeviceList deviceList(getIbvLib());
std::unordered_map<std::string, size_t> nicNameToNicIdx;
// The device index is among all available devices, the NIC index is among the
// ones we will use.
Expand All @@ -275,8 +415,8 @@ ContextImpl::ContextImpl(std::vector<std::string> gpuIdxToNicName)
TP_THROW_ASSERT_IF(!nicNames.empty())
<< "Couldn't find all the devices I was supposed to use";

for (size_t gpuIdx = 0; gpuIdx < gpuIdxToNicName.size(); gpuIdx++) {
gpuToNic_.push_back(nicNameToNicIdx[gpuIdxToNicName[gpuIdx]]);
for (size_t gpuIdx = 0; gpuIdx < actualGpuIdxToNicName.size(); gpuIdx++) {
gpuToNic_.push_back(nicNameToNicIdx[actualGpuIdxToNicName[gpuIdx]]);
}

startThread("TP_CUDA_GDR_loop");
Expand Down
4 changes: 3 additions & 1 deletion tensorpipe/channel/cuda_gdr/context_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <tensorpipe/common/cuda_lib.h>
#include <tensorpipe/common/error.h>
#include <tensorpipe/common/ibv.h>
#include <tensorpipe/common/optional.h>
#include <tensorpipe/transport/context.h>

namespace tensorpipe {
Expand Down Expand Up @@ -123,7 +124,8 @@ class ContextImpl final
: public BusyPollingLoop,
public ContextImplBoilerplate<CudaBuffer, ContextImpl, ChannelImpl> {
public:
explicit ContextImpl(std::vector<std::string> gpuIdxToNicName);
explicit ContextImpl(
optional<std::vector<std::string>> gpuIdxToNicName = nullopt);

std::shared_ptr<CudaChannel> createChannel(
std::shared_ptr<transport::Connection> connection,
Expand Down
4 changes: 1 addition & 3 deletions tensorpipe/test/channel/cuda_gdr/cuda_gdr_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,7 @@ class CudaGdrChannelTestHelper
public:
std::shared_ptr<tensorpipe::channel::CudaContext> makeContext(
std::string id) override {
std::vector<std::string> gpuToNicName = {"mlx5_0", "mlx5_0"};
auto context =
std::make_shared<tensorpipe::channel::cuda_gdr::Context>(gpuToNicName);
auto context = std::make_shared<tensorpipe::channel::cuda_gdr::Context>();
context->setId(std::move(id));
return context;
}
Expand Down

0 comments on commit ea17890

Please sign in to comment.