Skip to content

Commit

Permalink
[NCCL][Test Only] test send/recv on OSS
Browse files Browse the repository at this point in the history
Pull Request resolved: #45140

Test only
ghstack-source-id: 112964211

Differential Revision: [D23844388](https://our.internmc.facebook.com/intern/diff/D23844388/)
  • Loading branch information
mingzhe0908 committed Sep 26, 2020
1 parent 91c876e commit 43c9873
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 3 deletions.
4 changes: 4 additions & 0 deletions torch/csrc/cuda/nccl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,11 +112,14 @@ struct AutoNcclGroup {
(c10::cuda::CUDACachingAllocator::getFreeMutex())->lock();
#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
NCCL_CHECK(from_nccl_result(ncclGroupStart()));
std::cout << "AutoNcclGroup start nccl.cpp" << std::endl;
#endif
std::cout << "group guard?" << std::endl;
}
~AutoNcclGroup() {
#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
NCCL_CHECK(from_nccl_result(ncclGroupEnd()));
std::cout << "AutoNcclGroup end nccl.cpp" << std::endl;
#endif
(c10::cuda::CUDACachingAllocator::getFreeMutex())->unlock();
}
Expand Down Expand Up @@ -420,6 +423,7 @@ void broadcast(
count_max,
")");
ncclComm_t comm = comms[i];
std::cout << "issue input: " << i << std::endl;
NCCL_CHECK(from_nccl_result(ncclBcast(
tensors[i].data_ptr(), numel, data_type, 0, *(to_nccl_comm(&comm)), stream)));
}
Expand Down
3 changes: 0 additions & 3 deletions torch/lib/c10d/NCCLUtils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@
#define ENABLE_NCCL_ERROR_CHECKING
#endif

// Fix build issues with NCCL P2P - until then disable NCCL send/recv.
#if defined(ENABLE_NCCL_A2A) && (ENABLE_NCCL_A2A == 1)
// P2P is enabled only for NCCL versions 2.7+ since ncclSend()
// and ncclRecv() are not supported in earlier versions.
#if defined(NCCL_MAJOR) && (NCCL_MAJOR == 2) && defined(NCCL_MINOR) && \
Expand All @@ -27,7 +25,6 @@
#elif defined(NCCL_MAJOR) && (NCCL_MAJOR >= 3)
#define ENABLE_NCCL_P2P_SUPPORT
#endif
#endif

// Macro to throw on a non-successful NCCL return value.
#define C10D_NCCL_CHECK(cmd) \
Expand Down
4 changes: 4 additions & 0 deletions torch/lib/c10d/ProcessGroupNCCL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,13 @@ struct AutoNcclGroup {
(c10::cuda::CUDACachingAllocator::getFreeMutex())->lock();
#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
C10D_NCCL_CHECK(ncclGroupStart());
std::cout << "AutoNcclGroup start" << std::endl;
#endif
}
~AutoNcclGroup() noexcept(false) {
#if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
C10D_NCCL_CHECK(ncclGroupEnd());
std::cout << "AutoNcclGroup end" << std::endl;
#endif
(c10::cuda::CUDACachingAllocator::getFreeMutex())->unlock();
}
Expand Down Expand Up @@ -173,6 +175,7 @@ ncclResult_t ncclAlltoall(
ncclDataType_t type,
ncclComm_t comm,
cudaStream_t stream) {
std::cout << "ncclAlltoall?" << std::endl;
int numranks;
size_t rankdiff = count * size;
C10D_NCCL_CHECK(ncclCommCount(comm, &numranks));
Expand Down Expand Up @@ -1027,6 +1030,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::collective(
for (size_t i = 0; i < inputs.size(); ++i) {
gpuGuard.set_index(devices[i].index());
at::cuda::CUDAStream& ncclStream = ncclStreams_[key][i];
std::cout << "schedule input" << i << std::endl;
C10D_NCCL_CHECK(
fn(inputs[i], outputs[i], ncclComms[i]->getNcclComm(), ncclStream));
}
Expand Down

0 comments on commit 43c9873

Please sign in to comment.