[NCCL][Test Only] test send/recv on OSS

Pull Request resolved: #45140 Test only ghstack-source-id: 112964211 Differential Revision: [D23844388](https://our.internmc.facebook.com/intern/diff/D23844388/)
pytorch · Sep 26, 2020 · 43c9873 · 43c9873
1 parent 91c876e
commit 43c9873
Show file tree

Hide file tree

Showing 3 changed files with 8 additions and 3 deletions.
diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp
@@ -112,11 +112,14 @@ struct AutoNcclGroup {
     (c10::cuda::CUDACachingAllocator::getFreeMutex())->lock();
 #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
     NCCL_CHECK(from_nccl_result(ncclGroupStart()));
+    std::cout << "AutoNcclGroup start nccl.cpp" << std::endl;
 #endif
+    std::cout << "group guard?" << std::endl;
   }
   ~AutoNcclGroup() {
 #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
     NCCL_CHECK(from_nccl_result(ncclGroupEnd()));
+    std::cout << "AutoNcclGroup end nccl.cpp" << std::endl;
 #endif
     (c10::cuda::CUDACachingAllocator::getFreeMutex())->unlock();
   }
@@ -420,6 +423,7 @@ void broadcast(
         count_max,
         ")");
     ncclComm_t comm = comms[i];
+    std::cout << "issue input: " << i << std::endl;
     NCCL_CHECK(from_nccl_result(ncclBcast(
         tensors[i].data_ptr(), numel, data_type, 0, *(to_nccl_comm(&comm)), stream)));
   }

diff --git a/torch/lib/c10d/NCCLUtils.hpp b/torch/lib/c10d/NCCLUtils.hpp
@@ -17,8 +17,6 @@
 #define ENABLE_NCCL_ERROR_CHECKING
 #endif
 
-// Fix build issues with NCCL P2P - until then disable NCCL send/recv.
-#if defined(ENABLE_NCCL_A2A) && (ENABLE_NCCL_A2A == 1)
 // P2P is enabled only for NCCL versions 2.7+ since ncclSend()
 // and ncclRecv() are not supported in earlier versions.
 #if defined(NCCL_MAJOR) && (NCCL_MAJOR == 2) && defined(NCCL_MINOR) && \
@@ -27,7 +25,6 @@
 #elif defined(NCCL_MAJOR) && (NCCL_MAJOR >= 3)
 #define ENABLE_NCCL_P2P_SUPPORT
 #endif
-#endif
 
 // Macro to throw on a non-successful NCCL return value.
 #define C10D_NCCL_CHECK(cmd)                                                 \

diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp
@@ -24,11 +24,13 @@ struct AutoNcclGroup {
     (c10::cuda::CUDACachingAllocator::getFreeMutex())->lock();
 #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
     C10D_NCCL_CHECK(ncclGroupStart());
+    std::cout << "AutoNcclGroup start" << std::endl;
 #endif
   }
   ~AutoNcclGroup() noexcept(false) {
 #if defined(NCCL_MAJOR) && (NCCL_MAJOR >= 2)
     C10D_NCCL_CHECK(ncclGroupEnd());
+    std::cout << "AutoNcclGroup end" << std::endl;
 #endif
     (c10::cuda::CUDACachingAllocator::getFreeMutex())->unlock();
   }
@@ -173,6 +175,7 @@ ncclResult_t ncclAlltoall(
     ncclDataType_t type,
     ncclComm_t comm,
     cudaStream_t stream) {
+  std::cout << "ncclAlltoall?" << std::endl;
   int numranks;
   size_t rankdiff = count * size;
   C10D_NCCL_CHECK(ncclCommCount(comm, &numranks));
@@ -1027,6 +1030,7 @@ std::shared_ptr<ProcessGroup::Work> ProcessGroupNCCL::collective(
     for (size_t i = 0; i < inputs.size(); ++i) {
       gpuGuard.set_index(devices[i].index());
       at::cuda::CUDAStream& ncclStream = ncclStreams_[key][i];
+      std::cout << "schedule input" << i << std::endl;
       C10D_NCCL_CHECK(
           fn(inputs[i], outputs[i], ncclComms[i]->getNcclComm(), ncclStream));
     }