pytorch · gchanan · Mar 28, 2019 · Mar 28, 2019
diff --git a/torch/csrc/distributed/c10d/ddp.cpp b/torch/csrc/distributed/c10d/ddp.cpp
@@ -3,7 +3,9 @@
 #include <torch/csrc/cuda/comm.h>
 #include <torch/csrc/utils/tensor_flatten.h>
 
+#ifdef USE_C10D_NCCL
 #include <torch/csrc/cuda/nccl.h>
+#endif
 
 #include <c10d/ProcessGroup.hpp>
 
@@ -129,6 +131,11 @@ std::tuple<std::shared_ptr<ProcessGroup::Work>, at::Tensor> queueReduction(
     ProcessGroup& processGroup,
     std::vector<std::vector<at::Tensor>>& gradsBatch,
     const std::vector<int64_t>& devices) {
+#ifndef USE_C10D_NCCL
+  if (devices.size() > 1) {
+    AT_ERROR("queueReduction with more than 1 device not suppported without NCCL");
+  }
+#endif
   AT_ASSERT(!gradsBatch.empty());
   AT_ASSERT(!devices.empty());
 
@@ -160,7 +167,11 @@ std::tuple<std::shared_ptr<ProcessGroup::Work>, at::Tensor> queueReduction(
   }
 
   if (devices.size() > 1) {
+#ifdef USE_C10D_NCCL
     torch::cuda::nccl::reduce(gradsBatchCoalesced, 0);
+#else
+    AT_ERROR("shouldn't have gotten here -- queueReduction not suppported without NCCL");
+#endif
   }
 
   gradsBatchCoalesced[0] /= processGroup.getSize();