pytorch · lw · Nov 26, 2020 · Nov 27, 2020 · Nov 29, 2020 · Nov 29, 2020
diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h
@@ -417,12 +417,34 @@ struct C10_EXPORT ivalue::Future : c10::intrusive_ptr_target {
     return fut;
   }
 
-  // Since this file cannot import CUDA depedency, the type of the seocond arg
-  // in the callback is c10::Stream instead of at::cuda::CUDAStream, and
-  // CUDAStream is constructed on the fly. The default implementation
-  // is a no-op, since it does not deal with any CUDA streams.
-  virtual void setRecordStreamCallback(
-      std::function<void(const at::IValue&, const c10::Stream&)> record_stream_cb) {}
+  // Some subclasses deal with CUDA tensors and must inform the CUDA caching
+  // allocator of which CUDA streams each DataPtr is used in. If the value held
+  // by the future is a Python object we need to acquire the GIL when extracting
+  // these DataPtrs. Since this file cannot depend on Python, we allow users to
+  // provide a "custom" extractor. Look for example at the PythonFutureWrapper.
+  using DataPtrExtractor =
+      std::function<std::vector<std::reference_wrapper<const at::DataPtr>>(
+          const at::IValue&)>;
+  virtual void setDataPtrExtractor(DataPtrExtractor data_ptr_extractor) {}
+
+  // Expose the default implementation so that external ones can defer to it.
+  static std::vector<std::reference_wrapper<const at::DataPtr>>
+  defaultDataPtrExtractor(const at::IValue& value) {
+    // FIXME Should we support more types than just tensors and tensor lists?
+    TORCH_INTERNAL_ASSERT(
+        value.isTensorList() || value.isTensor(),
+        "the future value must be either a tensor list or a tensor.");
+    at::Tensor tensor;
+    if (value.isTensorList()) {
+      const auto tensors = value.toTensorVector();
+      TORCH_INTERNAL_ASSERT(tensors.size() == 1, "expected exactly 1 tensor");
+      tensor = tensors[0];
+    } else {
+      tensor = value.toTensor();
+    }
+
+    return {tensor.storage().data_ptr()};
+  };
 
   // Tries to retrieve the error message from std::exception_ptr.
   std::string tryRetrieveErrorMessage() {

diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h
@@ -119,32 +119,7 @@ struct VISIBILITY_HIDDEN PythonFutureWrapper
     // vector, but Future does not acquire GIL on destruction.
     auto pf = std::make_shared<PythonFunctionGuard>(std::move(cb));
 
-#ifdef USE_C10D_NCCL
-    // This callback is only used by NCCL backend, so skip this code on other
-    // backends and avoid importing cuda dependency.
-    // By default, assume that the input value is or can be casted into a tensor
-    // vector that has exactly one tensor.
-    auto record_stream_cb = [](const at::IValue& value,
-                               const c10::Stream& stream) {
-      if (value.isTensorList() || value.isPyObject()) {
-        std::vector<at::Tensor> tensors;
-        if (value.isTensorList()) {
-          tensors = value.toTensorVector();
-        } else {
-          pybind11::gil_scoped_acquire gil;
-          py::object obj = torch::jit::toPyObject(value);
-          tensors = torch::jit::toIValue(
-                        obj, c10::ListType::create(c10::TensorType::get()))
-                        .toTensorVector();
-        }
-        TORCH_INTERNAL_ASSERT(tensors.size() == 1, "expected exactly 1 tensor");
-        at::cuda::CUDAStream cuda_stream(stream);
-        c10::cuda::CUDACachingAllocator::recordStream(
-            tensors[0].storage().data_ptr(), cuda_stream);
-      }
-    };
-    fut->setRecordStreamCallback(record_stream_cb);
-#endif
+    fut->setDataPtrExtractor(&PythonFutureWrapper::dataPtrExtractor);
 
     return std::make_shared<jit::PythonFutureWrapper>(fut->then(
         // Capture a copy of the ivalue::Future instead of the `this` pointer
@@ -241,6 +216,24 @@ struct VISIBILITY_HIDDEN PythonFutureWrapper
   std::shared_ptr<PythonFutureWrapper> getPtr() {
     return shared_from_this();
   }
+
+  // This callback is only used by subclasses of Future that deal with CUDA,
+  // in order to register the pointers on the right streams with the caching
+  // allocator.
+  // By default, assume that the input value is or can be casted into a tensor
+  // vector that has exactly one tensor.
+  static std::vector<std::reference_wrapper<const at::DataPtr>> dataPtrExtractor(
+      const at::IValue& value) {
+    if (value.isPyObject()) {
+      pybind11::gil_scoped_acquire gil;
+      py::object obj = torch::jit::toPyObject(value);
+      // FIXME Should we support more types than just tensor lists?
+      auto new_value = torch::jit::toIValue(
+          obj, c10::ListType::create(c10::TensorType::get()));
+      return at::ivalue::Future::defaultDataPtrExtractor(new_value);
+    }
+    return at::ivalue::Future::defaultDataPtrExtractor(value);
+  };
 };
 
 // error reporting: when reporting user-caused errors, these functions should

diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp
@@ -314,28 +314,9 @@ class ProcessGroupNCCL : public ProcessGroup {
 
       // Do not free the underlying data storage of value_ before its
       // usage on futureNCCLCallbackStream_ finish.
-      if (record_stream_cb_ != nullptr) {
-        // If a Python communication hook is used, record_stream_cb_ will be
-        // set in torch/csrc/jit/python/pybind_utils.h, which allows Python
-        // dependency to be imported.
-        record_stream_cb_(value_, futureNCCLCallbackStream_->unwrap());
-      } else {
-        // If a C++ communication hook is used, create and set a record stream
-        // callback.
-        TORCH_INTERNAL_ASSERT(
-            value_.isTensorList() || value_.isTensor(),
-            "the future value must be either a tensor list or a tensor.");
-        at::Tensor tensor;
-        if (value_.isTensorList()) {
-          const auto tensors = value_.toTensorVector();
-          TORCH_INTERNAL_ASSERT(
-              tensors.size() == 1, "expected exactly 1 tensor");
-          tensor = tensors[0];
-        } else {
-          tensor = value_.toTensor();
-        }
+      for (const at::DataPtr& data_ptr : extractDataPtrs(value_)) {
         c10::cuda::CUDACachingAllocator::recordStream(
-            tensor.storage().data_ptr(), *futureNCCLCallbackStream_);
+            data_ptr, *futureNCCLCallbackStream_);
       }
 
       // Use the dedicated callback stream to run callback.
@@ -372,20 +353,36 @@ class ProcessGroupNCCL : public ProcessGroup {
       return !value_.isNone();
     }
 
-    void setRecordStreamCallback(
-        std::function<void(const at::IValue&, const c10::Stream&)>
-            record_stream_cb) override {
-      record_stream_cb_ = std::move(record_stream_cb);
+    void setDataPtrExtractor(DataPtrExtractor dataPtrExtractor) override {
+      std::unique_lock<std::mutex> lock(dataPtrExtractorMutex_);
+      dataPtrExtractor_ = std::move(dataPtrExtractor);
     }
 
    private:
     at::IValue value_;
     c10::DeviceIndex deviceIndex_;
     std::shared_ptr<std::vector<at::cuda::CUDAEvent>> cudaEvents_;
     std::shared_ptr<at::cuda::CUDAStream> futureNCCLCallbackStream_;
-    std::function<void(const at::IValue&, const c10::Stream&)>
-        record_stream_cb_;
+    DataPtrExtractor dataPtrExtractor_;
+    std::mutex dataPtrExtractorMutex_;
     c10::optional<FutureError> error_;
+
+    std::vector<std::reference_wrapper<const at::DataPtr>> extractDataPtrs(
+        const at::IValue& value) {
+      std::unique_lock<std::mutex> lock(dataPtrExtractorMutex_);
+      std::vector<std::reference_wrapper<const at::DataPtr>> data_ptrs;
+      if (dataPtrExtractor_ != nullptr) {
+        // If a Python communication hook is used, dataPtrExtractor_ will be
+        // set in torch/csrc/jit/python/pybind_utils.h, which allows Python
+        // dependency to be imported.
+        data_ptrs = dataPtrExtractor_(value);
+      } else {
+        // If a C++ communication hook is used, use the default extractor.
+        data_ptrs = at::ivalue::Future::defaultDataPtrExtractor(value);
+      }
+      TORCH_INTERNAL_ASSERT(data_ptrs.size() == 1, "expected exactly 1 tensor");
+      return data_ptrs;
+    }
   };
 
   // If you wish to create multiple process groups, each with a potentially