Implementation of torch::cuda::synchronize (#50072)

Summary: Adding `torch::cuda::synchronize()` to libtorch. Note that the implementation here adds a new method to the `CUDAHooksInterface`. An alternative that was suggested to me is to add a method to the `DeviceGuard` interface. Fixes #47722 Pull Request resolved: #50072 Reviewed By: H-Huang Differential Revision: D25804342 Pulled By: jbschlosser fbshipit-source-id: 45aa61d7c6fbfd3178caf2eb5ec053d6c01b5a43
pytorch · Jan 6, 2021 · 7d9eb6c · 7d9eb6c
1 parent e606e60
commit 7d9eb6c
Show file tree

Hide file tree

Showing 5 changed files with 22 additions and 0 deletions.
diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -369,6 +369,11 @@ int CUDAHooks::getNumGPUs() const {
   return at::cuda::device_count();
 }
 
+void CUDAHooks::deviceSynchronize(int64_t device_index) const {
+  at::DeviceGuard device_guard(at::Device(at::DeviceType::CUDA, device_index));
+  c10::cuda::device_synchronize();
+}
+
 // Sigh, the registry doesn't support namespaces :(
 using at::CUDAHooksRegistry;
 using at::RegistererCUDAHooksRegistry;

diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h
@@ -38,6 +38,7 @@ struct CUDAHooks : public at::CUDAHooksInterface {
   int64_t cuFFTGetPlanCacheSize(int64_t device_index) const override;
   void cuFFTClearPlanCache(int64_t device_index) const override;
   int getNumGPUs() const override;
+  void deviceSynchronize(int64_t device_index) const override;
 };
 
 }}} // at::cuda::detail
diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h
@@ -181,6 +181,10 @@ struct TORCH_API CUDAHooksInterface {
   virtual int getNumGPUs() const {
     return 0;
   }
+
+  virtual void deviceSynchronize(int64_t device_index) const {
+    TORCH_CHECK(false, "Cannot synchronize CUDA device without ATen_cuda library. ", CUDA_HELP);
+  }
 };
 
 // NB: dummy argument to suppress "ISO C++11 requires at least one argument

diff --git a/torch/csrc/api/include/torch/cuda.h b/torch/csrc/api/include/torch/cuda.h
@@ -23,5 +23,8 @@ void TORCH_API manual_seed(uint64_t seed);
 /// Sets the seed for all available GPUs.
 void TORCH_API manual_seed_all(uint64_t seed);
 
+/// Waits for all kernels in all streams on a CUDA device to complete.
+void TORCH_API synchronize(int64_t device_index = -1);
+
 } // namespace cuda
 } // namespace torch
diff --git a/torch/csrc/api/src/cuda.cpp b/torch/csrc/api/src/cuda.cpp
@@ -1,6 +1,7 @@
 #include <torch/cuda.h>
 
 #include <ATen/Context.h>
+#include <c10/core/DeviceGuard.h>
 
 #include <cstddef>
 
@@ -49,5 +50,13 @@ void manual_seed_all(uint64_t seed) {
   }
 }
 
+void synchronize(int64_t device_index) {
+  TORCH_CHECK(is_available(), "No CUDA GPUs are available");
+  int64_t num_gpus = cuda::device_count();
+  TORCH_CHECK(device_index == -1 || device_index < num_gpus,
+    "Device index out of range: ", device_index);
+  at::detail::getCUDAHooks().deviceSynchronize(device_index);
+}
+
 } // namespace cuda
 } // namespace torch