pytorch · lw · Dec 3, 2020 · Dec 4, 2020 · Dec 8, 2020 · Dec 9, 2020
diff --git a/aten/src/ATen/cuda/CUDAFuture.h b/aten/src/ATen/cuda/CUDAFuture.h
@@ -31,6 +31,8 @@ struct TORCH_CUDA_API CUDAFuture : at::ivalue::Future {
   }
 
   void postMarkCompletedHook(const at::IValue& value) override {
+    currentDevice_ = c10::cuda::current_device();
+
     // Extract them once and cache them for later uses.
     dataPtrs_ = extractDataPtrs(value);
 
@@ -85,6 +87,8 @@ struct TORCH_CUDA_API CUDAFuture : at::ivalue::Future {
         }
       }
 
+      c10::cuda::CUDAGuard deviceGuard(currentDevice_);
+
       callback();
     };
   }
@@ -109,6 +113,10 @@ struct TORCH_CUDA_API CUDAFuture : at::ivalue::Future {
   // Once WorkNCCL is gone (as part of the Future and Work merge) this should be
   // fixed.
  protected:
+  // The device that was current when markCompleted was called, which we'll
+  // restore when invoking callbacks.
+  c10::DeviceIndex currentDevice_;
+
   // The events that correspond to the completion of the async I/O kernels. They
   // are recorded on the appropriate streams when the future is marked completed
   // and can then be queried/waited/blocked on. There is one event for each

diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp
@@ -234,6 +234,7 @@ class ProcessGroupNCCL : public ProcessGroup {
                   return ev.device_index() == data_ptr.device().index();
                 }) != cudaEvents->end());
       }
+      currentDevice_ = c10::cuda::current_device();
       cudaEvents_ = std::move(cudaEvents);
       dataPtrs_ = std::move(dataPtrs);
       markCompleted(std::move(value));