pytorch · osalpekar · Aug 18, 2020 · Aug 19, 2020 · Aug 20, 2020 · Aug 20, 2020
diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp
@@ -306,6 +306,19 @@ void ProcessGroupNCCL::WorkNCCL::checkAndThrowException() {
   }
 }
 
+bool ProcessGroupNCCL::WorkNCCL::isCompletedAndThrowException() {
+  checkAndSetException();
+  std::lock_guard<std::mutex> lock(mutex_);
+  if (exception_ || finishedGPUExecutionInternal()) {
+    completed_ = true;
+    if (exception_) {
+      std::rethrow_exception(exception_);
+    }
+    return true;
+  }
+  return false;
+}
+
 void ProcessGroupNCCL::WorkNCCL::handleNCCLGuard() {
   std::lock_guard<std::mutex> lock(mutex_);
   completed_ = true;
@@ -632,10 +645,9 @@ void ProcessGroupNCCL::workCleanupLoop() {
     for (auto it = workList_.begin(); it != workList_.end();
          /* no increment*/) {
       auto& work = *it;
-      if (work->isCompleted()) {
-        // Handle Exceptions on failed GPU operations and remove completed
-        // workNCCL objects from work vector.
-        work->handleNCCLGuard();
+      // Handle Exceptions on failed GPU operations and remove completed
+      // workNCCL objects from work vector.
+      if (work->isCompletedAndThrowException()) {
         it = workList_.erase(it);
       } else {
         // Increment the iterator if the current WorkNCCL object is not

diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp
@@ -95,6 +95,10 @@ class ProcessGroupNCCL : public ProcessGroup {
     // It actually returns a FutureNCCL object which is a sub class Future.
     c10::intrusive_ptr<c10::ivalue::Future> getFuture() override;
 
+    // Checks for completion of the WorkNCCL object, and if complete, handles
+    // any caught errors or exceptions. Returns true if the work is completed.
+    bool isCompletedAndThrowException();
+
     // Helper function that sets an exception_ptr on the WorkNCCL object.
     void setException(std::exception_ptr exception_ptr);