From ada164bba861ffdaca04eabdd5f0e652d18555c5 Mon Sep 17 00:00:00 2001 From: Luca Wehrstedt Date: Thu, 26 Nov 2020 10:56:58 -0800 Subject: [PATCH] Fix FutureNCCL's completed() disagreeing with wait() This commit is part of a stack that reworks FutureNCCL in order to extract a generic CUDA-aware Future subclass. The stack deliberately breaks up this transition into elementary changes, to make it easier to verify that the behavior is preserved (or to highlight how it gets changed). --- My impression is that one property of the upstream Future class is that once .wait() returns, or once a callback is invoked, then .completed() should return True. This was not the case for FutureNCCL because .wait() would return immediately, and callbacks would be invoked inline, but .completed() could return False if the CUDA async operations hadn't completed yet. That was odd and confusing. Since there are other ways for users to check the status of CUDA operations (if they really need, and typically I don't think it's so common), perhaps it's best to avoid checking the status of CUDA events in .completed(). Differential Revision: [D25180531](https://our.internmc.facebook.com/intern/diff/D25180531/) [ghstack-poisoned] --- torch/lib/c10d/ProcessGroupNCCL.hpp | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp index dbaf9543d632..99cd0e8a96be 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.hpp +++ b/torch/lib/c10d/ProcessGroupNCCL.hpp @@ -333,15 +333,8 @@ class ProcessGroupNCCL : public ProcessGroup { return fut; } - // Checks cudaEventQuery with cudaEvents. Returns true if a FutureError was - // recorded or the entire operation is completed on the GPU. bool completed() const override { - if (error_) { - return true; - } - // Checking the work's corresponding CUDA events' status - auto ret = cudaEventQuery((*cudaEvents_)[0]); - return ret != cudaErrorNotReady || ret == cudaSuccess; + return true; } bool hasValue() const override {