From 675f47a8783b4b2fbf4213fbdc8ef7a4c32d3c7e Mon Sep 17 00:00:00 2001 From: Omkar Salpekar Date: Fri, 13 Nov 2020 10:34:18 -0800 Subject: [PATCH] Use Blocking Wait if both Blocking Wait and Async Error Handling Are Set Given that we're soon enabling async error handling in PET, we should make the behavior explicit when users have set NCCL_BLOCKING_WAIT in their own code while also using PET. This PR essentially gives blocking wait precedence (for now). This way the blast radius of the PET change is smaller, while we continue working with blocking wait users and discussing whether moving to async error handling may be a good fit. Differential Revision: [D24928149](https://our.internmc.facebook.com/intern/diff/D24928149/) [ghstack-poisoned] --- torch/lib/c10d/ProcessGroupNCCL.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp index c3a245fb13da..81c8dc50242b 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.cpp +++ b/torch/lib/c10d/ProcessGroupNCCL.cpp @@ -453,6 +453,14 @@ ProcessGroupNCCL::ProcessGroupNCCL( blockingWait_ = parseEnvVarFlag(NCCL_BLOCKING_WAIT); asyncErrorHandling_ = parseEnvVarFlag(NCCL_ASYNC_ERROR_HANDLING); + if (blockingWait_ && asyncErrorHandling_) { + LOG(INFO) << "[Rank " << rank_ + << "] NCCL_BLOCKING_WAIT and NCCL_ASYNC_ERROR_HANDLING " + << "should not both be enabled. " + << "Only NCCL_BLOCKING_WAIT is being used in this process."; + asyncErrorHandling_ = false; + } + #ifdef ENABLE_NCCL_ERROR_CHECKING ncclCommWatchdogThread_ = std::thread(&ProcessGroupNCCL::ncclCommWatchdog, this);