Skip to content

Commit

Permalink
Back out "Providing more information while crashing process in async …
Browse files Browse the repository at this point in the history
…error handling" (#47185)

Summary:
Pull Request resolved: #47185

Original commit changeset: 02d48f13352a

Test Plan: CI

Reviewed By: mruberry

Differential Revision: D24682055

fbshipit-source-id: 060efa29eb2f322971848ead447021f6972cb3f3
  • Loading branch information
albanD authored and facebook-github-bot committed Nov 2, 2020
1 parent 85e5b76 commit c10aa44
Showing 1 changed file with 9 additions and 21 deletions.
30 changes: 9 additions & 21 deletions torch/lib/c10d/ProcessGroupNCCL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -392,16 +392,10 @@ void ProcessGroupNCCL::WorkNCCL::synchronizeInternal(
LOG(INFO) << "[Rank " << rank_
<< "] Wrote aborted communicator id to store: " << storeKey;
}
auto currentTimepoint = std::chrono::steady_clock::now();
auto timeElapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
currentTimepoint - workStartTime_);
std::string exceptionMsg = c10::str("[Rank ", rank_, "] ",
"Caught collective operation timeout: ",
(*this),
" ran for ",
timeElapsed.count(),
" milliseconds before timing out.");
throw std::runtime_error(exceptionMsg);
LOG(INFO) << "[Rank " << rank_
<< "] Caught collective operation timeout for work: "
<< (*this);
throw std::runtime_error("Operation timed out!");
}
// Check for errors and throw appropriate exception.
checkAndThrowException();
Expand Down Expand Up @@ -510,18 +504,12 @@ void ProcessGroupNCCL::abortTimedOutCollectives(std::unordered_set<std::string>&
// Check for Timeouts in the WorkNCCL Operations, and abort all
// communicators accordingly.
if (work.timedOut()) {
auto currentTimepoint = std::chrono::steady_clock::now();
auto timeElapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
currentTimepoint - work.workStartTime_);
std::string exceptionMsg = c10::str("[Rank ", rank_, "] ",
"Watchdog caught collective operation timeout: ",
work,
" ran for ",
timeElapsed.count(),
" milliseconds before timing out.");
LOG(INFO) << exceptionMsg;
LOG(INFO)
<< "[Rank " << rank_
<< "] Watchdog caught collective operation timeout for work: "
<< work;
std::exception_ptr exception_ptr = std::make_exception_ptr(
std::runtime_error(exceptionMsg));
std::runtime_error("NCCL Operation Timed Out"));
work.setException(exception_ptr);
for (const auto& ncclComm : work.ncclComms_) {
ncclComm->ncclCommAbort();
Expand Down

0 comments on commit c10aa44

Please sign in to comment.