Skip to content

Commit

Permalink
[C10D] Make all PGNCCL LOG usages use logPrefix()
Browse files Browse the repository at this point in the history
ghstack-source-id: 3aff3612af6a7ed05c7d3b4525ec93fb2039b84a
Pull Request resolved: #116060
  • Loading branch information
wconstab committed Dec 18, 2023
1 parent 74080fd commit 92520ca
Showing 1 changed file with 24 additions and 20 deletions.
44 changes: 24 additions & 20 deletions torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -553,13 +553,13 @@ void ProcessGroupNCCL::WorkNCCL::handleException(
"Some NCCL operations have failed or timed out. Due to the ",
"asynchronous nature of CUDA kernels, subsequent GPU operations ",
"might run on corrupted/incomplete data.");
LOG(ERROR) << exceptionMsg;
LOG(ERROR) << logPrefix() << exceptionMsg;
C10_LOG_API_USAGE_ONCE("ProcessGroupNCCL.WorkNCCL.handleException");

if (SHOULD_TEAR_DOWN(errorHandling)) {
auto tearDownMsg = c10::str(
"To avoid data inconsistency, we are taking the entire process down.");
LOG(ERROR) << tearDownMsg;
LOG(ERROR) << logPrefix() << tearDownMsg;
std::rethrow_exception(exception_);
}
}
Expand Down Expand Up @@ -873,7 +873,8 @@ ProcessGroupNCCL::ProcessGroupNCCL(
void ProcessGroupNCCL::eagerConnectSingleDevice(at::Device device) {
std::vector<at::Device> rankDevices = {device};
const auto key = getKeyFromDevices(rankDevices);
LOG(INFO) << "Eagerly connecting nccl backend with device " << device;
LOG(INFO) << logPrefix() << "Eagerly connecting nccl backend with device "
<< device;
getNCCLComm(key, rankDevices, OpType::ALLREDUCE);
}

Expand All @@ -884,8 +885,8 @@ void ProcessGroupNCCL::performNocolorSplit(at::Device device) {
#ifdef NCCL_HAS_COMM_SPLIT
std::vector<at::Device> rankDevices = {device};
const auto key = getKeyFromDevices(rankDevices);
LOG(INFO) << "Performing nocolor split on backend device " << device
<< ", key " << key << ", i am " << this;
LOG(INFO) << logPrefix() << "Performing nocolor split on backend device "
<< device << ", key " << key << ", i am " << this;
auto comm = getNCCLComm(key, rankDevices, OpType::ALLREDUCE);
TORCH_CHECK_WITH(
DistBackendError,
Expand Down Expand Up @@ -1183,7 +1184,7 @@ bool ProcessGroupNCCL::dumpDebuggingInfo() {
// output file from an earlier call before a later call overwrites it.
static std::mutex writeDebugInfoMutex;
std::lock_guard<std::mutex> lock(writeDebugInfoMutex);
LOG(ERROR) << "ProcessGroupNCCL preparing to dump debug info.";
LOG(ERROR) << logPrefix() << "ProcessGroupNCCL preparing to dump debug info.";
if (ncclTraceBufferSize_ > 0) {
// We dump nccl trace into local disk by default and users can register
// their customized writer by inheriting `DebugInfoWriter` via
Expand All @@ -1204,7 +1205,7 @@ bool ProcessGroupNCCL::dumpDebuggingInfo() {
void ProcessGroupNCCL::terminateProcess(std::string errMsg) {
// Logging with `FATAL`, after errMsg printed, it calls `std::abort()`
// to terminate the program execution.
LOG(FATAL) << errMsg;
LOG(FATAL) << logPrefix() << errMsg;
}

void ProcessGroupNCCL::heartbeatMonitor() {
Expand Down Expand Up @@ -1383,7 +1384,7 @@ struct DumpPipe {
"Error creating named pipe ",
filename);
fd_ = open(filename.c_str(), O_RDONLY | O_NONBLOCK);
LOG(INFO) << "Pipe file " << filename
LOG(INFO) << logPrefix() << "Pipe file " << filename
<< " has been opened, write to it to trigger NCCL Debug Dump.";
TORCH_CHECK(fd_ != -1, "Error opening named pipe ", filename);
}
Expand Down Expand Up @@ -1509,7 +1510,7 @@ void ProcessGroupNCCL::watchdogHandler() {

if (desyncDebug_) {
auto desyncMsg = getNCCLWatchdogDebugInfo();
LOG(ERROR) << desyncMsg;
LOG(ERROR) << logPrefix() << desyncMsg;
}

if (dumpOnTimeout_) {
Expand All @@ -1518,10 +1519,12 @@ void ProcessGroupNCCL::watchdogHandler() {
}

} catch (const std::exception& e) {
LOG(ERROR) << "Failed to retrieve TORCH_NCCL_DESYNC_DEBUG report. "
LOG(ERROR) << logPrefix()
<< "Failed to retrieve TORCH_NCCL_DESYNC_DEBUG report. "
<< " Please file an issue. Error: " << e.what();
} catch (...) {
LOG(ERROR)
<< logPrefix()
<< "Failed to rerieve TORCH_NCCL_DESYNC_DEBUG report with unknown error."
<< " Please file an issue.";
}
Expand Down Expand Up @@ -1787,7 +1790,7 @@ std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
if (bound_device_id_) {
for (const auto& device : devices) {
if (*bound_device_id_ != device) {
LOG(ERROR) << "Tensor found on device " << device
LOG(ERROR) << logPrefix() << "Tensor found on device " << device
<< " but backend constrained to " << *bound_device_id_;
C10_THROW_ERROR(
DistBackendError,
Expand Down Expand Up @@ -1940,7 +1943,8 @@ std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
// At this point NCCL should have been initialized, hence we can accurately
// get the env value even if NCCL sets it by reading from nccl.conf file
if (getRank() == 0) {
LOG(INFO) << "NCCL_DEBUG: " << getCvarString({"NCCL_DEBUG"}, "N/A");
LOG(INFO) << logPrefix()
<< "NCCL_DEBUG: " << getCvarString({"NCCL_DEBUG"}, "N/A");
}

// See [Group Start/End Note]
Expand Down Expand Up @@ -3539,14 +3543,14 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::barrier(const BarrierOptions& opts) {
// ensure that each process is on a different GPU
auto numGPUs = at::cuda::getNumGPUs();
int16_t deviceIdx = static_cast<int16_t>(rank_ % numGPUs);
LOG(INFO) << c10::str(
"Rank ",
this->getRank(),
" using GPU ",
deviceIdx,
" to perform barrier as devices used by this process are currently unknown. ",
"This can potentially cause a hang if this rank to GPU mapping is incorrect.",
"Specify device_ids in barrier() to force use of a particular device.");
LOG(INFO)
<< logPrefix()
<< c10::str(
" using GPU ",
deviceIdx,
" to perform barrier as devices used by this process are currently unknown. ",
"This can potentially cause a hang if this rank to GPU mapping is incorrect.",
"Specify device_ids in barrier() to force use of a particular device.");
devices.emplace_back(guessDeviceForRank());
} else {
for (auto usedDeviceIdx : usedDeviceIdxs_) {
Expand Down

0 comments on commit 92520ca

Please sign in to comment.