Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[C10D] Make all PGNCCL LOG usages use logPrefix() #116060

Closed
wants to merge 6 commits into from
44 changes: 24 additions & 20 deletions torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -553,13 +553,13 @@ void ProcessGroupNCCL::WorkNCCL::handleException(
"Some NCCL operations have failed or timed out. Due to the ",
"asynchronous nature of CUDA kernels, subsequent GPU operations ",
"might run on corrupted/incomplete data.");
LOG(ERROR) << exceptionMsg;
LOG(ERROR) << logPrefix() << exceptionMsg;
C10_LOG_API_USAGE_ONCE("ProcessGroupNCCL.WorkNCCL.handleException");

if (SHOULD_TEAR_DOWN(errorHandling)) {
auto tearDownMsg = c10::str(
"To avoid data inconsistency, we are taking the entire process down.");
LOG(ERROR) << tearDownMsg;
LOG(ERROR) << logPrefix() << tearDownMsg;
std::rethrow_exception(exception_);
}
}
Expand Down Expand Up @@ -872,7 +872,8 @@ ProcessGroupNCCL::ProcessGroupNCCL(
void ProcessGroupNCCL::eagerConnectSingleDevice(at::Device device) {
std::vector<at::Device> rankDevices = {device};
const auto key = getKeyFromDevices(rankDevices);
LOG(INFO) << "Eagerly connecting nccl backend with device " << device;
LOG(INFO) << logPrefix() << "Eagerly connecting nccl backend with device "
<< device;
getNCCLComm(key, rankDevices, OpType::ALLREDUCE);
}

Expand All @@ -883,8 +884,8 @@ void ProcessGroupNCCL::performNocolorSplit(at::Device device) {
#ifdef NCCL_HAS_COMM_SPLIT
std::vector<at::Device> rankDevices = {device};
const auto key = getKeyFromDevices(rankDevices);
LOG(INFO) << "Performing nocolor split on backend device " << device
<< ", key " << key << ", i am " << this;
LOG(INFO) << logPrefix() << "Performing nocolor split on backend device "
<< device << ", key " << key << ", i am " << this;
auto comm = getNCCLComm(key, rankDevices, OpType::ALLREDUCE);
TORCH_CHECK_WITH(
DistBackendError,
Expand Down Expand Up @@ -1175,7 +1176,7 @@ bool ProcessGroupNCCL::dumpDebuggingInfo() {
// output file from an earlier call before a later call overwrites it.
static std::mutex writeDebugInfoMutex;
std::lock_guard<std::mutex> lock(writeDebugInfoMutex);
LOG(ERROR) << "ProcessGroupNCCL preparing to dump debug info.";
LOG(ERROR) << logPrefix() << "ProcessGroupNCCL preparing to dump debug info.";
if (ncclTraceBufferSize_ > 0) {
// We dump nccl trace into local disk by default and users can register
// their customized writer by inheriting `DebugInfoWriter` via
Expand All @@ -1196,7 +1197,7 @@ bool ProcessGroupNCCL::dumpDebuggingInfo() {
void ProcessGroupNCCL::terminateProcess(std::string errMsg) {
// Logging with `FATAL`, after errMsg printed, it calls `std::abort()`
// to terminate the program execution.
LOG(FATAL) << errMsg;
LOG(FATAL) << logPrefix() << errMsg;
}

void ProcessGroupNCCL::heartbeatMonitor() {
Expand Down Expand Up @@ -1375,7 +1376,7 @@ struct DumpPipe {
"Error creating named pipe ",
filename);
fd_ = open(filename.c_str(), O_RDONLY | O_NONBLOCK);
LOG(INFO) << "Pipe file " << filename
LOG(INFO) << logPrefix() << "Pipe file " << filename
<< " has been opened, write to it to trigger NCCL Debug Dump.";
TORCH_CHECK(fd_ != -1, "Error opening named pipe ", filename);
}
Expand Down Expand Up @@ -1501,7 +1502,7 @@ void ProcessGroupNCCL::watchdogHandler() {

if (desyncDebug_) {
auto desyncMsg = getNCCLWatchdogDebugInfo();
LOG(ERROR) << desyncMsg;
LOG(ERROR) << logPrefix() << desyncMsg;
}

if (dumpOnTimeout_) {
Expand All @@ -1510,10 +1511,12 @@ void ProcessGroupNCCL::watchdogHandler() {
}

} catch (const std::exception& e) {
LOG(ERROR) << "Failed to retrieve TORCH_NCCL_DESYNC_DEBUG report. "
LOG(ERROR) << logPrefix()
<< "Failed to retrieve TORCH_NCCL_DESYNC_DEBUG report. "
<< " Please file an issue. Error: " << e.what();
} catch (...) {
LOG(ERROR)
<< logPrefix()
<< "Failed to rerieve TORCH_NCCL_DESYNC_DEBUG report with unknown error."
<< " Please file an issue.";
}
Expand Down Expand Up @@ -1779,7 +1782,7 @@ std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
if (bound_device_id_) {
for (const auto& device : devices) {
if (*bound_device_id_ != device) {
LOG(ERROR) << "Tensor found on device " << device
LOG(ERROR) << logPrefix() << "Tensor found on device " << device
<< " but backend constrained to " << *bound_device_id_;
C10_THROW_ERROR(
DistBackendError,
Expand Down Expand Up @@ -1932,7 +1935,8 @@ std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
// At this point NCCL should have been initialized, hence we can accurately
// get the env value even if NCCL sets it by reading from nccl.conf file
if (getRank() == 0) {
LOG(INFO) << "NCCL_DEBUG: " << getCvarString({"NCCL_DEBUG"}, "N/A");
LOG(INFO) << logPrefix()
<< "NCCL_DEBUG: " << getCvarString({"NCCL_DEBUG"}, "N/A");
}

// See [Group Start/End Note]
Expand Down Expand Up @@ -3521,14 +3525,14 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::barrier(const BarrierOptions& opts) {
// ensure that each process is on a different GPU
auto numGPUs = at::cuda::getNumGPUs();
int16_t deviceIdx = static_cast<int16_t>(rank_ % numGPUs);
LOG(INFO) << c10::str(
"Rank ",
this->getRank(),
" using GPU ",
deviceIdx,
" to perform barrier as devices used by this process are currently unknown. ",
"This can potentially cause a hang if this rank to GPU mapping is incorrect.",
"Specify device_ids in barrier() to force use of a particular device.");
LOG(INFO)
<< logPrefix()
<< c10::str(
" using GPU ",
deviceIdx,
" to perform barrier as devices used by this process are currently unknown. ",
"This can potentially cause a hang if this rank to GPU mapping is incorrect.",
"Specify device_ids in barrier() to force use of a particular device.");
devices.emplace_back(guessDeviceForRank());
} else {
for (auto usedDeviceIdx : usedDeviceIdxs_) {
Expand Down