Skip to content

Commit

Permalink
[PG NCCL] Add TDD, NCCL_DEBUG log (#97692)
Browse files Browse the repository at this point in the history
Prints these env var setting during setup for easier debug.

Differential Revision: [D44430875](https://our.internmc.facebook.com/intern/diff/D44430875/)
Pull Request resolved: #97692
Approved by: https://github.com/kumpera
  • Loading branch information
rohan-varma authored and ZainRizvi committed Apr 19, 2023
1 parent 48e624f commit 358be4b
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 1 deletion.
9 changes: 8 additions & 1 deletion torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -659,14 +659,21 @@ ProcessGroupNCCL::ProcessGroupNCCL(
#endif

init();
const std::string OFF = "OFF";
const char* torch_distributed_debug =
parseEnvVarString("TORCH_DISTRIBUTED_DEBUG", OFF.c_str());
const char* nccl_debug = parseEnvVarString("NCCL_DEBUG", OFF.c_str());
LOG(INFO) << "[Rank " << rank_
<< "] ProcessGroupNCCL initialized with following options:"
<< "\nNCCL_ASYNC_ERROR_HANDLING: " << asyncErrorHandling_
<< "\nNCCL_DESYNC_DEBUG: " << desyncDebug_
<< "\nNCCL_BLOCKING_WAIT: " << blockingWait_
<< "\nTIMEOUT(ms): " << options_->timeout.count()
<< "\nUSE_HIGH_PRIORITY_STREAM: "
<< options_->is_high_priority_stream;
<< options_->is_high_priority_stream
<< "\n TORCH_DISTRIBUTED_DEBUG: "
<< std::string(torch_distributed_debug)
<< "\n NCCL_DEBUG: " << std::string(nccl_debug);

RECORD_PARAM_COMMS(
0, // seq
Expand Down
8 changes: 8 additions & 0 deletions torch/csrc/distributed/c10d/Utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,14 @@ inline int parseEnvVarInt(const char* envVarName) {
return C10D_ENV_NOT_SET;
}

inline const char* parseEnvVarString(const char* envVarName, const char* default_val) {
const char* val = std::getenv(envVarName);
if (val == nullptr) {
val = default_val;
}
return val;
}

inline int parseEnvVarIntDefault(const char* envVarName, int defaultVal) {
int val = parseEnvVarInt(envVarName);
if (val == C10D_ENV_NOT_SET)
Expand Down

0 comments on commit 358be4b

Please sign in to comment.