From 7284591dd0f1f5d6b05edc73606c0b0a16f5b452 Mon Sep 17 00:00:00 2001 From: Prajesh Praveen Anchalia <54730469+ppanchalia@users.noreply.github.com> Date: Thu, 6 Mar 2025 19:01:54 -0800 Subject: [PATCH] TCP Debug Logging (#415) Summary: Pull Request resolved: https://github.com/facebookincubator/gloo/pull/415 Adds a debug_logger to capture TCP failure debug data Meta: Aides debugging for S489966 This multipex a debug_logger for internal, where the data is logged to Scuba: gloo_tcp_debug using LoggerConfig: D70646098 JK: https://www.internalfb.com/intern/justknobs/?name=ai_infra%2Fpytorch_distributed#torch_gloo_disable_tcp_debug_scuba_logging JK Diff: D70684609 Reviewed By: d4l3k Differential Revision: D70682977 --- gloo/test/tcp_test.cc | 2 +- gloo/transport/tcp/CMakeLists.txt | 3 +++ gloo/transport/tcp/debug_data.h | 23 +++++++++++++++++++++++ gloo/transport/tcp/debug_logger.cc | 30 ++++++++++++++++++++++++++++++ gloo/transport/tcp/debug_logger.h | 20 ++++++++++++++++++++ gloo/transport/tcp/device.cc | 8 +++++++- gloo/transport/tcp/device.h | 4 ++++ gloo/transport/tcp/helpers.cc | 4 +++- gloo/transport/tcp/helpers.h | 27 ++++++++++++++++++++------- gloo/transport/tcp/pair.cc | 2 ++ 10 files changed, 113 insertions(+), 10 deletions(-) create mode 100644 gloo/transport/tcp/debug_data.h create mode 100644 gloo/transport/tcp/debug_logger.cc create mode 100644 gloo/transport/tcp/debug_logger.h diff --git a/gloo/test/tcp_test.cc b/gloo/test/tcp_test.cc index 33e34f0b6..3d5f525b0 100644 --- a/gloo/test/tcp_test.cc +++ b/gloo/test/tcp_test.cc @@ -25,7 +25,7 @@ TEST(TcpTest, ConnectTimeout) { EXPECT_TRUE(e); EXPECT_TRUE(dynamic_cast(&e)); }; - connectLoop(loop, remote, timeout, std::move(fn)); + connectLoop(loop, remote, 0, 5, timeout, std::move(fn)); std::unique_lock lock(m); cv.wait(lock, [&] { return done; }); diff --git a/gloo/transport/tcp/CMakeLists.txt b/gloo/transport/tcp/CMakeLists.txt index 206fd3e09..acaf906b7 100644 --- a/gloo/transport/tcp/CMakeLists.txt +++ b/gloo/transport/tcp/CMakeLists.txt @@ -5,6 +5,7 @@ else() "${CMAKE_CURRENT_SOURCE_DIR}/address.cc" "${CMAKE_CURRENT_SOURCE_DIR}/buffer.cc" "${CMAKE_CURRENT_SOURCE_DIR}/context.cc" + "${CMAKE_CURRENT_SOURCE_DIR}/debug_logger.cc" "${CMAKE_CURRENT_SOURCE_DIR}/device.cc" "${CMAKE_CURRENT_SOURCE_DIR}/error.cc" "${CMAKE_CURRENT_SOURCE_DIR}/helpers.cc" @@ -19,6 +20,8 @@ else() "${CMAKE_CURRENT_SOURCE_DIR}/attr.h" "${CMAKE_CURRENT_SOURCE_DIR}/buffer.h" "${CMAKE_CURRENT_SOURCE_DIR}/context.h" + "${CMAKE_CURRENT_SOURCE_DIR}/debug_data.h" + "${CMAKE_CURRENT_SOURCE_DIR}/debug_logger.h" "${CMAKE_CURRENT_SOURCE_DIR}/device.h" "${CMAKE_CURRENT_SOURCE_DIR}/error.h" "${CMAKE_CURRENT_SOURCE_DIR}/helpers.h" diff --git a/gloo/transport/tcp/debug_data.h b/gloo/transport/tcp/debug_data.h new file mode 100644 index 000000000..71863067e --- /dev/null +++ b/gloo/transport/tcp/debug_data.h @@ -0,0 +1,23 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include +#pragma once + +namespace gloo { +namespace transport { +namespace tcp { + +struct ConnectDebugData { + const int retryCount; + const int retryLimit; + const bool willRetry; + const int glooRank; + const int glooSize; + const std::string error; + const std::string remote; + const std::string local; +}; + +} // namespace tcp +} // namespace transport +} // namespace gloo diff --git a/gloo/transport/tcp/debug_logger.cc b/gloo/transport/tcp/debug_logger.cc new file mode 100644 index 000000000..b55209616 --- /dev/null +++ b/gloo/transport/tcp/debug_logger.cc @@ -0,0 +1,30 @@ +#include +#include + +namespace gloo { +namespace transport { +namespace tcp { + +void DebugLogger::log(const ConnectDebugData& data) { + GLOO_ERROR( + "failed to connect, willRetry=", + data.willRetry, + ", retry=", + data.retryCount, + ", retryLimit=", + data.retryLimit, + ", rank=", + data.glooRank, + ", size=", + data.glooSize, + ", local=", + data.local, + ", remote=", + data.remote, + ", error=", + data.error); +} + +} // namespace tcp +} // namespace transport +} // namespace gloo diff --git a/gloo/transport/tcp/debug_logger.h b/gloo/transport/tcp/debug_logger.h new file mode 100644 index 000000000..e58f9c6a1 --- /dev/null +++ b/gloo/transport/tcp/debug_logger.h @@ -0,0 +1,20 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#pragma once + +#include + +namespace gloo { +namespace transport { +namespace tcp { + +class DebugLogger { + public: + static void log(const ConnectDebugData& data); + + private: +}; + +} // namespace tcp +} // namespace transport +} // namespace gloo diff --git a/gloo/transport/tcp/device.cc b/gloo/transport/tcp/device.cc index f3f2b950f..aa356440f 100644 --- a/gloo/transport/tcp/device.cc +++ b/gloo/transport/tcp/device.cc @@ -299,12 +299,14 @@ bool Device::isInitiator(const Address& local, const Address& remote) const { void Device::connect( const Address& local, const Address& remote, + const int rank, + const int size, std::chrono::milliseconds timeout, connect_callback_t fn) { auto initiator = isInitiator(local, remote); if (initiator) { - connectAsInitiator(remote, timeout, std::move(fn)); + connectAsInitiator(remote, rank, size, timeout, std::move(fn)); return; } connectAsListener(local, timeout, std::move(fn)); @@ -335,6 +337,8 @@ void Device::connectAsListener( // void Device::connectAsInitiator( const Address& remote, + const int rank, + const int size, std::chrono::milliseconds timeout, connect_callback_t fn) { auto writeSeq = [loop = loop_, seq = remote.getSeq()]( @@ -357,6 +361,8 @@ void Device::connectAsInitiator( connectLoop( loop_, remote, + rank, + size, timeout, [loop = loop_, fn = std::move(fn), writeSeq = std::move(writeSeq)]( std::shared_ptr socket, const Error& error) { diff --git a/gloo/transport/tcp/device.h b/gloo/transport/tcp/device.h index 302f27d9f..86ecd6d22 100644 --- a/gloo/transport/tcp/device.h +++ b/gloo/transport/tcp/device.h @@ -96,6 +96,8 @@ class Device : public ::gloo::transport::Device, void connect( const Address& local, const Address& remote, + const int rank, + const int size, std::chrono::milliseconds timeout, connect_callback_t fn); @@ -106,6 +108,8 @@ class Device : public ::gloo::transport::Device, void connectAsInitiator( const Address& remote, + const int rank, + const int size, std::chrono::milliseconds timeout, connect_callback_t fn); diff --git a/gloo/transport/tcp/helpers.cc b/gloo/transport/tcp/helpers.cc index d30a813e5..b5be56c83 100644 --- a/gloo/transport/tcp/helpers.cc +++ b/gloo/transport/tcp/helpers.cc @@ -7,10 +7,12 @@ namespace tcp { void connectLoop( std::shared_ptr loop, const Address& remote, + const int rank, + const int size, std::chrono::milliseconds timeout, typename ConnectOperation::callback_t fn) { auto x = std::make_shared( - std::move(loop), remote, timeout, std::move(fn)); + std::move(loop), remote, rank, size, timeout, std::move(fn)); x->run(); } diff --git a/gloo/transport/tcp/helpers.h b/gloo/transport/tcp/helpers.h index af6da8d37..4d1baf0a1 100644 --- a/gloo/transport/tcp/helpers.h +++ b/gloo/transport/tcp/helpers.h @@ -12,9 +12,11 @@ #include #include +#include #include #include #include +#include "gloo/transport/tcp/debug_logger.h" // @manual=//gloo:debug_logger namespace gloo { namespace transport { @@ -180,9 +182,13 @@ class ConnectOperation final ConnectOperation( std::shared_ptr loop, const Address& remote, + const int rank, + const int size, std::chrono::milliseconds timeout, callback_t fn) : remote_(remote), + rank_(rank), + size_(size), deadline_(std::chrono::steady_clock::now() + timeout), loop_(std::move(loop)), fn_(std::move(fn)) {} @@ -230,15 +236,18 @@ class ConnectOperation final SystemError e("SO_ERROR", result, remote_); bool willRetry = std::chrono::steady_clock::now() < deadline_ && retry_++ < maxRetries_; - GLOO_ERROR( - "failed to connect, willRetry=", - willRetry, - ", retry=", + + auto debugData = ConnectDebugData{ retry_, - ", remote=", + maxRetries_, + willRetry, + rank_, + size_, + e.what(), remote_.str(), - ", error=", - e.what()); + socket_->sockName().str(), + }; + DebugLogger::log(debugData); // check deadline if (willRetry) { run(); @@ -253,6 +262,8 @@ class ConnectOperation final private: const Address remote_; + const int rank_; + const int size_; const std::chrono::time_point deadline_; const int maxRetries_{3}; @@ -269,6 +280,8 @@ class ConnectOperation final void connectLoop( std::shared_ptr loop, const Address& remote, + const int rank, + const int size, std::chrono::milliseconds timeout, typename ConnectOperation::callback_t fn); diff --git a/gloo/transport/tcp/pair.cc b/gloo/transport/tcp/pair.cc index f7462ca09..dd7de38f9 100644 --- a/gloo/transport/tcp/pair.cc +++ b/gloo/transport/tcp/pair.cc @@ -123,6 +123,8 @@ void Pair::connect(const std::vector& bytes) { device_->connect( self_, peer, + context_->rank, + context_->size, timeout_, std::bind( &Pair::connectCallback,