From 857e96e86c19919bb5367b34f65dd51021a4a4c3 Mon Sep 17 00:00:00 2001 From: Xilun Wu <12968408+XilunWu@users.noreply.github.com> Date: Tue, 4 Jun 2024 13:32:31 -0700 Subject: [PATCH 1/2] [BE][c10d] fix use of TORCH_ERROR in TCPStore libuv backend [ghstack-poisoned] --- .../distributed/c10d/TCPStoreLibUvBackend.cpp | 73 ++++++++++++++----- 1 file changed, 55 insertions(+), 18 deletions(-) diff --git a/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp b/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp index 845803c5e17e..6b4339480e0a 100644 --- a/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp +++ b/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp @@ -185,12 +185,16 @@ class UvTcpServer : public UvTcpSocket { try { int uv_res = uv_tcp_open((uv_tcp_t*)res->unsafeGetStream(), socket); TORCH_CHECK( - uv_res == 0, - "Failed to open existing socket. socket:{} code:{} name:{} message:{}", - socket, - uv_res, - uv_err_name(uv_res), - uv_strerror(uv_res)); + uv_res == 0, + "Failed to open existing socket. ", + "socket: ", + socket, + ", code: ", + uv_res, + ", name: ", + uv_err_name(uv_res), + ", message: ", + uv_strerror(uv_res)); res->cacheSocketPort(); } catch (std::exception& ex) { @@ -219,32 +223,45 @@ class UvTcpServer : public UvTcpSocket { } else { uv_res = uv_ip4_addr("0.0.0.0", port, (struct sockaddr_in*)&addr); } + TORCH_CHECK( uv_res == 0, - "UV Store addr parsing failure. useIpv6:{} code:{} name:{} message:{}", + "UV Store addr parsing failure. ", + "useIpv6: ", useIpv6, + ", code: ", uv_res, + ", name: ", uv_err_name(uv_res), + ", message: ", uv_strerror(uv_res)); uv_res = uv_tcp_bind(res->unsafeGetSocket(), (const struct sockaddr*)&addr, 0); TORCH_CHECK( uv_res == 0, - "UV Store bind failed. useIpv6:{} code:{} name:{} message:{}", + "The server socket has failed to bind. ", + "useIpv6: ", useIpv6, + ", code: ", uv_res, + ", name: ", uv_err_name(uv_res), + ", message: ", uv_strerror(uv_res)); uv_res = uv_listen(res->unsafeGetStream(), DEFAULT_BACKLOG, on_new_connection); TORCH_CHECK( uv_res == 0, - "UV Store listen failed. useIpv6:{} code:{} name:{} message:{}", + "The server socket has failed to listen on any local network address. ", + "useIpv6: ", useIpv6, + ", code: ", uv_res, + ", name: ", uv_err_name(uv_res), + ", message: ", uv_strerror(uv_res)); res->cacheSocketPort(); @@ -265,9 +282,12 @@ class UvTcpServer : public UvTcpSocket { uv_accept(unsafeGetStream(), (uv_stream_t*)socket->unsafeGetHandle()); TORCH_CHECK( res == 0, - "Failed to accept socket. code:{} name:{} desc:{}.", + "Failed to accept socket. ", + "code: ", res, + ", name: ", uv_err_name(res), + ", message: ", uv_strerror(res)); } @@ -458,9 +478,12 @@ class ChunkedStream { if (buff_idx >= buffers.size() && remaining > 0) { TORCH_CHECK( false, - "Trying to read past end of buffer buffer_idx:{} available:{} remaining:{}", + "Trying to read past end of buffer. ", + "buffer_idx: ", buff_idx, + ", available: ", buffers.size(), + ", remaining: ", remaining); } } @@ -498,8 +521,10 @@ class ChunkedStream { return false; TORCH_CHECK( size <= MAX_STRING_LEN, - "Invalid string size. size:{} max:{}", + "Invalid string size. ", + "size: ", size, + ", max: ", MAX_STRING_LEN); if (available() < size) @@ -515,8 +540,10 @@ class ChunkedStream { auto size_in_bytes = size * sizeof(uint8_t); TORCH_CHECK( size_in_bytes <= MAX_PAYLOAD_LEN, - "Invalid payload size. size: {} max:{}", + "Invalid payload size. ", + "size: ", size_in_bytes, + ", max: ", MAX_PAYLOAD_LEN); if (available() < size_in_bytes) @@ -782,8 +809,10 @@ class UvClient : public UvTcpSocket { return false; TORCH_CHECK( key_count <= MAX_KEY_COUNT, - "Too many keys being waited. keys:{} max:{}", + "Too many keys being waited. ", + "keys: ", key_count, + ", max: ", MAX_KEY_COUNT); std::vector keys(key_count); @@ -810,8 +839,10 @@ class UvClient : public UvTcpSocket { } TORCH_CHECK( key_count <= MAX_KEY_COUNT, - "Too many keys being waited. keys:{} max:{}", + "Too many keys being waited. ", + "keys: ", key_count, + ", max: ", MAX_KEY_COUNT); std::vector keys(key_count); @@ -872,8 +903,10 @@ class UvClient : public UvTcpSocket { } TORCH_CHECK( key_count <= MAX_KEY_COUNT, - "Too many keys with multi_get. keys:{} max:{}", + "Too many keys with multi_get. ", + "keys: ", key_count, + ", max: ", MAX_KEY_COUNT); StreamWriter sw(iptr()); @@ -898,8 +931,10 @@ class UvClient : public UvTcpSocket { } TORCH_CHECK( key_count <= MAX_KEY_COUNT, - "Too many keys with multi_get. keys:{} max:{}", + "Too many keys with multi_get. ", + "keys: ", key_count, + ", max: ", MAX_KEY_COUNT); for (const auto _ : c10::irange(key_count)) { @@ -988,9 +1023,11 @@ void LibUVStoreDaemon::init(const TCPStoreOptions& opts) { port_ = tcpServer->port(); TORCH_CHECK( port_ == opts.port || opts.port == 0, // zero means use any port - "listen fd {} is bound to port {}, expected to be bound to port {}", + "listen fd ", *opts.masterListenFd, + " is bound to port ", port_, + ", expected to be bound to port ", opts.port); } From 59f8ba73867a92b0c45169ff7762108a0eb08bc8 Mon Sep 17 00:00:00 2001 From: Xilun Wu <12968408+XilunWu@users.noreply.github.com> Date: Tue, 4 Jun 2024 13:45:57 -0700 Subject: [PATCH 2/2] Update on "[BE][c10d] fix use of TORCH_ERROR in TCPStore libuv backend" **Summary** The use of TORCH_ERROR in TCPStore libuv backend code needs update. cc mrshenli pritamdamania87 zhaojuanmao satgera gqchen aazzolini osalpekar jiayisuse H-Huang kwen2501 awgu penguinwu fegin wanchaol fduwjj wz337 tianyu-l wconstab yf225 chauhang d4l3k [ghstack-poisoned] --- .../distributed/c10d/TCPStoreLibUvBackend.cpp | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp b/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp index 6b4339480e0a..c70b8e7c6e87 100644 --- a/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp +++ b/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp @@ -185,16 +185,16 @@ class UvTcpServer : public UvTcpSocket { try { int uv_res = uv_tcp_open((uv_tcp_t*)res->unsafeGetStream(), socket); TORCH_CHECK( - uv_res == 0, - "Failed to open existing socket. ", - "socket: ", - socket, - ", code: ", - uv_res, - ", name: ", - uv_err_name(uv_res), - ", message: ", - uv_strerror(uv_res)); + uv_res == 0, + "Failed to open existing socket. ", + "socket: ", + socket, + ", code: ", + uv_res, + ", name: ", + uv_err_name(uv_res), + ", message: ", + uv_strerror(uv_res)); res->cacheSocketPort(); } catch (std::exception& ex) { @@ -223,7 +223,6 @@ class UvTcpServer : public UvTcpSocket { } else { uv_res = uv_ip4_addr("0.0.0.0", port, (struct sockaddr_in*)&addr); } - TORCH_CHECK( uv_res == 0, "UV Store addr parsing failure. ",