From aa901f19b92e41e6e120d99cb90cc668a83fce6f Mon Sep 17 00:00:00 2001 From: Jiajun Yao Date: Mon, 27 Mar 2023 11:44:39 -0700 Subject: [PATCH] Retry REDIS_REPLY_ERROR for RedisClient::GetNextJobID (#33733) Encountered check failure `redis_client.cc:73: Check failed: reply->type == REDIS_REPLY_INTEGER Expected integer, found Redis type 6 for JobCounter`. This PR retries REDIS_REPLY_ERROR which is 6 and also prints out the error message. Signed-off-by: Jiajun Yao --- src/ray/gcs/redis_client.cc | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/ray/gcs/redis_client.cc b/src/ray/gcs/redis_client.cc index 7380b112dadadf..d4cf0c0702dbe1 100644 --- a/src/ray/gcs/redis_client.cc +++ b/src/ray/gcs/redis_client.cc @@ -66,7 +66,19 @@ static int DoGetNextJobID(redisContext *context) { redisReply *reply = nullptr; bool under_retry_limit = RunRedisCommandWithRetries( context, cmd.c_str(), &reply, [](const redisReply *reply) { - return reply != nullptr && reply->type != REDIS_REPLY_NIL; + if (reply == nullptr) { + RAY_LOG(WARNING) << "Didn't get reply for " << cmd; + return false; + } + if (reply->type == REDIS_REPLY_NIL) { + RAY_LOG(WARNING) << "Got nil reply for " << cmd; + return false; + } + if (reply->type == REDIS_REPLY_ERROR) { + RAY_LOG(WARNING) << "Got error reply for " << cmd << " Error is " << reply->str; + return false; + } + return true; }); RAY_CHECK(reply); RAY_CHECK(under_retry_limit) << "No entry found for JobCounter";