From d0d2c55362b0205ad766dd572ac6eb1dca57b4fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-S=C3=A9bastien=20P=C3=A9dron?= Date: Tue, 3 Jun 2025 12:23:18 +0200 Subject: [PATCH] mirrored_supervisor: Rework error handling after a failed update [Why] The retry logic I added in 4621fe7730889168b133029a02a7da1a2b50aa6f was completely wrong. If Khepri reached its own timeout of 30 seconds (as of this writing), the mirrored supervisor would retry 50 times because it would not check the time spent. This means it would retry for 25 minutes. Nice. That retry would be terminated forcefully by the parent supervisor after 5 minutes if it was part of a shutdown. [How] This time, the code simply pass the error (timeout or something else) down to the following `case`. It will shut the mirrored supervisor down. This fixes very long RabbitMQ node termination (at least 5 minutes, sometimes more) in testsuites. An example to reproduce: gmake -C deps/rabbitmq_mqtt \ RABBITMQ_METADATA_STORE=khepri \ ct-v5 t=cluster_size_3:session_takeover_v3_v5 In this one, the third node of the cluster will take 5+ minutes to stop. (cherry picked from commit 376dd2ca60fb8c863b9df545f4f1200d5a298135) --- deps/rabbit/src/mirrored_supervisor.erl | 30 +++++++++---------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/deps/rabbit/src/mirrored_supervisor.erl b/deps/rabbit/src/mirrored_supervisor.erl index 661120360f11..201947072977 100644 --- a/deps/rabbit/src/mirrored_supervisor.erl +++ b/deps/rabbit/src/mirrored_supervisor.erl @@ -345,10 +345,16 @@ handle_info({'DOWN', _Ref, process, Pid, _Reason}, child_order = ChildOrder}) -> %% No guarantee pg will have received the DOWN before us. R = case lists:sort(pg:get_members(Group)) -- [Pid] of - [O | _] -> ChildSpecs = retry_update_all(O, Pid), - [start(Delegate, ChildSpec) - || ChildSpec <- restore_child_order(ChildSpecs, - ChildOrder)]; + [O | _] -> ChildSpecs = update_all(O, Pid), + case ChildSpecs of + _ when is_list(ChildSpecs) -> + [start(Delegate, ChildSpec) + || ChildSpec <- restore_child_order( + ChildSpecs, + ChildOrder)]; + {error, _} -> + [ChildSpecs] + end; _ -> [] end, case errors(R) of @@ -428,22 +434,6 @@ check_stop(Group, Delegate, Id) -> id({Id, _, _, _, _, _}) -> Id. -retry_update_all(O, Pid) -> - retry_update_all(O, Pid, 10000). - -retry_update_all(O, Pid, TimeLeft) when TimeLeft > 0 -> - case update_all(O, Pid) of - List when is_list(List) -> - List; - {error, timeout} -> - Sleep = 200, - TimeLeft1 = TimeLeft - Sleep, - timer:sleep(Sleep), - retry_update_all(O, Pid, TimeLeft1) - end; -retry_update_all(O, Pid, _TimeLeft) -> - update_all(O, Pid). - update_all(Overall, OldOverall) -> rabbit_db_msup:update_all(Overall, OldOverall).