From 3dd8a248d418bfc80754fd17ead7aee151439057 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-S=C3=A9bastien=20P=C3=A9dron?= Date: Wed, 22 Jan 2025 17:18:38 +0100 Subject: [PATCH] rabbit_peer_discovery: Cache computed node start time [Why] The computation of a node's system-time-based start time is based on its monotonic-time-based start time plus the current time offset. However, since Erlang/OTP 26, that time offset is volatile by default. Therefore the computed system-time-based start time can change too. It happens in CI and this affects the seed node selection in the peer discovery code: because the sorting is unstable when the start time is the criteria, different instances of the peen discovery code could select a different node. [How] The same code is used to compute the start time. However, the computed value is cached in a persistent_term for future queries. This way we ensure that the value is computed once and thus all queries will return the same result. --- deps/rabbit/src/rabbit_peer_discovery.erl | 93 +++++++++++++++++++---- 1 file changed, 80 insertions(+), 13 deletions(-) diff --git a/deps/rabbit/src/rabbit_peer_discovery.erl b/deps/rabbit/src/rabbit_peer_discovery.erl index 56400d3569e5..aba37ee5e7d2 100644 --- a/deps/rabbit/src/rabbit_peer_discovery.erl +++ b/deps/rabbit/src/rabbit_peer_discovery.erl @@ -26,7 +26,8 @@ normalize/1, append_node_prefix/1, node_prefix/0]). --export([do_query_node_props/2]). +-export([do_query_node_props/2, + get_local_start_time/0]). -ifdef(TEST). -export([query_node_props/1, @@ -614,7 +615,7 @@ query_node_props2([{Node, Members} | Rest], NodesAndProps, FromNode) -> "queries properties from node '~ts'", [node(), Node]], FromNode), StartTime = get_node_start_time( - Node, microsecond, FromNode), + Node, FromNode), IsReady = is_node_db_ready(Node, FromNode), NodeAndProps = {Node, Members, StartTime, IsReady}, NodesAndProps1 = [NodeAndProps | NodesAndProps], @@ -642,9 +643,8 @@ query_node_props2([], NodesAndProps, _FromNode) -> ?assert(length(NodesAndProps1) =< length(nodes(hidden))), NodesAndProps1. --spec get_node_start_time(Node, Unit, FromNode) -> StartTime when +-spec get_node_start_time(Node, FromNode) -> StartTime when Node :: node(), - Unit :: erlang:time_unit(), FromNode :: node(), StartTime :: non_neg_integer(). %% @doc Returns the start time of the given `Node' in `Unit'. @@ -663,17 +663,84 @@ query_node_props2([], NodesAndProps, _FromNode) -> %% https://www.erlang.org/doc/man/erlang#time_offset-0 to get the full %% explanation of the computation. %% +%% We also cache the computed start time. The reason is that since Erlang 26, +%% the time offset its volatile by default. Therefore we may compute a +%% different value each time. Caching the computed value gives up stability +%% again. +%% %% @private -get_node_start_time(Node, Unit, FromNode) -> - NativeStartTime = erpc_call( - Node, erlang, system_info, [start_time], FromNode), - TimeOffset = erpc_call(Node, erlang, time_offset, [], FromNode), - SystemStartTime = NativeStartTime + TimeOffset, - StartTime = erpc_call( - Node, erlang, convert_time_unit, - [SystemStartTime, native, Unit], FromNode), - StartTime. +-define(START_TIME_KEY, rabbitmq_node_start_time). +-define(START_TIME_UNIT, microsecond). + +get_node_start_time(Node, FromNode) -> + try + erpc_call(Node, ?MODULE, get_local_start_time, [], FromNode) + catch + error:{exception, undef, [{?MODULE, _, _, _} | _]} -> + get_node_start_time__v1(Node, FromNode) + end. + +get_node_start_time__v1(Node, FromNode) -> + LockId = {?START_TIME_KEY, self()}, + true = global:set_lock(LockId, [Node]), + try + CachedStartTime = erpc_call( + Node, persistent_term, get, + [?START_TIME_KEY, undefined], FromNode), + case CachedStartTime of + undefined -> + NativeStartTime = erpc_call( + Node, erlang, system_info, + [start_time], FromNode), + TimeOffset = erpc_call( + Node, erlang, time_offset, [], FromNode), + SystemStartTime = NativeStartTime + TimeOffset, + StartTime = erpc_call( + Node, erlang, convert_time_unit, + [SystemStartTime, native, ?START_TIME_UNIT], + FromNode), + erpc_call( + Node, persistent_term, put, [?START_TIME_KEY, StartTime], + FromNode), + StartTime; + _ -> + CachedStartTime + end + after + global:del_lock(LockId, [Node]) + end. + +%% The function below is exactly the same code as above but with meant to run +%% locally. This allows this node to compute its start time and cache it in +%% advance. + +get_local_start_time() -> + Node = node(), + LockId = {?START_TIME_KEY, self()}, + true = global:set_lock(LockId, [Node]), + try + CachedStartTime = persistent_term:get(?START_TIME_KEY, undefined), + case CachedStartTime of + undefined -> + NativeStartTime = erlang:system_info(start_time), + TimeOffset = erlang:time_offset(), + SystemStartTime = NativeStartTime + TimeOffset, + StartTime = erlang:convert_time_unit( + SystemStartTime, native, ?START_TIME_UNIT), + + ?LOG_DEBUG( + "Peer discovery: cache node start time (~b)", + [StartTime], + #{domain => ?RMQLOG_DOMAIN_PEER_DISC}), + persistent_term:put(?START_TIME_KEY, StartTime), + StartTime; + _ -> + CachedStartTime + end + after + global:del_lock(LockId, [Node]) + end. -spec is_node_db_ready(Node, FromNode) -> IsReady when Node :: node(),