From 45cbf53e88b56339974f2e25d2554f5b98d6d5a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-S=C3=A9bastien=20P=C3=A9dron?=
 <jean-sebastien.pedron@dumbbell.fr>
Date: Tue, 11 Feb 2025 14:50:54 +0100
Subject: [PATCH 1/9] rabbit_stream_queue_SUITE: Swap uses of node 2 and 3 in
 `format`

[Why]
We hit some transient errors with the previous order when doing
mixed-version testing. Swapping the nodes seems to fix the problem.

(cherry picked from commit 5cbda4c838591373b254d091f9775f1cf6e6ba40)
---
 deps/rabbit/test/rabbit_stream_queue_SUITE.erl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/deps/rabbit/test/rabbit_stream_queue_SUITE.erl b/deps/rabbit/test/rabbit_stream_queue_SUITE.erl
index 3ac5bd7b636f..d56e5c8b096f 100644
--- a/deps/rabbit/test/rabbit_stream_queue_SUITE.erl
+++ b/deps/rabbit/test/rabbit_stream_queue_SUITE.erl
@@ -1565,13 +1565,13 @@ format(Config) ->
     case length(Nodes) of
         3 ->
             [_, Server2, Server3] = Nodes,
-            ok = rabbit_control_helper:command(stop_app, Server2),
             ok = rabbit_control_helper:command(stop_app, Server3),
+            ok = rabbit_control_helper:command(stop_app, Server2),
 
             Fmt2 = rabbit_ct_broker_helpers:rpc(Config, Server, rabbit_stream_queue,
                                                ?FUNCTION_NAME, [QRecord, #{}]),
-            ok = rabbit_control_helper:command(start_app, Server2),
             ok = rabbit_control_helper:command(start_app, Server3),
+            ok = rabbit_control_helper:command(start_app, Server2),
             ?assertEqual(stream, proplists:get_value(type, Fmt2)),
             ?assertEqual(minority, proplists:get_value(state, Fmt2)),
             ?assertEqual(Server, proplists:get_value(leader, Fmt2)),

From 477fc473b7c89fbea336570d1f381fb1af9d29f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-S=C3=A9bastien=20P=C3=A9dron?=
 <jean-sebastien.pedron@dumbbell.fr>
Date: Wed, 12 Feb 2025 17:13:24 +0100
Subject: [PATCH 2/9] Skip peer discovery clustering tests if multiple Khepri
 machine versions

... are being used at the same time.

[Why]
Depending on which node clusters with which, a node running an older
version of the Khepri Ra machine may not be able to apply Ra commands
and could be stuck.

There is no real solution and this clearly an unsupported scenario. An
old node won't always be able to join a newer cluster.

[How]
In the testsuites, we skip clustering tests if we detect that multiple
Khepri Ra machine versions are being used.

(cherry picked from commit 1f1a13521b5c26904673faac1384ad28199c2fdf)
---
 .../src/rabbit_ct_broker_helpers.erl          |  9 +++++-
 .../test/system_SUITE.erl                     | 24 ++++++++++++++--
 .../test/system_SUITE.erl                     | 28 ++++++++++++++++---
 3 files changed, 53 insertions(+), 8 deletions(-)

diff --git a/deps/rabbitmq_ct_helpers/src/rabbit_ct_broker_helpers.erl b/deps/rabbitmq_ct_helpers/src/rabbit_ct_broker_helpers.erl
index 00eb0262ef72..6edff885905d 100644
--- a/deps/rabbitmq_ct_helpers/src/rabbit_ct_broker_helpers.erl
+++ b/deps/rabbitmq_ct_helpers/src/rabbit_ct_broker_helpers.erl
@@ -173,7 +173,8 @@
     user/1,
 
     configured_metadata_store/1,
-    await_metadata_store_consistent/2
+    await_metadata_store_consistent/2,
+    do_nodes_run_same_ra_machine_version/2
   ]).
 
 %% Internal functions exported to be used by rpc:call/4.
@@ -1174,6 +1175,12 @@ ra_last_applied(ServerId) ->
     #{last_applied := LastApplied} = ra:key_metrics(ServerId),
     LastApplied.
 
+do_nodes_run_same_ra_machine_version(Config, RaMachineMod) ->
+    [MacVer1 | MacVerN] = MacVers = rpc_all(Config, RaMachineMod, version, []),
+    ct:pal("Ra machine versions of ~s: ~0p", [RaMachineMod, MacVers]),
+    is_integer(MacVer1) andalso
+    lists:all(fun(MacVer) -> MacVer =:= MacVer1 end, MacVerN).
+
 rewrite_node_config_file(Config, Node) ->
     NodeConfig = get_node_config(Config, Node),
     I = if
diff --git a/deps/rabbitmq_peer_discovery_consul/test/system_SUITE.erl b/deps/rabbitmq_peer_discovery_consul/test/system_SUITE.erl
index a39e2bc7bf9e..417e3b773d04 100644
--- a/deps/rabbitmq_peer_discovery_consul/test/system_SUITE.erl
+++ b/deps/rabbitmq_peer_discovery_consul/test/system_SUITE.erl
@@ -83,9 +83,27 @@ init_per_testcase(Testcase, Config)
     case Config3 of
         _ when is_list(Config3) ->
             try
-                _ = rabbit_ct_broker_helpers:rpc_all(
-                      Config3, rabbit_peer_discovery_backend, api_version, []),
-                Config3
+                SameMacVer = (
+                  rabbit_ct_broker_helpers:
+                  do_nodes_run_same_ra_machine_version(
+                    Config3, khepri_machine)),
+                case SameMacVer of
+                    true ->
+                        _ = rabbit_ct_broker_helpers:rpc_all(
+                              Config3,
+                              rabbit_peer_discovery_backend, api_version, []),
+                        Config3;
+                    false ->
+                        Config5 = rabbit_ct_helpers:run_steps(
+                                    Config3,
+                                    rabbit_ct_client_helpers:teardown_steps()
+                                    ++
+                                    rabbit_ct_broker_helpers:teardown_steps()),
+                        rabbit_ct_helpers:testcase_finished(Config5, Testcase),
+                        {skip,
+                         "Nodes are using different Khepri Ra machine "
+                         "versions; clustering will likely fail"}
+                end
             catch
                 error:{exception, undef,
                        [{rabbit_peer_discovery_backend, api_version, _, _}
diff --git a/deps/rabbitmq_peer_discovery_etcd/test/system_SUITE.erl b/deps/rabbitmq_peer_discovery_etcd/test/system_SUITE.erl
index 7531f3bd92eb..1dfef24b0d06 100644
--- a/deps/rabbitmq_peer_discovery_etcd/test/system_SUITE.erl
+++ b/deps/rabbitmq_peer_discovery_etcd/test/system_SUITE.erl
@@ -92,9 +92,27 @@ init_per_testcase(Testcase, Config)
     case Config3 of
         _ when is_list(Config3) ->
             try
-                _ = rabbit_ct_broker_helpers:rpc_all(
-                      Config3, rabbit_peer_discovery_backend, api_version, []),
-                Config3
+                SameMacVer = (
+                  rabbit_ct_broker_helpers:
+                  do_nodes_run_same_ra_machine_version(
+                    Config3, khepri_machine)),
+                case SameMacVer of
+                    true ->
+                        _ = rabbit_ct_broker_helpers:rpc_all(
+                              Config3,
+                              rabbit_peer_discovery_backend, api_version, []),
+                        Config3;
+                    false ->
+                        Config5 = rabbit_ct_helpers:run_steps(
+                                    Config3,
+                                    rabbit_ct_client_helpers:teardown_steps()
+                                    ++
+                                    rabbit_ct_broker_helpers:teardown_steps()),
+                        rabbit_ct_helpers:testcase_finished(Config5, Testcase),
+                        {skip,
+                         "Nodes are using different Khepri Ra machine "
+                         "versions; clustering will likely fail"}
+                end
             catch
                 error:{exception, undef,
                        [{rabbit_peer_discovery_backend, api_version, _, _}
@@ -239,7 +257,9 @@ wait_for_etcd(EtcdEndpoints) ->
     Timeout = 60000,
     rabbit_ct_helpers:await_condition(
       fun() ->
-              case eetcd:open(test, EtcdEndpoints) of
+              Ret = eetcd:open(test, EtcdEndpoints),
+              ct:pal("Ret = ~p", [Ret]),
+              case Ret of
                   {ok, _Pid} -> true;
                   _          -> false
               end

From 44d32f12b7b8bf7db8d827fefc717a05279b656e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-S=C3=A9bastien=20P=C3=A9dron?=
 <jean-sebastien.pedron@dumbbell.fr>
Date: Thu, 13 Feb 2025 10:25:07 +0100
Subject: [PATCH 3/9] clustering_management_SUITE: Use old node as seed node

[Why]
During mixed-version testing, the old node might not be able to join or
rejoin a cluster if the other nodes run a newer Khepri machine version.

[How]
The old node is used as the cluster seed node and is never touched
otherwise. Other nodes are restarted or join the cluster later.

(cherry picked from commit e76233a222990ac7575d1a0217ef58e7e20efce8)
---
 .../test/clustering_management_SUITE.erl      | 30 +++++++++----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/deps/rabbit/test/clustering_management_SUITE.erl b/deps/rabbit/test/clustering_management_SUITE.erl
index 7e18242ccaea..bfa8959c825a 100644
--- a/deps/rabbit/test/clustering_management_SUITE.erl
+++ b/deps/rabbit/test/clustering_management_SUITE.erl
@@ -337,7 +337,7 @@ restart_cluster_node(Config) ->
     assert_clustered([Rabbit, Hare]).
 
 join_and_part_cluster_in_khepri(Config) ->
-    [Rabbit, Hare, Bunny] = cluster_members(Config),
+    [Rabbit, Bunny, Hare] = cluster_members(Config),
     assert_not_clustered(Rabbit),
     assert_not_clustered(Hare),
     assert_not_clustered(Bunny),
@@ -447,38 +447,38 @@ join_to_start_interval(Config) ->
     assert_clustered([Rabbit, Hare]).
 
 join_cluster_in_minority(Config) ->
-    [Rabbit, Hare, Bunny] = cluster_members(Config),
+    [Rabbit, Bunny, Hare] = cluster_members(Config),
     assert_not_clustered(Rabbit),
     assert_not_clustered(Hare),
     assert_not_clustered(Bunny),
 
-    stop_join_start(Config, Bunny, Rabbit),
+    stop_join_start(Config, Rabbit, Bunny),
     assert_clustered([Rabbit, Bunny]),
-    ok = rabbit_ct_broker_helpers:stop_node(Config, Bunny),
+    ok = rabbit_ct_broker_helpers:stop_node(Config, Rabbit),
 
     ok = stop_app(Config, Hare),
-    ?assertEqual(ok, join_cluster(Config, Hare, Rabbit, false)),
+    ?assertEqual(ok, join_cluster(Config, Hare, Bunny, false)),
 
-    ok = rabbit_ct_broker_helpers:start_node(Config, Bunny),
+    ok = rabbit_ct_broker_helpers:start_node(Config, Rabbit),
     ?assertEqual(ok, join_cluster(Config, Hare, Rabbit, false)),
     ?assertEqual(ok, start_app(Config, Hare)),
 
     assert_clustered([Rabbit, Bunny, Hare]).
 
 join_cluster_with_rabbit_stopped(Config) ->
-    [Rabbit, Hare, Bunny] = cluster_members(Config),
+    [Rabbit, Bunny, Hare] = cluster_members(Config),
     assert_not_clustered(Rabbit),
     assert_not_clustered(Hare),
     assert_not_clustered(Bunny),
 
-    stop_join_start(Config, Bunny, Rabbit),
+    stop_join_start(Config, Rabbit, Bunny),
     assert_clustered([Rabbit, Bunny]),
-    ok = stop_app(Config, Bunny),
+    ok = stop_app(Config, Rabbit),
 
     ok = stop_app(Config, Hare),
-    ?assertEqual(ok, join_cluster(Config, Hare, Rabbit, false)),
+    ?assertEqual(ok, join_cluster(Config, Hare, Bunny, false)),
 
-    ok = start_app(Config, Bunny),
+    ok = start_app(Config, Rabbit),
     ?assertEqual(ok, join_cluster(Config, Hare, Rabbit, false)),
     ?assertEqual(ok, start_app(Config, Hare)),
 
@@ -1119,7 +1119,7 @@ await_running_count_in_khepri(Config) ->
                                               await_running_count, [5, 1000])).
 
 start_nodes_in_reverse_order(Config) ->
-    [Rabbit, Hare, Bunny] = cluster_members(Config),
+    [Rabbit, Bunny, Hare] = cluster_members(Config),
     assert_not_clustered(Rabbit),
     assert_not_clustered(Hare),
     assert_not_clustered(Bunny),
@@ -1142,7 +1142,7 @@ start_nodes_in_reverse_order(Config) ->
 
 %% Test booting nodes in the wrong order for Mnesia. Interesting...
 start_nodes_in_stop_order(Config) ->
-    [Rabbit, Hare, Bunny] = cluster_members(Config),
+    [Rabbit, Bunny, Hare] = cluster_members(Config),
     assert_not_clustered(Rabbit),
     assert_not_clustered(Hare),
     assert_not_clustered(Bunny),
@@ -1167,7 +1167,7 @@ start_nodes_in_stop_order(Config) ->
     end.
 
 start_nodes_in_stop_order_in_khepri(Config) ->
-    [Rabbit, Hare, Bunny] = cluster_members(Config),
+    [Rabbit, Bunny, Hare] = cluster_members(Config),
     assert_not_clustered(Rabbit),
     assert_not_clustered(Hare),
     assert_not_clustered(Bunny),
@@ -1190,7 +1190,7 @@ start_nodes_in_stop_order_in_khepri(Config) ->
 
 %% TODO test force_boot with Khepri involved
 start_nodes_in_stop_order_with_force_boot(Config) ->
-    [Rabbit, Hare, Bunny] = cluster_members(Config),
+    [Rabbit, Bunny, Hare] = cluster_members(Config),
     assert_not_clustered(Rabbit),
     assert_not_clustered(Hare),
     assert_not_clustered(Bunny),

From 7a5f1708dc893db1d93b61f191264f9cb52960cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-S=C3=A9bastien=20P=C3=A9dron?=
 <jean-sebastien.pedron@dumbbell.fr>
Date: Thu, 13 Feb 2025 10:39:54 +0100
Subject: [PATCH 4/9] clustering_management_SUITE: Skip
 `start_with_invalid_schema_in_path` with Khepri

[Why]
This test plays with the Mnesia database explicitly.

(cherry picked from commit f088c4f5444f123cdbd8e08fc73cd48390fe0765)
---
 deps/rabbit/test/clustering_management_SUITE.erl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/deps/rabbit/test/clustering_management_SUITE.erl b/deps/rabbit/test/clustering_management_SUITE.erl
index bfa8959c825a..426f5e35e950 100644
--- a/deps/rabbit/test/clustering_management_SUITE.erl
+++ b/deps/rabbit/test/clustering_management_SUITE.erl
@@ -76,7 +76,6 @@ groups() ->
                                                  status_with_alarm,
                                                  pid_file_and_await_node_startup_in_khepri,
                                                  await_running_count_in_khepri,
-                                                 start_with_invalid_schema_in_path,
                                                  persistent_cluster_id,
                                                  stop_start_cluster_node,
                                                  restart_cluster_node,

From 6e7cc03d7892bf8db2cddacd5222845f11f6a3f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-S=C3=A9bastien=20P=C3=A9dron?=
 <jean-sebastien.pedron@dumbbell.fr>
Date: Thu, 13 Feb 2025 15:37:39 +0100
Subject: [PATCH 5/9] Increase the TCP ports range used by parallel-ct-set-*

[Why]
We see nodes trying to use busy ports in CI from time to time.

(cherry picked from commit e76c2271317075c28b0c8dfd97fe28b50c157001)
---
 deps/rabbit/Makefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/deps/rabbit/Makefile b/deps/rabbit/Makefile
index 304dcdce0564..5aebf56a99f5 100644
--- a/deps/rabbit/Makefile
+++ b/deps/rabbit/Makefile
@@ -241,10 +241,10 @@ define ct_master.erl
 	peer:call(Pid2, net_kernel, set_net_ticktime, [5]),
 	peer:call(Pid3, net_kernel, set_net_ticktime, [5]),
 	peer:call(Pid4, net_kernel, set_net_ticktime, [5]),
-	peer:call(Pid1, persistent_term, put, [rabbit_ct_tcp_port_base, 23000]),
-	peer:call(Pid2, persistent_term, put, [rabbit_ct_tcp_port_base, 25000]),
-	peer:call(Pid3, persistent_term, put, [rabbit_ct_tcp_port_base, 27000]),
-	peer:call(Pid4, persistent_term, put, [rabbit_ct_tcp_port_base, 29000]),
+	peer:call(Pid1, persistent_term, put, [rabbit_ct_tcp_port_base, 16000]),
+	peer:call(Pid2, persistent_term, put, [rabbit_ct_tcp_port_base, 20000]),
+	peer:call(Pid3, persistent_term, put, [rabbit_ct_tcp_port_base, 24000]),
+	peer:call(Pid4, persistent_term, put, [rabbit_ct_tcp_port_base, 28000]),
 	[{[_], {ok, Results}}] = ct_master_fork:run("$1"),
 	peer:stop(Pid4),
 	peer:stop(Pid3),

From 13687d09d0b94ac4363e4c307f6a3c0d1363e10f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-S=C3=A9bastien=20P=C3=A9dron?=
 <jean-sebastien.pedron@dumbbell.fr>
Date: Fri, 14 Feb 2025 11:41:57 +0100
Subject: [PATCH 6/9] rabbit_stream_queue_SUITE: Fix recursion issue

... in retry_if_coordinator_unavailable().

(cherry picked from commit ee0b5b5f323abd23f1ec758aea5b5ab344b3c393)
---
 deps/rabbit/test/rabbit_stream_queue_SUITE.erl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deps/rabbit/test/rabbit_stream_queue_SUITE.erl b/deps/rabbit/test/rabbit_stream_queue_SUITE.erl
index d56e5c8b096f..d9ff47230b6c 100644
--- a/deps/rabbit/test/rabbit_stream_queue_SUITE.erl
+++ b/deps/rabbit/test/rabbit_stream_queue_SUITE.erl
@@ -2743,7 +2743,7 @@ retry_if_coordinator_unavailable(Config, Server, Cmd, Retry) ->
             case re:run(Msg, ".*coordinator_unavailable.*", [{capture, none}]) of
                 match ->
                     ct:pal("Attempt to execute command ~p failed, coordinator unavailable", [Cmd]),
-                    retry_if_coordinator_unavailable(Config, Ch, Cmd, Retry - 1);
+                    retry_if_coordinator_unavailable(Config, Server, Cmd, Retry - 1);
                 _ ->
                     exit(Error)
             end

From 37d5a0c7992686fc4180a20dacf074e3bb0cfeaf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-S=C3=A9bastien=20P=C3=A9dron?=
 <jean-sebastien.pedron@dumbbell.fr>
Date: Fri, 14 Feb 2025 14:56:20 +0100
Subject: [PATCH 7/9] amqp_auth_SUITE: Handle error in init_per_group/2

(cherry picked from commit b7c9e648ea7f72d9ede3cfa2efec1d9f25f97c9e)
---
 deps/rabbit/test/amqp_auth_SUITE.erl | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/deps/rabbit/test/amqp_auth_SUITE.erl b/deps/rabbit/test/amqp_auth_SUITE.erl
index 5889cbdd5003..389a37b2d5c7 100644
--- a/deps/rabbit/test/amqp_auth_SUITE.erl
+++ b/deps/rabbit/test/amqp_auth_SUITE.erl
@@ -120,12 +120,17 @@ init_per_group(Group, Config0) ->
                Config1,
                rabbit_ct_broker_helpers:setup_steps() ++
                rabbit_ct_client_helpers:setup_steps()),
-    Vhost = <<"test vhost">>,
-    User = <<"test user">>,
-    ok = rabbit_ct_broker_helpers:add_vhost(Config, Vhost),
-    ok = rabbit_ct_broker_helpers:add_user(Config, User),
-    [{test_vhost, Vhost},
-     {test_user, User}] ++ Config.
+    case Config of
+        _ when is_list(Config) ->
+            Vhost = <<"test vhost">>,
+            User = <<"test user">>,
+            ok = rabbit_ct_broker_helpers:add_vhost(Config, Vhost),
+            ok = rabbit_ct_broker_helpers:add_user(Config, User),
+            [{test_vhost, Vhost},
+             {test_user, User}] ++ Config;
+        {skip, _} = Skip ->
+            Skip
+    end.
 
 end_per_group(_Group, Config) ->
     ok = rabbit_ct_broker_helpers:delete_user(Config, ?config(test_user, Config)),

From 4a20d59c2f08192f0a2eb3d9ea728f5d777dcbc8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-S=C3=A9bastien=20P=C3=A9dron?=
 <jean-sebastien.pedron@dumbbell.fr>
Date: Fri, 14 Feb 2025 15:23:50 +0100
Subject: [PATCH 8/9] unit_credit_flow_SUITE: Greatly reduce time trap

(cherry picked from commit 64b68e5d9ceb85bf7b6fb3391c4ed0136b361b8d)
---
 deps/rabbit/test/unit_credit_flow_SUITE.erl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/deps/rabbit/test/unit_credit_flow_SUITE.erl b/deps/rabbit/test/unit_credit_flow_SUITE.erl
index 189d0287290d..bdc3a0679b85 100644
--- a/deps/rabbit/test/unit_credit_flow_SUITE.erl
+++ b/deps/rabbit/test/unit_credit_flow_SUITE.erl
@@ -11,6 +11,9 @@
 
 -compile(export_all).
 
+suite() ->
+    [{timetrap, {minutes, 3}}].
+
 all() ->
     [
       {group, sequential_tests}

From 6a4e0dc9fb2ef8546ef1517b40996cfb0ae8c5ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-S=C3=A9bastien=20P=C3=A9dron?=
 <jean-sebastien.pedron@dumbbell.fr>
Date: Fri, 14 Feb 2025 15:36:07 +0100
Subject: [PATCH 9/9] GitHub workflows: List open TCP ports

This may help debug nodes that try to open busy ports.

(cherry picked from commit a5f30ea02ea1576e432c4e6086e0093b80db4b6d)
---
 .github/workflows/test-make-target.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/test-make-target.yaml b/.github/workflows/test-make-target.yaml
index 4d9e466dc362..690904c211f9 100644
--- a/.github/workflows/test-make-target.yaml
+++ b/.github/workflows/test-make-target.yaml
@@ -90,6 +90,7 @@ jobs:
     - name: RUN TESTS
       if: inputs.plugin != 'rabbitmq_cli'
       run: |
+        sudo netstat -ntp
         make -C deps/${{ inputs.plugin }} ${{ inputs.make_target }} RABBITMQ_METADATA_STORE=${{ inputs.metadata_store }}
 
     # rabbitmq_cli needs a correct broker version for two of its tests.