Skip to content

Commit

Permalink
topology_coordinator: drop group0 guard while changing raft configura…
Browse files Browse the repository at this point in the history
…tion

Changing config under the guard can cause a deadlock.

The guard holds _read_apply_mutex. The same lock is held by the group0
apply() function. It means that no entry can be applied while the guard
is held and raft apply fiber may be even sleeping waiting for this lock
to be release. Configuration change OTOH waits for the config change
command to be committed before returning, but the way raft is implement
is that commit notifications are triggered from apply fiber which may
be stuck. Deadlock.

Drop and re-take guard around configuration changes.

Fixes #17186
  • Loading branch information
Gleb Natapov authored and kbr-scylla committed Mar 1, 2024
1 parent 4ef5709 commit 94cd235
Showing 1 changed file with 20 additions and 11 deletions.
31 changes: 20 additions & 11 deletions service/topology_coordinator.cc
Expand Up @@ -130,7 +130,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {

struct term_changed_error {};

future<> cleanup_group0_config_if_needed() {
future<group0_guard> cleanup_group0_config_if_needed(group0_guard guard) {
auto& topo = _topo_sm._topology;
auto rconf = _group0.group0_server().get_configuration();
if (!rconf.is_joint()) {
Expand All @@ -141,6 +141,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
| boost::adaptors::filtered([&] (const raft::server_id& id) { return topo.left_nodes.contains(id); }));
if (!to_remove.empty()) {
// Remove from group 0 nodes that left. They may failed to do so by themselves
release_guard(std::move(guard));
try {
rtlogger.debug("topology coordinator fiber removing {}"
" from raft since they are in `left` state", to_remove);
Expand All @@ -149,8 +150,10 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
rtlogger.warn("topology coordinator fiber got commit_status_unknown status"
" while removing {} from raft", to_remove);
}
guard = co_await start_operation();
}
}
co_return std::move(guard);
}

struct cancel_requests {
Expand Down Expand Up @@ -406,10 +409,12 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
co_return retake_node(std::move(guard), node.id);
};

future<> remove_from_group0(const raft::server_id& id) {
future<group0_guard> remove_from_group0(group0_guard guard, const raft::server_id& id) {
rtlogger.info("removing node {} from group 0 configuration...", id);
release_guard(std::move(guard));
co_await _group0.remove_from_raft_config(id);
rtlogger.info("node {} removed from group 0 configuration", id);
co_return co_await start_operation();
}

future<> step_down_as_nonvoter() {
Expand Down Expand Up @@ -1624,7 +1629,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
}
break;
case node_state::removing:
co_await remove_from_group0(node.id);
node = retake_node(co_await remove_from_group0(std::move(node.guard), node.id), node.id);
[[fallthrough]];
case node_state::decommissioning: {
topology_mutation_builder builder(node.guard.write_timestamp());
Expand All @@ -1651,7 +1656,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
break;
case node_state::replacing: {
auto replaced_node_id = parse_replaced_node(node.req_param);
co_await remove_from_group0(replaced_node_id);
node = retake_node(co_await remove_from_group0(std::move(node.guard), replaced_node_id), node.id);

topology_mutation_builder builder1(node.guard.write_timestamp());
// Move new node to 'normal'
Expand Down Expand Up @@ -1750,7 +1755,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {

// Remove the node from group0 here - in general, it won't be able to leave on its own
// because we'll ban it as soon as we tell it to shut down.
co_await remove_from_group0(node.id);
node = retake_node(co_await remove_from_group0(std::move(node.guard), node.id), node.id);

topology_mutation_builder builder(node.guard.write_timestamp());
cleanup_ignored_nodes_on_left(builder, node.id);
Expand Down Expand Up @@ -2036,12 +2041,13 @@ class topology_coordinator : public endpoint_lifecycle_subscriber {
auto id = node.id;

assert(!_topo_sm._topology.transition_nodes.empty());

release_node(std::move(node));

if (!_raft.get_configuration().contains(id)) {
co_await _raft.modify_config({raft::config_member({id, {}}, {})}, {}, &_as);
}

release_node(std::move(node));

auto responded = false;
try {
co_await respond_to_joining_node(id, join_node_response_params{
Expand Down Expand Up @@ -2476,10 +2482,14 @@ future<> topology_coordinator::rollback_current_topology_op(group0_guard&& guard
// add will be removed from the group0 by the transition handler. It will also be notified to shutdown.
transition_state = topology::transition_state::left_token_ring;
break;
case node_state::removing:
case node_state::removing: {
auto id = node.id;
// The node was removed already. We need to add it back. Lets do it as non voter.
// If it ever boots again it will make itself a voter.
co_await _group0.group0_server().modify_config({raft::config_member{{node.id, {}}, false}}, {}, &_as);
release_node(std::move(node));
co_await _group0.group0_server().modify_config({raft::config_member{{id, {}}, false}}, {}, &_as);
node = retake_node(co_await start_operation(), id);
}
[[fallthrough]];
case node_state::decommissioning:
// To rollback decommission or remove just move the topology to rollback_to_normal.
Expand Down Expand Up @@ -2575,8 +2585,7 @@ future<> topology_coordinator::run() {
try {
co_await utils::get_local_injector().inject_with_handler("topology_coordinator_pause_before_processing_backlog",
[] (auto& handler) { return handler.wait_for_message(db::timeout_clock::now() + std::chrono::minutes(1)); });
auto guard = co_await start_operation();
co_await cleanup_group0_config_if_needed();
auto guard = co_await cleanup_group0_config_if_needed(co_await start_operation());

if (_rollback) {
co_await rollback_current_topology_op(std::move(guard));
Expand Down

0 comments on commit 94cd235

Please sign in to comment.