From 02ce19e851f0cda92f0a88331810f181b9e2e6f6 Mon Sep 17 00:00:00 2001 From: Asias He Date: Thu, 1 Aug 2019 20:40:50 +0800 Subject: [PATCH] storage_service: Replicate and advertise tokens early in the boot up process When a node is restarted, there is a race between gossip starts (other nodes will mark this node up again and send requests) and the tokens are replicated to other shards. Here is an example: - n1, n2 - n2 is down, n1 think n2 is down - n2 starts again, n2 starts gossip service, n1 thinks n2 is up and sends reads/writes to n2, but n2 hasn't replicated the token_metadata to all the shards. - n2 complains: token_metadata - sorted_tokens is empty in first_token_index! token_metadata - sorted_tokens is empty in first_token_index! token_metadata - sorted_tokens is empty in first_token_index! token_metadata - sorted_tokens is empty in first_token_index! token_metadata - sorted_tokens is empty in first_token_index! token_metadata - sorted_tokens is empty in first_token_index! storage_proxy - Failed to apply mutation from $ip#4: std::runtime_error (sorted_tokens is empty in first_token_index!) The code path looks like below: 0 stoarge_service::init_server 1 prepare_to_join() 2 add gossip application state of NET_VERSION, SCHEMA and so on. 3 _gossiper.start_gossiping().get() 4 join_token_ring() 5 _token_metadata.update_normal_tokens(tokens, get_broadcast_address()); 6 replicate_to_all_cores().get() 7 storage_service::set_gossip_tokens() which adds the gossip application state of TOKENS and STATUS The race talked above is at line 3 and line 6. To fix, we can replicate the token_metadata early after it is filled with the tokens read from system table before gossip starts. So that when other nodes think this restarting node is up, the tokens are already replicated to all the shards. In addition, this patch also fixes the issue that other nodes might see a node miss the TOKENS and STATUS application state in gossip if that node failed in the middle of a restarting process, i.e., it is killed after line 3 and before line 7. As a result we could not replace the node. Tests: update_cluster_layout_tests.py Fixes: #4709 Fixes: #4723 (cherry picked from commit 3b39a591357bc527bae597f8cffffb7a6c89160e) --- service/storage_service.cc | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/service/storage_service.cc b/service/storage_service.cc index 3394f9240912..d416b9692318 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -483,6 +483,14 @@ void storage_service::prepare_to_join(std::vector loaded_endpoints } } + // If this is a restarting node, we should update tokens before gossip starts + auto my_tokens = db::system_keyspace::get_saved_tokens().get0(); + bool restarting_normal_node = db::system_keyspace::bootstrap_complete() && !db().local().is_replacing() && !my_tokens.empty(); + if (restarting_normal_node) { + slogger.info("Restarting a node in NORMAL status"); + _token_metadata.update_normal_tokens(my_tokens, get_broadcast_address()); + } + // have to start the gossip service before we can see any info on other nodes. this is necessary // for bootstrap to get the load info it needs. // (we won't be part of the storage ring though until we add a counterId to our state, below.) @@ -493,6 +501,12 @@ void storage_service::prepare_to_join(std::vector loaded_endpoints }).get(); auto features = get_config_supported_features(); _token_metadata.update_host_id(local_host_id, get_broadcast_address()); + + // Replicate the tokens early because once gossip runs other nodes + // might send reads/writes to this node. Replicate it early to make + // sure the tokens are valid on all the shards. + replicate_to_all_cores().get(); + auto broadcast_rpc_address = utils::fb_utilities::get_broadcast_rpc_address(); auto& proxy = service::get_storage_proxy(); // Ensure we know our own actual Schema UUID in preparation for updates @@ -507,6 +521,10 @@ void storage_service::prepare_to_join(std::vector loaded_endpoints app_states.emplace(gms::application_state::RPC_READY, value_factory.cql_ready(false)); app_states.emplace(gms::application_state::VIEW_BACKLOG, versioned_value("")); app_states.emplace(gms::application_state::SCHEMA, value_factory.schema(schema_version)); + if (restarting_normal_node) { + app_states.emplace(gms::application_state::TOKENS, value_factory.tokens(my_tokens)); + app_states.emplace(gms::application_state::STATUS, value_factory.normal(my_tokens)); + } slogger.info("Starting up server gossip"); _gossiper.register_(this->shared_from_this()); @@ -815,6 +833,7 @@ void storage_service::bootstrap(std::unordered_set tokens) { } else { // Dont set any state for the node which is bootstrapping the existing token... _token_metadata.update_normal_tokens(tokens, get_broadcast_address()); + replicate_to_all_cores().get(); auto replace_addr = db().local().get_replace_address(); if (replace_addr) { slogger.debug("Removing replaced endpoint {} from system.peers", *replace_addr); @@ -1585,6 +1604,7 @@ future<> storage_service::init_server(int delay, bind_messaging_port do_bind) { auto tokens = db::system_keyspace::get_saved_tokens().get0(); if (!tokens.empty()) { _token_metadata.update_normal_tokens(tokens, get_broadcast_address()); + replicate_to_all_cores().get(); // order is important here, the gossiper can fire in between adding these two states. It's ok to send TOKENS without STATUS, but *not* vice versa. _gossiper.add_local_application_state({ { gms::application_state::TOKENS, value_factory.tokens(tokens) },