Merge 'storage_service: Enable Repair Based Node Operations (RBNO) by…

… default for all node ops' from Asias He Since 97bb2e4 (storage_service: Enable Repair Based Node Operations (RBNO) by default for replace), RBNO was enabled by default for replace ops. After more testing, we decided to enable repair based node operations by default for all node operations. Closes #12173 * github.com:scylladb/scylladb: storage_service: Enable Repair Based Node Operations (RBNO) by default for all node ops test: Increase START_TIMEOUT test: Increase max-networking-io-control-blocks storage_service: Check node has left in node_ops_cmd::decommission_done repair: Use remote dc neighbors for everywhere strategy
scylladb · Feb 6, 2023 · 1e6cc9c · 1e6cc9c
2 parents 511c012 + e7d5e50
commit 1e6cc9c
Show file tree

Hide file tree

Showing 7 changed files with 32 additions and 9 deletions.
diff --git a/db/config.cc b/db/config.cc
@@ -794,7 +794,7 @@ db::config::config(std::shared_ptr<db::extensions> exts)
     , ignore_dead_nodes_for_replace(this, "ignore_dead_nodes_for_replace", value_status::Used, "", "List dead nodes to ingore for replace operation using a comma-separated list of host IDs. E.g., scylla --ignore-dead-nodes-for-replace 8d5ed9f4-7764-4dbd-bad8-43fddce94b7c,125ed9f4-7777-1dbn-mac8-43fddce9123e")
     , override_decommission(this, "override_decommission", value_status::Used, false, "Set true to force a decommissioned node to join the cluster")
     , enable_repair_based_node_ops(this, "enable_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, true, "Set true to use enable repair based node operations instead of streaming based")
-    , allowed_repair_based_node_ops(this, "allowed_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, "replace", "A comma separated list of node operations which are allowed to enable repair based node operations. The operations can be bootstrap, replace, removenode, decommission and rebuild")
+    , allowed_repair_based_node_ops(this, "allowed_repair_based_node_ops", liveness::LiveUpdate, value_status::Used, "replace,removenode,rebuild,bootstrap,decommission", "A comma separated list of node operations which are allowed to enable repair based node operations. The operations can be bootstrap, replace, removenode, decommission and rebuild")
     , ring_delay_ms(this, "ring_delay_ms", value_status::Used, 30 * 1000, "Time a node waits to hear from other nodes before joining the ring in milliseconds. Same as -Dcassandra.ring_delay_ms in cassandra.")
     , shadow_round_ms(this, "shadow_round_ms", value_status::Used, 300 * 1000, "The maximum gossip shadow round time. Can be used to reduce the gossip feature check time during node boot up.")
     , fd_max_interval_ms(this, "fd_max_interval_ms", value_status::Used, 2 * 1000, "The maximum failure_detector interval time in milliseconds. Interval larger than the maximum will be ignored. Larger cluster may need to increase the default.")

diff --git a/repair/repair.cc b/repair/repair.cc
@@ -1445,7 +1445,9 @@ future<> repair_service::bootstrap_with_repair(locator::token_metadata_ptr tmptr
                         auto old_endpoints_in_local_dc = get_old_endpoints_in_local_dc();
                         auto rf_in_local_dc = get_rf_in_local_dc();
                         if (everywhere_topology) {
-                            neighbors = old_endpoints_in_local_dc;
+                            neighbors = old_endpoints_in_local_dc.empty() ? old_endpoints : old_endpoints_in_local_dc;
+                            rlogger.debug("bootstrap_with_repair: keyspace={}, range={}, old_endpoints={}, new_endpoints={}, old_endpoints_in_local_dc={}, neighbors={}",
+                                    keyspace_name, desired_range, old_endpoints, new_endpoints, old_endpoints_in_local_dc, neighbors);
                         } else if (old_endpoints.size() == replication_factor) {
                             // For example, with RF = 3 and 3 nodes n1, n2, n3
                             // in the cluster, n4 is bootstrapped, old_replicas

diff --git a/service/storage_service.cc b/service/storage_service.cc
@@ -2721,6 +2721,26 @@ future<node_ops_cmd_response> storage_service::node_ops_cmd_handler(gms::inet_ad
             slogger.debug("decommission[{}]: Updated heartbeat from coordinator={}", req.ops_uuid,  coordinator);
             node_ops_update_heartbeat(ops_uuid).get();
         } else if (req.cmd == node_ops_cmd::decommission_done) {
+            bool check_again = false;
+            auto start_time = std::chrono::steady_clock::now();
+            slogger.info("decommission[{}]: Started to check if nodes={} have left the cluster, coordinator={}", req.ops_uuid, req.leaving_nodes, coordinator);
+            do {
+                check_again = false;
+                for (auto& node : req.leaving_nodes) {
+                    auto tmptr = get_token_metadata_ptr();
+                    if (tmptr->is_normal_token_owner(node)) {
+                        check_again = true;
+                        if (std::chrono::steady_clock::now() > start_time + std::chrono::seconds(60)) {
+                            auto msg = format("decommission[{}]: Node {} is still in the cluster", req.ops_uuid, node);
+                            throw std::runtime_error(msg);
+                        }
+                        slogger.warn("decommission[{}]: Node {} is still in the cluster, sleep and check again", req.ops_uuid, node);
+                        sleep_abortable(std::chrono::milliseconds(500), _abort_source).get();
+                        break;
+                    }
+                }
+            } while (check_again);
+            slogger.info("decommission[{}]: Finished to check if nodes={} have left the cluster, coordinator={}", req.ops_uuid, req.leaving_nodes, coordinator);
             slogger.info("decommission[{}]: Marked ops done from coordinator={}", req.ops_uuid, coordinator);
             slogger.debug("Triggering off-strategy compaction for all non-system tables on decommission completion");
             _db.invoke_on_all([](replica::database &db) {

diff --git a/test/cql-pytest/run.py b/test/cql-pytest/run.py
@@ -202,7 +202,7 @@ def run_scylla_cmd(pid, dir):
         '--smp', '2',
         '-m', '1G',
         '--overprovisioned',
-        '--max-networking-io-control-blocks', '100',
+        '--max-networking-io-control-blocks', '1000',
         '--unsafe-bypass-fsync', '1',
         '--kernel-page-cache', '1',
         '--commitlog-use-o-dsync', '0',

diff --git a/test/pylib/manager_client.py b/test/pylib/manager_client.py
@@ -15,7 +15,7 @@
 from test.pylib.rest_client import UnixRESTClient, ScyllaRESTAPIClient
 from test.pylib.util import wait_for
 from test.pylib.internal_types import ServerNum, IPAddress, HostID, ServerInfo
-from test.pylib.scylla_cluster import ReplaceConfig
+from test.pylib.scylla_cluster import ReplaceConfig, ScyllaServer
 from cassandra.cluster import Session as CassandraSession  # type: ignore # pylint: disable=no-name-in-module
 from cassandra.cluster import Cluster as CassandraCluster  # type: ignore # pylint: disable=no-name-in-module
 import aiohttp
@@ -156,7 +156,8 @@ async def server_add(self, replace_cfg: Optional[ReplaceConfig] = None, cmdline:
                 data['replace_cfg'] = replace_cfg._asdict()
             if cmdline:
                 data['cmdline'] = cmdline
-            server_info = await self.client.put_json("/cluster/addserver", data, response_type="json")
+            server_info = await self.client.put_json("/cluster/addserver", data, response_type="json",
+                                                     timeout=ScyllaServer.START_TIMEOUT)
         except Exception as exc:
             raise Exception("Failed to add server") from exc
         try:

diff --git a/test/pylib/rest_client.py b/test/pylib/rest_client.py
@@ -100,9 +100,9 @@ async def post(self, resource_uri: str, host: Optional[str] = None,
 
     async def put_json(self, resource_uri: str, data: Mapping, host: Optional[str] = None,
                        port: Optional[int] = None, params: Optional[dict[str, str]] = None,
-                       response_type: Optional[str] = None) -> Any:
+                       response_type: Optional[str] = None, timeout: Optional[float] = None) -> Any:
         ret = await self._fetch("PUT", resource_uri, response_type = response_type, host = host,
-                                port = port, params = params, json = data)
+                                port = port, params = params, json = data, timeout = timeout)
         return ret
 
     async def delete(self, resource_uri: str, host: Optional[str] = None,

diff --git a/test/pylib/scylla_cluster.py b/test/pylib/scylla_cluster.py
@@ -115,7 +115,7 @@ def make_scylla_conf(workdir: pathlib.Path, host_addr: str, seed_addrs: List[str
     '-m', '1G',
     '--collectd', '0',
     '--overprovisioned',
-    '--max-networking-io-control-blocks', '100',
+    '--max-networking-io-control-blocks', '1000',
     '--unsafe-bypass-fsync', '1',
     '--kernel-page-cache', '1',
     '--commitlog-use-o-dsync', '0',
@@ -190,7 +190,7 @@ class ScyllaServer:
     """Starts and handles a single Scylla server, managing logs, checking if responsive,
        and cleanup when finished."""
     # pylint: disable=too-many-instance-attributes
-    START_TIMEOUT = 300     # seconds
+    START_TIMEOUT = 1000     # seconds
     start_time: float
     sleep_interval: float
     log_file: BufferedWriter