Skip to content

Commit

Permalink
fix(upgrade): upgrade with raft topology procedure
Browse files Browse the repository at this point in the history
After upgrade to latest master(6.0) raft topology feature or
tablets + raft topology features will be enabled by default
To switch cluster from gossiper to raft topology, raft topology
procedure should be executed. It is described here:
https://github.com/scylladb/scylladb/blob/c5601a749e21fc710958a7c84316ecdf5943022c/docs/dev/topology-over-raft.md
section: Upgrade from legacy topology to raft-based topology
  • Loading branch information
aleksbykov committed Jun 9, 2024
1 parent f99d288 commit 8b75680
Show file tree
Hide file tree
Showing 7 changed files with 95 additions and 16 deletions.
4 changes: 3 additions & 1 deletion defaults/test_default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ scylla_version: ''
test_upgrade_from_installed_3_1_0: false
target_upgrade_version: ''
disable_raft: true
enable_tablets_on_upgrade: false
enable_tablets_on_upgrade: true

stress_cdclog_reader_cmd: "cdc-stressor -stream-query-round-duration 30s"

Expand Down Expand Up @@ -250,3 +250,5 @@ teardown_validators:

kafka_backend: null
kafka_connectors: []

enable_force_gossip_topology_changes: false
2 changes: 2 additions & 0 deletions sdcm/provision/scylla_yaml/scylla_yaml.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,8 @@ def set_authorizer(cls, authorizer: str):
audit_categories: str = None # None
audit_tables: str = None # None
audit_keyspaces: str = None # None
force_gossip_topology_changes: bool = None # False
enable_tablets: bool = None # False

compaction_collection_items_count_warning_threshold: int = None # None

Expand Down
34 changes: 34 additions & 0 deletions sdcm/rest/raft_upgrade_procedure.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See LICENSE for more details.
#
# Copyright (c) 2022 ScyllaDB
import json
from sdcm.cluster import BaseNode
from sdcm.rest.remote_curl_client import RemoteCurlClient
from sdcm.wait import wait_for


class RaftUpgradeProcedure(RemoteCurlClient):
def __init__(self, node: BaseNode):
super().__init__(host="localhost:10000", endpoint="storage_service", node=node)

def start_upgrade_procedure(self) -> str:
path = "raft_topology/upgrade"
return self.run_remoter_curl(method="POST", path=path, params=None, timeout=30).stdout.strip()

def get_upgrade_procedure_status(self) -> str:
""" rest api return json string"""
path = "raft_topology/upgrade"
return json.loads(self.run_remoter_curl(method="GET", path=path, params=None, timeout=30).stdout.strip())

def wait_upgrade_procedure_done(self):
wait_for(lambda: self.get_upgrade_procedure_status().lower() == "done",
step=5, text="Check raft upgrade procedure state", timeout=60)
4 changes: 4 additions & 0 deletions sdcm/sct_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1550,6 +1550,10 @@ class SCTConfiguration(dict):

dict(name="kafka_connectors", env="SCT_KAFKA_CONNECTORS", type=str_or_list_or_eval,
help="configuration for setup up kafka connectors"),

dict(name="enable_force_gossip_topology_changes", env="SCT_ENABLE_FORCE_GOSSIP_TOPOLOGY_CHANGES", type=boolean,
help="""Enable gossip topology changes (disable raft topology)"""),

]

required_params = ['cluster_backend', 'test_duration', 'n_db_nodes', 'n_loaders', 'use_preinstalled_scylla',
Expand Down
4 changes: 3 additions & 1 deletion unit_tests/test_scylla_yaml.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,7 +398,9 @@ def test_scylla_yaml(self):
'virtual_dirty_soft_limit': None,
'volatile_system_keyspace_for_testing': None,
'workdir': None,
'write_request_timeout_in_ms': None
'write_request_timeout_in_ms': None,
'enable_tablets': None,
'force_gossip_topology_changes': None,
}
)

Expand Down
4 changes: 2 additions & 2 deletions unit_tests/test_scylla_yaml_builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def test_aws_multi_openldap(self):
'ldap_url_template': 'ldap://1.1.1.1:389/dc=scylla-qa,dc=com?cn?sub?'
'(uniqueMember=uid={USER},ou=Person,dc=scylla-qa,dc=com)',
'role_manager': 'com.scylladb.auth.LDAPRoleManager',
'saslauthd_socket_path': '/run/saslauthd/mux'
'saslauthd_socket_path': '/run/saslauthd/mux',
},
)

Expand Down Expand Up @@ -172,7 +172,7 @@ def test_gce_single_openldap(self):
'ldap_url_template': 'ldap://1.1.1.1:389/dc=scylla-qa,dc=com?cn?sub?'
'(uniqueMember=uid={USER},ou=Person,dc=scylla-qa,dc=com)',
'role_manager': 'com.scylladb.auth.LDAPRoleManager',
'saslauthd_socket_path': '/run/saslauthd/mux'
'saslauthd_socket_path': '/run/saslauthd/mux',
}
)

Expand Down
59 changes: 47 additions & 12 deletions upgrade_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,10 @@
from sdcm.sct_events.group_common_events import ignore_upgrade_schema_errors, ignore_ycsb_connection_refused, \
ignore_abort_requested_errors, decorate_with_context
from sdcm.utils import loader_utils
from sdcm.utils.features import TABLETS_FEATURE, CONSISTENT_TOPOLOGY_CHANGES_FEATURE, get_enabled_features
from sdcm.wait import wait_for
from sdcm.paths import SCYLLA_YAML_PATH
from sdcm.rest.raft_upgrade_procedure import RaftUpgradeProcedure
from test_lib.sla import create_sla_auth

NUMBER_OF_ROWS_FOR_TRUNCATE_TEST = 10
Expand Down Expand Up @@ -206,7 +209,10 @@ def _upgrade_node(self, node, upgrade_sstables=True, new_scylla_repo=None, new_v
scylla_yaml_updates.update({"consistent_cluster_management": True})

if self.params.get("enable_tablets_on_upgrade"):
scylla_yaml_updates.update({"experimental_features": ["tablets", "consistent-topology-changes"]})
scylla_yaml_updates.update({"enable_tablets": True})

if self.params.get("enable_force_`gossip_topology_changes"):
scylla_yaml_updates.update({"force_gossip_topology_changes": True, "enable_tablets": False})

if self.params.get('test_sst3'):
scylla_yaml_updates.update({"enable_sstables_mc_format": True})
Expand Down Expand Up @@ -371,14 +377,13 @@ def _rollback_node(self, node, upgrade_sstables=True):
node.run_nodetool("snapshot")
node.stop_scylla_server(verify_down=False)

if self.params.get("enable_tablets_on_upgrade"):
with node.remote_scylla_yaml() as scylla_yml:
current_experimental_features = scylla_yml.experimental_features
current_experimental_features.remove("tablets")
current_experimental_features.remove("consistent-topology-changes")
if len(current_experimental_features) == 0:
current_experimental_features = None
scylla_yml.experimental_features = current_experimental_features
with node.remote_scylla_yaml() as scylla_yml:
if self.params.get("enable_tablets_on_upgrade"):
scylla_yml.enable_tablets = None

if self.params.get("enable_force_gossip_topology_changes"):
scylla_yml.enable_tablets = None
scylla_yml.force_gossip_topology_changes = None

if node.distro.is_rhel_like:
node.remoter.run('sudo cp ~/scylla.repo-backup /etc/yum.repos.d/scylla.repo')
Expand Down Expand Up @@ -738,10 +743,10 @@ def test_rolling_upgrade(self): # pylint: disable=too-many-locals,too-many-stat
step = 'Step4 - Verify data during mixed cluster mode '
InfoEvent(message=step).publish()
self.fill_and_verify_db_data('after rollback the second node')

InfoEvent(message='Repair the first upgraded Node').publish()
self.db_cluster.nodes[indexes[0]].run_nodetool(sub_cmd='repair')
self.search_for_idx_token_error_after_upgrade(node=self.db_cluster.node_to_upgrade,
step=step)
self.db_cluster.nodes[indexes[0]].run_nodetool(sub_cmd='repair', timeout=7200, coredump_on_timeout=True)
self.search_for_idx_token_error_after_upgrade(node=self.db_cluster.node_to_upgrade, step=step)

with ignore_upgrade_schema_errors():

Expand All @@ -756,6 +761,36 @@ def test_rolling_upgrade(self): # pylint: disable=too-many-locals,too-many-stat
self.fill_and_verify_db_data('after upgraded %s' % self.db_cluster.node_to_upgrade.name)
self.search_for_idx_token_error_after_upgrade(node=self.db_cluster.node_to_upgrade,
step=step)
if self.params.get("enable_tablets_on_upgrade") or not self.params.get("enable_force_gossip_topology_changes"):
features = set()
if not self.params.get("enable_force_gossip_topology_changes"):
features.update([CONSISTENT_TOPOLOGY_CHANGES_FEATURE])

if self.params.get("enable_tablets_on_upgrade"):
features.update([TABLETS_FEATURE, CONSISTENT_TOPOLOGY_CHANGES_FEATURE])

InfoEvent(message='Step5.1 - run raft topology upgrade procedure')

def check_features_enabled(feature_list: list[str], node: BaseNode):
enabled_features_state = []
with self.db_cluster.cql_connection_patient_exclusive(node) as session:
enabled_features = get_enabled_features(session)
for feature in feature_list:
enabled_features_state.append(feature in enabled_features)
return all(enabled_features_state)

# wait features is enabled on nodes after upgrade
for node in self.db_cluster.nodes:
wait_for(func=check_features_enabled, timeout=60, step=f"Check feature enabled on node {node.name}",
feature_list=features, node=node)

raft_upgrade = RaftUpgradeProcedure(self.db_cluster.nodes[0])
result = raft_upgrade.start_upgrade_procedure()
InfoEvent(message=f'result {result}')
InfoEvent("Wait upgrade procedure done")
for node in self.db_cluster.nodes:
RaftUpgradeProcedure(node).wait_upgrade_procedure_done()
InfoEvent(message="Step5.1 - raft topology upgrade procedure done")

InfoEvent(message='Step6 - Verify stress results after upgrade ').publish()
InfoEvent(message='Waiting for stress threads to complete after upgrade').publish()
Expand Down

0 comments on commit 8b75680

Please sign in to comment.