From 1f38a636ca1695c7bb0287dc9fd176dec7e192bf Mon Sep 17 00:00:00 2001 From: Julia Yakovlev Date: Sun, 31 Mar 2024 19:01:03 +0300 Subject: [PATCH] improvement(log info): add datacenter and rack info When the test runs in multi DC environment it is helpful to get information about the node: in which datacenter and rack this node is located. For this goal update node information to the log. So it is not needed to search for this through the log Task: https://github.com/scylladb/qa-tasks/issues/1180 --- sdcm/cluster.py | 108 +++++++++++++++++++++--- sdcm/cluster_aws.py | 4 +- sdcm/cluster_k8s/__init__.py | 11 +++ unit_tests/dummy_remote.py | 1 + unit_tests/test_cluster.py | 10 ++- unit_tests/test_scylla_yaml_builders.py | 2 + unit_tests/test_seed_selector.py | 12 ++- unit_tests/test_utils_common.py | 16 +++- 8 files changed, 142 insertions(+), 22 deletions(-) diff --git a/sdcm/cluster.py b/sdcm/cluster.py index 5d1cfeb597..0ca3a7d185 100644 --- a/sdcm/cluster.py +++ b/sdcm/cluster.py @@ -302,6 +302,54 @@ def __init__(self, name, parent_cluster, ssh_login_info=None, base_logdir=None, self._kernel_version = None self._uuid = None self.scylla_network_configuration = None + self._datacenter_name = None + self._node_rack = None + + def _is_node_ready_run_scylla_commands(self) -> bool: + """ + When node is just created and started to configure, during first node initializing, it is impossible to connect to the node yet and + `remoter` object is None. So we cannot connect to the node and run there needed commands. + + When `remoter` is ready but cluster is not configured yet, `nodetool status` (and other Scylla commands) cannot be run. + Wait when it became available. + """ + # During first cluster remoter is not available and may cause to failure + if not self.remoter: + self.log.warning("Remoter is not available") + return False + + # During first cluster initializing the status info is not available and may cause to failure + if not self.db_up(): + self.log.warning("Running Scylla commands are not available. Scylla cluster is not configured yet") + return False + + return True + + @property + def datacenter(self) -> str: + if not self._datacenter_name: + # Get datacenter name from `nodetool status` for DB nodes + # "db" node_type is in kubernetes cluster + if self.parent_cluster.node_type in ["scylla-db", "db"]: + if self._is_node_ready_run_scylla_commands(): + datacenter_name_per_region = self.parent_cluster.get_datacenter_name_per_region(db_nodes=[self]) + self._datacenter_name = datacenter_name_per_region[self.region] if datacenter_name_per_region else None + else: + self._datacenter_name = self.region + + return self._datacenter_name + + @property + def node_rack(self) -> str: + if not self._node_rack: + # Get rack value from `nodetool status` for DB nodes + if self.parent_cluster.node_type == "scylla-db": + if self._is_node_ready_run_scylla_commands(): + rack_names = self.parent_cluster.get_rack_names_per_datacenter_and_rack_idx(db_nodes=[self]) + self._node_rack = list(rack_names.values())[0] + else: + self._node_rack = str(self.rack) + return self._node_rack @property def network_interfaces(self): @@ -931,15 +979,43 @@ def start_alert_manager_thread(self): def silence_alert(self, alert_name, duration=None, start=None, end=None): return AlertSilencer(self._alert_manager, alert_name, duration, start, end) + def _dc_info_str(self): + dc_info = [] + + # Example: `ManagerPodCluser` - the `params` is not needed and may be not passed there. Manager always created in the first DC + if not hasattr(self.parent_cluster, "params"): + TestFrameworkEvent(source=self.__class__.__name__, + message="The parent cluster has not 'params' attribute", + trace=sys._getframe().f_back, # pylint: disable=protected-access + severity=Severity.ERROR).publish() + # We want to figure out all places where "params" attribute has type that not consistent + elif not isinstance(self.parent_cluster.params, SCTConfiguration): + TestFrameworkEvent(source=self.__class__.__name__, + message=f"The 'params' attribute expected to by 'SCTConfiguration`, " + f"but actually it is a `{type(self.parent_cluster.params)}`", + trace=sys._getframe().f_back, # pylint: disable=protected-access + severity=Severity.ERROR).publish() + + elif len(self.parent_cluster.params.region_names) > 1 and self.datacenter: + dc_info.append(f"dc name: {self.datacenter}") + + # Workaround for 'k8s-local-kind*' backend. + # "node.init()" is called in `sdcm.cluster_k8s.mini_k8s.LocalMinimalClusterBase.host_node` when Scylla cluster, that hold + # "racks_count" parameter, is not created yet + if hasattr(self.parent_cluster, "racks_count") and self.parent_cluster.racks_count > 1 and self.node_rack: + dc_info.append(f"rack: {self.node_rack}") + + return f' ({", ".join(dc_info)})' if dc_info else "" + def __str__(self): # TODO: when new network_configuration will be supported by all backends, copy this function from sdcm.cluster_aws.AWSNode.__str__ # to here - return 'Node %s [%s | %s%s] (seed: %s)' % ( + return 'Node %s [%s | %s%s]%s' % ( self.name, self.public_ip_address, self.private_ip_address, " | %s" % self.ipv6_ip_address if self.test_config.IP_SSH_CONNECTIONS == "ipv6" else "", - self.is_seed) + self._dc_info_str()) def restart(self): raise NotImplementedError('Derived classes must implement restart') @@ -1124,7 +1200,7 @@ def destroy(self): def wait_ssh_up(self, verbose=True, timeout=500): text = None if verbose: - text = '%s: Waiting for SSH to be up' % self + text = '%s: Waiting for SSH to be up' % self.name wait.wait_for(func=self.remoter.is_up, step=10, text=text, timeout=timeout, throw_exc=True) def is_port_used(self, port: int, service_name: str) -> bool: @@ -1227,13 +1303,13 @@ def keyspace_available(): def wait_jmx_up(self, verbose=True, timeout=None): text = None if verbose: - text = '%s: Waiting for JMX service to be up' % self + text = '%s: Waiting for JMX service to be up' % self.name wait.wait_for(func=self.jmx_up, step=60, text=text, timeout=timeout, throw_exc=True) def wait_jmx_down(self, verbose=True, timeout=None): text = None if verbose: - text = '%s: Waiting for JMX service to be down' % self + text = '%s: Waiting for JMX service to be down' % self.name wait.wait_for(func=lambda: not self.jmx_up(), step=60, text=text, timeout=timeout, throw_exc=True) @property @@ -1265,7 +1341,7 @@ def _report_housekeeping_uuid(self, verbose=False): def wait_db_up(self, verbose=True, timeout=3600): text = None if verbose: - text = '%s: Waiting for DB services to be up' % self + text = '%s: Waiting for DB services to be up' % self.name wait.wait_for(func=self.db_up, step=60, text=text, timeout=timeout, throw_exc=True, stop_event=self.stop_wait_db_up_event) @@ -1284,7 +1360,7 @@ def is_manager_agent_up(self, port=None): def wait_manager_agent_up(self, verbose=True, timeout=180): text = None if verbose: - text = '%s: Waiting for manager agent to be up' % self + text = '%s: Waiting for manager agent to be up' % self.name wait.wait_for(func=self.is_manager_agent_up, step=10, text=text, timeout=timeout, throw_exc=True) def is_manager_server_up(self, port=None): @@ -1301,7 +1377,7 @@ def is_manager_server_up(self, port=None): def wait_manager_server_up(self, verbose=True, timeout=300, port=None): text = None if verbose: - text = '%s: Waiting for manager server to be up' % self + text = '%s: Waiting for manager server to be up' % self.name try: wait.wait_for(func=self.is_manager_server_up, port=port, step=10, text=text, timeout=timeout, throw_exc=True) @@ -1337,20 +1413,20 @@ def apt_running(self): def wait_apt_not_running(self, verbose=True): text = None if verbose: - text = '%s: Waiting for apt to finish running in the background' % self + text = '%s: Waiting for apt to finish running in the background' % self.name wait.wait_for(func=lambda: not self.apt_running(), step=60, text=text, throw_exc=False) def wait_db_down(self, verbose=True, timeout=3600, check_interval=60): text = None if verbose: - text = '%s: Waiting for DB services to be down' % self + text = '%s: Waiting for DB services to be down' % self.name wait.wait_for(func=lambda: not self.db_up(), step=check_interval, text=text, timeout=timeout, throw_exc=True) def wait_cs_installed(self, verbose=True): text = None if verbose: - text = '%s: Waiting for cassandra-stress' % self + text = '%s: Waiting for cassandra-stress' % self.name wait.wait_for(func=self.cs_installed, step=60, text=text, throw_exc=False) @@ -3200,9 +3276,11 @@ def get_datacenter_name_per_region(self, db_nodes=None): datacenter_name_per_region = {} for region, nodes in self.nodes_by_region(nodes=db_nodes).items(): if status := nodes[0].get_nodes_status(): - datacenter_name_per_region[region] = status[nodes[0]]['dc'] + # If `nodetool status` failed to get status for the node + if dc_name := status.get(nodes[0], {}).get('dc'): + datacenter_name_per_region[region] = dc_name else: - LOGGER.error("Failed to get nodes status from node %s", nodes[0]) + LOGGER.error("Failed to get nodes status from node %s", nodes[0].name) return datacenter_name_per_region @@ -3216,7 +3294,9 @@ def get_rack_names_per_datacenter_and_rack_idx(self, db_nodes: list[BaseNode] | rack_names_mapping = {} for (region, rack), nodes in self.nodes_by_racks_idx_and_regions(nodes=actual_db_nodes).items(): - rack_names_mapping[(region, rack)] = status[nodes[0]]['rack'] + # If `nodetool status` failed to get status for the node + if rack_name := status.get(nodes[0], {}).get('rack'): + rack_names_mapping[(region, rack)] = rack_name return rack_names_mapping diff --git a/sdcm/cluster_aws.py b/sdcm/cluster_aws.py index 20c06d2bbc..7358389546 100644 --- a/sdcm/cluster_aws.py +++ b/sdcm/cluster_aws.py @@ -439,12 +439,12 @@ def __str__(self): else: node_private_ip = self.private_ip_address - return 'Node %s [%s | %s%s] (seed: %s)' % ( + return 'Node %s [%s | %s%s]%s' % ( self.name, self.public_ip_address, node_private_ip, " | %s" % self.ipv6_ip_address if self.test_config.IP_SSH_CONNECTIONS == "ipv6" else "", - self.is_seed) + self._dc_info_str()) @property def network_interfaces(self): diff --git a/sdcm/cluster_k8s/__init__.py b/sdcm/cluster_k8s/__init__.py index ab075e1af4..c60bf9dbe4 100644 --- a/sdcm/cluster_k8s/__init__.py +++ b/sdcm/cluster_k8s/__init__.py @@ -51,6 +51,7 @@ from sdcm import sct_abs_path, cluster from sdcm.cluster import DeadNode, ClusterNodesNotReady from sdcm.provision.scylla_yaml.scylla_yaml import ScyllaYaml +from sdcm.sct_config import SCTConfiguration, init_and_verify_sct_config from sdcm.test_config import TestConfig from sdcm.db_stats import PrometheusDBStats from sdcm.remote import LOCALRUNNER, NETWORK_EXCEPTIONS @@ -1522,6 +1523,7 @@ def scylla_manager_cluster(self) -> 'ManagerPodCluser': container='scylla-manager', cluster_prefix='mgr-', node_prefix='mgr-node-', + params=init_and_verify_sct_config(), n_nodes=1 ) @@ -1764,6 +1766,15 @@ def __init__(self, name: str, parent_cluster: PodCluster, node_prefix: str = "no dc_idx=dc_idx, rack=rack) self.k8s_cluster = self.parent_cluster.k8s_clusters[self.dc_idx] + self._rack_name = None + + @property + def node_rack(self) -> str: + if not self._rack_name: + if pod := self._pod: + self._rack_name = pod.metadata.labels.get("scylla/rack", "") + + return self._rack_name @cached_property def pod_replace_timeout(self) -> int: diff --git a/unit_tests/dummy_remote.py b/unit_tests/dummy_remote.py index 414a70f5e1..9bf684158b 100644 --- a/unit_tests/dummy_remote.py +++ b/unit_tests/dummy_remote.py @@ -24,6 +24,7 @@ class DummyOutput: def __init__(self, stdout): self.stdout = stdout + self.stderr = stdout class DummyRemote: diff --git a/unit_tests/test_cluster.py b/unit_tests/test_cluster.py index fdb03b2672..5bdda81e62 100644 --- a/unit_tests/test_cluster.py +++ b/unit_tests/test_cluster.py @@ -28,6 +28,7 @@ import pytest from invoke import Result +from sdcm import sct_config from sdcm.cluster import BaseNode, BaseCluster, BaseMonitorSet, BaseScyllaCluster from sdcm.db_log_reader import DbLogReader from sdcm.sct_events import Severity @@ -47,9 +48,12 @@ class DummyDbCluster(BaseCluster, BaseScyllaCluster): # pylint: disable=abstrac # pylint: disable=super-init-not-called def __init__(self, nodes, params=None): self.nodes = nodes - self.params = params + self.params = params or sct_config.SCTConfiguration() + self.params["region_name"] = "test_region" + self.racks_count = 0 self.added_password_suffix = False self.log = logging.getLogger(__name__) + self.node_type = "scylla-db" class DummyDbLogReader(DbLogReader): @@ -70,6 +74,7 @@ def node(self): base_logdir=self.temp_dir, ssh_login_info=dict(key_file='~/.ssh/scylla-test'), ) + dummy_node.parent_cluster = DummyDbCluster(nodes=[dummy_node]) dummy_node.init() dummy_node.remoter = DummyRemote() return dummy_node @@ -283,6 +288,7 @@ def setUp(self): parent_cluster=None, base_logdir=self.temp_dir, ssh_login_info=dict(key_file='~/.ssh/scylla-test')) + self.node.parent_cluster = DummyDbCluster([self.node]) def test_no_scylla_binary_rhel_like(self): self.node.remoter = VersionDummyRemote(self, ( @@ -655,6 +661,7 @@ def test_base_node_cpuset(cat_results, expected_core_number): base_logdir=tempfile.mkdtemp(), ssh_login_info=dict(key_file='~/.ssh/scylla-test'), ) + dummy_node.parent_cluster = DummyDbCluster([dummy_node]) dummy_node.init() cat_results_obj = type("FakeGrepResults", (), { "stdout": f'#\n# some comment\nCPUSET="--cpuset {cat_results} "' @@ -683,6 +690,7 @@ def test_base_node_cpuset_not_configured(cat_results): base_logdir=tempfile.mkdtemp(), ssh_login_info=dict(key_file='~/.ssh/scylla-test'), ) + dummy_node.parent_cluster = DummyDbCluster([dummy_node]) dummy_node.init() cat_results_obj = type("FakeCatResults", (), {"stdout": cat_results}) dummy_node.remoter = type("FakeRemoter", (), { diff --git a/unit_tests/test_scylla_yaml_builders.py b/unit_tests/test_scylla_yaml_builders.py index da8d8b9a47..766b04e579 100644 --- a/unit_tests/test_scylla_yaml_builders.py +++ b/unit_tests/test_scylla_yaml_builders.py @@ -409,6 +409,8 @@ def __init__(self, params): # pylint: disable=super-init-not-called self.nodes = [] self.params = params self.name = 'dummy_cluster' + self.node_type = "scylla-db" + self.racks_count = 0 @property def seed_nodes_addresses(self): diff --git a/unit_tests/test_seed_selector.py b/unit_tests/test_seed_selector.py index 51ed83c265..46c27533fa 100644 --- a/unit_tests/test_seed_selector.py +++ b/unit_tests/test_seed_selector.py @@ -5,6 +5,7 @@ import os.path import sdcm.cluster +from sdcm import sct_config from sdcm.test_config import TestConfig from unit_tests.dummy_remote import DummyRemote @@ -34,12 +35,15 @@ def is_nonroot_install(self): # pylint: disable=invalid-overridden-method class DummyCluster(sdcm.cluster.BaseScyllaCluster): def __init__(self, *args, **kwargs): - self.params = {} - self.nodes = [] super().__init__(*args, **kwargs) + self.params = sct_config.SCTConfiguration() + self.params["region_name"] = "test_region" + self.racks_count = 0 + self.nodes = [] + self.node_type = "scylla-db" def set_test_params(self, seeds_selector, seeds_num, db_type): - self.params = {'seeds_selector': seeds_selector, 'seeds_num': seeds_num, 'db_type': db_type} + self.params.update({'seeds_selector': seeds_selector, 'seeds_num': seeds_num, 'db_type': db_type}) logging.basicConfig(format="%(asctime)s - %(levelname)-8s - %(name)-10s: %(message)s", level=logging.DEBUG) @@ -61,7 +65,7 @@ def setup_cluster(self, nodes_number): self.cluster = DummyCluster() # Add 3 nodes for i in range(1, nodes_number+1): - self.cluster.nodes.append(DummyNode(name='node%d' % i, parent_cluster=None, + self.cluster.nodes.append(DummyNode(name='node%d' % i, parent_cluster=self.cluster, base_logdir=self.temp_dir, ssh_login_info=dict(key_file='~/.ssh/scylla-test'))) for node in self.cluster.nodes: diff --git a/unit_tests/test_utils_common.py b/unit_tests/test_utils_common.py index b6e53f7ea3..a058920bec 100644 --- a/unit_tests/test_utils_common.py +++ b/unit_tests/test_utils_common.py @@ -19,7 +19,8 @@ import unittest.mock from pathlib import Path -from sdcm.cluster import BaseNode +from sdcm import sct_config +from sdcm.cluster import BaseNode, BaseCluster, BaseScyllaCluster from sdcm.utils.distro import Distro from sdcm.utils.common import convert_metric_to_ms, download_dir_from_cloud from sdcm.utils.sstable import load_inventory @@ -99,6 +100,18 @@ def run(self, *args, **kwargs): # pylint: disable=unused-argument file.write(f"{line}\n") +class DummyDbCluster(BaseCluster, BaseScyllaCluster): # pylint: disable=abstract-method + # pylint: disable=super-init-not-called + def __init__(self, nodes): + self.nodes = nodes + self.params = sct_config.SCTConfiguration() + self.params["region_name"] = "test_region" + self.racks_count = 0 + self.added_password_suffix = False + self.log = logging.getLogger(__name__) + self.node_type = "scylla-db" + + class DummyNode(BaseNode): # pylint: disable=abstract-method _system_log = None is_enterprise = False @@ -157,6 +170,7 @@ class TestSstableLoadUtils(unittest.TestCase): def setUpClass(cls): cls.node = DummyNode(name='test_node', parent_cluster=None, base_logdir=cls.temp_dir, ssh_login_info=dict(key_file='~/.ssh/scylla-test')) + cls.node.parent_cluster = DummyDbCluster([cls.node]) cls.node.init() def setUp(self):