Skip to content

Commit

Permalink
improvement(log info): add datacenter and rack info
Browse files Browse the repository at this point in the history
When the test runs in multi DC environment it is helpful to get information
about the node: in which datacenter and rack this node is located.

For this goal update node information to the log. So it is not needed to
search for this through the log

Task: scylladb/qa-tasks#1180
  • Loading branch information
juliayakovlev authored and fruch committed Apr 14, 2024
1 parent ead2529 commit 1f38a63
Show file tree
Hide file tree
Showing 8 changed files with 142 additions and 22 deletions.
108 changes: 94 additions & 14 deletions sdcm/cluster.py
Expand Up @@ -302,6 +302,54 @@ def __init__(self, name, parent_cluster, ssh_login_info=None, base_logdir=None,
self._kernel_version = None
self._uuid = None
self.scylla_network_configuration = None
self._datacenter_name = None
self._node_rack = None

def _is_node_ready_run_scylla_commands(self) -> bool:
"""
When node is just created and started to configure, during first node initializing, it is impossible to connect to the node yet and
`remoter` object is None. So we cannot connect to the node and run there needed commands.
When `remoter` is ready but cluster is not configured yet, `nodetool status` (and other Scylla commands) cannot be run.
Wait when it became available.
"""
# During first cluster remoter is not available and may cause to failure
if not self.remoter:
self.log.warning("Remoter is not available")
return False

# During first cluster initializing the status info is not available and may cause to failure
if not self.db_up():
self.log.warning("Running Scylla commands are not available. Scylla cluster is not configured yet")
return False

return True

@property
def datacenter(self) -> str:
if not self._datacenter_name:
# Get datacenter name from `nodetool status` for DB nodes
# "db" node_type is in kubernetes cluster
if self.parent_cluster.node_type in ["scylla-db", "db"]:
if self._is_node_ready_run_scylla_commands():
datacenter_name_per_region = self.parent_cluster.get_datacenter_name_per_region(db_nodes=[self])
self._datacenter_name = datacenter_name_per_region[self.region] if datacenter_name_per_region else None
else:
self._datacenter_name = self.region

return self._datacenter_name

@property
def node_rack(self) -> str:
if not self._node_rack:
# Get rack value from `nodetool status` for DB nodes
if self.parent_cluster.node_type == "scylla-db":
if self._is_node_ready_run_scylla_commands():
rack_names = self.parent_cluster.get_rack_names_per_datacenter_and_rack_idx(db_nodes=[self])
self._node_rack = list(rack_names.values())[0]
else:
self._node_rack = str(self.rack)
return self._node_rack

@property
def network_interfaces(self):
Expand Down Expand Up @@ -931,15 +979,43 @@ def start_alert_manager_thread(self):
def silence_alert(self, alert_name, duration=None, start=None, end=None):
return AlertSilencer(self._alert_manager, alert_name, duration, start, end)

def _dc_info_str(self):
dc_info = []

# Example: `ManagerPodCluser` - the `params` is not needed and may be not passed there. Manager always created in the first DC
if not hasattr(self.parent_cluster, "params"):
TestFrameworkEvent(source=self.__class__.__name__,
message="The parent cluster has not 'params' attribute",
trace=sys._getframe().f_back, # pylint: disable=protected-access
severity=Severity.ERROR).publish()
# We want to figure out all places where "params" attribute has type that not consistent
elif not isinstance(self.parent_cluster.params, SCTConfiguration):
TestFrameworkEvent(source=self.__class__.__name__,
message=f"The 'params' attribute expected to by 'SCTConfiguration`, "
f"but actually it is a `{type(self.parent_cluster.params)}`",
trace=sys._getframe().f_back, # pylint: disable=protected-access
severity=Severity.ERROR).publish()

elif len(self.parent_cluster.params.region_names) > 1 and self.datacenter:
dc_info.append(f"dc name: {self.datacenter}")

# Workaround for 'k8s-local-kind*' backend.
# "node.init()" is called in `sdcm.cluster_k8s.mini_k8s.LocalMinimalClusterBase.host_node` when Scylla cluster, that hold
# "racks_count" parameter, is not created yet
if hasattr(self.parent_cluster, "racks_count") and self.parent_cluster.racks_count > 1 and self.node_rack:
dc_info.append(f"rack: {self.node_rack}")

return f' ({", ".join(dc_info)})' if dc_info else ""

def __str__(self):
# TODO: when new network_configuration will be supported by all backends, copy this function from sdcm.cluster_aws.AWSNode.__str__
# to here
return 'Node %s [%s | %s%s] (seed: %s)' % (
return 'Node %s [%s | %s%s]%s' % (
self.name,
self.public_ip_address,
self.private_ip_address,
" | %s" % self.ipv6_ip_address if self.test_config.IP_SSH_CONNECTIONS == "ipv6" else "",
self.is_seed)
self._dc_info_str())

def restart(self):
raise NotImplementedError('Derived classes must implement restart')
Expand Down Expand Up @@ -1124,7 +1200,7 @@ def destroy(self):
def wait_ssh_up(self, verbose=True, timeout=500):
text = None
if verbose:
text = '%s: Waiting for SSH to be up' % self
text = '%s: Waiting for SSH to be up' % self.name
wait.wait_for(func=self.remoter.is_up, step=10, text=text, timeout=timeout, throw_exc=True)

def is_port_used(self, port: int, service_name: str) -> bool:
Expand Down Expand Up @@ -1227,13 +1303,13 @@ def keyspace_available():
def wait_jmx_up(self, verbose=True, timeout=None):
text = None
if verbose:
text = '%s: Waiting for JMX service to be up' % self
text = '%s: Waiting for JMX service to be up' % self.name
wait.wait_for(func=self.jmx_up, step=60, text=text, timeout=timeout, throw_exc=True)

def wait_jmx_down(self, verbose=True, timeout=None):
text = None
if verbose:
text = '%s: Waiting for JMX service to be down' % self
text = '%s: Waiting for JMX service to be down' % self.name
wait.wait_for(func=lambda: not self.jmx_up(), step=60, text=text, timeout=timeout, throw_exc=True)

@property
Expand Down Expand Up @@ -1265,7 +1341,7 @@ def _report_housekeeping_uuid(self, verbose=False):
def wait_db_up(self, verbose=True, timeout=3600):
text = None
if verbose:
text = '%s: Waiting for DB services to be up' % self
text = '%s: Waiting for DB services to be up' % self.name

wait.wait_for(func=self.db_up, step=60, text=text, timeout=timeout,
throw_exc=True, stop_event=self.stop_wait_db_up_event)
Expand All @@ -1284,7 +1360,7 @@ def is_manager_agent_up(self, port=None):
def wait_manager_agent_up(self, verbose=True, timeout=180):
text = None
if verbose:
text = '%s: Waiting for manager agent to be up' % self
text = '%s: Waiting for manager agent to be up' % self.name
wait.wait_for(func=self.is_manager_agent_up, step=10, text=text, timeout=timeout, throw_exc=True)

def is_manager_server_up(self, port=None):
Expand All @@ -1301,7 +1377,7 @@ def is_manager_server_up(self, port=None):
def wait_manager_server_up(self, verbose=True, timeout=300, port=None):
text = None
if verbose:
text = '%s: Waiting for manager server to be up' % self
text = '%s: Waiting for manager server to be up' % self.name
try:
wait.wait_for(func=self.is_manager_server_up, port=port,
step=10, text=text, timeout=timeout, throw_exc=True)
Expand Down Expand Up @@ -1337,20 +1413,20 @@ def apt_running(self):
def wait_apt_not_running(self, verbose=True):
text = None
if verbose:
text = '%s: Waiting for apt to finish running in the background' % self
text = '%s: Waiting for apt to finish running in the background' % self.name
wait.wait_for(func=lambda: not self.apt_running(), step=60,
text=text, throw_exc=False)

def wait_db_down(self, verbose=True, timeout=3600, check_interval=60):
text = None
if verbose:
text = '%s: Waiting for DB services to be down' % self
text = '%s: Waiting for DB services to be down' % self.name
wait.wait_for(func=lambda: not self.db_up(), step=check_interval, text=text, timeout=timeout, throw_exc=True)

def wait_cs_installed(self, verbose=True):
text = None
if verbose:
text = '%s: Waiting for cassandra-stress' % self
text = '%s: Waiting for cassandra-stress' % self.name
wait.wait_for(func=self.cs_installed, step=60,
text=text, throw_exc=False)

Expand Down Expand Up @@ -3200,9 +3276,11 @@ def get_datacenter_name_per_region(self, db_nodes=None):
datacenter_name_per_region = {}
for region, nodes in self.nodes_by_region(nodes=db_nodes).items():
if status := nodes[0].get_nodes_status():
datacenter_name_per_region[region] = status[nodes[0]]['dc']
# If `nodetool status` failed to get status for the node
if dc_name := status.get(nodes[0], {}).get('dc'):
datacenter_name_per_region[region] = dc_name
else:
LOGGER.error("Failed to get nodes status from node %s", nodes[0])
LOGGER.error("Failed to get nodes status from node %s", nodes[0].name)

return datacenter_name_per_region

Expand All @@ -3216,7 +3294,9 @@ def get_rack_names_per_datacenter_and_rack_idx(self, db_nodes: list[BaseNode] |

rack_names_mapping = {}
for (region, rack), nodes in self.nodes_by_racks_idx_and_regions(nodes=actual_db_nodes).items():
rack_names_mapping[(region, rack)] = status[nodes[0]]['rack']
# If `nodetool status` failed to get status for the node
if rack_name := status.get(nodes[0], {}).get('rack'):
rack_names_mapping[(region, rack)] = rack_name

return rack_names_mapping

Expand Down
4 changes: 2 additions & 2 deletions sdcm/cluster_aws.py
Expand Up @@ -439,12 +439,12 @@ def __str__(self):
else:
node_private_ip = self.private_ip_address

return 'Node %s [%s | %s%s] (seed: %s)' % (
return 'Node %s [%s | %s%s]%s' % (
self.name,
self.public_ip_address,
node_private_ip,
" | %s" % self.ipv6_ip_address if self.test_config.IP_SSH_CONNECTIONS == "ipv6" else "",
self.is_seed)
self._dc_info_str())

@property
def network_interfaces(self):
Expand Down
11 changes: 11 additions & 0 deletions sdcm/cluster_k8s/__init__.py
Expand Up @@ -51,6 +51,7 @@
from sdcm import sct_abs_path, cluster
from sdcm.cluster import DeadNode, ClusterNodesNotReady
from sdcm.provision.scylla_yaml.scylla_yaml import ScyllaYaml
from sdcm.sct_config import SCTConfiguration, init_and_verify_sct_config
from sdcm.test_config import TestConfig
from sdcm.db_stats import PrometheusDBStats
from sdcm.remote import LOCALRUNNER, NETWORK_EXCEPTIONS
Expand Down Expand Up @@ -1522,6 +1523,7 @@ def scylla_manager_cluster(self) -> 'ManagerPodCluser':
container='scylla-manager',
cluster_prefix='mgr-',
node_prefix='mgr-node-',
params=init_and_verify_sct_config(),
n_nodes=1
)

Expand Down Expand Up @@ -1764,6 +1766,15 @@ def __init__(self, name: str, parent_cluster: PodCluster, node_prefix: str = "no
dc_idx=dc_idx,
rack=rack)
self.k8s_cluster = self.parent_cluster.k8s_clusters[self.dc_idx]
self._rack_name = None

@property
def node_rack(self) -> str:
if not self._rack_name:
if pod := self._pod:
self._rack_name = pod.metadata.labels.get("scylla/rack", "")

return self._rack_name

@cached_property
def pod_replace_timeout(self) -> int:
Expand Down
1 change: 1 addition & 0 deletions unit_tests/dummy_remote.py
Expand Up @@ -24,6 +24,7 @@
class DummyOutput:
def __init__(self, stdout):
self.stdout = stdout
self.stderr = stdout


class DummyRemote:
Expand Down
10 changes: 9 additions & 1 deletion unit_tests/test_cluster.py
Expand Up @@ -28,6 +28,7 @@
import pytest
from invoke import Result

from sdcm import sct_config
from sdcm.cluster import BaseNode, BaseCluster, BaseMonitorSet, BaseScyllaCluster
from sdcm.db_log_reader import DbLogReader
from sdcm.sct_events import Severity
Expand All @@ -47,9 +48,12 @@ class DummyDbCluster(BaseCluster, BaseScyllaCluster): # pylint: disable=abstrac
# pylint: disable=super-init-not-called
def __init__(self, nodes, params=None):
self.nodes = nodes
self.params = params
self.params = params or sct_config.SCTConfiguration()
self.params["region_name"] = "test_region"
self.racks_count = 0
self.added_password_suffix = False
self.log = logging.getLogger(__name__)
self.node_type = "scylla-db"


class DummyDbLogReader(DbLogReader):
Expand All @@ -70,6 +74,7 @@ def node(self):
base_logdir=self.temp_dir,
ssh_login_info=dict(key_file='~/.ssh/scylla-test'),
)
dummy_node.parent_cluster = DummyDbCluster(nodes=[dummy_node])
dummy_node.init()
dummy_node.remoter = DummyRemote()
return dummy_node
Expand Down Expand Up @@ -283,6 +288,7 @@ def setUp(self):
parent_cluster=None,
base_logdir=self.temp_dir,
ssh_login_info=dict(key_file='~/.ssh/scylla-test'))
self.node.parent_cluster = DummyDbCluster([self.node])

def test_no_scylla_binary_rhel_like(self):
self.node.remoter = VersionDummyRemote(self, (
Expand Down Expand Up @@ -655,6 +661,7 @@ def test_base_node_cpuset(cat_results, expected_core_number):
base_logdir=tempfile.mkdtemp(),
ssh_login_info=dict(key_file='~/.ssh/scylla-test'),
)
dummy_node.parent_cluster = DummyDbCluster([dummy_node])
dummy_node.init()
cat_results_obj = type("FakeGrepResults", (), {
"stdout": f'#\n# some comment\nCPUSET="--cpuset {cat_results} "'
Expand Down Expand Up @@ -683,6 +690,7 @@ def test_base_node_cpuset_not_configured(cat_results):
base_logdir=tempfile.mkdtemp(),
ssh_login_info=dict(key_file='~/.ssh/scylla-test'),
)
dummy_node.parent_cluster = DummyDbCluster([dummy_node])
dummy_node.init()
cat_results_obj = type("FakeCatResults", (), {"stdout": cat_results})
dummy_node.remoter = type("FakeRemoter", (), {
Expand Down
2 changes: 2 additions & 0 deletions unit_tests/test_scylla_yaml_builders.py
Expand Up @@ -409,6 +409,8 @@ def __init__(self, params): # pylint: disable=super-init-not-called
self.nodes = []
self.params = params
self.name = 'dummy_cluster'
self.node_type = "scylla-db"
self.racks_count = 0

@property
def seed_nodes_addresses(self):
Expand Down
12 changes: 8 additions & 4 deletions unit_tests/test_seed_selector.py
Expand Up @@ -5,6 +5,7 @@
import os.path

import sdcm.cluster
from sdcm import sct_config
from sdcm.test_config import TestConfig
from unit_tests.dummy_remote import DummyRemote

Expand Down Expand Up @@ -34,12 +35,15 @@ def is_nonroot_install(self): # pylint: disable=invalid-overridden-method

class DummyCluster(sdcm.cluster.BaseScyllaCluster):
def __init__(self, *args, **kwargs):
self.params = {}
self.nodes = []
super().__init__(*args, **kwargs)
self.params = sct_config.SCTConfiguration()
self.params["region_name"] = "test_region"
self.racks_count = 0
self.nodes = []
self.node_type = "scylla-db"

def set_test_params(self, seeds_selector, seeds_num, db_type):
self.params = {'seeds_selector': seeds_selector, 'seeds_num': seeds_num, 'db_type': db_type}
self.params.update({'seeds_selector': seeds_selector, 'seeds_num': seeds_num, 'db_type': db_type})


logging.basicConfig(format="%(asctime)s - %(levelname)-8s - %(name)-10s: %(message)s", level=logging.DEBUG)
Expand All @@ -61,7 +65,7 @@ def setup_cluster(self, nodes_number):
self.cluster = DummyCluster()
# Add 3 nodes
for i in range(1, nodes_number+1):
self.cluster.nodes.append(DummyNode(name='node%d' % i, parent_cluster=None,
self.cluster.nodes.append(DummyNode(name='node%d' % i, parent_cluster=self.cluster,
base_logdir=self.temp_dir,
ssh_login_info=dict(key_file='~/.ssh/scylla-test')))
for node in self.cluster.nodes:
Expand Down
16 changes: 15 additions & 1 deletion unit_tests/test_utils_common.py
Expand Up @@ -19,7 +19,8 @@
import unittest.mock
from pathlib import Path

from sdcm.cluster import BaseNode
from sdcm import sct_config
from sdcm.cluster import BaseNode, BaseCluster, BaseScyllaCluster
from sdcm.utils.distro import Distro
from sdcm.utils.common import convert_metric_to_ms, download_dir_from_cloud
from sdcm.utils.sstable import load_inventory
Expand Down Expand Up @@ -99,6 +100,18 @@ def run(self, *args, **kwargs): # pylint: disable=unused-argument
file.write(f"{line}\n")


class DummyDbCluster(BaseCluster, BaseScyllaCluster): # pylint: disable=abstract-method
# pylint: disable=super-init-not-called
def __init__(self, nodes):
self.nodes = nodes
self.params = sct_config.SCTConfiguration()
self.params["region_name"] = "test_region"
self.racks_count = 0
self.added_password_suffix = False
self.log = logging.getLogger(__name__)
self.node_type = "scylla-db"


class DummyNode(BaseNode): # pylint: disable=abstract-method
_system_log = None
is_enterprise = False
Expand Down Expand Up @@ -157,6 +170,7 @@ class TestSstableLoadUtils(unittest.TestCase):
def setUpClass(cls):
cls.node = DummyNode(name='test_node', parent_cluster=None,
base_logdir=cls.temp_dir, ssh_login_info=dict(key_file='~/.ssh/scylla-test'))
cls.node.parent_cluster = DummyDbCluster([cls.node])
cls.node.init()

def setUp(self):
Expand Down

0 comments on commit 1f38a63

Please sign in to comment.