Skip to content

Commit

Permalink
fix(group_common_events): filter expected raft_topology error messages
Browse files Browse the repository at this point in the history
Add common group_events context manager which will Change severity
of expected error log messages to warning when topology operations
is failed aborted by request

Fixes: #7426, #7425
  • Loading branch information
aleksbykov committed May 28, 2024
1 parent 40ebca9 commit 52f5e66
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 7 deletions.
10 changes: 6 additions & 4 deletions sdcm/nemesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@
ignore_no_space_errors,
ignore_scrub_invalid_errors,
ignore_stream_mutation_fragments_errors,
ignore_raft_topology_cmd_failing,
ignore_ycsb_connection_refused,
decorate_with_context,
ignore_reactor_stall_errors,
Expand Down Expand Up @@ -3853,7 +3854,7 @@ def decommission_post_action():
for expected_start_failed_context in self.target_node.raft.get_severity_change_filters_scylla_start_failed(
terminate_pattern.timeout):
stack.enter_context(expected_start_failed_context)
with ignore_stream_mutation_fragments_errors():
with ignore_stream_mutation_fragments_errors(), ignore_raft_topology_cmd_failing():
ParallelObject(objects=[trigger, watcher], timeout=full_operations_timeout).call_objects()
if new_node := decommission_post_action():
new_node.wait_node_fully_start()
Expand Down Expand Up @@ -3932,21 +3933,22 @@ def disrupt_decommission_streaming_err(self):
"This nemesis logic is not compatible with K8S approach "
"for handling Scylla member's decommissioning.")

with ignore_stream_mutation_fragments_errors():
with ignore_stream_mutation_fragments_errors(), ignore_raft_topology_cmd_failing():
self.start_and_interrupt_decommission_streaming()

def disrupt_rebuild_streaming_err(self):
"""
Stop rebuild in middle to trigger some streaming fails, then rebuild the data on the node.
"""
with ignore_stream_mutation_fragments_errors():
with ignore_stream_mutation_fragments_errors(), ignore_raft_topology_cmd_failing():
self.start_and_interrupt_rebuild_streaming()

def disrupt_repair_streaming_err(self):
"""
Stop repair in middle to trigger some streaming fails, then rebuild the data on the node.
"""
self.start_and_interrupt_repair_streaming()
with ignore_raft_topology_cmd_failing():
self.start_and_interrupt_repair_streaming()

def _corrupt_data_file(self):
"""Randomly corrupt data file by dd"""
Expand Down
36 changes: 36 additions & 0 deletions sdcm/sct_events/group_common_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,42 @@ def ignore_stream_mutation_fragments_errors():
yield


@contextmanager
def ignore_raft_topology_cmd_failing():
with ExitStack() as stack:
stack.enter_context(EventsSeverityChangerFilter(
new_severity=Severity.WARNING,
event_class=DatabaseLogEvent,
regex=r".*raft_topology - raft_topology_cmd failed with: seastar::abort_requested_exception \(abort requested\)",
extra_time_to_expiration=30
))
stack.enter_context(EventsSeverityChangerFilter(
new_severity=Severity.WARNING,
event_class=DatabaseLogEvent,
regex=r".*raft_topology - raft_topology_cmd failed with: raft::request_aborted \(Request is aborted by a caller\)",
extra_time_to_expiration=30
))
stack.enter_context(EventsSeverityChangerFilter(
new_severity=Severity.WARNING,
event_class=DatabaseLogEvent,
regex=r".*raft_topology - send_raft_topology_cmd\(stream_ranges\) failed with exception \(node state is decommissioning\)",
extra_time_to_expiration=30
))
stack.enter_context(EventsSeverityChangerFilter(
new_severity=Severity.WARNING,
event_class=DatabaseLogEvent,
regex=r".*raft_topology - send_raft_topology_cmd\(stream_ranges\) failed with exception \(node state is rebuilding\)",
extra_time_to_expiration=30
))
stack.enter_context(EventsSeverityChangerFilter(
new_severity=Severity.WARNING,
event_class=DatabaseLogEvent,
regex=r".*raft_topology - send_raft_topology_cmd\(stream_ranges\) failed with exception \(node state is replacing\)",
extra_time_to_expiration=30
))
yield


def decorate_with_context(context_list: list[Callable | ContextManager] | Callable | ContextManager):
"""
helper to decorate a function to run with a list of callables that return context managers
Expand Down
8 changes: 5 additions & 3 deletions sdcm/utils/raft/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from sdcm.cluster import BaseNode, BaseScyllaCluster, BaseMonitorSet
from sdcm.wait import wait_for
from sdcm.sct_events.group_common_events import decorate_with_context, \
ignore_stream_mutation_fragments_errors, ignore_ycsb_connection_refused
ignore_stream_mutation_fragments_errors, ignore_ycsb_connection_refused, ignore_raft_topology_cmd_failing
from sdcm.utils.adaptive_timeouts import Operations, adaptive_timeout
from sdcm.utils.common import ParallelObject

Expand Down Expand Up @@ -117,7 +117,7 @@ def run_bootstrap_and_abort_with_action(self, terminate_pattern, abort_action: C

wait_operations_timeout = (self.SUCCESS_BOOTSTRAP_TIMEOUT + self.INSTANCE_START_TIMEOUT
+ terminate_pattern.timeout + abort_action_timeout)
with ignore_stream_mutation_fragments_errors(), contextlib.ExitStack() as stack:
with ignore_stream_mutation_fragments_errors(), ignore_raft_topology_cmd_failing(), contextlib.ExitStack() as stack:
for expected_start_failed_context in self.verification_node.raft.get_severity_change_filters_scylla_start_failed(
terminate_pattern.timeout):
stack.enter_context(expected_start_failed_context)
Expand All @@ -142,7 +142,9 @@ def clean_and_restart_bootstrap_after_abort(self):
self.verification_node.raft.clean_group0_garbage(raise_exception=True)
LOGGER.debug("Clean old scylla data and restart scylla service")
self.bootstrap_node.clean_scylla_data()
with adaptive_timeout(operation=Operations.NEW_NODE, node=self.verification_node, timeout=3600) as bootstrap_timeout:
with ignore_raft_topology_cmd_failing(), \
adaptive_timeout(operation=Operations.NEW_NODE, node=self.verification_node, timeout=3600) as bootstrap_timeout:

self.bootstrap_node.start_scylla_server(verify_up_timeout=bootstrap_timeout, verify_down=True)
self.bootstrap_node.start_scylla_jmx()
self.db_cluster.check_nodes_up_and_normal(
Expand Down

0 comments on commit 52f5e66

Please sign in to comment.