Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(group_common_events): filter expected raft_topology error messages #7452

Merged
merged 1 commit into from
May 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions sdcm/nemesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@
ignore_no_space_errors,
ignore_scrub_invalid_errors,
ignore_stream_mutation_fragments_errors,
ignore_raft_topology_cmd_failing,
ignore_ycsb_connection_refused,
decorate_with_context,
ignore_reactor_stall_errors,
Expand Down Expand Up @@ -3853,7 +3854,7 @@ def decommission_post_action():
for expected_start_failed_context in self.target_node.raft.get_severity_change_filters_scylla_start_failed(
terminate_pattern.timeout):
stack.enter_context(expected_start_failed_context)
with ignore_stream_mutation_fragments_errors():
with ignore_stream_mutation_fragments_errors(), ignore_raft_topology_cmd_failing():
ParallelObject(objects=[trigger, watcher], timeout=full_operations_timeout).call_objects()
if new_node := decommission_post_action():
new_node.wait_node_fully_start()
Expand Down Expand Up @@ -3932,21 +3933,22 @@ def disrupt_decommission_streaming_err(self):
"This nemesis logic is not compatible with K8S approach "
"for handling Scylla member's decommissioning.")

with ignore_stream_mutation_fragments_errors():
with ignore_stream_mutation_fragments_errors(), ignore_raft_topology_cmd_failing():
self.start_and_interrupt_decommission_streaming()

def disrupt_rebuild_streaming_err(self):
"""
Stop rebuild in middle to trigger some streaming fails, then rebuild the data on the node.
"""
with ignore_stream_mutation_fragments_errors():
with ignore_stream_mutation_fragments_errors(), ignore_raft_topology_cmd_failing():
self.start_and_interrupt_rebuild_streaming()

def disrupt_repair_streaming_err(self):
"""
Stop repair in middle to trigger some streaming fails, then rebuild the data on the node.
"""
self.start_and_interrupt_repair_streaming()
with ignore_raft_topology_cmd_failing():
self.start_and_interrupt_repair_streaming()

def _corrupt_data_file(self):
"""Randomly corrupt data file by dd"""
Expand Down
36 changes: 36 additions & 0 deletions sdcm/sct_events/group_common_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,42 @@ def ignore_stream_mutation_fragments_errors():
yield


@contextmanager
def ignore_raft_topology_cmd_failing():
with ExitStack() as stack:
stack.enter_context(EventsSeverityChangerFilter(
new_severity=Severity.WARNING,
event_class=DatabaseLogEvent,
regex=r".*raft_topology - raft_topology_cmd failed with: seastar::abort_requested_exception \(abort requested\)",
extra_time_to_expiration=30
))
stack.enter_context(EventsSeverityChangerFilter(
new_severity=Severity.WARNING,
event_class=DatabaseLogEvent,
regex=r".*raft_topology - raft_topology_cmd failed with: raft::request_aborted \(Request is aborted by a caller\)",
extra_time_to_expiration=30
))
stack.enter_context(EventsSeverityChangerFilter(
new_severity=Severity.WARNING,
event_class=DatabaseLogEvent,
regex=r".*raft_topology - send_raft_topology_cmd\(stream_ranges\) failed with exception \(node state is decommissioning\)",
extra_time_to_expiration=30
))
stack.enter_context(EventsSeverityChangerFilter(
new_severity=Severity.WARNING,
event_class=DatabaseLogEvent,
regex=r".*raft_topology - send_raft_topology_cmd\(stream_ranges\) failed with exception \(node state is rebuilding\)",
extra_time_to_expiration=30
))
stack.enter_context(EventsSeverityChangerFilter(
new_severity=Severity.WARNING,
event_class=DatabaseLogEvent,
regex=r".*raft_topology - send_raft_topology_cmd\(stream_ranges\) failed with exception \(node state is replacing\)",
extra_time_to_expiration=30
))
yield


def decorate_with_context(context_list: list[Callable | ContextManager] | Callable | ContextManager):
"""
helper to decorate a function to run with a list of callables that return context managers
Expand Down
8 changes: 5 additions & 3 deletions sdcm/utils/raft/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from sdcm.cluster import BaseNode, BaseScyllaCluster, BaseMonitorSet
from sdcm.wait import wait_for
from sdcm.sct_events.group_common_events import decorate_with_context, \
ignore_stream_mutation_fragments_errors, ignore_ycsb_connection_refused
ignore_stream_mutation_fragments_errors, ignore_ycsb_connection_refused, ignore_raft_topology_cmd_failing
from sdcm.utils.adaptive_timeouts import Operations, adaptive_timeout
from sdcm.utils.common import ParallelObject

Expand Down Expand Up @@ -117,7 +117,7 @@ def run_bootstrap_and_abort_with_action(self, terminate_pattern, abort_action: C

wait_operations_timeout = (self.SUCCESS_BOOTSTRAP_TIMEOUT + self.INSTANCE_START_TIMEOUT
+ terminate_pattern.timeout + abort_action_timeout)
with ignore_stream_mutation_fragments_errors(), contextlib.ExitStack() as stack:
with ignore_stream_mutation_fragments_errors(), ignore_raft_topology_cmd_failing(), contextlib.ExitStack() as stack:
for expected_start_failed_context in self.verification_node.raft.get_severity_change_filters_scylla_start_failed(
terminate_pattern.timeout):
stack.enter_context(expected_start_failed_context)
Expand All @@ -142,7 +142,9 @@ def clean_and_restart_bootstrap_after_abort(self):
self.verification_node.raft.clean_group0_garbage(raise_exception=True)
LOGGER.debug("Clean old scylla data and restart scylla service")
self.bootstrap_node.clean_scylla_data()
with adaptive_timeout(operation=Operations.NEW_NODE, node=self.verification_node, timeout=3600) as bootstrap_timeout:
with ignore_raft_topology_cmd_failing(), \
adaptive_timeout(operation=Operations.NEW_NODE, node=self.verification_node, timeout=3600) as bootstrap_timeout:

self.bootstrap_node.start_scylla_server(verify_up_timeout=bootstrap_timeout, verify_down=True)
self.bootstrap_node.start_scylla_jmx()
self.db_cluster.check_nodes_up_and_normal(
Expand Down