From 689cbf622d31a549dc915bcb3ac10a55e78cfc79 Mon Sep 17 00:00:00 2001 From: Josh McKenzie Date: Tue, 18 Aug 2015 14:42:18 -0400 Subject: [PATCH] Flush before kill on Windows. Add retry on deletion in cluster.remove() Reference CASSANDRA-10075: The retry is an attempt to remedy an intermittent failure in dtests where commitlogs cannot be deleted during cluster stop. Flush before kill is an attempt to address a class of problems that has shown up in various dtests while stabilizing Windows tests for 2.2. --- ccmlib/cluster.py | 21 ++++++++++++++++++--- ccmlib/node.py | 6 ++++++ 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/ccmlib/cluster.py b/ccmlib/cluster.py index 354af42d..48be942c 100644 --- a/ccmlib/cluster.py +++ b/ccmlib/cluster.py @@ -209,7 +209,7 @@ def balanced_tokens_across_dcs(self, dcs): def remove(self, node=None): if node is not None: - if not node.name in self.nodes: + if node.name not in self.nodes: return del self.nodes[node.name] @@ -217,10 +217,25 @@ def remove(self, node=None): self.seeds.remove(node) self._update_config() node.stop(gently=False) - common.rmdirs(node.get_path()) + self.remove_dir_with_retry(node.get_path()) else: self.stop(gently=False) - common.rmdirs(self.get_path()) + self.remove_dir_with_retry(self.get_path()) + + # We can race w/shutdown on Windows and get Access is denied attempting to delete node logs. + # see CASSANDRA-10075 + def remove_dir_with_retry(self, path): + tries = 0 + removed = False + while removed is False: + try: + common.rmdirs(path) + removed = True + except Exception as e: + tries = tries + 1 + time.sleep(.1) + if tries == 5: + raise e def clear(self): self.stop() diff --git a/ccmlib/node.py b/ccmlib/node.py index d7d96198..ac19c9e8 100644 --- a/ccmlib/node.py +++ b/ccmlib/node.py @@ -561,6 +561,12 @@ def stop(self, wait=True, wait_other_notice=False, gently=True): # We have recurring issues with nodes not stopping / releasing files in the CI # environment so it makes more sense just to murder it hard since there's # really little downside. + + # We want the node to flush its data before shutdown as some tests rely on small writes being present. + # The default Periodic sync at 10 ms may not have flushed data yet, causing tests to fail. + if gently is True: + self.flush() + os.system("taskkill /F /PID " + str(self.pid)) if self._find_pid_on_windows(): print_("WARN: Failed to terminate node: {0} with pid: {1}".format(self.name, self.pid))