Skip to content

Commit

Permalink
Flush before kill on Windows. Add retry on deletion in cluster.remove()
Browse files Browse the repository at this point in the history
Reference CASSANDRA-10075: The retry is an attempt to remedy an intermittent
failure in dtests where commitlogs cannot be deleted during cluster stop.
Flush before kill is an attempt to address a class of problems that has
shown up in various dtests while stabilizing Windows tests for 2.2.
  • Loading branch information
Josh McKenzie committed Aug 18, 2015
1 parent d1bec4e commit 689cbf6
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 3 deletions.
21 changes: 18 additions & 3 deletions ccmlib/cluster.py
Expand Up @@ -209,18 +209,33 @@ def balanced_tokens_across_dcs(self, dcs):

def remove(self, node=None):
if node is not None:
if not node.name in self.nodes:
if node.name not in self.nodes:
return

del self.nodes[node.name]
if node in self.seeds:
self.seeds.remove(node)
self._update_config()
node.stop(gently=False)
common.rmdirs(node.get_path())
self.remove_dir_with_retry(node.get_path())
else:
self.stop(gently=False)
common.rmdirs(self.get_path())
self.remove_dir_with_retry(self.get_path())

# We can race w/shutdown on Windows and get Access is denied attempting to delete node logs.
# see CASSANDRA-10075
def remove_dir_with_retry(self, path):
tries = 0
removed = False
while removed is False:
try:
common.rmdirs(path)
removed = True
except Exception as e:
tries = tries + 1
time.sleep(.1)
if tries == 5:
raise e

def clear(self):
self.stop()
Expand Down
6 changes: 6 additions & 0 deletions ccmlib/node.py
Expand Up @@ -561,6 +561,12 @@ def stop(self, wait=True, wait_other_notice=False, gently=True):
# We have recurring issues with nodes not stopping / releasing files in the CI
# environment so it makes more sense just to murder it hard since there's
# really little downside.

# We want the node to flush its data before shutdown as some tests rely on small writes being present.
# The default Periodic sync at 10 ms may not have flushed data yet, causing tests to fail.
if gently is True:
self.flush()

os.system("taskkill /F /PID " + str(self.pid))
if self._find_pid_on_windows():
print_("WARN: Failed to terminate node: {0} with pid: {1}".format(self.name, self.pid))
Expand Down

0 comments on commit 689cbf6

Please sign in to comment.