From 5d1e6575b3848cefe1d095834d968e940d8b5678 Mon Sep 17 00:00:00 2001 From: Theofilos Manitaras Date: Mon, 26 Nov 2018 16:45:24 +0100 Subject: [PATCH 1/3] Check if 'UnavailableNodes' are down to cancel job * Use scontrol to retrieve the state of the nodes reported as unavailable and cancel the job if they are down. * Add methods `in_state` and `is_down` to `SlurmNode` class. * Add unittests for the above methods. --- reframe/core/schedulers/slurm.py | 28 ++++++++++++++++++++++++---- unittests/test_schedulers.py | 11 +++++++++++ 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index d70c19332e..7da6d76fdd 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -293,11 +293,24 @@ def _check_and_cancel(self, reason_descr): reason, reason_details = reason_descr, None if reason in self._cancel_reasons: - # Here we handle the case were the UnavailableNodes list is empty, - # which actually means that the job is pending if reason == 'ReqNodeNotAvail' and reason_details: - if re.match(r'UnavailableNodes:$', reason_details.strip()): - return + nodes_match = re.match( + r'UnavailableNodes:(?P\S+)?', + reason_details.strip()) + if nodes_match: + node_names = node_match.group('node_names') + if nodes_names: + # Here we retrieve the info of the unavailable nodes + # and check if they are indeed down + nodes = self._get_nodes_by_name(node_names) + down_nodes = [n for n in nodes if n.is_down()] + if not down_nodes: + return + else: + # Here we handle the case where the UnavailableNodes + # list is empty which actually means that the job + # is pending + return self.cancel() reason_msg = ('job cancelled because it was blocked due to ' @@ -421,6 +434,13 @@ def __hash__(self): def is_available(self): return self._state == 'IDLE' + def is_down(self): + return any( + self.in_state(s) for s in ['DOWN', 'DRAIN', 'MAINT', 'NO_RESPOND']) + + def in_state(self, state): + return state in self._state + @property def active_features(self): return self._active_features diff --git a/unittests/test_schedulers.py b/unittests/test_schedulers.py index 8793c26a29..54302b1d61 100644 --- a/unittests/test_schedulers.py +++ b/unittests/test_schedulers.py @@ -802,7 +802,18 @@ def test_attributes(self): def test_str(self): self.assertEqual('nid00001', str(self.allocated_node)) + def test_in_state(self): + self.assertTrue(self.allocated_node.in_state('ALLOCATED')) + self.assertFalse(self.idle_node.in_state('MAINT')) + self.assertTrue(self.idle_drained.in_state('IDLE')) + self.assertTrue(self.idle_drained.in_state('DRAIN')) + def test_is_available(self): self.assertFalse(self.allocated_node.is_available()) self.assertTrue(self.idle_node.is_available()) self.assertFalse(self.idle_drained.is_available()) + + def test_is_down(self): + self.assertFalse(self.allocated_node.is_down()) + self.assertFalse(self.idle_node.is_down()) + self.assertTrue(self.idle_drained.is_down()) From f0be29988d637f2d40616e4f857648f87bc77451 Mon Sep 17 00:00:00 2001 From: Theofilos Manitaras Date: Tue, 27 Nov 2018 09:07:52 +0100 Subject: [PATCH 2/3] Address PR comments --- reframe/core/schedulers/slurm.py | 24 +++++++++++------------- unittests/test_schedulers.py | 8 ++++---- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 7da6d76fdd..6a5c6bdf4d 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -298,17 +298,15 @@ def _check_and_cancel(self, reason_descr): r'UnavailableNodes:(?P\S+)?', reason_details.strip()) if nodes_match: - node_names = node_match.group('node_names') + node_names = node_match['node_names'] if nodes_names: - # Here we retrieve the info of the unavailable nodes + # Retrieve the info of the unavailable nodes # and check if they are indeed down nodes = self._get_nodes_by_name(node_names) - down_nodes = [n for n in nodes if n.is_down()] - if not down_nodes: + if not any(n.is_down() for n in nodes): return else: - # Here we handle the case where the UnavailableNodes - # list is empty which actually means that the job + # List of unavailable nodes is empty; assume job # is pending return @@ -420,7 +418,8 @@ def __init__(self, node_descr): 'Partitions', node_descr).split(',')) self._active_features = set(self._extract_attribute( 'ActiveFeatures', node_descr).split(',')) - self._state = self._extract_attribute('State', node_descr) + self._states = set( + self._extract_attribute('State', node_descr).split('+')) def __eq__(self, other): if not isinstance(other, type(self)): @@ -432,14 +431,13 @@ def __hash__(self): return hash(self.name) def is_available(self): - return self._state == 'IDLE' + return self._states == {'IDLE'} def is_down(self): - return any( - self.in_state(s) for s in ['DOWN', 'DRAIN', 'MAINT', 'NO_RESPOND']) + return bool({'DOWN', 'DRAIN', 'MAINT', 'NO_RESPOND'} & self._states) def in_state(self, state): - return state in self._state + return state in self._states @property def active_features(self): @@ -454,8 +452,8 @@ def partitions(self): return self._partitions @property - def state(self): - return self._state + def states(self): + return self._states def _extract_attribute(self, attr_name, node_descr): attr_match = re.search(r'%s=(\S+)' % attr_name, node_descr) diff --git a/unittests/test_schedulers.py b/unittests/test_schedulers.py index 54302b1d61..c1e9f625d2 100644 --- a/unittests/test_schedulers.py +++ b/unittests/test_schedulers.py @@ -779,10 +779,10 @@ def setUp(self): self.idle_node = SlurmNode(idle_node_description) self.idle_drained = SlurmNode(idle_drained_node_description) - def test_state(self): - self.assertEqual(self.allocated_node.state, 'ALLOCATED') - self.assertEqual(self.idle_node.state, 'IDLE') - self.assertEqual(self.idle_drained.state, 'IDLE+DRAIN') + def test_states(self): + self.assertEqual(self.allocated_node.states, {'ALLOCATED'}) + self.assertEqual(self.idle_node.states, {'IDLE'}) + self.assertEqual(self.idle_drained.states, {'IDLE', 'DRAIN'}) def test_equals(self): self.assertEqual(self.allocated_node, self.allocated_node_copy) From fc27b47027bb8923d62c21195e8b90dc4b374af3 Mon Sep 17 00:00:00 2001 From: Theofilos Manitaras Date: Mon, 3 Dec 2018 08:53:36 +0100 Subject: [PATCH 3/3] Address PR comments (version 2) --- reframe/core/schedulers/slurm.py | 3 --- unittests/test_schedulers.py | 6 ------ 2 files changed, 9 deletions(-) diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 6a5c6bdf4d..e45d0e6ead 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -436,9 +436,6 @@ def is_available(self): def is_down(self): return bool({'DOWN', 'DRAIN', 'MAINT', 'NO_RESPOND'} & self._states) - def in_state(self, state): - return state in self._states - @property def active_features(self): return self._active_features diff --git a/unittests/test_schedulers.py b/unittests/test_schedulers.py index c1e9f625d2..13f05270f5 100644 --- a/unittests/test_schedulers.py +++ b/unittests/test_schedulers.py @@ -802,12 +802,6 @@ def test_attributes(self): def test_str(self): self.assertEqual('nid00001', str(self.allocated_node)) - def test_in_state(self): - self.assertTrue(self.allocated_node.in_state('ALLOCATED')) - self.assertFalse(self.idle_node.in_state('MAINT')) - self.assertTrue(self.idle_drained.in_state('IDLE')) - self.assertTrue(self.idle_drained.in_state('DRAIN')) - def test_is_available(self): self.assertFalse(self.allocated_node.is_available()) self.assertTrue(self.idle_node.is_available())