diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index d70c19332e..e45d0e6ead 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -293,11 +293,22 @@ def _check_and_cancel(self, reason_descr): reason, reason_details = reason_descr, None if reason in self._cancel_reasons: - # Here we handle the case were the UnavailableNodes list is empty, - # which actually means that the job is pending if reason == 'ReqNodeNotAvail' and reason_details: - if re.match(r'UnavailableNodes:$', reason_details.strip()): - return + nodes_match = re.match( + r'UnavailableNodes:(?P\S+)?', + reason_details.strip()) + if nodes_match: + node_names = node_match['node_names'] + if nodes_names: + # Retrieve the info of the unavailable nodes + # and check if they are indeed down + nodes = self._get_nodes_by_name(node_names) + if not any(n.is_down() for n in nodes): + return + else: + # List of unavailable nodes is empty; assume job + # is pending + return self.cancel() reason_msg = ('job cancelled because it was blocked due to ' @@ -407,7 +418,8 @@ def __init__(self, node_descr): 'Partitions', node_descr).split(',')) self._active_features = set(self._extract_attribute( 'ActiveFeatures', node_descr).split(',')) - self._state = self._extract_attribute('State', node_descr) + self._states = set( + self._extract_attribute('State', node_descr).split('+')) def __eq__(self, other): if not isinstance(other, type(self)): @@ -419,7 +431,10 @@ def __hash__(self): return hash(self.name) def is_available(self): - return self._state == 'IDLE' + return self._states == {'IDLE'} + + def is_down(self): + return bool({'DOWN', 'DRAIN', 'MAINT', 'NO_RESPOND'} & self._states) @property def active_features(self): @@ -434,8 +449,8 @@ def partitions(self): return self._partitions @property - def state(self): - return self._state + def states(self): + return self._states def _extract_attribute(self, attr_name, node_descr): attr_match = re.search(r'%s=(\S+)' % attr_name, node_descr) diff --git a/unittests/test_schedulers.py b/unittests/test_schedulers.py index 8793c26a29..13f05270f5 100644 --- a/unittests/test_schedulers.py +++ b/unittests/test_schedulers.py @@ -779,10 +779,10 @@ def setUp(self): self.idle_node = SlurmNode(idle_node_description) self.idle_drained = SlurmNode(idle_drained_node_description) - def test_state(self): - self.assertEqual(self.allocated_node.state, 'ALLOCATED') - self.assertEqual(self.idle_node.state, 'IDLE') - self.assertEqual(self.idle_drained.state, 'IDLE+DRAIN') + def test_states(self): + self.assertEqual(self.allocated_node.states, {'ALLOCATED'}) + self.assertEqual(self.idle_node.states, {'IDLE'}) + self.assertEqual(self.idle_drained.states, {'IDLE', 'DRAIN'}) def test_equals(self): self.assertEqual(self.allocated_node, self.allocated_node_copy) @@ -806,3 +806,8 @@ def test_is_available(self): self.assertFalse(self.allocated_node.is_available()) self.assertTrue(self.idle_node.is_available()) self.assertFalse(self.idle_drained.is_available()) + + def test_is_down(self): + self.assertFalse(self.allocated_node.is_down()) + self.assertFalse(self.idle_node.is_down()) + self.assertTrue(self.idle_drained.is_down())