diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index a4b5186199..ac051bce2d 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -20,7 +20,7 @@ JobBlockedError, JobError, JobSchedulerError) -from reframe.utility import nodelist_abbrev, seconds_to_hms +from reframe.utility import nodelist_abbrev, nodelist_expand, seconds_to_hms def slurm_state_completed(state): @@ -73,6 +73,18 @@ def __init__(self, *args, **kwargs): self._is_array = False self._is_cancelling = False + # The compacted nodelist as reported by Slurm. This must be updated in + # every poll as Slurm may be slow in reporting the exact nodelist + self._nodespec = None + + @property + def nodelist(self): + # Redefine nodelist so as to generate it from the nodespec + if self._nodelist is None and self._nodespec is not None: + self._nodelist = nodelist_expand(self._nodespec) + + return self._nodelist + @property def is_array(self): return self._is_array @@ -380,13 +392,6 @@ def _get_nodes_by_name(self, nodespec): node_descriptions = completed.stdout.splitlines() return _create_nodes(node_descriptions) - def _update_nodelist(self, job, nodespec): - if job.nodelist is not None: - return - - if nodespec and nodespec != 'None assigned': - job._nodelist = [n.name for n in self._get_nodes_by_name(nodespec)] - def _update_completion_time(self, job, timestamps): if job._completion_time is not None: return @@ -460,9 +465,7 @@ def poll(self, *jobs): ) # Use ',' to join nodes to be consistent with Slurm syntax - self._update_nodelist( - job, ','.join(m.group('nodespec') for m in jobarr_info) - ) + job._nodespec = ','.join(m.group('nodespec') for m in jobarr_info) self._update_completion_time( job, (m.group('end') for m in jobarr_info) ) diff --git a/reframe/utility/__init__.py b/reframe/utility/__init__.py index 2bcd792c1f..aa678bce4a 100644 --- a/reframe/utility/__init__.py +++ b/reframe/utility/__init__.py @@ -925,6 +925,43 @@ def nodelist_abbrev(nodes): return ','.join(str(ng) for ng in node_groups) +def nodelist_expand(nodespec): + '''Expand the nodes in ``nodespec`` to a list of nodes. + + :arg nodespec: A node specification as the one returned by + :func:`nodelist_abbrev` + :returns: The list of nodes corresponding to the given node specification. + + .. versionadded:: 4.0.0 + ''' + + if not isinstance(nodespec, str): + raise TypeError('nodespec argument must be a string') + + if nodespec == '': + return [] + + nodespec_parts = nodespec.split(',') + node_patt = re.compile(r'(?P.+)\[(?P\d+)-(?P\d+)\]') + nodes = [] + for ns in nodespec_parts: + if '[' not in ns and ']' not in ns: + nodes.append(ns) + continue + + match = node_patt.match(ns) + if not match: + raise ValueError(f'invalid nodespec: {nodespec}') + + prefix = match.group('prefix') + low, upper = int(match.group('l')), int(match.group('u')) + width = count_digits(upper) + for nid in range(low, upper+1): + nodes.append(f'{prefix}{nid:0{width}}') + + return nodes + + def cache_return_value(fn): '''Decorator that caches the return value of the decorated function. diff --git a/unittests/test_utility.py b/unittests/test_utility.py index f105a9ce32..4e9f8fe728 100644 --- a/unittests/test_utility.py +++ b/unittests/test_utility.py @@ -1761,7 +1761,7 @@ def bar(x, y): util.is_trivially_callable(1) -def test_nodelist_abbrev(): +def test_nodelist_utilities(): nid_nodes = [f'nid{n:03}' for n in range(5, 20)] cid_nodes = [f'cid{n:03}' for n in range(20)] @@ -1776,12 +1776,20 @@ def test_nodelist_abbrev(): random.shuffle(all_nodes) nodelist = util.nodelist_abbrev + expand = util.nodelist_expand assert nodelist(nid_nodes) == 'nid00[1-2],nid0[05-19],nid125' assert nodelist(cid_nodes) == 'cid0[00-19],cid05[5-6]' assert nodelist(all_nodes) == ( 'cid0[00-19],cid05[5-6],nid00[1-2],nid0[05-19],nid125' ) + # Test the reverse operation + assert expand('nid00[1-2],nid0[05-19],nid125') == sorted(nid_nodes) + assert expand('cid0[00-19],cid05[5-6]') == sorted(cid_nodes) + assert expand( + 'cid0[00-19],cid05[5-6],nid00[1-2],nid0[05-19],nid125' + ) == sorted(all_nodes) + # Test non-contiguous nodes nid_nodes = [] for i in range(3): @@ -1793,12 +1801,22 @@ def test_nodelist_abbrev(): assert nodelist([]) == '' assert nodelist(['nid001']) == 'nid001' + # Test the reverse operation + assert expand('nid00[0-4],nid01[0-4],nid02[0-4]') == sorted(nid_nodes) + assert expand('nid01,nid10,nid20') == ['nid01', 'nid10', 'nid20'] + assert expand('') == [] + assert expand('nid001') == ['nid001'] + # Test host names with numbers in their basename (see GH #2357) nodes = [f'c2-01-{n:02}' for n in range(100)] assert nodelist(nodes) == 'c2-01-[00-99]' + # Test the reverse operation + assert expand('c2-01-[00-99]') == nodes + # Test node duplicates assert nodelist(['nid001', 'nid001', 'nid002']) == 'nid001,nid00[1-2]' + assert expand('nid001,nid00[1-2]') == ['nid001', 'nid001', 'nid002'] with pytest.raises(TypeError, match='nodes argument must be a Sequence'): nodelist(1) @@ -1806,6 +1824,12 @@ def test_nodelist_abbrev(): with pytest.raises(TypeError, match='nodes argument cannot be a string'): nodelist('foo') + with pytest.raises(TypeError, match='nodespec argument must be a string'): + expand(10) + + with pytest.raises(ValueError, match='invalid nodespec'): + expand('nid00[1-3],nid3[3-43') + def test_cached_return_value():