From 7a1dbb15f00f0c8dfdba651c30466a9ec777a803 Mon Sep 17 00:00:00 2001 From: jgp Date: Tue, 6 Apr 2021 13:12:05 +0200 Subject: [PATCH 1/6] Abbreviate nodelist using hostlist format --- reframe/core/schedulers/__init__.py | 13 +++++++++++++ reframe/core/schedulers/slurm.py | 4 +++- reframe/frontend/cli.py | 7 +++++++ reframe/frontend/statistics.py | 2 ++ reframe/schemas/config.json | 2 ++ 5 files changed, 27 insertions(+), 1 deletion(-) diff --git a/reframe/core/schedulers/__init__.py b/reframe/core/schedulers/__init__.py index 287eb35d3b..b786cc62c1 100644 --- a/reframe/core/schedulers/__init__.py +++ b/reframe/core/schedulers/__init__.py @@ -201,6 +201,7 @@ def __init__(self, self._exitcode = None self._state = None self._nodelist = None + self._hostlist = None self._submit_time = None self._completion_time = None @@ -326,6 +327,18 @@ def nodelist(self): ''' return self._nodelist + @property + def hostlist(self): + '''The hostlist of this job. + + The value of this field is scheduler-specific. + + .. versionadded:: 3.6 + + :type: :class`str` or :class:`None` + ''' + return self._hostlist + @property def submit_time(self): return self._submit_time diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index bd019c9df3..9ad0255abd 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -368,7 +368,9 @@ def _update_nodelist(self, job, nodespec): if nodespec and nodespec != 'None assigned': job._nodelist = [n.name for n in self._get_nodes_by_name(nodespec)] - + if rt.runtime().get_option('general/0/report_hostlist'): + job._hostlist = nodespec + def _update_completion_time(self, job, timestamps): if job._completion_time is not None: return diff --git a/reframe/frontend/cli.py b/reframe/frontend/cli.py index c6a14459bd..93e031aca3 100644 --- a/reframe/frontend/cli.py +++ b/reframe/frontend/cli.py @@ -450,6 +450,13 @@ def main(): action='store_true', help='Use a login shell for job scripts' ) + argparser.add_argument( + dest='report_hostlist', + envvar='RFM_REPORT_HOSTLIST', + configvar='general/report_hostlist', + action='store_true', + help='Use hostlist format to report nodelist' + ) # Parse command line options = argparser.parse_args() diff --git a/reframe/frontend/statistics.py b/reframe/frontend/statistics.py index 16db93d2d5..95e8a26a6d 100644 --- a/reframe/frontend/statistics.py +++ b/reframe/frontend/statistics.py @@ -112,6 +112,7 @@ def json(self, force=False): 'maintainers': check.maintainers, 'name': check.name, 'nodelist': [], + 'hostlist': None, 'outputdir': None, 'perfvars': None, 'result': None, @@ -140,6 +141,7 @@ def json(self, force=False): entry['job_stderr'] = check.stderr.evaluate() entry['job_stdout'] = check.stdout.evaluate() entry['nodelist'] = check.job.nodelist or [] + entry['hostlist'] = check.job.hostlist or '' if check.build_job: entry['build_stderr'] = check.build_stderr.evaluate() diff --git a/reframe/schemas/config.json b/reframe/schemas/config.json index 1d5b143b1c..11c86989d7 100644 --- a/reframe/schemas/config.json +++ b/reframe/schemas/config.json @@ -448,6 +448,7 @@ "non_default_craype": {"type": "boolean"}, "purge_environment": {"type": "boolean"}, "report_file": {"type": "string"}, + "report_hostlist": {"type": "boolean"}, "save_log_files": {"type": "boolean"}, "target_systems": {"$ref": "#/defs/system_ref"}, "timestamp_dirs": {"type": "string"}, @@ -486,6 +487,7 @@ "general/non_default_craype": false, "general/purge_environment": false, "general/report_file": "${HOME}/.reframe/reports/run-report.json", + "general/report_hostlist": false, "general/save_log_files": false, "general/target_systems": ["*"], "general/timestamp_dirs": "", From 3934a7d71f4483575af9c0b59dc5b2f474604cb7 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Tue, 13 Apr 2021 17:59:06 +0200 Subject: [PATCH 2/6] Add utility function for abbreviating node lists --- reframe/utility/__init__.py | 232 ++++++++++++++++++++++++++++++++++++ unittests/test_utility.py | 39 ++++++ 2 files changed, 271 insertions(+) diff --git a/reframe/utility/__init__.py b/reframe/utility/__init__.py index 0d6f7d615e..05d7cb3990 100644 --- a/reframe/utility/__init__.py +++ b/reframe/utility/__init__.py @@ -643,6 +643,238 @@ def _is_valid_for_env(m, e): yield (p.fullname, e.name, m) +def _delta_encode(seq): + '''Delta-encode sequence. + + The input list must be at least of size 1. + + :returns: the encoded list. The first element of the encoded sequence is + the first element of the original sequence. + + ''' + + assert len(seq) >= 1 + + ret = [seq[0]] + for i in range(1, len(seq)): + ret.append(seq[i] - seq[i-1]) + + return ret + + +def _rl_encode(seq): + '''Run-length encode a delta-encoded sequence. + + The input list must be at least of size 1. + + :returns: the encoded list. Each element of the list is a three-tuple + containing the first element of the unit, the delta value of the unit + and its length. + + ''' + assert len(seq) >= 1 + + encoded = [] + curr_unit = [seq[0], 1, 1] # current RLE unit + for delta in seq[1:]: + uelem, udelta, ulen = curr_unit + if udelta is None: + curr_unit[1] = delta + curr_unit[2] += 1 + elif udelta != delta: + # New unit; we don't set the delta of the new unit here, because + # `delta` is just the jump for the previous unit. The length of + # the unit is initialized to one, because the last processed + # element *is* part of the new unit. + encoded.append(tuple(curr_unit)) + curr_unit = [uelem + udelta*(ulen-1) + delta, None, 1] + else: + # Increase unit + curr_unit[2] += 1 + + # Fix last unit and add it to the encoded list + if curr_unit[1] is None: + # Conveniently set delta to 1 + curr_unit[1] = 1 + + encoded.append(tuple(curr_unit)) + return encoded + + +class _NodeElem: + def __init__(self, nodename): + m = re.search('(^\D+)(\d+)', nodename) + if m is None: + self._basename = nodename + self._nodeid = -1 + else: + self._basename = m.group(1) + self._nodeid = int(m.group(2).lstrip('0')) + + @property + def basename(self): + return self._basename + + @property + def nodeid(self): + return self._nodeid + + def __repr__(self): + return f'{type(self)}({self.basename}, {self.nodeid})' + + +def _parse_node(nodename): + m = re.search('(^\D+)(\d+)', nodename) + if m is None: + basename = nodename + width = 0 + nodeid = None + else: + basename = m.group(1) + _id = m.group(2).lstrip('0') + if _id == '': + # This is to cover nodes with id=0, e.g., x000 + _id = '0' + + nodeid = int(_id) + width = len(m.group(2)) + + return basename, width, nodeid + + +def _count_digits(n): + '''Count digits of a decimal number.''' + + num_digits = 1 + while n > 10: + n /= 10 + num_digits += 1 + + return num_digits + + +def _common_prefix(s1, s2): + pos = 0 + for i in range(min(len(s1), len(s2))): + if s1[i] != s2[i]: + break + + pos += 1 + + return s1[:pos], s1[pos:], s2[pos:] + + +class _NodeGroup: + def __init__(self, name, width): + self.__name = name + self.__width = width + self.__nodes = [] + + @property + def name(self): + return self.__name + + @property + def width(self): + return self.__width + + @property + def nodes(self): + return self.__nodes + + def add(self, nid): + self.__nodes.append(nid) + + def __str__(self): + abbrev = [] + encoded = _rl_encode(_delta_encode(self.nodes)) + for unit in encoded: + start, delta, size = unit + if size == 1: + s_start = str(start).zfill(self.width) + abbrev.append(f'{self.name}{s_start}') + elif delta != 1: + # We simply unpack node lists with delta != 1 + for i in range(size): + s_start = str(start + i*delta).zfill(self.width) + abbrev.append(f'{self.name}{s_start}') + else: + last = start + delta*(size-1) + digits_last = _count_digits(last) + pad = self.width - digits_last + nd_range = self.name + if pad > 0: + for _ in range(pad): + nd_range += '0' + + s_first = str(start).zfill(digits_last) + s_last = str(last) + prefix, s_first, s_last = _common_prefix(s_first, s_last) + nd_range += f'{prefix}[{s_first}-{s_last}]' + abbrev.append(nd_range) + + return ','.join(abbrev) + + def __hash__(self): + return hash(self.name) ^ hash(self.width) + + def __eq__(self, other): + if not isinstance(other, _NodeGroup): + return NotImplemented + + return self.name == other.name and self.width == other.width + + +def nodelist_abbrev(nodes): + '''Create an abbreviated string representation of the node list. + + For example, the node list + + .. code-block:: python + + ['nid001', 'nid002', 'nid010', 'nid011', 'nid012', 'nid510', 'nid511'] + + will be abbreviated as follows: + + .. code-block:: none + + nid00[1-2],nid0[10-12],nid51[0-1] + + + .. versionadded:: 3.5.3 + + :arg nodes: The node list to abbreviate. + :returns: The abbreviated list representation. + + ''' + + # The algorithm used for abbreviating the list is a standard index + # compression algorithm, the run-length encoding. We first delta encode + # the nodes based on their id, which we retrieve from their name, and then + # run-length encode the list of deltas. The resulting run-length-encoded + # units are then used to generate the abbreviated representation using + # some formatting sugar. The abbreviation is handled in the `__str__()` + # function of the `_NodeGroup`. The purpose of the `_NodeGroup` is to + # group nodes in the list that belong to the same family, namely have the + # same prefix. We then apply the run-length encoding to each group + # independently. + + if isinstance(nodes, str): + raise TypeError('nodes argument cannot be a string') + + if not isinstance(nodes, collections.abc.Sequence): + raise TypeError('nodes argument must be a Sequence') + + node_groups = {} + for n in sorted(nodes): + basename, width, nid = _parse_node(n) + ng = _NodeGroup(basename, width) + node_groups.setdefault(ng, ng) + node_groups[ng].add(nid) + + return ','.join(str(ng) for ng in node_groups) + + class ScopedDict(UserDict): '''This is a special dictionary that imposes scopes on its keys. diff --git a/unittests/test_utility.py b/unittests/test_utility.py index f322707d15..17613e7a2a 100644 --- a/unittests/test_utility.py +++ b/unittests/test_utility.py @@ -1723,3 +1723,42 @@ def foo(): assert util.is_copyable(len) assert util.is_copyable(int) assert not util.is_copyable(foo()) + + +def test_nodelist_abbrev(): + nid_nodes = [f'nid{n:03}' for n in range(5, 20)] + cid_nodes = [f'cid{n:03}' for n in range(20)] + + random.shuffle(nid_nodes) + random.shuffle(cid_nodes) + nid_nodes.insert(0, 'nid002') + nid_nodes.insert(0, 'nid001') + nid_nodes.append('nid125') + cid_nodes += ['cid055', 'cid056'] + + all_nodes = nid_nodes + cid_nodes + random.shuffle(all_nodes) + + nodelist = util.nodelist_abbrev + assert nodelist(nid_nodes) == 'nid00[1-2],nid0[05-19],nid125' + assert nodelist(cid_nodes) == 'cid0[00-19],cid05[5-6]' + assert nodelist(all_nodes) == ( + 'cid0[00-19],cid05[5-6],nid00[1-2],nid0[05-19],nid125' + ) + + # Test non-contiguous nodes + nid_nodes = [] + for i in range(3): + nid_nodes += [f'nid{n:03}' for n in range(10*i, 10*i+5)] + + random.shuffle(nid_nodes) + assert nodelist(nid_nodes) == 'nid00[0-4],nid01[0-4],nid02[0-4]' + assert nodelist(['nid01', 'nid10', 'nid20']) == 'nid01,nid10,nid20' + assert nodelist([]) == '' + assert nodelist(['nid001']) == 'nid001' + + with pytest.raises(TypeError, match='nodes argument must be a Sequence'): + nodelist(1) + + with pytest.raises(TypeError, match='nodes argument cannot be a string'): + nodelist('foo') From 146582d059103eca867ad6e9c4a3501e95db60b2 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Thu, 15 Apr 2021 13:41:55 +0200 Subject: [PATCH 3/6] Remove dead code and fix regex string --- reframe/utility/__init__.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/reframe/utility/__init__.py b/reframe/utility/__init__.py index 05d7cb3990..9b6d5dd5ce 100644 --- a/reframe/utility/__init__.py +++ b/reframe/utility/__init__.py @@ -701,28 +701,6 @@ def _rl_encode(seq): return encoded -class _NodeElem: - def __init__(self, nodename): - m = re.search('(^\D+)(\d+)', nodename) - if m is None: - self._basename = nodename - self._nodeid = -1 - else: - self._basename = m.group(1) - self._nodeid = int(m.group(2).lstrip('0')) - - @property - def basename(self): - return self._basename - - @property - def nodeid(self): - return self._nodeid - - def __repr__(self): - return f'{type(self)}({self.basename}, {self.nodeid})' - - def _parse_node(nodename): m = re.search('(^\D+)(\d+)', nodename) if m is None: From 21a6db9e321d88fa4b27dabefc90391a3e765841 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Thu, 15 Apr 2021 13:49:21 +0200 Subject: [PATCH 4/6] More comments and PEP8 fixes --- reframe/utility/__init__.py | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/reframe/utility/__init__.py b/reframe/utility/__init__.py index 9b6d5dd5ce..e2e88668dc 100644 --- a/reframe/utility/__init__.py +++ b/reframe/utility/__init__.py @@ -648,6 +648,18 @@ def _delta_encode(seq): The input list must be at least of size 1. + Example of delta encoding: + + - Input list: + 1 2 5 6 7 8 9 125 + + - Output list: + 1 1 3 1 1 1 1 106 + ^ + | + First element + of the original list. + :returns: the encoded list. The first element of the encoded sequence is the first element of the original sequence. @@ -667,6 +679,21 @@ def _rl_encode(seq): The input list must be at least of size 1. + Example of run-length encoding: + + - Original list: + 1 2 5 6 7 8 9 125 + + - Delta-encoded list: + 1 1 3 1 1 1 1 106 + + - Run-length-encoded list: + + (1,1,2), (5,1,5), (125,1,1) + + For convenience, in each RLE unit we use the first element of the original + unit and not the delta value from the previous unit. + :returns: the encoded list. Each element of the list is a three-tuple containing the first element of the unit, the delta value of the unit and its length. @@ -702,7 +729,7 @@ def _rl_encode(seq): def _parse_node(nodename): - m = re.search('(^\D+)(\d+)', nodename) + m = re.search(r'(^\D+)(\d+)', nodename) if m is None: basename = nodename width = 0 From f13e166d1d76ef1d5be9abe90c5abea4362db329 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Fri, 16 Apr 2021 15:26:39 +0200 Subject: [PATCH 5/6] Add unit test for node duplicates --- unittests/test_utility.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/unittests/test_utility.py b/unittests/test_utility.py index caabc76220..11aa8c7bf6 100644 --- a/unittests/test_utility.py +++ b/unittests/test_utility.py @@ -1746,6 +1746,9 @@ def test_nodelist_abbrev(): assert nodelist([]) == '' assert nodelist(['nid001']) == 'nid001' + # Test node duplicates + assert nodelist(['nid001', 'nid001', 'nid002']) == 'nid001,nid00[1-2]' + with pytest.raises(TypeError, match='nodes argument must be a Sequence'): nodelist(1) From 5b69e4530dd008d46e968ed266cc90ea878fd5d9 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Fri, 16 Apr 2021 16:33:06 +0200 Subject: [PATCH 6/6] Revert changes related to configuration - Node lists are always abbreviated in the `FAILURE INFO` output but not in the JSON report. --- reframe/core/schedulers/__init__.py | 13 ------------- reframe/core/schedulers/slurm.py | 4 +--- reframe/frontend/cli.py | 7 ------- reframe/frontend/statistics.py | 8 ++++---- reframe/schemas/config.json | 2 -- 5 files changed, 5 insertions(+), 29 deletions(-) diff --git a/reframe/core/schedulers/__init__.py b/reframe/core/schedulers/__init__.py index b786cc62c1..287eb35d3b 100644 --- a/reframe/core/schedulers/__init__.py +++ b/reframe/core/schedulers/__init__.py @@ -201,7 +201,6 @@ def __init__(self, self._exitcode = None self._state = None self._nodelist = None - self._hostlist = None self._submit_time = None self._completion_time = None @@ -327,18 +326,6 @@ def nodelist(self): ''' return self._nodelist - @property - def hostlist(self): - '''The hostlist of this job. - - The value of this field is scheduler-specific. - - .. versionadded:: 3.6 - - :type: :class`str` or :class:`None` - ''' - return self._hostlist - @property def submit_time(self): return self._submit_time diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 9ad0255abd..bd019c9df3 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -368,9 +368,7 @@ def _update_nodelist(self, job, nodespec): if nodespec and nodespec != 'None assigned': job._nodelist = [n.name for n in self._get_nodes_by_name(nodespec)] - if rt.runtime().get_option('general/0/report_hostlist'): - job._hostlist = nodespec - + def _update_completion_time(self, job, timestamps): if job._completion_time is not None: return diff --git a/reframe/frontend/cli.py b/reframe/frontend/cli.py index 580dca139c..e107ec5c67 100644 --- a/reframe/frontend/cli.py +++ b/reframe/frontend/cli.py @@ -450,13 +450,6 @@ def main(): action='store_true', help='Use a login shell for job scripts' ) - argparser.add_argument( - dest='report_hostlist', - envvar='RFM_REPORT_HOSTLIST', - configvar='general/report_hostlist', - action='store_true', - help='Use hostlist format to report nodelist' - ) # Parse command line options = argparser.parse_args() diff --git a/reframe/frontend/statistics.py b/reframe/frontend/statistics.py index 95e8a26a6d..963676d519 100644 --- a/reframe/frontend/statistics.py +++ b/reframe/frontend/statistics.py @@ -7,6 +7,7 @@ import traceback import reframe.core.runtime as rt import reframe.core.exceptions as errors +import reframe.utility as util class TestStats: @@ -112,7 +113,6 @@ def json(self, force=False): 'maintainers': check.maintainers, 'name': check.name, 'nodelist': [], - 'hostlist': None, 'outputdir': None, 'perfvars': None, 'result': None, @@ -141,7 +141,6 @@ def json(self, force=False): entry['job_stderr'] = check.stderr.evaluate() entry['job_stdout'] = check.stdout.evaluate() entry['nodelist'] = check.job.nodelist or [] - entry['hostlist'] = check.job.hostlist or '' if check.build_job: entry['build_stderr'] = check.build_stderr.evaluate() @@ -218,8 +217,9 @@ def print_failure_report(self, printer): printer.info(f" * System partition: {r['system']}") printer.info(f" * Environment: {r['environment']}") printer.info(f" * Stage directory: {r['stagedir']}") - nodelist = ','.join(r['nodelist']) if r['nodelist'] else None - printer.info(f" * Node list: {nodelist}") + printer.info( + f" * Node list: {util.nodelist_abbrev(r['nodelist'])}" + ) job_type = 'local' if r['scheduler'] == 'local' else 'batch job' jobid = r['jobid'] printer.info(f" * Job type: {job_type} (id={r['jobid']})") diff --git a/reframe/schemas/config.json b/reframe/schemas/config.json index 11c86989d7..1d5b143b1c 100644 --- a/reframe/schemas/config.json +++ b/reframe/schemas/config.json @@ -448,7 +448,6 @@ "non_default_craype": {"type": "boolean"}, "purge_environment": {"type": "boolean"}, "report_file": {"type": "string"}, - "report_hostlist": {"type": "boolean"}, "save_log_files": {"type": "boolean"}, "target_systems": {"$ref": "#/defs/system_ref"}, "timestamp_dirs": {"type": "string"}, @@ -487,7 +486,6 @@ "general/non_default_craype": false, "general/purge_environment": false, "general/report_file": "${HOME}/.reframe/reports/run-report.json", - "general/report_hostlist": false, "general/save_log_files": false, "general/target_systems": ["*"], "general/timestamp_dirs": "",