From 1f24f72c6c11955623b7f53169f8212c25de8bd6 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Mon, 22 Mar 2021 15:31:19 +0100 Subject: [PATCH 01/30] Add utility for processor information --- reframe/utility/systeminfo.py | 243 ++++++++++++++++++++++++++++++++++ 1 file changed, 243 insertions(+) create mode 100644 reframe/utility/systeminfo.py diff --git a/reframe/utility/systeminfo.py b/reframe/utility/systeminfo.py new file mode 100644 index 0000000000..ced3ef3d9a --- /dev/null +++ b/reframe/utility/systeminfo.py @@ -0,0 +1,243 @@ +# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich) +# ReFrame Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: BSD-3-Clause + +'''Managing system information. + +.. versionadded:: 3.6.0 + +''' +import archspec.cpu +import glob +import os +import re + +import reframe.utility.osext as osext +from reframe.core.exceptions import SpawnedProcessError + + +def bits_from_string(mask): + ret = [] + mask_int = int(mask, 0) + index = 0 + while mask_int: + if mask_int & 1: + ret.append(index) + + index += 1 + mask_int >>= 1 + + return ret + + +def string_from_bits(ids): + ret = 0 + for id in ids: + ret |= (1 << id) + + return hex(ret).upper() + + +def filesystem_info(): + cache_units = { + 'K': 1024, + 'M': 1048576, + 'G': 1073741824 + } + processor_info = { + 'topology': {} + } + cpu_dirs = glob.glob(r'/sys/devices/system/cpu/cpu[0-9]*') + nodes = glob.glob(r'/sys/devices/system/node/node[0-9]*') + + cores = set() + for cpu in cpu_dirs: + with open(os.path.join(cpu, 'topology/core_cpus')) as fp: + core_cpus = fp.read() + core_cpus = re.sub(r'[\s,]', '', core_cpus) + core_cpus = f'0x{core_cpus.upper()}' + cores.add(core_cpus) + + sockets = set() + for cpu in cpu_dirs: + with open(os.path.join(cpu, 'topology/package_cpus')) as fp: + package_cpus = fp.read() + package_cpus = re.sub(r'[\s,]', '', package_cpus) + package_cpus = f'0x{package_cpus.upper()}' + sockets.add(package_cpus) + + numa_nodes = [] + for node in nodes: + with open(os.path.join(node, 'cpumap')) as fp: + cpumap = fp.read() + cpumap = re.sub(r'[\s,]', '', cpumap) + cpumap = f'0x{cpumap.upper()}' + numa_nodes.append(cpumap) + + numa_nodes.sort() + + caches = {} + for cpu in cpu_dirs: + cache_dirs = glob.glob(cpu + r'/cache/index[0-9]*') + for cache in cache_dirs: + with open(os.path.join(cache, 'level')) as fp: + cache_level = int(fp.read()) + + # Skip L1 instruction cache + with open(os.path.join(cache, 'type')) as fp: + if cache_level == 1 and fp.read() == 'Instruction\n': + continue + + with open(os.path.join(cache, 'ways_of_associativity')) as fp: + cache_associativity = int(fp.read()) + + with open(os.path.join(cache, 'size')) as fp: + cache_size = fp.read() + m = re.match(r'(?P\d+)(?P\S)', cache_size) + if m: + value = int(m.group('val')) + unit = cache_units.get(m.group('unit'), 1) + cache_size = value*unit + + with open(os.path.join(cache, 'coherency_line_size')) as fp: + cache_linesize = int(fp.read()) + + with open(os.path.join(cache, 'shared_cpu_map')) as fp: + cache_cpuset = fp.read() + cache_cpuset = re.sub(r'[\s,]', '', cache_cpuset) + cache_cpuset = f'0x{cache_cpuset.upper()}' + + num_cpus = len(bits_from_string(cache_cpuset)) + caches.setdefault((cache_level, cache_size, cache_linesize, + cache_associativity, num_cpus), set()) + caches[(cache_level, cache_size, cache_linesize, + cache_associativity, num_cpus)].add(cache_cpuset) + + num_cpus = len(cpu_dirs) + num_cores = len(cores) + num_sockets = len(sockets) + num_cpus_per_core = num_cpus // num_cores + num_cpus_per_socket = num_cpus // num_sockets + + processor_info['num_cpus'] = num_cpus + processor_info['num_cpus_per_core'] = num_cpus_per_core + processor_info['num_cpus_per_socket'] = num_cpus_per_socket + processor_info['num_sockets'] = num_sockets + processor_info['topology']['numa_nodes'] = numa_nodes + processor_info['topology']['sockets'] = sorted(list(sockets)) + processor_info['topology']['cores'] = sorted(list(cores)) + processor_info['topology']['caches'] = [] + for cache_type, cpusets in caches.items(): + (cache_level, cache_size, cache_linesize, cache_associativity, + num_cpus) = cache_type + c = { + 'type': f'L{cache_level}', + 'size': cache_size, + 'linesize': cache_linesize, + 'associativity': cache_associativity, + 'num_cpus': num_cpus, + 'cpusets': sorted(list(cpusets)) + } + processor_info['topology']['caches'].append(c) + + return processor_info + + +def sysctl_info(): + try: + exec_output = osext.run_command('sysctl hw machdep.cpu.cache', + check=True) + except (FileNotFoundError, SpawnedProcessError): + return {} + + processor_info = { + 'topology': {} + } + match = re.search(r'hw\.ncpu: (?P\d+)', exec_output.stdout) + if match: + num_cpus = int(match.group('num_cpus')) + + match = re.search(r'hw\.physicalcpu: (?P\d+)', + exec_output.stdout) + if match: + num_cores = int(match.group('num_cores')) + + match = re.search(r'hw\.packages: (?P\d+)', + exec_output.stdout) + if match: + num_sockets = int(match.group('num_sockets')) + processor_info['num_sockets'] = num_sockets + + match = re.search(r'hw\.cacheconfig:(?P(\s\d+)*)', + exec_output.stdout) + if match: + cacheconfig = list(map(int, match.group('cacheconfig').split())) + + match = re.search(r'hw\.cachesize:(?P(\s\d+)*)', + exec_output.stdout) + if match: + cachesize = list(map(int, match.group('cachesize').split())) + + match = re.search(r'hw\.cachelinesize: (?P\d+)', + exec_output.stdout) + if match: + linesize = int(match.group('linesize')) + + cache_associativity = [0] + # index 0 is referring to memory + for i in range(1, len(cachesize)): + if cachesize[i] == 0: + break + + match = re.search(rf'machdep\.cpu\.cache\.L{i}_associativity: ' + rf'(?P\d+)', + exec_output.stdout) + ca = int(match.group('associativity')) if match else 0 + cache_associativity.append(ca) + + num_cpus_per_socket = num_cpus // num_sockets + num_cpus_per_core = num_cpus // num_cores + + processor_info['num_cpus'] = num_cpus + processor_info['num_cpus_per_socket'] = num_cpus_per_socket + processor_info['num_cpus_per_core'] = num_cpus_per_core + processor_info['topology']['numa_nodes'] = string_from_bits(range(num_cpus)) + processor_info['topology']['sockets'] = [ + string_from_bits(range(start, start+num_cpus_per_socket)) for start + in range(0, num_cpus, num_cpus_per_socket) + ] + processor_info['topology']['cores'] = [ + string_from_bits(range(start, start+num_cpus_per_core)) for start + in range(0, num_cpus, num_cpus_per_core) + ] + processor_info['topology']['caches'] = [] + for i in range(1, len(cache_associativity)): + t = { + 'type': f'L{i}', + 'size': cachesize[i], + 'linesize': linesize, + 'associativity': cache_associativity[i], + 'num_cpus': cacheconfig[i], + 'cpusets': [ + string_from_bits(range(start, start+cacheconfig[i])) + for start in range(0, num_cpus, cacheconfig[i]) + ] + } + processor_info['topology']['caches'].append(t) + + return processor_info + + +def get_proc_info(): + processor_info = { + 'arch': archspec.cpu.host().name + } + # Try first to get information from the filesystem + if glob.glob('/sys/'): + topology_information = filesystem_info() + + # Try the `sysctl` command + topology_information = sysctl_info() + processor_info.update(topology_information) + return processor_info From 0ce02f4dfe54daac52909f779056a266949dcec7 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Mon, 22 Mar 2021 15:38:25 +0100 Subject: [PATCH 02/30] Fix bug --- reframe/utility/systeminfo.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/reframe/utility/systeminfo.py b/reframe/utility/systeminfo.py index ced3ef3d9a..665ddab524 100644 --- a/reframe/utility/systeminfo.py +++ b/reframe/utility/systeminfo.py @@ -236,8 +236,9 @@ def get_proc_info(): # Try first to get information from the filesystem if glob.glob('/sys/'): topology_information = filesystem_info() + else: + # Try the `sysctl` command + topology_information = sysctl_info() - # Try the `sysctl` command - topology_information = sysctl_info() processor_info.update(topology_information) return processor_info From 73e8af33c8ca7acc547e11b8389e777aa0f18c1d Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Tue, 23 Mar 2021 13:01:00 +0100 Subject: [PATCH 03/30] Fix crashing when files or modules are not available --- reframe/utility/systeminfo.py | 148 +++++++++++++++++++++++----------- 1 file changed, 102 insertions(+), 46 deletions(-) diff --git a/reframe/utility/systeminfo.py b/reframe/utility/systeminfo.py index 665ddab524..7574e8c213 100644 --- a/reframe/utility/systeminfo.py +++ b/reframe/utility/systeminfo.py @@ -9,6 +9,7 @@ ''' import archspec.cpu +import contextlib import glob import os import re @@ -53,27 +54,50 @@ def filesystem_info(): cores = set() for cpu in cpu_dirs: - with open(os.path.join(cpu, 'topology/core_cpus')) as fp: - core_cpus = fp.read() - core_cpus = re.sub(r'[\s,]', '', core_cpus) - core_cpus = f'0x{core_cpus.upper()}' - cores.add(core_cpus) + core_cpus_path = os.path.join(cpu, 'topology/core_cpus') + thread_siblings_path = os.path.join(cpu, 'topology/thread_siblings') + if glob.glob(core_cpus_path): + cores_path = core_cpus_path + elif glob.glob(thread_siblings_path): + cores_path = thread_siblings_path + else: + # Information cannot be retrieved + continue + + with contextlib.suppress(IOError): + with open(cores_path) as fp: + core_cpus = fp.read() + core_cpus = re.sub(r'[\s,]', '', core_cpus) + core_cpus = f'0x{core_cpus.upper()}' + cores.add(core_cpus) sockets = set() for cpu in cpu_dirs: - with open(os.path.join(cpu, 'topology/package_cpus')) as fp: - package_cpus = fp.read() - package_cpus = re.sub(r'[\s,]', '', package_cpus) - package_cpus = f'0x{package_cpus.upper()}' - sockets.add(package_cpus) + package_cpus_path = os.path.join(cpu, 'topology/package_cpus') + core_siblings_path = os.path.join(cpu, 'topology/core_siblings') + if glob.glob(package_cpus_path): + sockets_path = package_cpus_path + elif glob.glob(core_siblings_path): + sockets_path = core_siblings_path + else: + # Information cannot be retrieved + continue + + with contextlib.suppress(IOError): + with open(sockets_path) as fp: + package_cpus = fp.read() + package_cpus = re.sub(r'[\s,]', '', package_cpus) + package_cpus = f'0x{package_cpus.upper()}' + sockets.add(package_cpus) numa_nodes = [] for node in nodes: - with open(os.path.join(node, 'cpumap')) as fp: - cpumap = fp.read() - cpumap = re.sub(r'[\s,]', '', cpumap) - cpumap = f'0x{cpumap.upper()}' - numa_nodes.append(cpumap) + with contextlib.suppress(IOError): + with open(os.path.join(node, 'cpumap')) as fp: + cpumap = fp.read() + cpumap = re.sub(r'[\s,]', '', cpumap) + cpumap = f'0x{cpumap.upper()}' + numa_nodes.append(cpumap) numa_nodes.sort() @@ -81,32 +105,60 @@ def filesystem_info(): for cpu in cpu_dirs: cache_dirs = glob.glob(cpu + r'/cache/index[0-9]*') for cache in cache_dirs: - with open(os.path.join(cache, 'level')) as fp: - cache_level = int(fp.read()) - - # Skip L1 instruction cache - with open(os.path.join(cache, 'type')) as fp: - if cache_level == 1 and fp.read() == 'Instruction\n': - continue - - with open(os.path.join(cache, 'ways_of_associativity')) as fp: - cache_associativity = int(fp.read()) - - with open(os.path.join(cache, 'size')) as fp: - cache_size = fp.read() - m = re.match(r'(?P\d+)(?P\S)', cache_size) - if m: - value = int(m.group('val')) - unit = cache_units.get(m.group('unit'), 1) - cache_size = value*unit - - with open(os.path.join(cache, 'coherency_line_size')) as fp: - cache_linesize = int(fp.read()) - - with open(os.path.join(cache, 'shared_cpu_map')) as fp: - cache_cpuset = fp.read() - cache_cpuset = re.sub(r'[\s,]', '', cache_cpuset) - cache_cpuset = f'0x{cache_cpuset.upper()}' + cache_level = 0 + cache_size = 0 + cache_linesize = 0 + cache_associativity = 0 + cache_cpuset = '' + + with contextlib.suppress(IOError): + with open(os.path.join(cache, 'level')) as fp: + cache_level = int(fp.read()) + + with contextlib.suppress(IOError): + # Skip L1 instruction cache + with open(os.path.join(cache, 'type')) as fp: + if cache_level == 1 and fp.read() == 'Instruction\n': + continue + + with contextlib.suppress(IOError): + with open(os.path.join(cache, 'size')) as fp: + cache_size = fp.read() + m = re.match(r'(?P\d+)(?P\S)', cache_size) + if m: + value = int(m.group('val')) + unit = cache_units.get(m.group('unit'), 1) + cache_size = value*unit + + with contextlib.suppress(IOError): + with open(os.path.join(cache, 'coherency_line_size')) as fp: + cache_linesize = int(fp.read()) + + # Don't take the associativity directly from + # "ways_of_associativity" file because some archs (ia64, ppc) + # put 0 there when fully-associative, while others (x86) + # put something like -1 there. + with contextlib.suppress(IOError): + with open(os.path.join(cache, 'number_of_sets')) as fp: + cache_number_of_sets = int(fp.read()) + + with open(os.path.join(cache, + 'physical_line_partition')) as fp: + cache_physical_line_partition = int(fp.read()) + + if (cache_linesize and + cache_physical_line_partition and + cache_number_of_sets): + cache_associativity = (cache_size // + cache_linesize // + cache_physical_line_partition // + cache_number_of_sets) + + with contextlib.suppress(IOError): + with open(os.path.join(cache, 'shared_cpu_map')) as fp: + cache_cpuset = fp.read() + cache_cpuset = re.sub(r'[\s,]', '', cache_cpuset) + cache_cpuset = f'0x{cache_cpuset.upper()}' num_cpus = len(bits_from_string(cache_cpuset)) caches.setdefault((cache_level, cache_size, cache_linesize, @@ -117,8 +169,8 @@ def filesystem_info(): num_cpus = len(cpu_dirs) num_cores = len(cores) num_sockets = len(sockets) - num_cpus_per_core = num_cpus // num_cores - num_cpus_per_socket = num_cpus // num_sockets + num_cpus_per_core = num_cpus // num_cores if num_cores else 0 + num_cpus_per_socket = num_cpus // num_sockets if num_sockets else 0 processor_info['num_cpus'] = num_cpus processor_info['num_cpus_per_core'] = num_cpus_per_core @@ -230,9 +282,13 @@ def sysctl_info(): def get_proc_info(): - processor_info = { - 'arch': archspec.cpu.host().name - } + try: + processor_info = { + 'arch': archspec.cpu.host().name + } + except ModuleNotFoundError: + processor_info = {} + # Try first to get information from the filesystem if glob.glob('/sys/'): topology_information = filesystem_info() From 4a2a96330ef1ac29ea14711a6eb681760d3ce729 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Tue, 23 Mar 2021 13:09:51 +0100 Subject: [PATCH 04/30] Fix archspec import --- reframe/utility/systeminfo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/reframe/utility/systeminfo.py b/reframe/utility/systeminfo.py index 7574e8c213..eaa1cb69b9 100644 --- a/reframe/utility/systeminfo.py +++ b/reframe/utility/systeminfo.py @@ -8,7 +8,6 @@ .. versionadded:: 3.6.0 ''' -import archspec.cpu import contextlib import glob import os @@ -283,6 +282,8 @@ def sysctl_info(): def get_proc_info(): try: + import archspec.cpu + processor_info = { 'arch': archspec.cpu.host().name } From e2529c10dce04e6bfa38f76f32543196687b8b11 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Tue, 23 Mar 2021 16:54:48 +0100 Subject: [PATCH 05/30] Add archspec in reframe requirements --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 84b7473d47..ba261c1751 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +archspec==0.1.2 argcomplete==1.12.2 coverage==5.3 importlib_metadata==2.0.0 From 6d9c8c7f9d5bff0f695a849b09874bf9811babb0 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Sat, 22 May 2021 23:37:13 +0200 Subject: [PATCH 06/30] Add experimental CLI option for accessing now the auto-config --- reframe/frontend/cli.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/reframe/frontend/cli.py b/reframe/frontend/cli.py index c6a14459bd..c8078cb671 100644 --- a/reframe/frontend/cli.py +++ b/reframe/frontend/cli.py @@ -279,6 +279,11 @@ def main(): 'for the selected tests and exit'), ) + # FIXME: Remove this + action_options.add_argument( + '--proc-config', action='store_true' + ) + # Run options run_options.add_argument( '-J', '--job-option', action='append', metavar='OPT', @@ -815,6 +820,14 @@ def _case_failed(t): ) sys.exit(0) + # FIXME: Remove this one + if options.proc_config: + import reframe.utility as util + from reframe.utility.systeminfo import get_proc_info + + printer.info(util.ppretty(get_proc_info())) + sys.exit(0) + if not options.run: printer.error("No action option specified. Available options:\n" " - `-l'/`-L' for listing\n" From 12f11b3d7f0b76f0bf9496a4922d13175942c2df Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Wed, 26 May 2021 23:29:40 +0200 Subject: [PATCH 07/30] Add the command-line option --- reframe/frontend/cli.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/reframe/frontend/cli.py b/reframe/frontend/cli.py index f528367615..bab7f347d5 100644 --- a/reframe/frontend/cli.py +++ b/reframe/frontend/cli.py @@ -297,11 +297,6 @@ def main(): 'for the selected tests and exit'), ) - # FIXME: Remove this - action_options.add_argument( - '--proc-config', action='store_true' - ) - # Run options run_options.add_argument( '-J', '--job-option', action='append', metavar='OPT', @@ -429,6 +424,11 @@ def main(): metavar='PARAM', help='Print the value of configuration parameter PARAM and exit' ) + misc_options.add_argument( + '--detect-system-topology', action='store_true', + help=('Detect and store topology information ' + 'for the current system and exit') + ) misc_options.add_argument( '--system', action='store', help='Load configuration for SYSTEM', envvar='RFM_SYSTEM' @@ -625,6 +625,13 @@ def main(): sys.exit(0) + if options.detect_system_topology: + import reframe.utility as util + from reframe.utility.systeminfo import get_proc_info + + printer.info(util.ppretty(get_proc_info())) + sys.exit(0) + printer.debug(format_env(options.env_vars)) # Setup the check loader @@ -857,14 +864,6 @@ def _case_failed(t): ) sys.exit(0) - # FIXME: Remove this one - if options.proc_config: - import reframe.utility as util - from reframe.utility.systeminfo import get_proc_info - - printer.info(util.ppretty(get_proc_info())) - sys.exit(0) - if not options.run: printer.error("No action option specified. Available options:\n" " - `-l'/`-L' for listing\n" From 9bec4005af890212911a4ebf4eaf9ca09ab77983 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Wed, 26 May 2021 23:44:48 +0200 Subject: [PATCH 08/30] Auto-detect topology --- reframe/core/systems.py | 16 ++--- reframe/frontend/autodetect.py | 106 +++++++++++++++++++++++++++++++++ reframe/frontend/cli.py | 31 ++++++++-- reframe/utility/systeminfo.py | 7 ++- 4 files changed, 145 insertions(+), 15 deletions(-) create mode 100644 reframe/frontend/autodetect.py diff --git a/reframe/core/systems.py b/reframe/core/systems.py index 74d6521784..f7d92cf720 100644 --- a/reframe/core/systems.py +++ b/reframe/core/systems.py @@ -4,13 +4,15 @@ # SPDX-License-Identifier: BSD-3-Clause import json +import os -import reframe.utility as utility +import reframe.utility as util import reframe.utility.jsonext as jsonext +import reframe.utility.systeminfo as sysinfo from reframe.core.backends import (getlauncher, getscheduler) +from reframe.core.environments import (Environment, ProgEnvironment) from reframe.core.logging import getlogger from reframe.core.modules import ModulesSystem -from reframe.core.environments import (Environment, ProgEnvironment) class ProcessorType(jsonext.JSONSerializable): @@ -232,7 +234,7 @@ def access(self): :type: :class:`List[str]` ''' - return utility.SequenceView(self._access) + return util.SequenceView(self._access) @property def descr(self): @@ -249,7 +251,7 @@ def environs(self): :type: :class:`List[ProgEnvironment]` ''' - return utility.SequenceView(self._environs) + return util.SequenceView(self._environs) @property def container_environs(self): @@ -258,7 +260,7 @@ def container_environs(self): :type: :class:`Dict[str, Environment]` ''' - return utility.MappingView(self._container_environs) + return util.MappingView(self._container_environs) @property def fullname(self): @@ -315,7 +317,7 @@ def resources(self): ''' - return utility.MappingView(self._resources) + return util.MappingView(self._resources) @property def scheduler(self): @@ -661,7 +663,7 @@ def partitions(self): :type: :class:`List[SystemPartition]` ''' - return utility.SequenceView(self._partitions) + return util.SequenceView(self._partitions) def __eq__(self, other): if not isinstance(other, type(self)): diff --git a/reframe/frontend/autodetect.py b/reframe/frontend/autodetect.py new file mode 100644 index 0000000000..dd446860d0 --- /dev/null +++ b/reframe/frontend/autodetect.py @@ -0,0 +1,106 @@ +import json +import os +import tempfile + +import reframe as rfm +import reframe.core.shell as shell +import reframe.utility.osext as osext +import reframe.utility.systeminfo as sysinfo +from reframe.core.logging import getlogger +from reframe.core.runtime import runtime +from reframe.core.schedulers import Job + + +# reframe --detect-local-topology +# +# RFM_DETECT_REMOTE_SYSTEM_TOPOLOGY=y (default=n) +# +# ReFrame will launch remote jobs executing `reframe --detect-local-topology` + +def _load_procinfo(filename): + try: + with open(filename) as fp: + return json.load(fp) + except OSError as e: + getlogger().warning( + f'could not load procinfo file: {filename!r}: {e}' + ) + return {} + + +def _save_procinfo(filename, procinfo): + if not procinfo: + return + + os.makedirs(os.path.dirname(filename), exist_ok=True) + try: + with open(filename, 'w') as fp: + json.dump(procinfo, fp, indent=2) + except OSError as e: + getlogger().warning( + f'could not save procinfo file: {filename!r}: {e}' + ) + + +def _is_part_local(part): + return (part.scheduler.registered_name == 'local' and + part.launcher_type.registered_name == 'local') + + +def _remote_detect(part): + rfm_exec = os.path.join(rfm.INSTALL_PREFIX, 'bin/reframe') + try: + with tempfile.TemporaryDirectory(dir='.') as dirname: + job = Job.create(part.scheduler, + part.launcher_type(), + name='rfm-detect-job', + sched_access=part.access) + with osext.change_dir(dirname): + job.prepare([f'{rfm_exec} --detect-local-topology=topo.json'], + trap_errors=True) + job.submit() + job.wait() + with open('topo.json') as fp: + procinfo = json.load(fp) + except Exception as e: + getlogger().warning(f'failed to retrieve remote processor info: {e}') + procinfo = {} + + return procinfo + + +def detect_procinfo(): + rt = runtime() + detect_remote_systems = rt.get_option( + 'general/0/detect_remote_system_topology' + ) + config_file = rt.site_config.filename + if config_file == '': + config_prefix = os.path.join( + os.getenv('HOME'), '.reframe/procinfo' + ) + else: + config_prefix = os.path.dirname(config_file) + config_prefix = os.path.join(config_prefix, '_meta') + + for part in rt.system.partitions: + if part.processor.info != {}: + # Processor info set up already in the configuration + continue + + procinfo_file = os.path.join( + config_prefix, f'{rt.system.name}-{part.name}', 'processor.json' + ) + + if os.path.exists(procinfo_file): + part.processor._info = _load_procinfo(procinfo_file) + continue + + # No procinfo found, try to auto-detect it + if _is_part_local(part): + # Unconditionally detect the system for fully local partitions + part.processor._info = sysinfo.get_proc_info() + _save_procinfo(procinfo_file, part.processor.info) + elif detect_remote_systems: + part.processor._info = _remote_detect(part) + _save_procinfo(procinfo_file, part.processor.info) diff --git a/reframe/frontend/cli.py b/reframe/frontend/cli.py index bab7f347d5..efac732a78 100644 --- a/reframe/frontend/cli.py +++ b/reframe/frontend/cli.py @@ -21,6 +21,7 @@ import reframe.core.runtime as runtime import reframe.core.warnings as warnings import reframe.frontend.argparse as argparse +import reframe.frontend.autodetect as autodetect import reframe.frontend.ci as ci import reframe.frontend.dependencies as dependencies import reframe.frontend.filters as filters @@ -425,9 +426,8 @@ def main(): help='Print the value of configuration parameter PARAM and exit' ) misc_options.add_argument( - '--detect-system-topology', action='store_true', - help=('Detect and store topology information ' - 'for the current system and exit') + '--detect-local-topology', action='store', nargs='?', const='-', + help='Detect the local system topology and exit' ) misc_options.add_argument( '--system', action='store', help='Load configuration for SYSTEM', @@ -486,6 +486,13 @@ def main(): configvar='logging/handlers_perflog/httpjson_url', help='URL of HTTP server accepting JSON logs' ) + argparser.add_argument( + dest='detect_remote_system_topology', + envvar='RFM_DETECT_REMOTE_SYSTEM_TOPOLOGY', + configvar='general/detect_remote_system_topology', + action='store_true', + help='Detect remote system topology' + ) # Parse command line options = argparser.parse_args() @@ -587,6 +594,7 @@ def main(): sys.exit(1) rt = runtime.runtime() + autodetect.detect_procinfo() try: if site_config.get('general/0/module_map_file'): rt.modules_system.load_mapping_from_file( @@ -625,11 +633,24 @@ def main(): sys.exit(0) - if options.detect_system_topology: + if options.detect_local_topology: import reframe.utility as util from reframe.utility.systeminfo import get_proc_info - printer.info(util.ppretty(get_proc_info())) + topofile = options.detect_local_topology + if topofile == '-': + json.dump(get_proc_info(), sys.stdout, indent=2) + sys.stdout.write('\n') + else: + try: + with open(topofile, 'w') as fp: + json.dump(get_proc_info(), fp, indent=2) + fp.write('\n') + except OSError as e: + getlogger().error( + f'could not write topology file: {topofile!r}') + sys.exit(1) + sys.exit(0) printer.debug(format_env(options.env_vars)) diff --git a/reframe/utility/systeminfo.py b/reframe/utility/systeminfo.py index eaa1cb69b9..3c77aa11a1 100644 --- a/reframe/utility/systeminfo.py +++ b/reframe/utility/systeminfo.py @@ -36,7 +36,7 @@ def string_from_bits(ids): for id in ids: ret |= (1 << id) - return hex(ret).upper() + return hex(ret).lower() def filesystem_info(): @@ -243,7 +243,7 @@ def sysctl_info(): match = re.search(rf'machdep\.cpu\.cache\.L{i}_associativity: ' rf'(?P\d+)', - exec_output.stdout) + exec_output.stdout) ca = int(match.group('associativity')) if match else 0 cache_associativity.append(ca) @@ -253,7 +253,8 @@ def sysctl_info(): processor_info['num_cpus'] = num_cpus processor_info['num_cpus_per_socket'] = num_cpus_per_socket processor_info['num_cpus_per_core'] = num_cpus_per_core - processor_info['topology']['numa_nodes'] = string_from_bits(range(num_cpus)) + processor_info['topology']['numa_nodes'] = string_from_bits( + range(num_cpus)) processor_info['topology']['sockets'] = [ string_from_bits(range(start, start+num_cpus_per_socket)) for start in range(0, num_cpus, num_cpus_per_socket) From 055d2b7f2892749f7824fdf5edfd37e36792d252 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Sat, 29 May 2021 22:36:22 +0200 Subject: [PATCH 09/30] Add log calls --- reframe/frontend/autodetect.py | 61 +++++++++++++++++++++------------- reframe/frontend/cli.py | 2 +- reframe/utility/systeminfo.py | 17 +++------- 3 files changed, 42 insertions(+), 38 deletions(-) diff --git a/reframe/frontend/autodetect.py b/reframe/frontend/autodetect.py index dd446860d0..e5aad083a7 100644 --- a/reframe/frontend/autodetect.py +++ b/reframe/frontend/autodetect.py @@ -11,34 +11,28 @@ from reframe.core.schedulers import Job -# reframe --detect-local-topology -# -# RFM_DETECT_REMOTE_SYSTEM_TOPOLOGY=y (default=n) -# -# ReFrame will launch remote jobs executing `reframe --detect-local-topology` - -def _load_procinfo(filename): +def _load_topology(filename): try: with open(filename) as fp: return json.load(fp) except OSError as e: getlogger().warning( - f'could not load procinfo file: {filename!r}: {e}' + f'could not load topology file: {filename!r}: {e}' ) return {} -def _save_procinfo(filename, procinfo): - if not procinfo: +def _save_topology(filename, topo_info): + if not topo_info: return os.makedirs(os.path.dirname(filename), exist_ok=True) try: with open(filename, 'w') as fp: - json.dump(procinfo, fp, indent=2) + json.dump(topo_info, fp, indent=2) except OSError as e: getlogger().warning( - f'could not save procinfo file: {filename!r}: {e}' + f'could not save topology file: {filename!r}: {e}' ) @@ -48,6 +42,9 @@ def _is_part_local(part): def _remote_detect(part): + getlogger().info( + f'Detecting topology of remote partition {part.fullname!r}' + ) rfm_exec = os.path.join(rfm.INSTALL_PREFIX, 'bin/reframe') try: with tempfile.TemporaryDirectory(dir='.') as dirname: @@ -58,18 +55,25 @@ def _remote_detect(part): with osext.change_dir(dirname): job.prepare([f'{rfm_exec} --detect-local-topology=topo.json'], trap_errors=True) + + with open(job.script_filename) as fp: + getlogger().debug( + f'submitting remote job script:\n{fp.read()}' + ) + job.submit() job.wait() with open('topo.json') as fp: - procinfo = json.load(fp) + topo_info = json.load(fp) + except Exception as e: getlogger().warning(f'failed to retrieve remote processor info: {e}') - procinfo = {} + topo_info = {} - return procinfo + return topo_info -def detect_procinfo(): +def detect_topology(): rt = runtime() detect_remote_systems = rt.get_option( 'general/0/detect_remote_system_topology' @@ -77,30 +81,39 @@ def detect_procinfo(): config_file = rt.site_config.filename if config_file == '': config_prefix = os.path.join( - os.getenv('HOME'), '.reframe/procinfo' + os.getenv('HOME'), '.reframe/topology' ) else: config_prefix = os.path.dirname(config_file) config_prefix = os.path.join(config_prefix, '_meta') for part in rt.system.partitions: + getlogger().debug(f'detecting topology info for {part.fullname}') if part.processor.info != {}: # Processor info set up already in the configuration + getlogger().debug( + f'> topology found in configuration file; skipping...' + ) continue - procinfo_file = os.path.join( + topo_file = os.path.join( config_prefix, f'{rt.system.name}-{part.name}', 'processor.json' ) - - if os.path.exists(procinfo_file): - part.processor._info = _load_procinfo(procinfo_file) + if os.path.exists(topo_file): + getlogger().debug( + f'> found topology file {topo_file!r}; loading...' + ) + part.processor._info = _load_topology(topo_file) continue - # No procinfo found, try to auto-detect it + # No topology found, try to auto-detect it + getlogger().debug(f'> no topology file found; auto-detecting...') if _is_part_local(part): # Unconditionally detect the system for fully local partitions part.processor._info = sysinfo.get_proc_info() - _save_procinfo(procinfo_file, part.processor.info) + _save_topology(topo_file, part.processor.info) elif detect_remote_systems: part.processor._info = _remote_detect(part) - _save_procinfo(procinfo_file, part.processor.info) + _save_topology(topo_file, part.processor.info) + + getlogger().debug(f'> saved topology in {topo_file!r}') diff --git a/reframe/frontend/cli.py b/reframe/frontend/cli.py index efac732a78..e24f059117 100644 --- a/reframe/frontend/cli.py +++ b/reframe/frontend/cli.py @@ -594,7 +594,7 @@ def main(): sys.exit(1) rt = runtime.runtime() - autodetect.detect_procinfo() + autodetect.detect_topology() try: if site_config.get('general/0/module_map_file'): rt.modules_system.load_mapping_from_file( diff --git a/reframe/utility/systeminfo.py b/reframe/utility/systeminfo.py index 3c77aa11a1..b7ebb58dce 100644 --- a/reframe/utility/systeminfo.py +++ b/reframe/utility/systeminfo.py @@ -3,11 +3,7 @@ # # SPDX-License-Identifier: BSD-3-Clause -'''Managing system information. - -.. versionadded:: 3.6.0 - -''' +import archspec.cpu import contextlib import glob import os @@ -282,14 +278,9 @@ def sysctl_info(): def get_proc_info(): - try: - import archspec.cpu - - processor_info = { - 'arch': archspec.cpu.host().name - } - except ModuleNotFoundError: - processor_info = {} + processor_info = { + 'arch': archspec.cpu.host().name + } # Try first to get information from the filesystem if glob.glob('/sys/'): From b33bfdd1e65046813d6ca809da31f931f0f2da1e Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Sat, 29 May 2021 23:12:48 +0200 Subject: [PATCH 10/30] Use parallel launcher --- reframe/frontend/autodetect.py | 6 +++--- reframe/frontend/cli.py | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/reframe/frontend/autodetect.py b/reframe/frontend/autodetect.py index e5aad083a7..bb2e4ae2ec 100644 --- a/reframe/frontend/autodetect.py +++ b/reframe/frontend/autodetect.py @@ -3,7 +3,6 @@ import tempfile import reframe as rfm -import reframe.core.shell as shell import reframe.utility.osext as osext import reframe.utility.systeminfo as sysinfo from reframe.core.logging import getlogger @@ -53,9 +52,10 @@ def _remote_detect(part): name='rfm-detect-job', sched_access=part.access) with osext.change_dir(dirname): - job.prepare([f'{rfm_exec} --detect-local-topology=topo.json'], + launcher_cmd = job.launcher.run_command(job) + job.prepare([f'{launcher_cmd} {rfm_exec} ' + f'--detect-local-topology=topo.json'], trap_errors=True) - with open(job.script_filename) as fp: getlogger().debug( f'submitting remote job script:\n{fp.read()}' diff --git a/reframe/frontend/cli.py b/reframe/frontend/cli.py index e24f059117..745a066d6e 100644 --- a/reframe/frontend/cli.py +++ b/reframe/frontend/cli.py @@ -634,7 +634,6 @@ def main(): sys.exit(0) if options.detect_local_topology: - import reframe.utility as util from reframe.utility.systeminfo import get_proc_info topofile = options.detect_local_topology From b6c329f6607e91625cf11072920c575375a7c35b Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Sat, 29 May 2021 23:36:10 +0200 Subject: [PATCH 11/30] Fix doc build + wheel creation --- docs/requirements.txt | 1 + setup.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 879add60dd..bf9f80560f 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,3 +1,4 @@ +archspec==0.1.2 jsonschema==3.2.0 semver==2.13.0 Sphinx==3.5.4 diff --git a/setup.py b/setup.py index b698fc23a3..a5307c4e35 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,8 @@ ), package_data={'reframe': ['schemas/*']}, include_package_data=True, - install_requires=['argcomplete', 'jsonschema', 'lxml', 'PyYAML', 'semver'], + install_requires=['archspec', 'argcomplete', 'jsonschema', + 'lxml', 'PyYAML', 'semver'], python_requires='>=3.6', scripts=['bin/reframe'], classifiers=( From 72c6164e8fd31b40c672483ebc7ad6e7eedcd785 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Sun, 30 May 2021 00:06:02 +0200 Subject: [PATCH 12/30] Remove unused imports --- reframe/core/systems.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/reframe/core/systems.py b/reframe/core/systems.py index f7d92cf720..cf4fa4501d 100644 --- a/reframe/core/systems.py +++ b/reframe/core/systems.py @@ -4,11 +4,9 @@ # SPDX-License-Identifier: BSD-3-Clause import json -import os import reframe.utility as util import reframe.utility.jsonext as jsonext -import reframe.utility.systeminfo as sysinfo from reframe.core.backends import (getlauncher, getscheduler) from reframe.core.environments import (Environment, ProgEnvironment) from reframe.core.logging import getlogger From 45f86f4fe0d3926c32e770695e251853fc97841c Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Mon, 31 May 2021 00:35:01 +0200 Subject: [PATCH 13/30] Add unit tests --- reframe/frontend/autodetect.py | 2 +- reframe/frontend/cli.py | 11 ++++++----- unittests/test_cli.py | 27 +++++++++++++++++++++++++++ 3 files changed, 34 insertions(+), 6 deletions(-) diff --git a/reframe/frontend/autodetect.py b/reframe/frontend/autodetect.py index bb2e4ae2ec..379b0b2dcd 100644 --- a/reframe/frontend/autodetect.py +++ b/reframe/frontend/autodetect.py @@ -54,7 +54,7 @@ def _remote_detect(part): with osext.change_dir(dirname): launcher_cmd = job.launcher.run_command(job) job.prepare([f'{launcher_cmd} {rfm_exec} ' - f'--detect-local-topology=topo.json'], + f'--detect-host-topology=topo.json'], trap_errors=True) with open(job.script_filename) as fp: getlogger().debug( diff --git a/reframe/frontend/cli.py b/reframe/frontend/cli.py index 745a066d6e..d03bb3f0bf 100644 --- a/reframe/frontend/cli.py +++ b/reframe/frontend/cli.py @@ -426,8 +426,8 @@ def main(): help='Print the value of configuration parameter PARAM and exit' ) misc_options.add_argument( - '--detect-local-topology', action='store', nargs='?', const='-', - help='Detect the local system topology and exit' + '--detect-host-topology', action='store', nargs='?', const='-', + help='Detect the local host topology and exit' ) misc_options.add_argument( '--system', action='store', help='Load configuration for SYSTEM', @@ -633,10 +633,10 @@ def main(): sys.exit(0) - if options.detect_local_topology: + if options.detect_host_topology: from reframe.utility.systeminfo import get_proc_info - topofile = options.detect_local_topology + topofile = options.detect_host_topology if topofile == '-': json.dump(get_proc_info(), sys.stdout, indent=2) sys.stdout.write('\n') @@ -647,7 +647,8 @@ def main(): fp.write('\n') except OSError as e: getlogger().error( - f'could not write topology file: {topofile!r}') + f'could not write topology file: {topofile!r}' + ) sys.exit(1) sys.exit(0) diff --git a/unittests/test_cli.py b/unittests/test_cli.py index b11605dead..37b8802a83 100644 --- a/unittests/test_cli.py +++ b/unittests/test_cli.py @@ -6,6 +6,7 @@ import contextlib import io import itertools +import json import os import pytest import re @@ -765,3 +766,29 @@ def test_maxfail_negative(run_reframe): assert 'Traceback' not in stderr assert "--maxfail should be a non-negative integer: '-2'" in stdout assert returncode == 1 + + +def test_detect_host_topology(run_reframe): + import reframe.utility.systeminfo as sysinfo + + returncode, stdout, stderr = run_reframe( + more_options=['--detect-host-topology'] + ) + assert 'Traceback' not in stdout + assert 'Traceback' not in stderr + assert returncode == 0 + assert stdout == json.dumps(sysinfo.get_proc_info(), indent=2) + '\n' + + +def test_detect_host_topology_file(run_reframe, tmp_path): + import reframe.utility.systeminfo as sysinfo + + topo_file = tmp_path / 'topo.json' + returncode, stdout, stderr = run_reframe( + more_options=[f'--detect-host-topology={topo_file}'] + ) + assert 'Traceback' not in stdout + assert 'Traceback' not in stderr + assert returncode == 0 + with open(topo_file) as fp: + assert json.load(fp) == sysinfo.get_proc_info() From 8f1500bcc82d9f36e91b540b022d4988f9dcf487 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Tue, 1 Jun 2021 00:40:37 +0200 Subject: [PATCH 14/30] Fine tune implementation --- reframe/frontend/autodetect.py | 9 +- reframe/frontend/cli.py | 6 +- reframe/utility/{systeminfo.py => cpuinfo.py} | 136 +++++++++--------- unittests/test_cli.py | 8 +- 4 files changed, 83 insertions(+), 76 deletions(-) rename reframe/utility/{systeminfo.py => cpuinfo.py} (75%) diff --git a/reframe/frontend/autodetect.py b/reframe/frontend/autodetect.py index 379b0b2dcd..95f110b7f8 100644 --- a/reframe/frontend/autodetect.py +++ b/reframe/frontend/autodetect.py @@ -1,13 +1,18 @@ +# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich) +# ReFrame Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: BSD-3-Clause + import json import os import tempfile import reframe as rfm import reframe.utility.osext as osext -import reframe.utility.systeminfo as sysinfo from reframe.core.logging import getlogger from reframe.core.runtime import runtime from reframe.core.schedulers import Job +from reframe.utility.cpuinfo import cpuinfo def _load_topology(filename): @@ -110,7 +115,7 @@ def detect_topology(): getlogger().debug(f'> no topology file found; auto-detecting...') if _is_part_local(part): # Unconditionally detect the system for fully local partitions - part.processor._info = sysinfo.get_proc_info() + part.processor._info = cpuinfo() _save_topology(topo_file, part.processor.info) elif detect_remote_systems: part.processor._info = _remote_detect(part) diff --git a/reframe/frontend/cli.py b/reframe/frontend/cli.py index d03bb3f0bf..1ea38e4314 100644 --- a/reframe/frontend/cli.py +++ b/reframe/frontend/cli.py @@ -634,16 +634,16 @@ def main(): sys.exit(0) if options.detect_host_topology: - from reframe.utility.systeminfo import get_proc_info + from reframe.utility.cpuinfo import cpuinfo topofile = options.detect_host_topology if topofile == '-': - json.dump(get_proc_info(), sys.stdout, indent=2) + json.dump(cpuinfo(), sys.stdout, indent=2) sys.stdout.write('\n') else: try: with open(topofile, 'w') as fp: - json.dump(get_proc_info(), fp, indent=2) + json.dump(cpuinfo(), fp, indent=2) fp.write('\n') except OSError as e: getlogger().error( diff --git a/reframe/utility/systeminfo.py b/reframe/utility/cpuinfo.py similarity index 75% rename from reframe/utility/systeminfo.py rename to reframe/utility/cpuinfo.py index b7ebb58dce..c5ca4203f1 100644 --- a/reframe/utility/systeminfo.py +++ b/reframe/utility/cpuinfo.py @@ -13,40 +13,43 @@ from reframe.core.exceptions import SpawnedProcessError -def bits_from_string(mask): - ret = [] - mask_int = int(mask, 0) - index = 0 - while mask_int: - if mask_int & 1: - ret.append(index) +def _bits_from_str(mask_s): + '''Return the set bits from a string representing a bit array.''' - index += 1 - mask_int >>= 1 + bits = [] + mask = int(mask_s, 0) + pos = 0 + while mask: + if mask & 1: + bits.append(pos) + + pos += 1 + mask >>= 1 + + return bits - return ret +def _str_from_bits(bits): + '''Return a string representation of a bit array with ``bits`` set.''' -def string_from_bits(ids): ret = 0 - for id in ids: - ret |= (1 << id) + for b in bits: + ret |= (1 << b) return hex(ret).lower() -def filesystem_info(): +def _sysfs_topo(): cache_units = { 'K': 1024, - 'M': 1048576, - 'G': 1073741824 + 'M': 1024*1024, + 'G': 1024*1024*1024 } - processor_info = { + cpuinfo = { 'topology': {} } cpu_dirs = glob.glob(r'/sys/devices/system/cpu/cpu[0-9]*') nodes = glob.glob(r'/sys/devices/system/node/node[0-9]*') - cores = set() for cpu in cpu_dirs: core_cpus_path = os.path.join(cpu, 'topology/core_cpus') @@ -63,7 +66,7 @@ def filesystem_info(): with open(cores_path) as fp: core_cpus = fp.read() core_cpus = re.sub(r'[\s,]', '', core_cpus) - core_cpus = f'0x{core_cpus.upper()}' + core_cpus = f'0x{core_cpus.lower()}' cores.add(core_cpus) sockets = set() @@ -82,7 +85,7 @@ def filesystem_info(): with open(sockets_path) as fp: package_cpus = fp.read() package_cpus = re.sub(r'[\s,]', '', package_cpus) - package_cpus = f'0x{package_cpus.upper()}' + package_cpus = f'0x{package_cpus.lower()}' sockets.add(package_cpus) numa_nodes = [] @@ -91,11 +94,10 @@ def filesystem_info(): with open(os.path.join(node, 'cpumap')) as fp: cpumap = fp.read() cpumap = re.sub(r'[\s,]', '', cpumap) - cpumap = f'0x{cpumap.upper()}' + cpumap = f'0x{cpumap.lower()}' numa_nodes.append(cpumap) numa_nodes.sort() - caches = {} for cpu in cpu_dirs: cache_dirs = glob.glob(cpu + r'/cache/index[0-9]*') @@ -105,7 +107,6 @@ def filesystem_info(): cache_linesize = 0 cache_associativity = 0 cache_cpuset = '' - with contextlib.suppress(IOError): with open(os.path.join(cache, 'level')) as fp: cache_level = int(fp.read()) @@ -132,7 +133,7 @@ def filesystem_info(): # Don't take the associativity directly from # "ways_of_associativity" file because some archs (ia64, ppc) # put 0 there when fully-associative, while others (x86) - # put something like -1 there. + # put something like -1. with contextlib.suppress(IOError): with open(os.path.join(cache, 'number_of_sets')) as fp: cache_number_of_sets = int(fp.read()) @@ -153,9 +154,9 @@ def filesystem_info(): with open(os.path.join(cache, 'shared_cpu_map')) as fp: cache_cpuset = fp.read() cache_cpuset = re.sub(r'[\s,]', '', cache_cpuset) - cache_cpuset = f'0x{cache_cpuset.upper()}' + cache_cpuset = f'0x{cache_cpuset.lower()}' - num_cpus = len(bits_from_string(cache_cpuset)) + num_cpus = len(_bits_from_str(cache_cpuset)) caches.setdefault((cache_level, cache_size, cache_linesize, cache_associativity, num_cpus), set()) caches[(cache_level, cache_size, cache_linesize, @@ -167,17 +168,18 @@ def filesystem_info(): num_cpus_per_core = num_cpus // num_cores if num_cores else 0 num_cpus_per_socket = num_cpus // num_sockets if num_sockets else 0 - processor_info['num_cpus'] = num_cpus - processor_info['num_cpus_per_core'] = num_cpus_per_core - processor_info['num_cpus_per_socket'] = num_cpus_per_socket - processor_info['num_sockets'] = num_sockets - processor_info['topology']['numa_nodes'] = numa_nodes - processor_info['topology']['sockets'] = sorted(list(sockets)) - processor_info['topology']['cores'] = sorted(list(cores)) - processor_info['topology']['caches'] = [] + # Fill in the cpuinfo + cpuinfo['num_cpus'] = num_cpus + cpuinfo['num_cpus_per_core'] = num_cpus_per_core + cpuinfo['num_cpus_per_socket'] = num_cpus_per_socket + cpuinfo['num_sockets'] = num_sockets + cpuinfo['topology']['numa_nodes'] = numa_nodes + cpuinfo['topology']['sockets'] = sorted(list(sockets)) + cpuinfo['topology']['cores'] = sorted(list(cores)) + cpuinfo['topology']['caches'] = [] for cache_type, cpusets in caches.items(): - (cache_level, cache_size, cache_linesize, cache_associativity, - num_cpus) = cache_type + (cache_level, cache_size, + cache_linesize, cache_associativity, num_cpus) = cache_type c = { 'type': f'L{cache_level}', 'size': cache_size, @@ -186,19 +188,19 @@ def filesystem_info(): 'num_cpus': num_cpus, 'cpusets': sorted(list(cpusets)) } - processor_info['topology']['caches'].append(c) + cpuinfo['topology']['caches'].append(c) - return processor_info + return cpuinfo -def sysctl_info(): +def _sysctl_topo(): try: exec_output = osext.run_command('sysctl hw machdep.cpu.cache', check=True) except (FileNotFoundError, SpawnedProcessError): return {} - processor_info = { + cpuinfo = { 'topology': {} } match = re.search(r'hw\.ncpu: (?P\d+)', exec_output.stdout) @@ -214,7 +216,7 @@ def sysctl_info(): exec_output.stdout) if match: num_sockets = int(match.group('num_sockets')) - processor_info['num_sockets'] = num_sockets + cpuinfo['num_sockets'] = num_sockets match = re.search(r'hw\.cacheconfig:(?P(\s\d+)*)', exec_output.stdout) @@ -231,8 +233,8 @@ def sysctl_info(): if match: linesize = int(match.group('linesize')) - cache_associativity = [0] # index 0 is referring to memory + cache_associativity = [0] for i in range(1, len(cachesize)): if cachesize[i] == 0: break @@ -240,26 +242,26 @@ def sysctl_info(): match = re.search(rf'machdep\.cpu\.cache\.L{i}_associativity: ' rf'(?P\d+)', exec_output.stdout) - ca = int(match.group('associativity')) if match else 0 - cache_associativity.append(ca) + assoc = int(match.group('associativity')) if match else 0 + cache_associativity.append(assoc) num_cpus_per_socket = num_cpus // num_sockets num_cpus_per_core = num_cpus // num_cores - processor_info['num_cpus'] = num_cpus - processor_info['num_cpus_per_socket'] = num_cpus_per_socket - processor_info['num_cpus_per_core'] = num_cpus_per_core - processor_info['topology']['numa_nodes'] = string_from_bits( - range(num_cpus)) - processor_info['topology']['sockets'] = [ - string_from_bits(range(start, start+num_cpus_per_socket)) for start - in range(0, num_cpus, num_cpus_per_socket) + # Fill in the cpuinfo + cpuinfo['num_cpus'] = num_cpus + cpuinfo['num_cpus_per_socket'] = num_cpus_per_socket + cpuinfo['num_cpus_per_core'] = num_cpus_per_core + cpuinfo['topology']['numa_nodes'] = _str_from_bits(range(num_cpus)) + cpuinfo['topology']['sockets'] = [ + _str_from_bits(range(start, start+num_cpus_per_socket)) + for start in range(0, num_cpus, num_cpus_per_socket) ] - processor_info['topology']['cores'] = [ - string_from_bits(range(start, start+num_cpus_per_core)) for start - in range(0, num_cpus, num_cpus_per_core) + cpuinfo['topology']['cores'] = [ + _str_from_bits(range(start, start+num_cpus_per_core)) + for start in range(0, num_cpus, num_cpus_per_core) ] - processor_info['topology']['caches'] = [] + cpuinfo['topology']['caches'] = [] for i in range(1, len(cache_associativity)): t = { 'type': f'L{i}', @@ -268,26 +270,26 @@ def sysctl_info(): 'associativity': cache_associativity[i], 'num_cpus': cacheconfig[i], 'cpusets': [ - string_from_bits(range(start, start+cacheconfig[i])) + _str_from_bits(range(start, start+cacheconfig[i])) for start in range(0, num_cpus, cacheconfig[i]) ] } - processor_info['topology']['caches'].append(t) + cpuinfo['topology']['caches'].append(t) - return processor_info + return cpuinfo -def get_proc_info(): - processor_info = { +def cpuinfo(): + ret = { 'arch': archspec.cpu.host().name } # Try first to get information from the filesystem - if glob.glob('/sys/'): - topology_information = filesystem_info() + if os.path.isdir('/sys'): + topology = _sysfs_topo() else: - # Try the `sysctl` command - topology_information = sysctl_info() + # Try with the `sysctl` command + topology = _sysctl_topo() - processor_info.update(topology_information) - return processor_info + ret.update(topology) + return ret diff --git a/unittests/test_cli.py b/unittests/test_cli.py index 37b8802a83..8b8deeba96 100644 --- a/unittests/test_cli.py +++ b/unittests/test_cli.py @@ -769,7 +769,7 @@ def test_maxfail_negative(run_reframe): def test_detect_host_topology(run_reframe): - import reframe.utility.systeminfo as sysinfo + from reframe.utility.cpuinfo import cpuinfo returncode, stdout, stderr = run_reframe( more_options=['--detect-host-topology'] @@ -777,11 +777,11 @@ def test_detect_host_topology(run_reframe): assert 'Traceback' not in stdout assert 'Traceback' not in stderr assert returncode == 0 - assert stdout == json.dumps(sysinfo.get_proc_info(), indent=2) + '\n' + assert stdout == json.dumps(cpuinfo(), indent=2) + '\n' def test_detect_host_topology_file(run_reframe, tmp_path): - import reframe.utility.systeminfo as sysinfo + from reframe.utility.cpuinfo import cpuinfo topo_file = tmp_path / 'topo.json' returncode, stdout, stderr = run_reframe( @@ -791,4 +791,4 @@ def test_detect_host_topology_file(run_reframe, tmp_path): assert 'Traceback' not in stderr assert returncode == 0 with open(topo_file) as fp: - assert json.load(fp) == sysinfo.get_proc_info() + assert json.load(fp) == cpuinfo() From 65228524172fa196a18edac16c8ac2a4edc69170 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Wed, 2 Jun 2021 00:12:41 +0200 Subject: [PATCH 15/30] Load device metadata files --- reframe/frontend/autodetect.py | 61 +++++++++++++++++++++++++--------- unittests/test_autodetect.py | 41 +++++++++++++++++++++++ unittests/test_cli.py | 9 ++++- 3 files changed, 94 insertions(+), 17 deletions(-) create mode 100644 unittests/test_autodetect.py diff --git a/reframe/frontend/autodetect.py b/reframe/frontend/autodetect.py index 95f110b7f8..0b4703cb9a 100644 --- a/reframe/frontend/autodetect.py +++ b/reframe/frontend/autodetect.py @@ -15,18 +15,18 @@ from reframe.utility.cpuinfo import cpuinfo -def _load_topology(filename): +def _load_info(filename): try: with open(filename) as fp: return json.load(fp) except OSError as e: getlogger().warning( - f'could not load topology file: {filename!r}: {e}' + f'could not load file: {filename!r}: {e}' ) return {} -def _save_topology(filename, topo_info): +def _save_info(filename, topo_info): if not topo_info: return @@ -94,31 +94,60 @@ def detect_topology(): for part in rt.system.partitions: getlogger().debug(f'detecting topology info for {part.fullname}') + found_procinfo = False + found_devinfo = False if part.processor.info != {}: # Processor info set up already in the configuration getlogger().debug( f'> topology found in configuration file; skipping...' ) + found_procinfo = True + + if part.devices: + # Devices set up already in the configuration + getlogger().debug( + f'> devices found in configuration file; skipping...' + ) + found_devinfo = True + + if found_procinfo and found_devinfo: continue topo_file = os.path.join( config_prefix, f'{rt.system.name}-{part.name}', 'processor.json' ) - if os.path.exists(topo_file): + dev_file = os.path.join( + config_prefix, f'{rt.system.name}-{part.name}', 'devices.json' + ) + if not found_procinfo and os.path.exists(topo_file): getlogger().debug( f'> found topology file {topo_file!r}; loading...' ) - part.processor._info = _load_topology(topo_file) + part.processor._info = _load_info(topo_file) + found_procinfo = True + + if not found_devinfo and os.path.exists(dev_file): + getlogger().debug( + f'> found devices file {dev_file!r}; loading...' + ) + part._devices = _load_info(dev_file) + found_devinfo = True + + if found_procinfo and found_devinfo: continue - # No topology found, try to auto-detect it - getlogger().debug(f'> no topology file found; auto-detecting...') - if _is_part_local(part): - # Unconditionally detect the system for fully local partitions - part.processor._info = cpuinfo() - _save_topology(topo_file, part.processor.info) - elif detect_remote_systems: - part.processor._info = _remote_detect(part) - _save_topology(topo_file, part.processor.info) - - getlogger().debug(f'> saved topology in {topo_file!r}') + if not found_procinfo: + # No topology found, try to auto-detect it + getlogger().debug(f'> no topology file found; auto-detecting...') + if _is_part_local(part): + # Unconditionally detect the system for fully local partitions + part.processor._info = cpuinfo() + _save_info(topo_file, part.processor.info) + elif detect_remote_systems: + part.processor._info = _remote_detect(part) + _save_info(topo_file, part.processor.info) + + getlogger().debug(f'> saved topology in {topo_file!r}') + + if not found_devinfo: + getlogger().debug(f'> device auto-detection is not supported') diff --git a/unittests/test_autodetect.py b/unittests/test_autodetect.py new file mode 100644 index 0000000000..c2ced9a7e4 --- /dev/null +++ b/unittests/test_autodetect.py @@ -0,0 +1,41 @@ +# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich) +# ReFrame Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: BSD-3-Clause + +import json +import os +import pytest +import shutil + + +from reframe.core.runtime import runtime +from reframe.frontend.autodetect import detect_topology +from reframe.utility.cpuinfo import cpuinfo + + +@pytest.fixture +def exec_ctx(make_exec_ctx_g, tmp_path): + # Copy the default settings to the temp dir + shutil.copy('reframe/core/settings.py', tmp_path / 'settings.py') + + # Create a devices file manually, since it is not auto-generated + meta_prefix = tmp_path / '_meta' / 'generic-default' + os.makedirs(meta_prefix) + with open(meta_prefix / 'devices.json', 'w') as fp: + json.dump([ + { + 'type': 'gpu', + 'arch': 'a100', + 'num_devices': 8 + } + ], fp) + + yield from make_exec_ctx_g(tmp_path / 'settings.py') + + +def test_autotect(exec_ctx): + detect_topology() + part = runtime().system.partitions[0] + assert part.processor.info == cpuinfo() + assert part.devices == [{'type': 'gpu', 'arch': 'a100', 'num_devices': 8}] diff --git a/unittests/test_cli.py b/unittests/test_cli.py index 8b8deeba96..c64434dde9 100644 --- a/unittests/test_cli.py +++ b/unittests/test_cli.py @@ -16,6 +16,7 @@ import reframe.frontend.runreport as runreport import reframe.core.logging as logging import reframe.core.runtime as rt +import reframe.utility.osext as osext import unittests.utility as test_util @@ -53,7 +54,13 @@ def perflogdir(tmp_path): @pytest.fixture -def run_reframe(tmp_path, perflogdir): +def rm_config_meta(): + yield + osext.rmtree('unittests/resources/_meta') + + +@pytest.fixture +def run_reframe(tmp_path, perflogdir, rm_config_meta): def _run_reframe(system='generic:default', checkpath=['unittests/resources/checks/hellocheck.py'], environs=['builtin'], From c030e3afe9c06815caa4491d10f1436dd80aee33 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Thu, 3 Jun 2021 01:18:31 +0200 Subject: [PATCH 16/30] More unit tests for topology auto-detection --- unittests/test_autodetect.py | 5 +++-- unittests/test_cli.py | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/unittests/test_autodetect.py b/unittests/test_autodetect.py index c2ced9a7e4..e4f1141266 100644 --- a/unittests/test_autodetect.py +++ b/unittests/test_autodetect.py @@ -17,7 +17,8 @@ @pytest.fixture def exec_ctx(make_exec_ctx_g, tmp_path): # Copy the default settings to the temp dir - shutil.copy('reframe/core/settings.py', tmp_path / 'settings.py') + config_file = tmp_path / 'conf.py' + shutil.copy('reframe/core/settings.py', config_file) # Create a devices file manually, since it is not auto-generated meta_prefix = tmp_path / '_meta' / 'generic-default' @@ -31,7 +32,7 @@ def exec_ctx(make_exec_ctx_g, tmp_path): } ], fp) - yield from make_exec_ctx_g(tmp_path / 'settings.py') + yield from make_exec_ctx_g(config_file) def test_autotect(exec_ctx): diff --git a/unittests/test_cli.py b/unittests/test_cli.py index c64434dde9..97fcb212bb 100644 --- a/unittests/test_cli.py +++ b/unittests/test_cli.py @@ -10,6 +10,7 @@ import os import pytest import re +import shutil import sys import reframe.core.environments as env @@ -56,7 +57,7 @@ def perflogdir(tmp_path): @pytest.fixture def rm_config_meta(): yield - osext.rmtree('unittests/resources/_meta') + shutil.rmtree('unittests/resources/_meta', ignore_errors=True) @pytest.fixture From 071e54546d10fc14eaa8fcdcb223af4f6b4210d7 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Thu, 3 Jun 2021 01:23:11 +0200 Subject: [PATCH 17/30] Remove unused imports --- unittests/test_cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/unittests/test_cli.py b/unittests/test_cli.py index 97fcb212bb..4d51f888b4 100644 --- a/unittests/test_cli.py +++ b/unittests/test_cli.py @@ -17,7 +17,6 @@ import reframe.frontend.runreport as runreport import reframe.core.logging as logging import reframe.core.runtime as rt -import reframe.utility.osext as osext import unittests.utility as test_util From 7ea6115fc6a5c5d96df306200eb7c44f3f65e103 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Mon, 7 Jun 2021 00:07:06 +0200 Subject: [PATCH 18/30] Add documentation --- docs/config_reference.rst | 12 ++++++++++++ docs/configure.rst | 37 +++++++++++++++++++++++++++++++++++++ docs/manpage.rst | 23 +++++++++++++++++++++++ reframe/frontend/cli.py | 8 ++++---- 4 files changed, 76 insertions(+), 4 deletions(-) diff --git a/docs/config_reference.rst b/docs/config_reference.rst index f31a62b124..26371445aa 100644 --- a/docs/config_reference.rst +++ b/docs/config_reference.rst @@ -1201,6 +1201,18 @@ General Configuration The command-line option sets the configuration option to ``false``. +.. js:attribute:: .general[].detect_remote_system_topology + + :required: No + :default: ``false`` + + Try to auto-detect processor information of remote partitions as well. + This may slow down the initialization of the framework, since it involves submitting auto-detection jobs to the remote partitions. + For more information on how ReFrame auto-detects processor information, you may refer to :ref:`proc-autodetection`. + + .. versionadded:: 3.6.2 + + .. js:attribute:: .general[].ignore_check_conflicts :required: No diff --git a/docs/configure.rst b/docs/configure.rst index c304bbee97..d0b4ad6704 100644 --- a/docs/configure.rst +++ b/docs/configure.rst @@ -397,3 +397,40 @@ Let's see some concrete examples: "CC" If you explicitly query a configuration value which is not defined in the configuration file, ReFrame will print its default value. + + +.. _proc-autodetection: + +------------------------------------ +Auto-detecting processor information +------------------------------------ + +.. versionadded:: 3.6.2 + +.. |devices| replace:: :attr:`devices` +.. _devices: config_reference.html#.systems[].partitions[].devices +.. |processor| replace:: :attr:`processor` +.. _processor: config_reference.html#.systems[].partitions[].processor +.. |detect_remote_system_topology| replace:: :attr:`processor` +.. _detect_remote_system_topology: config_reference.html#.general[].detect_remote_system_topology + +ReFrame is able to detect the processor topology of both local and remote partitions automatically. +The processor and device information are made available to the tests through the corresponding attributes of the :attr:`~reframe.core.pipeline.RegressionTest.current_partition` allowing a test to modify its behavior accordingly. +Currently, ReFrame supports auto-detection of the local or remote processor information only. +It does not support auto-detection of devices, in which cases users should explicitly specify this information using the |devices|_ configuration option. +The processor information auto-detection works as follows: + +#. If the |processor|_ configuration is option is defined, then no auto-detection is attempted. + +#. If the |processor|_ configuration option is not defined, ReFrame will look for a processor configuration metadata file in ``{configdir}/_meta/{system}-{part}/processor.json`` or in ``~/.reframe/topology/{system}-{part}/processor.json`` in case of the builtin configuration file. + If the file is found, the topology information is loaded from there. + These files are generated automatically by ReFrame from previous runs. + +#. If the corresponding metadata files are not found, the processor information will be auto-detected. + If the system partition is local (i.e., ``local`` scheduler + ``local`` launcher), the processor information is auto-detected unconditionally and stored in the corresponding metadata file for this partition. + If the partition is remote, ReFrame will not try to auto-detect it unless the :envvar:`RFM_DETECT_REMOTE_SYSTEM_TOPOLOGY` or the |detect_remote_system_topology|_ configuration option is set. + + For detecting remote processor information, ReFrame will generate a job script based on the partition information and launch itself on the remote system with ``{launcher} reframe --detect-host-topology=topo.json``. + The :option:`--detect-host-topology` option causes ReFrame to detect the topology of the current host. + + In case of errors during auto-detection, ReFrame will simply issue a warning and continue. diff --git a/docs/manpage.rst b/docs/manpage.rst index 69a5d0123d..46ecd7b97c 100644 --- a/docs/manpage.rst +++ b/docs/manpage.rst @@ -576,6 +576,16 @@ Miscellaneous options This option can also be set using the :envvar:`RFM_SYSTEM` environment variable. +.. _--detect-host-topology: + +.. option:: --detect-host-topology[=FILE] + + Detect the local host processor topology, store it to ``FILE`` and exit. + If no ``FILE`` is specified, the standard output will be used. + + .. versionadded:: 3.6.2 + + .. option:: --failure-stats Print failure statistics at the end of the run. @@ -698,6 +708,19 @@ Here is an alphabetical list of the environment variables recognized by ReFrame: ================================== ================== +.. envvar:: RFM_DETECT_REMOTE_SYSTEM_TOPOLOGY + + Auto-detect processor information of remote partitions as well. + + .. table:: + :align: left + + ================================== ================== + Associated command line option n/a + Associated configuration parameter :js:attr:`detect_remote_system_topology` general configuration parameter + ================================== ================== + + .. envvar:: RFM_GRAYLOG_ADDRESS The address of the Graylog server to send performance logs. diff --git a/reframe/frontend/cli.py b/reframe/frontend/cli.py index 1ea38e4314..25802d53a7 100644 --- a/reframe/frontend/cli.py +++ b/reframe/frontend/cli.py @@ -425,14 +425,14 @@ def main(): metavar='PARAM', help='Print the value of configuration parameter PARAM and exit' ) - misc_options.add_argument( - '--detect-host-topology', action='store', nargs='?', const='-', - help='Detect the local host topology and exit' - ) misc_options.add_argument( '--system', action='store', help='Load configuration for SYSTEM', envvar='RFM_SYSTEM' ) + misc_options.add_argument( + '--detect-host-topology', action='store', nargs='?', const='-', + help='Detect the local host topology and exit' + ) misc_options.add_argument( '--upgrade-config-file', action='store', metavar='OLD[:NEW]', help='Upgrade ReFrame 2.x configuration file to ReFrame 3.x syntax' From 9c10cbec7becda800debbe141a224492c46112fd Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Tue, 29 Jun 2021 21:55:52 +0200 Subject: [PATCH 19/30] Address PR comments --- docs/config_reference.rst | 2 +- docs/configure.rst | 4 +-- docs/manpage.rst | 6 ++-- reframe/core/config.py | 5 +++ reframe/frontend/autodetect.py | 64 ++++++++++++++++++++++++++++------ reframe/schemas/config.json | 9 ++--- reframe/utility/cpuinfo.py | 2 +- 7 files changed, 70 insertions(+), 22 deletions(-) diff --git a/docs/config_reference.rst b/docs/config_reference.rst index 26371445aa..8ff473571e 100644 --- a/docs/config_reference.rst +++ b/docs/config_reference.rst @@ -1210,7 +1210,7 @@ General Configuration This may slow down the initialization of the framework, since it involves submitting auto-detection jobs to the remote partitions. For more information on how ReFrame auto-detects processor information, you may refer to :ref:`proc-autodetection`. - .. versionadded:: 3.6.2 + .. versionadded:: 3.7.0 .. js:attribute:: .general[].ignore_check_conflicts diff --git a/docs/configure.rst b/docs/configure.rst index 5092266aa1..0ee0e52b40 100644 --- a/docs/configure.rst +++ b/docs/configure.rst @@ -405,13 +405,13 @@ Let's see some concrete examples: Auto-detecting processor information ------------------------------------ -.. versionadded:: 3.6.2 +.. versionadded:: 3.7.0 .. |devices| replace:: :attr:`devices` .. _devices: config_reference.html#.systems[].partitions[].devices .. |processor| replace:: :attr:`processor` .. _processor: config_reference.html#.systems[].partitions[].processor -.. |detect_remote_system_topology| replace:: :attr:`processor` +.. |detect_remote_system_topology| replace:: :attr:`detect_remote_system_topology` .. _detect_remote_system_topology: config_reference.html#.general[].detect_remote_system_topology ReFrame is able to detect the processor topology of both local and remote partitions automatically. diff --git a/docs/manpage.rst b/docs/manpage.rst index 1e16fbf2a2..4f0bf9bce1 100644 --- a/docs/manpage.rst +++ b/docs/manpage.rst @@ -583,7 +583,7 @@ Miscellaneous options Detect the local host processor topology, store it to ``FILE`` and exit. If no ``FILE`` is specified, the standard output will be used. - .. versionadded:: 3.6.2 + .. versionadded:: 3.7.0 .. option:: --failure-stats @@ -716,7 +716,7 @@ Here is an alphabetical list of the environment variables recognized by ReFrame: :align: left ================================== ================== - Associated command line option n/a + Associated command line option N/A Associated configuration parameter :js:attr:`detect_remote_system_topology` general configuration parameter ================================== ================== @@ -943,7 +943,7 @@ Here is an alphabetical list of the environment variables recognized by ReFrame: :align: left ================================== ================== - Associated command line option n/a + Associated command line option N/A Associated configuration parameter :js:attr:`resolve_module_conflicts` general configuration parameter ================================== ================== diff --git a/reframe/core/config.py b/reframe/core/config.py index 4e15eefe64..8442e9859c 100644 --- a/reframe/core/config.py +++ b/reframe/core/config.py @@ -101,6 +101,11 @@ def __getitem__(self, key): def __getattr__(self, attr): return getattr(self._pick_config(), attr) + @property + def schema(self): + '''Configuration schema''' + return self._schema + def add_sticky_option(self, option, value): self._sticky_options[option] = value diff --git a/reframe/frontend/autodetect.py b/reframe/frontend/autodetect.py index 0b4703cb9a..6934a2da58 100644 --- a/reframe/frontend/autodetect.py +++ b/reframe/frontend/autodetect.py @@ -4,26 +4,54 @@ # SPDX-License-Identifier: BSD-3-Clause import json +import jsonschema import os import tempfile import reframe as rfm import reframe.utility.osext as osext +from reframe.core.exceptions import ConfigError from reframe.core.logging import getlogger from reframe.core.runtime import runtime from reframe.core.schedulers import Job from reframe.utility.cpuinfo import cpuinfo -def _load_info(filename): +_REFRAME_GH_REPO = 'https://github.com/eth-cscs/reframe.git' + + +def _subschema(fragment): + '''Create a configuration subschema.''' + + full_schema = runtime().site_config.schema + return { + '$schema': full_schema['$schema'], + 'defs': full_schema['defs'], + '$ref': fragment + } + + +def _validate_info(info, schema): + if schema is None: + return info + + jsonschema.validate(info, schema) + return info + + +def _load_info(filename, schema=None): try: with open(filename) as fp: - return json.load(fp) + return _validate_info(json.load(fp), schema) except OSError as e: getlogger().warning( f'could not load file: {filename!r}: {e}' ) return {} + except jsonschema.ValidationError as e: + raise ConfigError( + f'could not validate meta-config file {filename!r}' + ) from e def _save_info(filename, topo_info): @@ -46,6 +74,24 @@ def _is_part_local(part): def _remote_detect(part): + def _emit_script(job, fresh=False): + if fresh: + commands = [ + f'_prefix=$(mktemp -d)', + f'cd $_prefix', + f'git clone {_REFRAME_GH_REPO} reframe', + f'cd reframe', + f'./bootstrap.sh' + ] + else: + commands = [] + + launcher_cmd = job.launcher.run_command(job) + commands += [ + f'{launcher_cmd} {rfm_exec} --detect-host-topology=topo.json' + ] + job.prepare(commands, trap_errors=True) + getlogger().info( f'Detecting topology of remote partition {part.fullname!r}' ) @@ -57,10 +103,7 @@ def _remote_detect(part): name='rfm-detect-job', sched_access=part.access) with osext.change_dir(dirname): - launcher_cmd = job.launcher.run_command(job) - job.prepare([f'{launcher_cmd} {rfm_exec} ' - f'--detect-host-topology=topo.json'], - trap_errors=True) + _emit_script(job, fresh) with open(job.script_filename) as fp: getlogger().debug( f'submitting remote job script:\n{fp.read()}' @@ -70,7 +113,6 @@ def _remote_detect(part): job.wait() with open('topo.json') as fp: topo_info = json.load(fp) - except Exception as e: getlogger().warning(f'failed to retrieve remote processor info: {e}') topo_info = {} @@ -89,8 +131,7 @@ def detect_topology(): os.getenv('HOME'), '.reframe/topology' ) else: - config_prefix = os.path.dirname(config_file) - config_prefix = os.path.join(config_prefix, '_meta') + config_prefix = os.path.join(os.path.dirname(config_file), '_meta') for part in rt.system.partitions: getlogger().debug(f'detecting topology info for {part.fullname}') @@ -123,14 +164,15 @@ def detect_topology(): getlogger().debug( f'> found topology file {topo_file!r}; loading...' ) - part.processor._info = _load_info(topo_file) + part.processor._info = _load_info(topo_file, + _subschema('#/defs/processor_info')) found_procinfo = True if not found_devinfo and os.path.exists(dev_file): getlogger().debug( f'> found devices file {dev_file!r}; loading...' ) - part._devices = _load_info(dev_file) + part._devices = _load_info(dev_file, _subschema('#/defs/devices')) found_devinfo = True if found_procinfo and found_devinfo: diff --git a/reframe/schemas/config.json b/reframe/schemas/config.json index 45fdc149f4..6a9e1f5413 100644 --- a/reframe/schemas/config.json +++ b/reframe/schemas/config.json @@ -207,6 +207,10 @@ "arch": {"type": "string"}, "num_devices": {"type": "number"} } + }, + "devices": { + "type": "array", + "items": {"$ref": "#/defs/device_info"} } }, "type": "object", @@ -294,10 +298,7 @@ "items": {"type": "string"} }, "processor": {"$ref": "#/defs/processor_info"}, - "devices": { - "type": "array", - "items": {"$ref": "#/defs/device_info"} - }, + "devices": {"$ref": "#/defs/devices"}, "extras": {"type": "object"}, "resources": { "type": "array", diff --git a/reframe/utility/cpuinfo.py b/reframe/utility/cpuinfo.py index c5ca4203f1..a770a71c22 100644 --- a/reframe/utility/cpuinfo.py +++ b/reframe/utility/cpuinfo.py @@ -252,7 +252,7 @@ def _sysctl_topo(): cpuinfo['num_cpus'] = num_cpus cpuinfo['num_cpus_per_socket'] = num_cpus_per_socket cpuinfo['num_cpus_per_core'] = num_cpus_per_core - cpuinfo['topology']['numa_nodes'] = _str_from_bits(range(num_cpus)) + cpuinfo['topology']['numa_nodes'] = [_str_from_bits(range(num_cpus))] cpuinfo['topology']['sockets'] = [ _str_from_bits(range(start, start+num_cpus_per_socket)) for start in range(0, num_cpus, num_cpus_per_socket) From 0953adc6c0167b5685d6d9686dd2deca6b4de390 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Thu, 1 Jul 2021 23:54:25 +0200 Subject: [PATCH 20/30] WIP: Address PR comments --- reframe/frontend/autodetect.py | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/reframe/frontend/autodetect.py b/reframe/frontend/autodetect.py index 6934a2da58..3d84d64186 100644 --- a/reframe/frontend/autodetect.py +++ b/reframe/frontend/autodetect.py @@ -88,7 +88,7 @@ def _emit_script(job, fresh=False): launcher_cmd = job.launcher.run_command(job) commands += [ - f'{launcher_cmd} {rfm_exec} --detect-host-topology=topo.json' + f'{launcher_cmd} {rfm_exec} --detect-host-topology' ] job.prepare(commands, trap_errors=True) @@ -103,20 +103,30 @@ def _emit_script(job, fresh=False): name='rfm-detect-job', sched_access=part.access) with osext.change_dir(dirname): - _emit_script(job, fresh) - with open(job.script_filename) as fp: - getlogger().debug( - f'submitting remote job script:\n{fp.read()}' - ) - - job.submit() - job.wait() - with open('topo.json') as fp: - topo_info = json.load(fp) + more_tries = [{'fresh': False}] + while more_tries: + args = more_tries.pop() + + _emit_script(job, **args) + with open(job.script_filename) as fp: + getlogger().debug( + f'submitting remote job script:\n{fp.read()}' + ) + + job.submit() + job.wait() + with open(job.stdout) as fp: + try: + topo_info = json.load(fp) + except json.JSONDecodeError: + getlogger().debug(f'not a JSON file:\n{fp.read()}') + more_tries.append({'fresh': True}) except Exception as e: - getlogger().warning(f'failed to retrieve remote processor info: {e}') topo_info = {} + if not topo_info: + getlogger().warning(f'failed to retrieve remote processor info: {e}') + return topo_info From 0f87547343bc95ec2a131fd2ef6d92d3a404bca2 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Fri, 2 Jul 2021 00:55:16 +0200 Subject: [PATCH 21/30] Fix UnboundLocalError --- reframe/frontend/autodetect.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/reframe/frontend/autodetect.py b/reframe/frontend/autodetect.py index 3d84d64186..ae3bc20174 100644 --- a/reframe/frontend/autodetect.py +++ b/reframe/frontend/autodetect.py @@ -123,9 +123,10 @@ def _emit_script(job, fresh=False): more_tries.append({'fresh': True}) except Exception as e: topo_info = {} - - if not topo_info: getlogger().warning(f'failed to retrieve remote processor info: {e}') + else: + if not topo_info: + getlogger().warning('failed to retrieve remote processor info') return topo_info From 9617a4bbb90285a24c8679e867234ae3d4eaaeaa Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Fri, 2 Jul 2021 09:36:51 +0200 Subject: [PATCH 22/30] Fix PEP8 issues --- reframe/frontend/autodetect.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/reframe/frontend/autodetect.py b/reframe/frontend/autodetect.py index ae3bc20174..a7e589fadf 100644 --- a/reframe/frontend/autodetect.py +++ b/reframe/frontend/autodetect.py @@ -175,8 +175,9 @@ def detect_topology(): getlogger().debug( f'> found topology file {topo_file!r}; loading...' ) - part.processor._info = _load_info(topo_file, - _subschema('#/defs/processor_info')) + part.processor._info = _load_info( + topo_file, _subschema('#/defs/processor_info') + ) found_procinfo = True if not found_devinfo and os.path.exists(dev_file): From e22f28d6460a85deded61085c33c4c3dc3cbc5ad Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Fri, 2 Jul 2021 14:03:05 +0200 Subject: [PATCH 23/30] Fix remote detection --- reframe/frontend/autodetect.py | 54 +++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/reframe/frontend/autodetect.py b/reframe/frontend/autodetect.py index a7e589fadf..161fb5303a 100644 --- a/reframe/frontend/autodetect.py +++ b/reframe/frontend/autodetect.py @@ -20,6 +20,20 @@ _REFRAME_GH_REPO = 'https://github.com/eth-cscs/reframe.git' +def _contents(filename): + '''Return the contents of a file.''' + + with open(filename) as fp: + return fp.read() + + +def _log_contents(filename): + filename = os.path.abspath(filename) + getlogger().debug(f'--- {filename} ---\n' + f'{_contents(filename)}\n' + f'--- {filename} ---') + + def _subschema(fragment): '''Create a configuration subschema.''' @@ -96,33 +110,30 @@ def _emit_script(job, fresh=False): f'Detecting topology of remote partition {part.fullname!r}' ) rfm_exec = os.path.join(rfm.INSTALL_PREFIX, 'bin/reframe') + topo_info = {} try: with tempfile.TemporaryDirectory(dir='.') as dirname: - job = Job.create(part.scheduler, - part.launcher_type(), - name='rfm-detect-job', - sched_access=part.access) with osext.change_dir(dirname): - more_tries = [{'fresh': False}] - while more_tries: - args = more_tries.pop() - + methods = [{'fresh': False}, {'fresh': True}] + for args in methods: + job = Job.create(part.scheduler, + part.launcher_type(), + name='rfm-detect-job', + sched_access=part.access) _emit_script(job, **args) - with open(job.script_filename) as fp: - getlogger().debug( - f'submitting remote job script:\n{fp.read()}' - ) - + getlogger().debug('submitting detection script') + _log_contents(job.script_filename) job.submit() job.wait() - with open(job.stdout) as fp: - try: - topo_info = json.load(fp) - except json.JSONDecodeError: - getlogger().debug(f'not a JSON file:\n{fp.read()}') - more_tries.append({'fresh': True}) + getlogger().debug('job finished') + _log_contents(job.stdout) + _log_contents(job.stderr) + try: + topo_info = json.loads(_contents(job.stdout)) + break + except json.JSONDecodeError: + getlogger().debug('stdout not a JSON file') except Exception as e: - topo_info = {} getlogger().warning(f'failed to retrieve remote processor info: {e}') else: if not topo_info: @@ -199,7 +210,8 @@ def detect_topology(): _save_info(topo_file, part.processor.info) elif detect_remote_systems: part.processor._info = _remote_detect(part) - _save_info(topo_file, part.processor.info) + if part.processor.info: + _save_info(topo_file, part.processor.info) getlogger().debug(f'> saved topology in {topo_file!r}') From ac53c8d95776cb960ebcf6990db9c8076ec158a5 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Fri, 2 Jul 2021 14:11:19 +0200 Subject: [PATCH 24/30] Temporarily change pull repo and branch --- reframe/frontend/autodetect.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/reframe/frontend/autodetect.py b/reframe/frontend/autodetect.py index 161fb5303a..259c20133e 100644 --- a/reframe/frontend/autodetect.py +++ b/reframe/frontend/autodetect.py @@ -17,7 +17,8 @@ from reframe.utility.cpuinfo import cpuinfo -_REFRAME_GH_REPO = 'https://github.com/eth-cscs/reframe.git' +_GH_REPO = 'https://github.com/vkarak/reframe.git' +_GH_BRANCH = 'feat/cpu-autodetect' def _contents(filename): @@ -93,7 +94,7 @@ def _emit_script(job, fresh=False): commands = [ f'_prefix=$(mktemp -d)', f'cd $_prefix', - f'git clone {_REFRAME_GH_REPO} reframe', + f'git clone -b {_GH_BRANCH} {_GH_REPO} reframe', f'cd reframe', f'./bootstrap.sh' ] From 059ebe0e2671d71b6da0d5c1422bb8d0adf6b364 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Fri, 2 Jul 2021 15:20:33 +0200 Subject: [PATCH 25/30] Fix reframe executable --- reframe/frontend/autodetect.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/reframe/frontend/autodetect.py b/reframe/frontend/autodetect.py index 259c20133e..2b18a9b3e9 100644 --- a/reframe/frontend/autodetect.py +++ b/reframe/frontend/autodetect.py @@ -91,6 +91,7 @@ def _is_part_local(part): def _remote_detect(part): def _emit_script(job, fresh=False): if fresh: + rfm_exec = './bin/reframe' commands = [ f'_prefix=$(mktemp -d)', f'cd $_prefix', @@ -99,6 +100,7 @@ def _emit_script(job, fresh=False): f'./bootstrap.sh' ] else: + rfm_exec = os.path.join(rfm.INSTALL_PREFIX, 'bin/reframe') commands = [] launcher_cmd = job.launcher.run_command(job) @@ -110,7 +112,6 @@ def _emit_script(job, fresh=False): getlogger().info( f'Detecting topology of remote partition {part.fullname!r}' ) - rfm_exec = os.path.join(rfm.INSTALL_PREFIX, 'bin/reframe') topo_info = {} try: with tempfile.TemporaryDirectory(dir='.') as dirname: From 71c2b268a27cb4640852dda35299ffcef3704e1e Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Tue, 6 Jul 2021 00:14:58 +0200 Subject: [PATCH 26/30] Improve remote processor detection --- reframe/frontend/autodetect.py | 83 ++++++++++++++++------------------ reframe/frontend/cli.py | 13 ++++-- reframe/schemas/config.json | 4 ++ 3 files changed, 52 insertions(+), 48 deletions(-) diff --git a/reframe/frontend/autodetect.py b/reframe/frontend/autodetect.py index 2b18a9b3e9..d059032795 100644 --- a/reframe/frontend/autodetect.py +++ b/reframe/frontend/autodetect.py @@ -17,10 +17,6 @@ from reframe.utility.cpuinfo import cpuinfo -_GH_REPO = 'https://github.com/vkarak/reframe.git' -_GH_BRANCH = 'feat/cpu-autodetect' - - def _contents(filename): '''Return the contents of a file.''' @@ -35,6 +31,24 @@ def _log_contents(filename): f'--- {filename} ---') +class _copy_reframe: + def __init__(self, prefix): + self._prefix = prefix + self._prefix = runtime().get_option('general/0/remote_workdir') + self._workdir = None + + def __enter__(self): + self._workdir = tempfile.mkdtemp(prefix='rfm.', dir=self._prefix) + paths = ['bin/', 'reframe/', 'bootstrap.sh', 'requirements.txt'] + for p in paths: + osext.copytree(os.path.join(rfm.INSTALL_PREFIX, p), self._workdir) + + return self._workdir + + def __exit__(self): + osext.rmtree(self._workdir) + + def _subschema(fragment): '''Create a configuration subschema.''' @@ -89,23 +103,11 @@ def _is_part_local(part): def _remote_detect(part): - def _emit_script(job, fresh=False): - if fresh: - rfm_exec = './bin/reframe' - commands = [ - f'_prefix=$(mktemp -d)', - f'cd $_prefix', - f'git clone -b {_GH_BRANCH} {_GH_REPO} reframe', - f'cd reframe', - f'./bootstrap.sh' - ] - else: - rfm_exec = os.path.join(rfm.INSTALL_PREFIX, 'bin/reframe') - commands = [] - + def _emit_script(job): launcher_cmd = job.launcher.run_command(job) commands += [ - f'{launcher_cmd} {rfm_exec} --detect-host-topology' + f'./bootstrap.sh' + f'{launcher_cmd} ./bin/reframe --detect-host-topology=topo.json' ] job.prepare(commands, trap_errors=True) @@ -114,41 +116,32 @@ def _emit_script(job, fresh=False): ) topo_info = {} try: - with tempfile.TemporaryDirectory(dir='.') as dirname: + dest = runtime().get_option('general/0/remote_workdir') + with _copy_reframe(dest) as dirname: with osext.change_dir(dirname): - methods = [{'fresh': False}, {'fresh': True}] - for args in methods: - job = Job.create(part.scheduler, - part.launcher_type(), - name='rfm-detect-job', - sched_access=part.access) - _emit_script(job, **args) - getlogger().debug('submitting detection script') - _log_contents(job.script_filename) - job.submit() - job.wait() - getlogger().debug('job finished') - _log_contents(job.stdout) - _log_contents(job.stderr) - try: - topo_info = json.loads(_contents(job.stdout)) - break - except json.JSONDecodeError: - getlogger().debug('stdout not a JSON file') + job = Job.create(part.scheduler, + part.launcher_type(), + name='rfm-detect-job', + sched_access=part.access) + _emit_script(job) + + getlogger().debug('submitting detection script') + _log_contents(job.script_filename) + job.submit() + job.wait() + getlogger().debug('job finished') + _log_contents(job.stdout) + _log_contents(job.stderr) + topo_info = json.loads(_contents('topo.json')) except Exception as e: getlogger().warning(f'failed to retrieve remote processor info: {e}') - else: - if not topo_info: - getlogger().warning('failed to retrieve remote processor info') return topo_info def detect_topology(): rt = runtime() - detect_remote_systems = rt.get_option( - 'general/0/detect_remote_system_topology' - ) + detect_remote_systems = rt.get_option('general/0/remote_detect') config_file = rt.site_config.filename if config_file == '': config_prefix = os.path.join( diff --git a/reframe/frontend/cli.py b/reframe/frontend/cli.py index 25802d53a7..7e927abb8b 100644 --- a/reframe/frontend/cli.py +++ b/reframe/frontend/cli.py @@ -487,12 +487,19 @@ def main(): help='URL of HTTP server accepting JSON logs' ) argparser.add_argument( - dest='detect_remote_system_topology', - envvar='RFM_DETECT_REMOTE_SYSTEM_TOPOLOGY', - configvar='general/detect_remote_system_topology', + dest='remote_detect', + envvar='RFM_REMOTE_DETECT', + configvar='general/remote_detect', action='store_true', help='Detect remote system topology' ) + argparser.add_argument( + dest='remote_tmpdir', + envvar='RFM_REMOTE_WORKDIR', + configvar='general/remote_workdir', + action='store_true', + help='Working directory for launching ReFrame remotely' + ) # Parse command line options = argparser.parse_args() diff --git a/reframe/schemas/config.json b/reframe/schemas/config.json index 6a9e1f5413..294fba0a32 100644 --- a/reframe/schemas/config.json +++ b/reframe/schemas/config.json @@ -461,6 +461,8 @@ }, "non_default_craype": {"type": "boolean"}, "purge_environment": {"type": "boolean"}, + "remote_detect": {"type": "boolean"}, + "remote_workdir": {"type": "string"}, "report_file": {"type": "string"}, "report_junit": {"type": ["string", "null"]}, "resolve_module_conflicts": {"type": "boolean"}, @@ -501,6 +503,8 @@ "general/module_mappings": [], "general/non_default_craype": false, "general/purge_environment": false, + "general/remote_detect": false, + "general/remote_workdir": ".", "general/report_file": "${HOME}/.reframe/reports/run-report.json", "general/report_junit": null, "general/resolve_module_conflicts": true, From d7b41337a7a1927517857ffd187c6d1e7b5654de Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Tue, 6 Jul 2021 15:57:45 +0200 Subject: [PATCH 27/30] Fix remote detection --- reframe/frontend/autodetect.py | 30 ++++++++++++++++++++---------- reframe/frontend/cli.py | 4 ++-- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/reframe/frontend/autodetect.py b/reframe/frontend/autodetect.py index d059032795..b6efdf9542 100644 --- a/reframe/frontend/autodetect.py +++ b/reframe/frontend/autodetect.py @@ -6,6 +6,7 @@ import json import jsonschema import os +import shutil import tempfile import reframe as rfm @@ -38,15 +39,23 @@ def __init__(self, prefix): self._workdir = None def __enter__(self): - self._workdir = tempfile.mkdtemp(prefix='rfm.', dir=self._prefix) + self._workdir = os.path.abspath( + tempfile.mkdtemp(prefix='rfm.', dir=self._prefix) + ) paths = ['bin/', 'reframe/', 'bootstrap.sh', 'requirements.txt'] for p in paths: - osext.copytree(os.path.join(rfm.INSTALL_PREFIX, p), self._workdir) + src = os.path.join(rfm.INSTALL_PREFIX, p) + if os.path.isdir(src): + dst = os.path.join(self._workdir, p) + osext.copytree(src, dst, dirs_exist_ok=True) + else: + shutil.copy2(src, self._workdir) return self._workdir - def __exit__(self): - osext.rmtree(self._workdir) + def __exit__(self, exc_type, exc_val, exc_tb): + # osext.rmtree(self._workdir) + pass def _subschema(fragment): @@ -105,26 +114,27 @@ def _is_part_local(part): def _remote_detect(part): def _emit_script(job): launcher_cmd = job.launcher.run_command(job) - commands += [ - f'./bootstrap.sh' + commands = [ + f'./bootstrap.sh', f'{launcher_cmd} ./bin/reframe --detect-host-topology=topo.json' ] job.prepare(commands, trap_errors=True) getlogger().info( - f'Detecting topology of remote partition {part.fullname!r}' + f'Detecting topology of remote partition {part.fullname!r}: ' + f'this may take some time...' + ) topo_info = {} try: - dest = runtime().get_option('general/0/remote_workdir') - with _copy_reframe(dest) as dirname: + prefix = runtime().get_option('general/0/remote_workdir') + with _copy_reframe(prefix) as dirname: with osext.change_dir(dirname): job = Job.create(part.scheduler, part.launcher_type(), name='rfm-detect-job', sched_access=part.access) _emit_script(job) - getlogger().debug('submitting detection script') _log_contents(job.script_filename) job.submit() diff --git a/reframe/frontend/cli.py b/reframe/frontend/cli.py index 7e927abb8b..bb2e112b95 100644 --- a/reframe/frontend/cli.py +++ b/reframe/frontend/cli.py @@ -494,10 +494,10 @@ def main(): help='Detect remote system topology' ) argparser.add_argument( - dest='remote_tmpdir', + dest='remote_workdir', envvar='RFM_REMOTE_WORKDIR', configvar='general/remote_workdir', - action='store_true', + action='store', help='Working directory for launching ReFrame remotely' ) From 267d9e0ae2fc1a52e1e2b09ccf0147f3d809075d Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Tue, 6 Jul 2021 16:04:31 +0200 Subject: [PATCH 28/30] Re-add the temp dir removal --- reframe/frontend/autodetect.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/reframe/frontend/autodetect.py b/reframe/frontend/autodetect.py index b6efdf9542..df5c5c008d 100644 --- a/reframe/frontend/autodetect.py +++ b/reframe/frontend/autodetect.py @@ -54,8 +54,7 @@ def __enter__(self): return self._workdir def __exit__(self, exc_type, exc_val, exc_tb): - # osext.rmtree(self._workdir) - pass + osext.rmtree(self._workdir) def _subschema(fragment): From d942a1483756260d493cac7eb0ea9476a71bb373 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Thu, 8 Jul 2021 23:15:31 +0200 Subject: [PATCH 29/30] Update documentation --- docs/config_reference.rst | 16 +++++++++++++++- docs/configure.rst | 13 ++++++++----- docs/manpage.rst | 21 +++++++++++++++++++-- 3 files changed, 42 insertions(+), 8 deletions(-) diff --git a/docs/config_reference.rst b/docs/config_reference.rst index 8ff473571e..9bdc4a3c28 100644 --- a/docs/config_reference.rst +++ b/docs/config_reference.rst @@ -341,9 +341,13 @@ System Partition Configuration :default: ``{}`` Processor information for this partition stored in a `processor info object <#processor-info>`__. + If not set, ReFrame will try to auto-detect this information (see :ref:`proc-autodetection` for more information). .. versionadded:: 3.5.0 + .. versionchanged:: 3.7.0 + ReFrame is now able to detect the processor information automatically. + .. js:attribute:: .systems[].partitions[].devices @@ -1201,7 +1205,7 @@ General Configuration The command-line option sets the configuration option to ``false``. -.. js:attribute:: .general[].detect_remote_system_topology +.. js:attribute:: .general[].remote_detect :required: No :default: ``false`` @@ -1213,6 +1217,16 @@ General Configuration .. versionadded:: 3.7.0 +.. js:attribute:: .general[].remote_workdir + + :required: No + :default: ``"."`` + + The temporary directory prefix that will be used to create a fresh ReFrame clone, in order to auto-detect the processor information of a remote partition. + + .. versionadded:: 3.7.0 + + .. js:attribute:: .general[].ignore_check_conflicts :required: No diff --git a/docs/configure.rst b/docs/configure.rst index 0ee0e52b40..58ccce68c1 100644 --- a/docs/configure.rst +++ b/docs/configure.rst @@ -401,7 +401,6 @@ Let's see some concrete examples: .. _proc-autodetection: ------------------------------------- Auto-detecting processor information ------------------------------------ @@ -428,9 +427,13 @@ The processor information auto-detection works as follows: #. If the corresponding metadata files are not found, the processor information will be auto-detected. If the system partition is local (i.e., ``local`` scheduler + ``local`` launcher), the processor information is auto-detected unconditionally and stored in the corresponding metadata file for this partition. - If the partition is remote, ReFrame will not try to auto-detect it unless the :envvar:`RFM_DETECT_REMOTE_SYSTEM_TOPOLOGY` or the |detect_remote_system_topology|_ configuration option is set. - - For detecting remote processor information, ReFrame will generate a job script based on the partition information and launch itself on the remote system with ``{launcher} reframe --detect-host-topology=topo.json``. - The :option:`--detect-host-topology` option causes ReFrame to detect the topology of the current host. + If the partition is remote, ReFrame will not try to auto-detect it unless the :envvar:`RFM_REMOTE_DETECT` or the |detect_remote_system_topology|_ configuration option is set. + In that case, the steps to auto-detect the remote processor information are the following: + + a. ReFrame creates a fresh clone of itself in a temporary directory created under ``.`` by default. + This temporary directory prefix can be changed by setting the :envvar:`RFM_REMOTE_WORKDIR` environment variable. + b. ReFrame changes to that directory and launches a job that will first bootstrap the fresh clone and then run that clone with ``{launcher} ./bin/reframe --detect-host-topology=topo.json``. + The :option:`--detect-host-topology` option causes ReFrame to detect the topology of the current host, + which in this case would be the remote compute nodes. In case of errors during auto-detection, ReFrame will simply issue a warning and continue. diff --git a/docs/manpage.rst b/docs/manpage.rst index 4f0bf9bce1..b04327b918 100644 --- a/docs/manpage.rst +++ b/docs/manpage.rst @@ -708,7 +708,7 @@ Here is an alphabetical list of the environment variables recognized by ReFrame: ================================== ================== -.. envvar:: RFM_DETECT_REMOTE_SYSTEM_TOPOLOGY +.. envvar:: RFM_REMOTE_DETECT Auto-detect processor information of remote partitions as well. @@ -717,9 +717,26 @@ Here is an alphabetical list of the environment variables recognized by ReFrame: ================================== ================== Associated command line option N/A - Associated configuration parameter :js:attr:`detect_remote_system_topology` general configuration parameter + Associated configuration parameter :js:attr:`remote_detect` general configuration parameter ================================== ================== +.. versionadded:: 3.7.0 + + +.. envvar:: RFM_REMOTE_WORKDIR + + The temporary directory prefix that will be used to create a fresh ReFrame clone, in order to auto-detect the processor information of a remote partition. + + .. table:: + :align: left + + ================================== ================== + Associated command line option N/A + Associated configuration parameter :js:attr:`remote_workdir` general configuration parameter + ================================== ================== + +.. versionadded:: 3.7.0 + .. envvar:: RFM_GRAYLOG_ADDRESS From 6e5a086a4853c1a031267253cfc5784393d3cbc9 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Fri, 9 Jul 2021 15:32:25 +0200 Subject: [PATCH 30/30] Lock meta-config processor info file for reading/writing --- reframe/frontend/autodetect.py | 7 +++++-- reframe/utility/osext.py | 23 +++++++++++++++++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/reframe/frontend/autodetect.py b/reframe/frontend/autodetect.py index df5c5c008d..2019fbce02 100644 --- a/reframe/frontend/autodetect.py +++ b/reframe/frontend/autodetect.py @@ -3,6 +3,7 @@ # # SPDX-License-Identifier: BSD-3-Clause +import fcntl import json import jsonschema import os @@ -79,7 +80,8 @@ def _validate_info(info, schema): def _load_info(filename, schema=None): try: with open(filename) as fp: - return _validate_info(json.load(fp), schema) + with osext.lock_file(fp, fcntl.LOCK_SH): + return _validate_info(json.load(fp), schema) except OSError as e: getlogger().warning( f'could not load file: {filename!r}: {e}' @@ -98,7 +100,8 @@ def _save_info(filename, topo_info): os.makedirs(os.path.dirname(filename), exist_ok=True) try: with open(filename, 'w') as fp: - json.dump(topo_info, fp, indent=2) + with osext.lock_file(fp, fcntl.LOCK_EX): + json.dump(topo_info, fp, indent=2) except OSError as e: getlogger().warning( f'could not save topology file: {filename!r}: {e}' diff --git a/reframe/utility/osext.py b/reframe/utility/osext.py index 1ef9045bae..24ff73aa51 100644 --- a/reframe/utility/osext.py +++ b/reframe/utility/osext.py @@ -9,6 +9,7 @@ import collections.abc import errno +import fcntl import getpass import grp import os @@ -304,6 +305,28 @@ def rmtree(*args, max_retries=3, **kwargs): raise +# FIXME: Need a proper unit test for this +class lock_file: + def __init__(self, fp, mode): + '''Lock file pointed to by file pointer fp. + + This call is blocking. + ''' + + self._mode = mode + if isinstance(fp, int): + # Treat fp as a file descriptor + self._fd = fp + else: + self._fd = fp.fileno() + + def __enter__(self): + fcntl.flock(self._fd, self._mode) + + def __exit__(self, exc_type, exc_val, exc_tb): + fcntl.flock(self._fd, fcntl.LOCK_UN) + + def inpath(entry, pathvar): '''Check if entry is in path.