diff --git a/docs/config_reference.rst b/docs/config_reference.rst index f31a62b124..9bdc4a3c28 100644 --- a/docs/config_reference.rst +++ b/docs/config_reference.rst @@ -341,9 +341,13 @@ System Partition Configuration :default: ``{}`` Processor information for this partition stored in a `processor info object <#processor-info>`__. + If not set, ReFrame will try to auto-detect this information (see :ref:`proc-autodetection` for more information). .. versionadded:: 3.5.0 + .. versionchanged:: 3.7.0 + ReFrame is now able to detect the processor information automatically. + .. js:attribute:: .systems[].partitions[].devices @@ -1201,6 +1205,28 @@ General Configuration The command-line option sets the configuration option to ``false``. +.. js:attribute:: .general[].remote_detect + + :required: No + :default: ``false`` + + Try to auto-detect processor information of remote partitions as well. + This may slow down the initialization of the framework, since it involves submitting auto-detection jobs to the remote partitions. + For more information on how ReFrame auto-detects processor information, you may refer to :ref:`proc-autodetection`. + + .. versionadded:: 3.7.0 + + +.. js:attribute:: .general[].remote_workdir + + :required: No + :default: ``"."`` + + The temporary directory prefix that will be used to create a fresh ReFrame clone, in order to auto-detect the processor information of a remote partition. + + .. versionadded:: 3.7.0 + + .. js:attribute:: .general[].ignore_check_conflicts :required: No diff --git a/docs/configure.rst b/docs/configure.rst index 1e58fbad12..58ccce68c1 100644 --- a/docs/configure.rst +++ b/docs/configure.rst @@ -397,3 +397,43 @@ Let's see some concrete examples: "CC" If you explicitly query a configuration value which is not defined in the configuration file, ReFrame will print its default value. + + +.. _proc-autodetection: + +Auto-detecting processor information +------------------------------------ + +.. versionadded:: 3.7.0 + +.. |devices| replace:: :attr:`devices` +.. _devices: config_reference.html#.systems[].partitions[].devices +.. |processor| replace:: :attr:`processor` +.. _processor: config_reference.html#.systems[].partitions[].processor +.. |detect_remote_system_topology| replace:: :attr:`detect_remote_system_topology` +.. _detect_remote_system_topology: config_reference.html#.general[].detect_remote_system_topology + +ReFrame is able to detect the processor topology of both local and remote partitions automatically. +The processor and device information are made available to the tests through the corresponding attributes of the :attr:`~reframe.core.pipeline.RegressionTest.current_partition` allowing a test to modify its behavior accordingly. +Currently, ReFrame supports auto-detection of the local or remote processor information only. +It does not support auto-detection of devices, in which cases users should explicitly specify this information using the |devices|_ configuration option. +The processor information auto-detection works as follows: + +#. If the |processor|_ configuration is option is defined, then no auto-detection is attempted. + +#. If the |processor|_ configuration option is not defined, ReFrame will look for a processor configuration metadata file in ``{configdir}/_meta/{system}-{part}/processor.json`` or in ``~/.reframe/topology/{system}-{part}/processor.json`` in case of the builtin configuration file. + If the file is found, the topology information is loaded from there. + These files are generated automatically by ReFrame from previous runs. + +#. If the corresponding metadata files are not found, the processor information will be auto-detected. + If the system partition is local (i.e., ``local`` scheduler + ``local`` launcher), the processor information is auto-detected unconditionally and stored in the corresponding metadata file for this partition. + If the partition is remote, ReFrame will not try to auto-detect it unless the :envvar:`RFM_REMOTE_DETECT` or the |detect_remote_system_topology|_ configuration option is set. + In that case, the steps to auto-detect the remote processor information are the following: + + a. ReFrame creates a fresh clone of itself in a temporary directory created under ``.`` by default. + This temporary directory prefix can be changed by setting the :envvar:`RFM_REMOTE_WORKDIR` environment variable. + b. ReFrame changes to that directory and launches a job that will first bootstrap the fresh clone and then run that clone with ``{launcher} ./bin/reframe --detect-host-topology=topo.json``. + The :option:`--detect-host-topology` option causes ReFrame to detect the topology of the current host, + which in this case would be the remote compute nodes. + + In case of errors during auto-detection, ReFrame will simply issue a warning and continue. diff --git a/docs/manpage.rst b/docs/manpage.rst index a6c6cb0f38..b04327b918 100644 --- a/docs/manpage.rst +++ b/docs/manpage.rst @@ -576,6 +576,16 @@ Miscellaneous options This option can also be set using the :envvar:`RFM_SYSTEM` environment variable. +.. _--detect-host-topology: + +.. option:: --detect-host-topology[=FILE] + + Detect the local host processor topology, store it to ``FILE`` and exit. + If no ``FILE`` is specified, the standard output will be used. + + .. versionadded:: 3.7.0 + + .. option:: --failure-stats Print failure statistics at the end of the run. @@ -698,6 +708,36 @@ Here is an alphabetical list of the environment variables recognized by ReFrame: ================================== ================== +.. envvar:: RFM_REMOTE_DETECT + + Auto-detect processor information of remote partitions as well. + + .. table:: + :align: left + + ================================== ================== + Associated command line option N/A + Associated configuration parameter :js:attr:`remote_detect` general configuration parameter + ================================== ================== + +.. versionadded:: 3.7.0 + + +.. envvar:: RFM_REMOTE_WORKDIR + + The temporary directory prefix that will be used to create a fresh ReFrame clone, in order to auto-detect the processor information of a remote partition. + + .. table:: + :align: left + + ================================== ================== + Associated command line option N/A + Associated configuration parameter :js:attr:`remote_workdir` general configuration parameter + ================================== ================== + +.. versionadded:: 3.7.0 + + .. envvar:: RFM_GRAYLOG_ADDRESS The address of the Graylog server to send performance logs. @@ -920,7 +960,7 @@ Here is an alphabetical list of the environment variables recognized by ReFrame: :align: left ================================== ================== - Associated command line option n/a + Associated command line option N/A Associated configuration parameter :js:attr:`resolve_module_conflicts` general configuration parameter ================================== ================== diff --git a/docs/requirements.txt b/docs/requirements.txt index 857f3426f1..c641cbf389 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,3 +1,4 @@ +archspec==0.1.2 docutils==0.16 # https://github.com/sphinx-doc/sphinx/issues/9001 jsonschema==3.2.0 semver==2.13.0 diff --git a/reframe/core/config.py b/reframe/core/config.py index 4e15eefe64..8442e9859c 100644 --- a/reframe/core/config.py +++ b/reframe/core/config.py @@ -101,6 +101,11 @@ def __getitem__(self, key): def __getattr__(self, attr): return getattr(self._pick_config(), attr) + @property + def schema(self): + '''Configuration schema''' + return self._schema + def add_sticky_option(self, option, value): self._sticky_options[option] = value diff --git a/reframe/core/systems.py b/reframe/core/systems.py index 74d6521784..cf4fa4501d 100644 --- a/reframe/core/systems.py +++ b/reframe/core/systems.py @@ -5,12 +5,12 @@ import json -import reframe.utility as utility +import reframe.utility as util import reframe.utility.jsonext as jsonext from reframe.core.backends import (getlauncher, getscheduler) +from reframe.core.environments import (Environment, ProgEnvironment) from reframe.core.logging import getlogger from reframe.core.modules import ModulesSystem -from reframe.core.environments import (Environment, ProgEnvironment) class ProcessorType(jsonext.JSONSerializable): @@ -232,7 +232,7 @@ def access(self): :type: :class:`List[str]` ''' - return utility.SequenceView(self._access) + return util.SequenceView(self._access) @property def descr(self): @@ -249,7 +249,7 @@ def environs(self): :type: :class:`List[ProgEnvironment]` ''' - return utility.SequenceView(self._environs) + return util.SequenceView(self._environs) @property def container_environs(self): @@ -258,7 +258,7 @@ def container_environs(self): :type: :class:`Dict[str, Environment]` ''' - return utility.MappingView(self._container_environs) + return util.MappingView(self._container_environs) @property def fullname(self): @@ -315,7 +315,7 @@ def resources(self): ''' - return utility.MappingView(self._resources) + return util.MappingView(self._resources) @property def scheduler(self): @@ -661,7 +661,7 @@ def partitions(self): :type: :class:`List[SystemPartition]` ''' - return utility.SequenceView(self._partitions) + return util.SequenceView(self._partitions) def __eq__(self, other): if not isinstance(other, type(self)): diff --git a/reframe/frontend/autodetect.py b/reframe/frontend/autodetect.py new file mode 100644 index 0000000000..2019fbce02 --- /dev/null +++ b/reframe/frontend/autodetect.py @@ -0,0 +1,226 @@ +# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich) +# ReFrame Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: BSD-3-Clause + +import fcntl +import json +import jsonschema +import os +import shutil +import tempfile + +import reframe as rfm +import reframe.utility.osext as osext +from reframe.core.exceptions import ConfigError +from reframe.core.logging import getlogger +from reframe.core.runtime import runtime +from reframe.core.schedulers import Job +from reframe.utility.cpuinfo import cpuinfo + + +def _contents(filename): + '''Return the contents of a file.''' + + with open(filename) as fp: + return fp.read() + + +def _log_contents(filename): + filename = os.path.abspath(filename) + getlogger().debug(f'--- {filename} ---\n' + f'{_contents(filename)}\n' + f'--- {filename} ---') + + +class _copy_reframe: + def __init__(self, prefix): + self._prefix = prefix + self._prefix = runtime().get_option('general/0/remote_workdir') + self._workdir = None + + def __enter__(self): + self._workdir = os.path.abspath( + tempfile.mkdtemp(prefix='rfm.', dir=self._prefix) + ) + paths = ['bin/', 'reframe/', 'bootstrap.sh', 'requirements.txt'] + for p in paths: + src = os.path.join(rfm.INSTALL_PREFIX, p) + if os.path.isdir(src): + dst = os.path.join(self._workdir, p) + osext.copytree(src, dst, dirs_exist_ok=True) + else: + shutil.copy2(src, self._workdir) + + return self._workdir + + def __exit__(self, exc_type, exc_val, exc_tb): + osext.rmtree(self._workdir) + + +def _subschema(fragment): + '''Create a configuration subschema.''' + + full_schema = runtime().site_config.schema + return { + '$schema': full_schema['$schema'], + 'defs': full_schema['defs'], + '$ref': fragment + } + + +def _validate_info(info, schema): + if schema is None: + return info + + jsonschema.validate(info, schema) + return info + + +def _load_info(filename, schema=None): + try: + with open(filename) as fp: + with osext.lock_file(fp, fcntl.LOCK_SH): + return _validate_info(json.load(fp), schema) + except OSError as e: + getlogger().warning( + f'could not load file: {filename!r}: {e}' + ) + return {} + except jsonschema.ValidationError as e: + raise ConfigError( + f'could not validate meta-config file {filename!r}' + ) from e + + +def _save_info(filename, topo_info): + if not topo_info: + return + + os.makedirs(os.path.dirname(filename), exist_ok=True) + try: + with open(filename, 'w') as fp: + with osext.lock_file(fp, fcntl.LOCK_EX): + json.dump(topo_info, fp, indent=2) + except OSError as e: + getlogger().warning( + f'could not save topology file: {filename!r}: {e}' + ) + + +def _is_part_local(part): + return (part.scheduler.registered_name == 'local' and + part.launcher_type.registered_name == 'local') + + +def _remote_detect(part): + def _emit_script(job): + launcher_cmd = job.launcher.run_command(job) + commands = [ + f'./bootstrap.sh', + f'{launcher_cmd} ./bin/reframe --detect-host-topology=topo.json' + ] + job.prepare(commands, trap_errors=True) + + getlogger().info( + f'Detecting topology of remote partition {part.fullname!r}: ' + f'this may take some time...' + + ) + topo_info = {} + try: + prefix = runtime().get_option('general/0/remote_workdir') + with _copy_reframe(prefix) as dirname: + with osext.change_dir(dirname): + job = Job.create(part.scheduler, + part.launcher_type(), + name='rfm-detect-job', + sched_access=part.access) + _emit_script(job) + getlogger().debug('submitting detection script') + _log_contents(job.script_filename) + job.submit() + job.wait() + getlogger().debug('job finished') + _log_contents(job.stdout) + _log_contents(job.stderr) + topo_info = json.loads(_contents('topo.json')) + except Exception as e: + getlogger().warning(f'failed to retrieve remote processor info: {e}') + + return topo_info + + +def detect_topology(): + rt = runtime() + detect_remote_systems = rt.get_option('general/0/remote_detect') + config_file = rt.site_config.filename + if config_file == '': + config_prefix = os.path.join( + os.getenv('HOME'), '.reframe/topology' + ) + else: + config_prefix = os.path.join(os.path.dirname(config_file), '_meta') + + for part in rt.system.partitions: + getlogger().debug(f'detecting topology info for {part.fullname}') + found_procinfo = False + found_devinfo = False + if part.processor.info != {}: + # Processor info set up already in the configuration + getlogger().debug( + f'> topology found in configuration file; skipping...' + ) + found_procinfo = True + + if part.devices: + # Devices set up already in the configuration + getlogger().debug( + f'> devices found in configuration file; skipping...' + ) + found_devinfo = True + + if found_procinfo and found_devinfo: + continue + + topo_file = os.path.join( + config_prefix, f'{rt.system.name}-{part.name}', 'processor.json' + ) + dev_file = os.path.join( + config_prefix, f'{rt.system.name}-{part.name}', 'devices.json' + ) + if not found_procinfo and os.path.exists(topo_file): + getlogger().debug( + f'> found topology file {topo_file!r}; loading...' + ) + part.processor._info = _load_info( + topo_file, _subschema('#/defs/processor_info') + ) + found_procinfo = True + + if not found_devinfo and os.path.exists(dev_file): + getlogger().debug( + f'> found devices file {dev_file!r}; loading...' + ) + part._devices = _load_info(dev_file, _subschema('#/defs/devices')) + found_devinfo = True + + if found_procinfo and found_devinfo: + continue + + if not found_procinfo: + # No topology found, try to auto-detect it + getlogger().debug(f'> no topology file found; auto-detecting...') + if _is_part_local(part): + # Unconditionally detect the system for fully local partitions + part.processor._info = cpuinfo() + _save_info(topo_file, part.processor.info) + elif detect_remote_systems: + part.processor._info = _remote_detect(part) + if part.processor.info: + _save_info(topo_file, part.processor.info) + + getlogger().debug(f'> saved topology in {topo_file!r}') + + if not found_devinfo: + getlogger().debug(f'> device auto-detection is not supported') diff --git a/reframe/frontend/cli.py b/reframe/frontend/cli.py index 07df20a1df..bb2e112b95 100644 --- a/reframe/frontend/cli.py +++ b/reframe/frontend/cli.py @@ -21,6 +21,7 @@ import reframe.core.runtime as runtime import reframe.core.warnings as warnings import reframe.frontend.argparse as argparse +import reframe.frontend.autodetect as autodetect import reframe.frontend.ci as ci import reframe.frontend.dependencies as dependencies import reframe.frontend.filters as filters @@ -428,6 +429,10 @@ def main(): '--system', action='store', help='Load configuration for SYSTEM', envvar='RFM_SYSTEM' ) + misc_options.add_argument( + '--detect-host-topology', action='store', nargs='?', const='-', + help='Detect the local host topology and exit' + ) misc_options.add_argument( '--upgrade-config-file', action='store', metavar='OLD[:NEW]', help='Upgrade ReFrame 2.x configuration file to ReFrame 3.x syntax' @@ -481,6 +486,20 @@ def main(): configvar='logging/handlers_perflog/httpjson_url', help='URL of HTTP server accepting JSON logs' ) + argparser.add_argument( + dest='remote_detect', + envvar='RFM_REMOTE_DETECT', + configvar='general/remote_detect', + action='store_true', + help='Detect remote system topology' + ) + argparser.add_argument( + dest='remote_workdir', + envvar='RFM_REMOTE_WORKDIR', + configvar='general/remote_workdir', + action='store', + help='Working directory for launching ReFrame remotely' + ) # Parse command line options = argparser.parse_args() @@ -582,6 +601,7 @@ def main(): sys.exit(1) rt = runtime.runtime() + autodetect.detect_topology() try: if site_config.get('general/0/module_map_file'): rt.modules_system.load_mapping_from_file( @@ -620,6 +640,26 @@ def main(): sys.exit(0) + if options.detect_host_topology: + from reframe.utility.cpuinfo import cpuinfo + + topofile = options.detect_host_topology + if topofile == '-': + json.dump(cpuinfo(), sys.stdout, indent=2) + sys.stdout.write('\n') + else: + try: + with open(topofile, 'w') as fp: + json.dump(cpuinfo(), fp, indent=2) + fp.write('\n') + except OSError as e: + getlogger().error( + f'could not write topology file: {topofile!r}' + ) + sys.exit(1) + + sys.exit(0) + printer.debug(format_env(options.env_vars)) # Setup the check loader diff --git a/reframe/schemas/config.json b/reframe/schemas/config.json index 45fdc149f4..294fba0a32 100644 --- a/reframe/schemas/config.json +++ b/reframe/schemas/config.json @@ -207,6 +207,10 @@ "arch": {"type": "string"}, "num_devices": {"type": "number"} } + }, + "devices": { + "type": "array", + "items": {"$ref": "#/defs/device_info"} } }, "type": "object", @@ -294,10 +298,7 @@ "items": {"type": "string"} }, "processor": {"$ref": "#/defs/processor_info"}, - "devices": { - "type": "array", - "items": {"$ref": "#/defs/device_info"} - }, + "devices": {"$ref": "#/defs/devices"}, "extras": {"type": "object"}, "resources": { "type": "array", @@ -460,6 +461,8 @@ }, "non_default_craype": {"type": "boolean"}, "purge_environment": {"type": "boolean"}, + "remote_detect": {"type": "boolean"}, + "remote_workdir": {"type": "string"}, "report_file": {"type": "string"}, "report_junit": {"type": ["string", "null"]}, "resolve_module_conflicts": {"type": "boolean"}, @@ -500,6 +503,8 @@ "general/module_mappings": [], "general/non_default_craype": false, "general/purge_environment": false, + "general/remote_detect": false, + "general/remote_workdir": ".", "general/report_file": "${HOME}/.reframe/reports/run-report.json", "general/report_junit": null, "general/resolve_module_conflicts": true, diff --git a/reframe/utility/cpuinfo.py b/reframe/utility/cpuinfo.py new file mode 100644 index 0000000000..a770a71c22 --- /dev/null +++ b/reframe/utility/cpuinfo.py @@ -0,0 +1,295 @@ +# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich) +# ReFrame Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: BSD-3-Clause + +import archspec.cpu +import contextlib +import glob +import os +import re + +import reframe.utility.osext as osext +from reframe.core.exceptions import SpawnedProcessError + + +def _bits_from_str(mask_s): + '''Return the set bits from a string representing a bit array.''' + + bits = [] + mask = int(mask_s, 0) + pos = 0 + while mask: + if mask & 1: + bits.append(pos) + + pos += 1 + mask >>= 1 + + return bits + + +def _str_from_bits(bits): + '''Return a string representation of a bit array with ``bits`` set.''' + + ret = 0 + for b in bits: + ret |= (1 << b) + + return hex(ret).lower() + + +def _sysfs_topo(): + cache_units = { + 'K': 1024, + 'M': 1024*1024, + 'G': 1024*1024*1024 + } + cpuinfo = { + 'topology': {} + } + cpu_dirs = glob.glob(r'/sys/devices/system/cpu/cpu[0-9]*') + nodes = glob.glob(r'/sys/devices/system/node/node[0-9]*') + cores = set() + for cpu in cpu_dirs: + core_cpus_path = os.path.join(cpu, 'topology/core_cpus') + thread_siblings_path = os.path.join(cpu, 'topology/thread_siblings') + if glob.glob(core_cpus_path): + cores_path = core_cpus_path + elif glob.glob(thread_siblings_path): + cores_path = thread_siblings_path + else: + # Information cannot be retrieved + continue + + with contextlib.suppress(IOError): + with open(cores_path) as fp: + core_cpus = fp.read() + core_cpus = re.sub(r'[\s,]', '', core_cpus) + core_cpus = f'0x{core_cpus.lower()}' + cores.add(core_cpus) + + sockets = set() + for cpu in cpu_dirs: + package_cpus_path = os.path.join(cpu, 'topology/package_cpus') + core_siblings_path = os.path.join(cpu, 'topology/core_siblings') + if glob.glob(package_cpus_path): + sockets_path = package_cpus_path + elif glob.glob(core_siblings_path): + sockets_path = core_siblings_path + else: + # Information cannot be retrieved + continue + + with contextlib.suppress(IOError): + with open(sockets_path) as fp: + package_cpus = fp.read() + package_cpus = re.sub(r'[\s,]', '', package_cpus) + package_cpus = f'0x{package_cpus.lower()}' + sockets.add(package_cpus) + + numa_nodes = [] + for node in nodes: + with contextlib.suppress(IOError): + with open(os.path.join(node, 'cpumap')) as fp: + cpumap = fp.read() + cpumap = re.sub(r'[\s,]', '', cpumap) + cpumap = f'0x{cpumap.lower()}' + numa_nodes.append(cpumap) + + numa_nodes.sort() + caches = {} + for cpu in cpu_dirs: + cache_dirs = glob.glob(cpu + r'/cache/index[0-9]*') + for cache in cache_dirs: + cache_level = 0 + cache_size = 0 + cache_linesize = 0 + cache_associativity = 0 + cache_cpuset = '' + with contextlib.suppress(IOError): + with open(os.path.join(cache, 'level')) as fp: + cache_level = int(fp.read()) + + with contextlib.suppress(IOError): + # Skip L1 instruction cache + with open(os.path.join(cache, 'type')) as fp: + if cache_level == 1 and fp.read() == 'Instruction\n': + continue + + with contextlib.suppress(IOError): + with open(os.path.join(cache, 'size')) as fp: + cache_size = fp.read() + m = re.match(r'(?P\d+)(?P\S)', cache_size) + if m: + value = int(m.group('val')) + unit = cache_units.get(m.group('unit'), 1) + cache_size = value*unit + + with contextlib.suppress(IOError): + with open(os.path.join(cache, 'coherency_line_size')) as fp: + cache_linesize = int(fp.read()) + + # Don't take the associativity directly from + # "ways_of_associativity" file because some archs (ia64, ppc) + # put 0 there when fully-associative, while others (x86) + # put something like -1. + with contextlib.suppress(IOError): + with open(os.path.join(cache, 'number_of_sets')) as fp: + cache_number_of_sets = int(fp.read()) + + with open(os.path.join(cache, + 'physical_line_partition')) as fp: + cache_physical_line_partition = int(fp.read()) + + if (cache_linesize and + cache_physical_line_partition and + cache_number_of_sets): + cache_associativity = (cache_size // + cache_linesize // + cache_physical_line_partition // + cache_number_of_sets) + + with contextlib.suppress(IOError): + with open(os.path.join(cache, 'shared_cpu_map')) as fp: + cache_cpuset = fp.read() + cache_cpuset = re.sub(r'[\s,]', '', cache_cpuset) + cache_cpuset = f'0x{cache_cpuset.lower()}' + + num_cpus = len(_bits_from_str(cache_cpuset)) + caches.setdefault((cache_level, cache_size, cache_linesize, + cache_associativity, num_cpus), set()) + caches[(cache_level, cache_size, cache_linesize, + cache_associativity, num_cpus)].add(cache_cpuset) + + num_cpus = len(cpu_dirs) + num_cores = len(cores) + num_sockets = len(sockets) + num_cpus_per_core = num_cpus // num_cores if num_cores else 0 + num_cpus_per_socket = num_cpus // num_sockets if num_sockets else 0 + + # Fill in the cpuinfo + cpuinfo['num_cpus'] = num_cpus + cpuinfo['num_cpus_per_core'] = num_cpus_per_core + cpuinfo['num_cpus_per_socket'] = num_cpus_per_socket + cpuinfo['num_sockets'] = num_sockets + cpuinfo['topology']['numa_nodes'] = numa_nodes + cpuinfo['topology']['sockets'] = sorted(list(sockets)) + cpuinfo['topology']['cores'] = sorted(list(cores)) + cpuinfo['topology']['caches'] = [] + for cache_type, cpusets in caches.items(): + (cache_level, cache_size, + cache_linesize, cache_associativity, num_cpus) = cache_type + c = { + 'type': f'L{cache_level}', + 'size': cache_size, + 'linesize': cache_linesize, + 'associativity': cache_associativity, + 'num_cpus': num_cpus, + 'cpusets': sorted(list(cpusets)) + } + cpuinfo['topology']['caches'].append(c) + + return cpuinfo + + +def _sysctl_topo(): + try: + exec_output = osext.run_command('sysctl hw machdep.cpu.cache', + check=True) + except (FileNotFoundError, SpawnedProcessError): + return {} + + cpuinfo = { + 'topology': {} + } + match = re.search(r'hw\.ncpu: (?P\d+)', exec_output.stdout) + if match: + num_cpus = int(match.group('num_cpus')) + + match = re.search(r'hw\.physicalcpu: (?P\d+)', + exec_output.stdout) + if match: + num_cores = int(match.group('num_cores')) + + match = re.search(r'hw\.packages: (?P\d+)', + exec_output.stdout) + if match: + num_sockets = int(match.group('num_sockets')) + cpuinfo['num_sockets'] = num_sockets + + match = re.search(r'hw\.cacheconfig:(?P(\s\d+)*)', + exec_output.stdout) + if match: + cacheconfig = list(map(int, match.group('cacheconfig').split())) + + match = re.search(r'hw\.cachesize:(?P(\s\d+)*)', + exec_output.stdout) + if match: + cachesize = list(map(int, match.group('cachesize').split())) + + match = re.search(r'hw\.cachelinesize: (?P\d+)', + exec_output.stdout) + if match: + linesize = int(match.group('linesize')) + + # index 0 is referring to memory + cache_associativity = [0] + for i in range(1, len(cachesize)): + if cachesize[i] == 0: + break + + match = re.search(rf'machdep\.cpu\.cache\.L{i}_associativity: ' + rf'(?P\d+)', + exec_output.stdout) + assoc = int(match.group('associativity')) if match else 0 + cache_associativity.append(assoc) + + num_cpus_per_socket = num_cpus // num_sockets + num_cpus_per_core = num_cpus // num_cores + + # Fill in the cpuinfo + cpuinfo['num_cpus'] = num_cpus + cpuinfo['num_cpus_per_socket'] = num_cpus_per_socket + cpuinfo['num_cpus_per_core'] = num_cpus_per_core + cpuinfo['topology']['numa_nodes'] = [_str_from_bits(range(num_cpus))] + cpuinfo['topology']['sockets'] = [ + _str_from_bits(range(start, start+num_cpus_per_socket)) + for start in range(0, num_cpus, num_cpus_per_socket) + ] + cpuinfo['topology']['cores'] = [ + _str_from_bits(range(start, start+num_cpus_per_core)) + for start in range(0, num_cpus, num_cpus_per_core) + ] + cpuinfo['topology']['caches'] = [] + for i in range(1, len(cache_associativity)): + t = { + 'type': f'L{i}', + 'size': cachesize[i], + 'linesize': linesize, + 'associativity': cache_associativity[i], + 'num_cpus': cacheconfig[i], + 'cpusets': [ + _str_from_bits(range(start, start+cacheconfig[i])) + for start in range(0, num_cpus, cacheconfig[i]) + ] + } + cpuinfo['topology']['caches'].append(t) + + return cpuinfo + + +def cpuinfo(): + ret = { + 'arch': archspec.cpu.host().name + } + + # Try first to get information from the filesystem + if os.path.isdir('/sys'): + topology = _sysfs_topo() + else: + # Try with the `sysctl` command + topology = _sysctl_topo() + + ret.update(topology) + return ret diff --git a/reframe/utility/osext.py b/reframe/utility/osext.py index 1ef9045bae..24ff73aa51 100644 --- a/reframe/utility/osext.py +++ b/reframe/utility/osext.py @@ -9,6 +9,7 @@ import collections.abc import errno +import fcntl import getpass import grp import os @@ -304,6 +305,28 @@ def rmtree(*args, max_retries=3, **kwargs): raise +# FIXME: Need a proper unit test for this +class lock_file: + def __init__(self, fp, mode): + '''Lock file pointed to by file pointer fp. + + This call is blocking. + ''' + + self._mode = mode + if isinstance(fp, int): + # Treat fp as a file descriptor + self._fd = fp + else: + self._fd = fp.fileno() + + def __enter__(self): + fcntl.flock(self._fd, self._mode) + + def __exit__(self, exc_type, exc_val, exc_tb): + fcntl.flock(self._fd, fcntl.LOCK_UN) + + def inpath(entry, pathvar): '''Check if entry is in path. diff --git a/requirements.txt b/requirements.txt index 5a03c53e0a..d8c11c4b59 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +archspec==0.1.2 argcomplete==1.12.3 coverage==5.5 importlib_metadata==4.0.1; python_version < '3.8' diff --git a/setup.py b/setup.py index b698fc23a3..a5307c4e35 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,8 @@ ), package_data={'reframe': ['schemas/*']}, include_package_data=True, - install_requires=['argcomplete', 'jsonschema', 'lxml', 'PyYAML', 'semver'], + install_requires=['archspec', 'argcomplete', 'jsonschema', + 'lxml', 'PyYAML', 'semver'], python_requires='>=3.6', scripts=['bin/reframe'], classifiers=( diff --git a/unittests/test_autodetect.py b/unittests/test_autodetect.py new file mode 100644 index 0000000000..e4f1141266 --- /dev/null +++ b/unittests/test_autodetect.py @@ -0,0 +1,42 @@ +# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich) +# ReFrame Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: BSD-3-Clause + +import json +import os +import pytest +import shutil + + +from reframe.core.runtime import runtime +from reframe.frontend.autodetect import detect_topology +from reframe.utility.cpuinfo import cpuinfo + + +@pytest.fixture +def exec_ctx(make_exec_ctx_g, tmp_path): + # Copy the default settings to the temp dir + config_file = tmp_path / 'conf.py' + shutil.copy('reframe/core/settings.py', config_file) + + # Create a devices file manually, since it is not auto-generated + meta_prefix = tmp_path / '_meta' / 'generic-default' + os.makedirs(meta_prefix) + with open(meta_prefix / 'devices.json', 'w') as fp: + json.dump([ + { + 'type': 'gpu', + 'arch': 'a100', + 'num_devices': 8 + } + ], fp) + + yield from make_exec_ctx_g(config_file) + + +def test_autotect(exec_ctx): + detect_topology() + part = runtime().system.partitions[0] + assert part.processor.info == cpuinfo() + assert part.devices == [{'type': 'gpu', 'arch': 'a100', 'num_devices': 8}] diff --git a/unittests/test_cli.py b/unittests/test_cli.py index b11605dead..4d51f888b4 100644 --- a/unittests/test_cli.py +++ b/unittests/test_cli.py @@ -6,9 +6,11 @@ import contextlib import io import itertools +import json import os import pytest import re +import shutil import sys import reframe.core.environments as env @@ -52,7 +54,13 @@ def perflogdir(tmp_path): @pytest.fixture -def run_reframe(tmp_path, perflogdir): +def rm_config_meta(): + yield + shutil.rmtree('unittests/resources/_meta', ignore_errors=True) + + +@pytest.fixture +def run_reframe(tmp_path, perflogdir, rm_config_meta): def _run_reframe(system='generic:default', checkpath=['unittests/resources/checks/hellocheck.py'], environs=['builtin'], @@ -765,3 +773,29 @@ def test_maxfail_negative(run_reframe): assert 'Traceback' not in stderr assert "--maxfail should be a non-negative integer: '-2'" in stdout assert returncode == 1 + + +def test_detect_host_topology(run_reframe): + from reframe.utility.cpuinfo import cpuinfo + + returncode, stdout, stderr = run_reframe( + more_options=['--detect-host-topology'] + ) + assert 'Traceback' not in stdout + assert 'Traceback' not in stderr + assert returncode == 0 + assert stdout == json.dumps(cpuinfo(), indent=2) + '\n' + + +def test_detect_host_topology_file(run_reframe, tmp_path): + from reframe.utility.cpuinfo import cpuinfo + + topo_file = tmp_path / 'topo.json' + returncode, stdout, stderr = run_reframe( + more_options=[f'--detect-host-topology={topo_file}'] + ) + assert 'Traceback' not in stdout + assert 'Traceback' not in stderr + assert returncode == 0 + with open(topo_file) as fp: + assert json.load(fp) == cpuinfo()