From d92521b678758bedd7ded8ab93fb10e0b59a045e Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Sun, 27 Nov 2022 21:22:15 +0100 Subject: [PATCH 1/4] Move scheduler options into the partition definition --- reframe/core/schedulers/__init__.py | 35 +++++++++++++++++++++- reframe/core/schedulers/flux.py | 4 +-- reframe/core/schedulers/lsf.py | 4 +-- reframe/core/schedulers/oar.py | 4 +-- reframe/core/schedulers/pbs.py | 4 +-- reframe/core/schedulers/sge.py | 4 +-- reframe/core/schedulers/slurm.py | 16 +++------- reframe/core/systems.py | 2 +- reframe/frontend/cli.py | 2 +- reframe/schemas/config.json | 46 +++++++++++------------------ unittests/test_schedulers.py | 2 +- 11 files changed, 63 insertions(+), 60 deletions(-) diff --git a/reframe/core/schedulers/__init__.py b/reframe/core/schedulers/__init__.py index 7423a68736..bdc9470301 100644 --- a/reframe/core/schedulers/__init__.py +++ b/reframe/core/schedulers/__init__.py @@ -25,12 +25,45 @@ class JobMeta(RegressionTestMeta, abc.ABCMeta): '''Job metaclass.''' -class JobScheduler(abc.ABC): +class JobSchedulerMeta(abc.ABCMeta): + '''Metaclass for JobSchedulers. + + The purpose of this metaclass is to intercept the constructor call and + consume the `part_name` argument for setting up the configuration prefix + without requiring the users to call `super().__init__()` in their + constructors. This allows the base class to have the look and feel of a + pure interface. + + :meta private: + + ''' + def __call__(cls, *args, **kwargs): + part_name = kwargs.pop('part_name', None) + obj = cls.__new__(cls, *args, **kwargs) + if part_name: + obj._config_prefix = ( + f'systems/0/paritions/@{part_name}/sched_options' + ) + else: + obj._config_prefix = 'systems/0/sched_options' + + obj.__init__(*args, **kwargs) + return obj + + +class JobScheduler(abc.ABC, metaclass=JobSchedulerMeta): '''Abstract base class for job scheduler backends. :meta private: ''' + def get_option(self, name): + '''Get scheduler-specific option. + + :meta private: + ''' + return runtime.runtime().get_option(f'{self._config_prefix}/{name}') + @abc.abstractmethod def make_job(self, *args, **kwargs): '''Create a new job to be managed by this scheduler. diff --git a/reframe/core/schedulers/flux.py b/reframe/core/schedulers/flux.py index 91d987893b..99b1362fd7 100644 --- a/reframe/core/schedulers/flux.py +++ b/reframe/core/schedulers/flux.py @@ -65,9 +65,7 @@ def completed(self): class FluxJobScheduler(JobScheduler): def __init__(self): self._fexecutor = flux.job.FluxExecutor() - self._submit_timeout = rt.runtime().get_option( - f'schedulers/@{self.registered_name}/job_submit_timeout' - ) + self._submit_timeout = self.get_option('job_submit_timeout') def emit_preamble(self, job): # We don't need to submit with a file, so we don't need a preamble. diff --git a/reframe/core/schedulers/lsf.py b/reframe/core/schedulers/lsf.py index 10d42ff93b..bbe178cc91 100644 --- a/reframe/core/schedulers/lsf.py +++ b/reframe/core/schedulers/lsf.py @@ -27,9 +27,7 @@ class LsfJobScheduler(PbsJobScheduler): def __init__(self): self._prefix = '#BSUB' - self._submit_timeout = rt.runtime().get_option( - f'schedulers/@{self.registered_name}/job_submit_timeout' - ) + self._submit_timeout = self.get_option('job_submit_timeout') def _format_option(self, var, option): if var is not None: diff --git a/reframe/core/schedulers/oar.py b/reframe/core/schedulers/oar.py index 66e907041b..c10c117785 100644 --- a/reframe/core/schedulers/oar.py +++ b/reframe/core/schedulers/oar.py @@ -60,9 +60,7 @@ def oar_state_pending(state): class OarJobScheduler(PbsJobScheduler): def __init__(self): self._prefix = '#OAR' - self._submit_timeout = rt.runtime().get_option( - f'schedulers/@{self.registered_name}/job_submit_timeout' - ) + self._submit_timeout = self.get_option('job_submit_timeout') def emit_preamble(self, job): # host is de-facto nodes and core is number of cores requested per node diff --git a/reframe/core/schedulers/pbs.py b/reframe/core/schedulers/pbs.py index 9de089468e..5a677b7839 100644 --- a/reframe/core/schedulers/pbs.py +++ b/reframe/core/schedulers/pbs.py @@ -76,9 +76,7 @@ class PbsJobScheduler(sched.JobScheduler): def __init__(self): self._prefix = '#PBS' - self._submit_timeout = rt.runtime().get_option( - f'schedulers/@{self.registered_name}/job_submit_timeout' - ) + self._submit_timeout = self.get_option('job_submit_timeout') def _emit_lselect_option(self, job): num_tasks_per_node = job.num_tasks_per_node or 1 diff --git a/reframe/core/schedulers/sge.py b/reframe/core/schedulers/sge.py index f1b9c13e6a..f1175293a9 100644 --- a/reframe/core/schedulers/sge.py +++ b/reframe/core/schedulers/sge.py @@ -28,9 +28,7 @@ class SgeJobScheduler(PbsJobScheduler): def __init__(self): self._prefix = '#$' - self._submit_timeout = rt.runtime().get_option( - f'schedulers/@{self.registered_name}/job_submit_timeout' - ) + self._submit_timeout = self.get_option('job_submit_timeout') def emit_preamble(self, job): preamble = [ diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 4ecf2299b2..fbc9d670de 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -132,22 +132,14 @@ def __init__(self): 'QOSJobLimit', 'QOSResourceLimit', 'QOSUsageThreshold'] - ignore_reqnodenotavail = rt.runtime().get_option( - f'schedulers/@{self.registered_name}/ignore_reqnodenotavail' - ) + ignore_reqnodenotavail = self.get_option('ignore_reqnodenotavail') if not ignore_reqnodenotavail: self._cancel_reasons.append('ReqNodeNotAvail') self._update_state_count = 0 - self._submit_timeout = rt.runtime().get_option( - f'schedulers/@{self.registered_name}/job_submit_timeout' - ) - self._use_nodes_opt = rt.runtime().get_option( - f'schedulers/@{self.registered_name}/use_nodes_option' - ) - self._resubmit_on_errors = rt.runtime().get_option( - f'schedulers/@{self.registered_name}/resubmit_on_errors' - ) + self._submit_timeout = self.get_option('job_submit_timeout') + self._use_nodes_opt = self.get_option('use_nodes_option') + self._resubmit_on_errors = self.get_option('resubmit_on_errors') def make_job(self, *args, **kwargs): return _SlurmJob(*args, **kwargs) diff --git a/reframe/core/systems.py b/reframe/core/systems.py index c07e04bca8..a45783e48e 100644 --- a/reframe/core/systems.py +++ b/reframe/core/systems.py @@ -314,7 +314,7 @@ def scheduler(self): ''' if self._scheduler is None: - self._scheduler = self._sched_type() + self._scheduler = self._sched_type(part_name=self.name) return self._scheduler diff --git a/reframe/frontend/cli.py b/reframe/frontend/cli.py index 0aa6a63705..f20da637b9 100644 --- a/reframe/frontend/cli.py +++ b/reframe/frontend/cli.py @@ -593,7 +593,7 @@ def main(): argparser.add_argument( dest='ignore_reqnodenotavail', envvar='RFM_IGNORE_REQNODENOTAVAIL', - configvar='schedulers/ignore_reqnodenotavail', + configvar='systems*/sched_options/ignore_reqnodenotavail', action='store_true', help='Ignore ReqNodeNotAvail Slurm error' ) diff --git a/reframe/schemas/config.json b/reframe/schemas/config.json index 08ef062f66..e1d44fed14 100644 --- a/reframe/schemas/config.json +++ b/reframe/schemas/config.json @@ -101,6 +101,18 @@ } ] }, + "sched_options": { + "type": "object", + "properties": { + "ignore_reqnodenotavail": {"type": "boolean"}, + "job_submit_timeout": {"type": "number"}, + "resubmit_on_errors": { + "type": "array", + "items": {"type": "string"} + }, + "use_nodes_option": {"type": "boolean"} + } + }, "stream_handler": { "allOf": [ {"$ref": "#/defs/handler_common"}, @@ -381,29 +393,6 @@ "additionalProperties": false } }, - "schedulers": { - "type": "array", - "items": { - "type": "object", - "properties": { - "name": { - "type": "string", - "enum": ["flux", "local", "lsf", "oar", "pbs", - "sge", "slurm", "squeue", "torque"] - }, - "ignore_reqnodenotavail": {"type": "boolean"}, - "resubmit_on_errors": { - "type": "array", - "items": {"type": "string"} - }, - "job_submit_timeout": {"type": "number"}, - "target_systems": {"$ref": "#/defs/system_ref"}, - "use_nodes_option": {"type": "boolean"} - }, - "required": ["name"], - "additionalProperties": false - } - }, "logging": { "type": "array", "items": { @@ -578,11 +567,6 @@ "logging/handlers*/syslog_facility": "user", "modes/options": [], "modes/target_systems": ["*"], - "schedulers/ignore_reqnodenotavail": false, - "schedulers/resubmit_on_errors": [], - "schedulers/job_submit_timeout": 60, - "schedulers/target_systems": ["*"], - "schedulers/use_nodes_option": false, "systems/descr": "", "systems/max_local_jobs": 8, "systems/modules_system": "nomod", @@ -612,6 +596,10 @@ "systems/partitions/processor": {}, "systems/partitions/time_limit": null, "systems/partitions/devices": [], - "systems/partitions/extras": {} + "systems/partitions/extras": {}, + "systems*/sched_options/ignore_reqnodenotavail": false, + "systems*/sched_options/resubmit_on_errors": [], + "systems*/sched_options/job_submit_timeout": 60, + "systems*/sched_options/use_nodes_option": false } } diff --git a/unittests/test_schedulers.py b/unittests/test_schedulers.py index 1f0f76acd3..a0c2317ae2 100644 --- a/unittests/test_schedulers.py +++ b/unittests/test_schedulers.py @@ -378,7 +378,7 @@ def test_prepare_without_smt(fake_job, slurm_only): def test_prepare_nodes_option(make_exec_ctx, make_job, slurm_only): make_exec_ctx(test_util.TEST_CONFIG_FILE, 'generic', - {'schedulers/use_nodes_option': True}) + {'systems*/sched_options/use_nodes_option': True}) job = make_job() job.num_tasks = 16 job.num_tasks_per_node = 2 From 28564e8c7db11fccd33fc2caa876949ae0702066 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Tue, 29 Nov 2022 00:15:21 +0100 Subject: [PATCH 2/4] Update documentation --- docs/config_reference.rst | 161 +++++++++++++++++------------------- docs/configure.rst | 8 +- docs/pipeline.rst | 2 + reframe/schemas/config.json | 2 +- 4 files changed, 84 insertions(+), 89 deletions(-) diff --git a/docs/config_reference.rst b/docs/config_reference.rst index c3c9054b1c..fd51d41fac 100644 --- a/docs/config_reference.rst +++ b/docs/config_reference.rst @@ -57,13 +57,6 @@ It consists of the following properties: A list of `logging configuration objects <#logging-configuration>`__. -.. py:attribute:: .schedulers - - :required: No - - A list of `scheduler configuration objects <#scheduler-configuration>`__. - - .. py:attribute:: .modes :required: No @@ -77,6 +70,12 @@ It consists of the following properties: A list of `general configuration objects <#general-configuration>`__. +.. warning:: + .. versionchanged:: 4.0.0 + The ``schedulers`` section is removed. + Scheduler options should be set per partition using the ``sched_options`` attribute. + + System Configuration -------------------- @@ -205,6 +204,15 @@ System Configuration This list must have at least one element. +.. js:attribute:: .systems[].sched_options + + :required: No + :default: ``{}`` + + Scheduler options for the local scheduler that is associated with the ReFrame's execution context. + To understand the difference between the different execution contexts, please refer to ":ref:`execution-contexts`" + For the available scheduler options, see the :obj:`sched_options` in the partition configuration below. + ------------------------------ System Partition Configuration ------------------------------ @@ -294,6 +302,69 @@ System Partition Configuration the :py:attr:`extra_resources` will be simply ignored in this case and the scheduler backend will interpret the different test fields in the appropriate way. +.. js:attribute:: .systems[].partitions[].sched_options + + :required: No + :default: ``{}`` + + Scheduler-specific options for this partition. + See below for the available options. + + +.. js:attribute:: .systems[].partitions[].sched_options.ignore_reqnodenotavail + + :required: No + :default: ``false`` + + Ignore the ``ReqNodeNotAvail`` Slurm state. + + If a job associated to a test is in pending state with the Slurm reason ``ReqNodeNotAvail`` and a list of unavailable nodes is also specified, ReFrame will check the status of the nodes and, if all of them are indeed down, it will cancel the job. + Sometimes, however, when Slurm's backfill algorithm takes too long to compute, Slurm will set the pending reason to ``ReqNodeNotAvail`` and mark all system nodes as unavailable, causing ReFrame to kill the job. + In such cases, you may set this parameter to ``true`` to avoid this. + + This option is relevant for the Slurm backends only. + +.. js:attribute:: .systems[].partitions[].sched_options.job_submit_timeout + + :required: No + :default: ``60`` + + Timeout in seconds for the job submission command. + + If timeout is reached, the test issuing that command will be marked as a failure. + + +.. js:attribute:: .systems[].partitions[].sched_options.resubmit_on_errors + + :required: No + :default: ``[]`` + + If any of the listed errors occur, try to resubmit the job after some seconds. + + As an example, you could have ReFrame trying to resubmit a job in case that the maximum submission limit per user is reached by setting this field to ``["QOSMaxSubmitJobPerUserLimit"]``. + You can ignore multiple errors at the same time if you add more error strings in the list. + + This option is relevant for the Slurm backends only. + + .. versionadded:: 3.4.1 + + .. warning:: + Job submission is a synchronous operation in ReFrame. + If this option is set, ReFrame's execution will block until the error conditions specified in this list are resolved. + No other test would be able to proceed. + + +.. js:attribute:: .systems[].partitions[].sched_options.use_nodes_option + + :required: No + :default: ``false`` + + Always emit the ``--nodes`` Slurm option in the preamble of the job script. + This option is relevant to Slurm backends only. + + This option is relevant for the Slurm backends only. + + .. js:attribute:: .systems[].partitions[].launcher :required: Yes @@ -1251,82 +1322,6 @@ An example configuration of this handler for performance logging is shown here: This handler transmits the whole log record, meaning that all the information will be available and indexable at the remote end. -Scheduler Configuration ------------------------ - -A scheduler configuration object contains configuration options specific to the scheduler's behavior. - - ------------------------- -Common scheduler options ------------------------- - - -.. js:attribute:: .schedulers[].name - - :required: Yes - - The name of the scheduler that these options refer to. - It can be any of the supported job scheduler `backends <#.systems[].partitions[].scheduler>`__. - - -.. js:attribute:: .schedulers[].job_submit_timeout - - :required: No - :default: 60 - - Timeout in seconds for the job submission command. - If timeout is reached, the regression test issuing that command will be marked as a failure. - - -.. js:attribute:: .schedulers[].target_systems - - :required: No - :default: ``["*"]`` - - A list of systems or system/partitions combinations that this scheduler configuration is valid for. - For a detailed description of this property, you may refer `here <#.environments[].target_systems>`__. - -.. js:attribute:: .schedulers[].use_nodes_option - - :required: No - :default: ``false`` - - Always emit the ``--nodes`` Slurm option in the preamble of the job script. - This option is relevant to Slurm backends only. - - -.. js:attribute:: .schedulers[].ignore_reqnodenotavail - - :required: No - :default: ``false`` - - This option is relevant to the Slurm backends only. - - If a job associated to a test is in pending state with the Slurm reason ``ReqNodeNotAvail`` and a list of unavailable nodes is also specified, ReFrame will check the status of the nodes and, if all of them are indeed down, it will cancel the job. - Sometimes, however, when Slurm's backfill algorithm takes too long to compute, Slurm will set the pending reason to ``ReqNodeNotAvail`` and mark all system nodes as unavailable, causing ReFrame to kill the job. - In such cases, you may set this parameter to ``true`` to avoid this. - - -.. js:attribute:: .schedulers[].resubmit_on_errors - - :required: No - :default: ``[]`` - - This option is relevant to the Slurm backends only. - - If any of the listed errors occur, ReFrame will try to resubmit the job after some seconds. - As an example, you could have ReFrame trying to resubmit a job in case that the maximum submission limit per user is reached by setting this field to ``["QOSMaxSubmitJobPerUserLimit"]``. - You can ignore multiple errors at the same time if you add more error strings in the list. - - .. versionadded:: 3.4.1 - - .. warning:: - Job submission is a synchronous operation in ReFrame. - If this option is set, ReFrame's execution will block until the error conditions specified in this list are resolved. - No other test would be able to proceed. - - Execution Mode Configuration ---------------------------- diff --git a/docs/configure.rst b/docs/configure.rst index 49239ebd66..20248ba10c 100644 --- a/docs/configure.rst +++ b/docs/configure.rst @@ -217,12 +217,10 @@ However, there are several options that can go into this section, but the reader Other configuration options --------------------------- -There are finally two more optional configuration sections that are not discussed here: - -1. The ``schedulers`` section holds configuration variables specific to the different scheduler backends and -2. the ``modes`` section defines different execution modes for the framework. - Execution modes are discussed in the :doc:`pipeline` page. +There is finally one additional optional configuration section that is not discussed here: +The ``modes`` section defines different execution modes for the framework. +Execution modes are discussed in the :doc:`pipeline` page. Building the Final Configuration diff --git a/docs/pipeline.rst b/docs/pipeline.rst index 523073ecd2..d090f88aac 100644 --- a/docs/pipeline.rst +++ b/docs/pipeline.rst @@ -161,6 +161,8 @@ There are a number of things to notice in this diagram: The ``compile`` stage is now also executed asynchronously. +.. _execution-contexts: + -------------------------------------- Where each pipeline stage is executed? -------------------------------------- diff --git a/reframe/schemas/config.json b/reframe/schemas/config.json index e1d44fed14..9652b02ab7 100644 --- a/reframe/schemas/config.json +++ b/reframe/schemas/config.json @@ -598,8 +598,8 @@ "systems/partitions/devices": [], "systems/partitions/extras": {}, "systems*/sched_options/ignore_reqnodenotavail": false, - "systems*/sched_options/resubmit_on_errors": [], "systems*/sched_options/job_submit_timeout": 60, + "systems*/sched_options/resubmit_on_errors": [], "systems*/sched_options/use_nodes_option": false } } From bddc669b255cb310939e31dfa523df7ca90b5599 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Tue, 29 Nov 2022 00:19:14 +0100 Subject: [PATCH 3/4] Add missing version annotations --- docs/config_reference.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/config_reference.rst b/docs/config_reference.rst index fd51d41fac..bc6844a2d6 100644 --- a/docs/config_reference.rst +++ b/docs/config_reference.rst @@ -213,6 +213,9 @@ System Configuration To understand the difference between the different execution contexts, please refer to ":ref:`execution-contexts`" For the available scheduler options, see the :obj:`sched_options` in the partition configuration below. + .. versionadded:: 4.0.0 + + ------------------------------ System Partition Configuration ------------------------------ @@ -310,6 +313,8 @@ System Partition Configuration Scheduler-specific options for this partition. See below for the available options. + .. versionadded:: 4.0.0 + .. js:attribute:: .systems[].partitions[].sched_options.ignore_reqnodenotavail From bcfbce571a78ce2d841118d52de93394dbbf40d2 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Tue, 29 Nov 2022 21:12:09 +0100 Subject: [PATCH 4/4] Remove unused imports --- reframe/core/schedulers/flux.py | 1 - reframe/core/schedulers/lsf.py | 1 - reframe/core/schedulers/oar.py | 1 - reframe/core/schedulers/pbs.py | 1 - reframe/core/schedulers/sge.py | 1 - 5 files changed, 5 deletions(-) diff --git a/reframe/core/schedulers/flux.py b/reframe/core/schedulers/flux.py index 99b1362fd7..0b961fb465 100644 --- a/reframe/core/schedulers/flux.py +++ b/reframe/core/schedulers/flux.py @@ -14,7 +14,6 @@ import os import time -import reframe.core.runtime as rt from reframe.core.backends import register_scheduler from reframe.core.exceptions import JobError from reframe.core.schedulers import JobScheduler, Job diff --git a/reframe/core/schedulers/lsf.py b/reframe/core/schedulers/lsf.py index bbe178cc91..2205c48c49 100644 --- a/reframe/core/schedulers/lsf.py +++ b/reframe/core/schedulers/lsf.py @@ -14,7 +14,6 @@ import re import time -import reframe.core.runtime as rt import reframe.utility.osext as osext from reframe.core.backends import register_scheduler from reframe.core.exceptions import JobSchedulerError diff --git a/reframe/core/schedulers/oar.py b/reframe/core/schedulers/oar.py index c10c117785..31feceddb3 100644 --- a/reframe/core/schedulers/oar.py +++ b/reframe/core/schedulers/oar.py @@ -14,7 +14,6 @@ import re import time -import reframe.core.runtime as rt import reframe.utility.osext as osext from reframe.core.backends import register_scheduler from reframe.core.exceptions import JobError, JobSchedulerError diff --git a/reframe/core/schedulers/pbs.py b/reframe/core/schedulers/pbs.py index 5a677b7839..2ec2de353f 100644 --- a/reframe/core/schedulers/pbs.py +++ b/reframe/core/schedulers/pbs.py @@ -15,7 +15,6 @@ import re import time -import reframe.core.runtime as rt import reframe.core.schedulers as sched import reframe.utility.osext as osext from reframe.core.backends import register_scheduler diff --git a/reframe/core/schedulers/sge.py b/reframe/core/schedulers/sge.py index f1175293a9..74ae6c5088 100644 --- a/reframe/core/schedulers/sge.py +++ b/reframe/core/schedulers/sge.py @@ -14,7 +14,6 @@ import time import xml.etree.ElementTree as ET -import reframe.core.runtime as rt import reframe.utility.osext as osext from reframe.core.backends import register_scheduler from reframe.core.exceptions import JobSchedulerError