diff --git a/docs/config_reference.rst b/docs/config_reference.rst index 54ea1bcddf..cda52f4f4d 100644 --- a/docs/config_reference.rst +++ b/docs/config_reference.rst @@ -57,13 +57,6 @@ It consists of the following properties: A list of `logging configuration objects <#logging-configuration>`__. -.. py:attribute:: .schedulers - - :required: No - - A list of `scheduler configuration objects <#scheduler-configuration>`__. - - .. py:attribute:: .modes :required: No @@ -77,6 +70,12 @@ It consists of the following properties: A list of `general configuration objects <#general-configuration>`__. +.. warning:: + .. versionchanged:: 4.0.0 + The ``schedulers`` section is removed. + Scheduler options should be set per partition using the ``sched_options`` attribute. + + System Configuration -------------------- @@ -205,6 +204,18 @@ System Configuration This list must have at least one element. +.. js:attribute:: .systems[].sched_options + + :required: No + :default: ``{}`` + + Scheduler options for the local scheduler that is associated with the ReFrame's execution context. + To understand the difference between the different execution contexts, please refer to ":ref:`execution-contexts`" + For the available scheduler options, see the :obj:`sched_options` in the partition configuration below. + + .. versionadded:: 4.0.0 + + ------------------------------ System Partition Configuration ------------------------------ @@ -294,6 +305,71 @@ System Partition Configuration the :py:attr:`extra_resources` will be simply ignored in this case and the scheduler backend will interpret the different test fields in the appropriate way. +.. js:attribute:: .systems[].partitions[].sched_options + + :required: No + :default: ``{}`` + + Scheduler-specific options for this partition. + See below for the available options. + + .. versionadded:: 4.0.0 + + +.. js:attribute:: .systems[].partitions[].sched_options.ignore_reqnodenotavail + + :required: No + :default: ``false`` + + Ignore the ``ReqNodeNotAvail`` Slurm state. + + If a job associated to a test is in pending state with the Slurm reason ``ReqNodeNotAvail`` and a list of unavailable nodes is also specified, ReFrame will check the status of the nodes and, if all of them are indeed down, it will cancel the job. + Sometimes, however, when Slurm's backfill algorithm takes too long to compute, Slurm will set the pending reason to ``ReqNodeNotAvail`` and mark all system nodes as unavailable, causing ReFrame to kill the job. + In such cases, you may set this parameter to ``true`` to avoid this. + + This option is relevant for the Slurm backends only. + +.. js:attribute:: .systems[].partitions[].sched_options.job_submit_timeout + + :required: No + :default: ``60`` + + Timeout in seconds for the job submission command. + + If timeout is reached, the test issuing that command will be marked as a failure. + + +.. js:attribute:: .systems[].partitions[].sched_options.resubmit_on_errors + + :required: No + :default: ``[]`` + + If any of the listed errors occur, try to resubmit the job after some seconds. + + As an example, you could have ReFrame trying to resubmit a job in case that the maximum submission limit per user is reached by setting this field to ``["QOSMaxSubmitJobPerUserLimit"]``. + You can ignore multiple errors at the same time if you add more error strings in the list. + + This option is relevant for the Slurm backends only. + + .. versionadded:: 3.4.1 + + .. warning:: + Job submission is a synchronous operation in ReFrame. + If this option is set, ReFrame's execution will block until the error conditions specified in this list are resolved. + No other test would be able to proceed. + + +.. js:attribute:: .systems[].partitions[].sched_options.use_nodes_option + + :required: No + :default: ``false`` + + Always emit the ``--nodes`` Slurm option in the preamble of the job script. + This option is relevant to Slurm backends only. + + This option is relevant for the Slurm backends only. + + .. js:attribute:: .systems[].partitions[].launcher :required: Yes @@ -1287,82 +1363,6 @@ An example configuration of this handler for performance logging is shown here: This handler transmits the whole log record, meaning that all the information will be available and indexable at the remote end. -Scheduler Configuration ------------------------ - -A scheduler configuration object contains configuration options specific to the scheduler's behavior. - - ------------------------- -Common scheduler options ------------------------- - - -.. js:attribute:: .schedulers[].name - - :required: Yes - - The name of the scheduler that these options refer to. - It can be any of the supported job scheduler `backends <#.systems[].partitions[].scheduler>`__. - - -.. js:attribute:: .schedulers[].job_submit_timeout - - :required: No - :default: 60 - - Timeout in seconds for the job submission command. - If timeout is reached, the regression test issuing that command will be marked as a failure. - - -.. js:attribute:: .schedulers[].target_systems - - :required: No - :default: ``["*"]`` - - A list of systems or system/partitions combinations that this scheduler configuration is valid for. - For a detailed description of this property, you may refer `here <#.environments[].target_systems>`__. - -.. js:attribute:: .schedulers[].use_nodes_option - - :required: No - :default: ``false`` - - Always emit the ``--nodes`` Slurm option in the preamble of the job script. - This option is relevant to Slurm backends only. - - -.. js:attribute:: .schedulers[].ignore_reqnodenotavail - - :required: No - :default: ``false`` - - This option is relevant to the Slurm backends only. - - If a job associated to a test is in pending state with the Slurm reason ``ReqNodeNotAvail`` and a list of unavailable nodes is also specified, ReFrame will check the status of the nodes and, if all of them are indeed down, it will cancel the job. - Sometimes, however, when Slurm's backfill algorithm takes too long to compute, Slurm will set the pending reason to ``ReqNodeNotAvail`` and mark all system nodes as unavailable, causing ReFrame to kill the job. - In such cases, you may set this parameter to ``true`` to avoid this. - - -.. js:attribute:: .schedulers[].resubmit_on_errors - - :required: No - :default: ``[]`` - - This option is relevant to the Slurm backends only. - - If any of the listed errors occur, ReFrame will try to resubmit the job after some seconds. - As an example, you could have ReFrame trying to resubmit a job in case that the maximum submission limit per user is reached by setting this field to ``["QOSMaxSubmitJobPerUserLimit"]``. - You can ignore multiple errors at the same time if you add more error strings in the list. - - .. versionadded:: 3.4.1 - - .. warning:: - Job submission is a synchronous operation in ReFrame. - If this option is set, ReFrame's execution will block until the error conditions specified in this list are resolved. - No other test would be able to proceed. - - Execution Mode Configuration ---------------------------- diff --git a/docs/configure.rst b/docs/configure.rst index 674468ac53..26a0a7806d 100644 --- a/docs/configure.rst +++ b/docs/configure.rst @@ -216,12 +216,10 @@ However, there are several options that can go into this section, but the reader Other configuration options --------------------------- -There are finally two more optional configuration sections that are not discussed here: - -1. The ``schedulers`` section holds configuration variables specific to the different scheduler backends and -2. the ``modes`` section defines different execution modes for the framework. - Execution modes are discussed in the :doc:`pipeline` page. +There is finally one additional optional configuration section that is not discussed here: +The ``modes`` section defines different execution modes for the framework. +Execution modes are discussed in the :doc:`pipeline` page. Building the Final Configuration diff --git a/docs/pipeline.rst b/docs/pipeline.rst index 523073ecd2..d090f88aac 100644 --- a/docs/pipeline.rst +++ b/docs/pipeline.rst @@ -161,6 +161,8 @@ There are a number of things to notice in this diagram: The ``compile`` stage is now also executed asynchronously. +.. _execution-contexts: + -------------------------------------- Where each pipeline stage is executed? -------------------------------------- diff --git a/reframe/core/schedulers/__init__.py b/reframe/core/schedulers/__init__.py index 7423a68736..bdc9470301 100644 --- a/reframe/core/schedulers/__init__.py +++ b/reframe/core/schedulers/__init__.py @@ -25,12 +25,45 @@ class JobMeta(RegressionTestMeta, abc.ABCMeta): '''Job metaclass.''' -class JobScheduler(abc.ABC): +class JobSchedulerMeta(abc.ABCMeta): + '''Metaclass for JobSchedulers. + + The purpose of this metaclass is to intercept the constructor call and + consume the `part_name` argument for setting up the configuration prefix + without requiring the users to call `super().__init__()` in their + constructors. This allows the base class to have the look and feel of a + pure interface. + + :meta private: + + ''' + def __call__(cls, *args, **kwargs): + part_name = kwargs.pop('part_name', None) + obj = cls.__new__(cls, *args, **kwargs) + if part_name: + obj._config_prefix = ( + f'systems/0/paritions/@{part_name}/sched_options' + ) + else: + obj._config_prefix = 'systems/0/sched_options' + + obj.__init__(*args, **kwargs) + return obj + + +class JobScheduler(abc.ABC, metaclass=JobSchedulerMeta): '''Abstract base class for job scheduler backends. :meta private: ''' + def get_option(self, name): + '''Get scheduler-specific option. + + :meta private: + ''' + return runtime.runtime().get_option(f'{self._config_prefix}/{name}') + @abc.abstractmethod def make_job(self, *args, **kwargs): '''Create a new job to be managed by this scheduler. diff --git a/reframe/core/schedulers/flux.py b/reframe/core/schedulers/flux.py index 91d987893b..0b961fb465 100644 --- a/reframe/core/schedulers/flux.py +++ b/reframe/core/schedulers/flux.py @@ -14,7 +14,6 @@ import os import time -import reframe.core.runtime as rt from reframe.core.backends import register_scheduler from reframe.core.exceptions import JobError from reframe.core.schedulers import JobScheduler, Job @@ -65,9 +64,7 @@ def completed(self): class FluxJobScheduler(JobScheduler): def __init__(self): self._fexecutor = flux.job.FluxExecutor() - self._submit_timeout = rt.runtime().get_option( - f'schedulers/@{self.registered_name}/job_submit_timeout' - ) + self._submit_timeout = self.get_option('job_submit_timeout') def emit_preamble(self, job): # We don't need to submit with a file, so we don't need a preamble. diff --git a/reframe/core/schedulers/lsf.py b/reframe/core/schedulers/lsf.py index 10d42ff93b..2205c48c49 100644 --- a/reframe/core/schedulers/lsf.py +++ b/reframe/core/schedulers/lsf.py @@ -14,7 +14,6 @@ import re import time -import reframe.core.runtime as rt import reframe.utility.osext as osext from reframe.core.backends import register_scheduler from reframe.core.exceptions import JobSchedulerError @@ -27,9 +26,7 @@ class LsfJobScheduler(PbsJobScheduler): def __init__(self): self._prefix = '#BSUB' - self._submit_timeout = rt.runtime().get_option( - f'schedulers/@{self.registered_name}/job_submit_timeout' - ) + self._submit_timeout = self.get_option('job_submit_timeout') def _format_option(self, var, option): if var is not None: diff --git a/reframe/core/schedulers/oar.py b/reframe/core/schedulers/oar.py index 66e907041b..31feceddb3 100644 --- a/reframe/core/schedulers/oar.py +++ b/reframe/core/schedulers/oar.py @@ -14,7 +14,6 @@ import re import time -import reframe.core.runtime as rt import reframe.utility.osext as osext from reframe.core.backends import register_scheduler from reframe.core.exceptions import JobError, JobSchedulerError @@ -60,9 +59,7 @@ def oar_state_pending(state): class OarJobScheduler(PbsJobScheduler): def __init__(self): self._prefix = '#OAR' - self._submit_timeout = rt.runtime().get_option( - f'schedulers/@{self.registered_name}/job_submit_timeout' - ) + self._submit_timeout = self.get_option('job_submit_timeout') def emit_preamble(self, job): # host is de-facto nodes and core is number of cores requested per node diff --git a/reframe/core/schedulers/pbs.py b/reframe/core/schedulers/pbs.py index 9de089468e..2ec2de353f 100644 --- a/reframe/core/schedulers/pbs.py +++ b/reframe/core/schedulers/pbs.py @@ -15,7 +15,6 @@ import re import time -import reframe.core.runtime as rt import reframe.core.schedulers as sched import reframe.utility.osext as osext from reframe.core.backends import register_scheduler @@ -76,9 +75,7 @@ class PbsJobScheduler(sched.JobScheduler): def __init__(self): self._prefix = '#PBS' - self._submit_timeout = rt.runtime().get_option( - f'schedulers/@{self.registered_name}/job_submit_timeout' - ) + self._submit_timeout = self.get_option('job_submit_timeout') def _emit_lselect_option(self, job): num_tasks_per_node = job.num_tasks_per_node or 1 diff --git a/reframe/core/schedulers/sge.py b/reframe/core/schedulers/sge.py index f1b9c13e6a..74ae6c5088 100644 --- a/reframe/core/schedulers/sge.py +++ b/reframe/core/schedulers/sge.py @@ -14,7 +14,6 @@ import time import xml.etree.ElementTree as ET -import reframe.core.runtime as rt import reframe.utility.osext as osext from reframe.core.backends import register_scheduler from reframe.core.exceptions import JobSchedulerError @@ -28,9 +27,7 @@ class SgeJobScheduler(PbsJobScheduler): def __init__(self): self._prefix = '#$' - self._submit_timeout = rt.runtime().get_option( - f'schedulers/@{self.registered_name}/job_submit_timeout' - ) + self._submit_timeout = self.get_option('job_submit_timeout') def emit_preamble(self, job): preamble = [ diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 7bc3a1a086..f2be99073c 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -132,22 +132,14 @@ def __init__(self): 'QOSJobLimit', 'QOSResourceLimit', 'QOSUsageThreshold'] - ignore_reqnodenotavail = rt.runtime().get_option( - f'schedulers/@{self.registered_name}/ignore_reqnodenotavail' - ) + ignore_reqnodenotavail = self.get_option('ignore_reqnodenotavail') if not ignore_reqnodenotavail: self._cancel_reasons.append('ReqNodeNotAvail') self._update_state_count = 0 - self._submit_timeout = rt.runtime().get_option( - f'schedulers/@{self.registered_name}/job_submit_timeout' - ) - self._use_nodes_opt = rt.runtime().get_option( - f'schedulers/@{self.registered_name}/use_nodes_option' - ) - self._resubmit_on_errors = rt.runtime().get_option( - f'schedulers/@{self.registered_name}/resubmit_on_errors' - ) + self._submit_timeout = self.get_option('job_submit_timeout') + self._use_nodes_opt = self.get_option('use_nodes_option') + self._resubmit_on_errors = self.get_option('resubmit_on_errors') def make_job(self, *args, **kwargs): return _SlurmJob(*args, **kwargs) diff --git a/reframe/core/systems.py b/reframe/core/systems.py index c07e04bca8..a45783e48e 100644 --- a/reframe/core/systems.py +++ b/reframe/core/systems.py @@ -314,7 +314,7 @@ def scheduler(self): ''' if self._scheduler is None: - self._scheduler = self._sched_type() + self._scheduler = self._sched_type(part_name=self.name) return self._scheduler diff --git a/reframe/frontend/cli.py b/reframe/frontend/cli.py index 75e7386539..fbb384e02d 100644 --- a/reframe/frontend/cli.py +++ b/reframe/frontend/cli.py @@ -586,7 +586,7 @@ def main(): argparser.add_argument( dest='ignore_reqnodenotavail', envvar='RFM_IGNORE_REQNODENOTAVAIL', - configvar='schedulers/ignore_reqnodenotavail', + configvar='systems*/sched_options/ignore_reqnodenotavail', action='store_true', help='Ignore ReqNodeNotAvail Slurm error' ) diff --git a/reframe/schemas/config.json b/reframe/schemas/config.json index 59a8af87da..697004ebbb 100644 --- a/reframe/schemas/config.json +++ b/reframe/schemas/config.json @@ -102,6 +102,18 @@ } ] }, + "sched_options": { + "type": "object", + "properties": { + "ignore_reqnodenotavail": {"type": "boolean"}, + "job_submit_timeout": {"type": "number"}, + "resubmit_on_errors": { + "type": "array", + "items": {"type": "string"} + }, + "use_nodes_option": {"type": "boolean"} + } + }, "stream_handler": { "allOf": [ {"$ref": "#/defs/handler_common"}, @@ -382,29 +394,6 @@ "additionalProperties": false } }, - "schedulers": { - "type": "array", - "items": { - "type": "object", - "properties": { - "name": { - "type": "string", - "enum": ["flux", "local", "lsf", "oar", "pbs", - "sge", "slurm", "squeue", "torque"] - }, - "ignore_reqnodenotavail": {"type": "boolean"}, - "resubmit_on_errors": { - "type": "array", - "items": {"type": "string"} - }, - "job_submit_timeout": {"type": "number"}, - "target_systems": {"$ref": "#/defs/system_ref"}, - "use_nodes_option": {"type": "boolean"} - }, - "required": ["name"], - "additionalProperties": false - } - }, "logging": { "type": "array", "items": { @@ -579,11 +568,6 @@ "logging/handlers*/syslog_facility": "user", "modes/options": [], "modes/target_systems": ["*"], - "schedulers/ignore_reqnodenotavail": false, - "schedulers/resubmit_on_errors": [], - "schedulers/job_submit_timeout": 60, - "schedulers/target_systems": ["*"], - "schedulers/use_nodes_option": false, "systems/descr": "", "systems/max_local_jobs": 8, "systems/modules_system": "nomod", @@ -613,6 +597,10 @@ "systems/partitions/processor": {}, "systems/partitions/time_limit": null, "systems/partitions/devices": [], - "systems/partitions/extras": {} + "systems/partitions/extras": {}, + "systems*/sched_options/ignore_reqnodenotavail": false, + "systems*/sched_options/job_submit_timeout": 60, + "systems*/sched_options/resubmit_on_errors": [], + "systems*/sched_options/use_nodes_option": false } } diff --git a/unittests/test_schedulers.py b/unittests/test_schedulers.py index 1f0f76acd3..a0c2317ae2 100644 --- a/unittests/test_schedulers.py +++ b/unittests/test_schedulers.py @@ -378,7 +378,7 @@ def test_prepare_without_smt(fake_job, slurm_only): def test_prepare_nodes_option(make_exec_ctx, make_job, slurm_only): make_exec_ctx(test_util.TEST_CONFIG_FILE, 'generic', - {'schedulers/use_nodes_option': True}) + {'systems*/sched_options/use_nodes_option': True}) job = make_job() job.num_tasks = 16 job.num_tasks_per_node = 2