diff --git a/docs/config_reference.rst b/docs/config_reference.rst index 9fdf9aa7fe..7259cca0bb 100644 --- a/docs/config_reference.rst +++ b/docs/config_reference.rst @@ -961,6 +961,17 @@ Common scheduler options For a detailed description of this property, you may refer `here <#.environments[].target_systems>`__. +.. js:attribute:: .schedulers[].ignore_reqnodenotavail + + :required: No + :default: ``false`` + + This option is relevant to the Slurm backends only. + + If a job associated to a test is in pending state with the Slurm reason ``ReqNodeNotAvail`` and a list of unavailable nodes is also specified, ReFrame will check the status of the nodes and, if all of them are indeed down, it will cancel the job. + Sometimes, however, when Slurm's backfill algorithm takes too long to compute, Slurm will set the pending reason to ``ReqNodeNotAvail`` and mark all system nodes as unavailable, causing ReFrame to kill the job. + In such cases, you may set this parameter to ``true`` to avoid this. + Execution Mode Configuration ---------------------------- diff --git a/docs/manpage.rst b/docs/manpage.rst index 9cdd059812..0aca1c086a 100644 --- a/docs/manpage.rst +++ b/docs/manpage.rst @@ -635,6 +635,19 @@ Here is an alphabetical list of the environment variables recognized by ReFrame: ================================== ================== +.. envvar:: RFM_IGNORE_REQNODENOTAVAIL + + Do not treat specially jobs in pending state with the reason ``ReqNodeNotAvail`` (Slurm only). + + .. table:: + :align: left + + ================================== ================== + Associated command line option N/A + Associated configuration parameter :js:attr:`ignore_reqnodenotavail` scheduler configuration parameter + ================================== ================== + + .. envvar:: RFM_KEEP_STAGE_FILES Keep test stage directories even for tests that finish successfully. diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 279774899b..44ec36e87b 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -97,8 +97,13 @@ def __init__(self): 'PartitionNodeLimit', 'QOSJobLimit', 'QOSResourceLimit', - 'ReqNodeNotAvail', 'QOSUsageThreshold'] + ignore_reqnodenotavail = rt.runtime().get_option( + f'schedulers/@{self.registered_name}/ignore_reqnodenotavail' + ) + if not ignore_reqnodenotavail: + self._cancel_reasons.append('ReqNodeNotAvail') + self._is_cancelling = False self._is_job_array = None self._update_state_count = 0 diff --git a/reframe/frontend/cli.py b/reframe/frontend/cli.py index 1c48bec6bc..4701681662 100644 --- a/reframe/frontend/cli.py +++ b/reframe/frontend/cli.py @@ -351,6 +351,13 @@ def main(): configvar='logging/handlers_perflog/graylog_address', help='Graylog server address' ) + argparser.add_argument( + dest='ignore_reqnodenotavail', + envvar='RFM_IGNORE_REQNODENOTAVAIL', + configvar='schedulers/ignore_reqnodenotavail', + action='store_true', + help='Graylog server address' + ) argparser.add_argument( dest='use_login_shell', envvar='RFM_USE_LOGIN_SHELL', diff --git a/reframe/schemas/config.json b/reframe/schemas/config.json index dc7c05b442..da54649375 100644 --- a/reframe/schemas/config.json +++ b/reframe/schemas/config.json @@ -273,7 +273,11 @@ "items": { "type": "object", "properties": { - "name": {"type": "string"}, + "name": { + "type": "string", + "enum": ["local", "pbs", "slurm", "squeue", "torque"] + }, + "ignore_reqnodenotavail": {"type": "boolean"}, "job_submit_timeout": {"type": "number"}, "target_systems": {"$ref": "#/defs/system_ref"} }, @@ -417,6 +421,7 @@ "logging/handlers*/syslog_facility": "user", "modes/options": [], "modes/target_systems": ["*"], + "schedulers/ignore_reqnodenotavail": false, "schedulers/job_submit_timeout": 60, "schedulers/target_systems": ["*"], "systems/descr": "",