From 3dd643fc758efa5e51b51e7347a50256c0710123 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Wed, 27 May 2020 00:13:46 +0200 Subject: [PATCH 1/2] Add configuration option to disable the ReqNodeNotAvail check --- docs/config_reference.rst | 11 +++++++++++ docs/manpage.rst | 13 +++++++++++++ reframe/core/schedulers/slurm.py | 7 ++++++- reframe/frontend/cli.py | 6 ++++++ schemas/config.json | 7 ++++++- 5 files changed, 42 insertions(+), 2 deletions(-) diff --git a/docs/config_reference.rst b/docs/config_reference.rst index c28faa346d..da9f7b3908 100644 --- a/docs/config_reference.rst +++ b/docs/config_reference.rst @@ -958,6 +958,17 @@ Common scheduler options For a detailed description of this property, you may refer `here <#.environments[].target_systems>`__. +.. js:attribute:: .schedulers[].ignore_reqnodenotavail + + :required: No + :default: ``false`` + + This option is relevant to the Slurm backends only. + + If a job associated to a test is in pending state with the Slurm reason ``ReqNodeNotAvail`` and a list of unavailable nodes is also specified, ReFrame will check the status of the nodes and, if all of them are indeed down, it will cancel the job. + Sometimes, however, when Slurm's backfill algorithm takes too long to compute, Slurm will set the pending reason to ``ReqNodeNotAvail`` and mark all system nodes as unavailable, causing ReFrame to kill the job. + In such cases, you may set this parameter to ``true`` to avoid this. + Execution Mode Configuration ---------------------------- diff --git a/docs/manpage.rst b/docs/manpage.rst index aae6c41378..130faceb7c 100644 --- a/docs/manpage.rst +++ b/docs/manpage.rst @@ -610,6 +610,19 @@ Here is an alphabetical list of the environment variables recognized by ReFrame: ================================== ================== +.. envvar:: RFM_IGNORE_REQNODENOTAVAIL + + Do not treat specially jobs in pending state with the reason ``ReqNodeNotAvail`` (Slurm only). + + .. table:: + :align: left + + ================================== ================== + Associated command line option N/A + Associated configuration parameter :js:attr:`ignore_reqnodenotavail` scheduler configuration parameter + ================================== ================== + + .. envvar:: RFM_KEEP_STAGE_FILES Keep test stage directories even for tests that finish successfully. diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 279774899b..44ec36e87b 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -97,8 +97,13 @@ def __init__(self): 'PartitionNodeLimit', 'QOSJobLimit', 'QOSResourceLimit', - 'ReqNodeNotAvail', 'QOSUsageThreshold'] + ignore_reqnodenotavail = rt.runtime().get_option( + f'schedulers/@{self.registered_name}/ignore_reqnodenotavail' + ) + if not ignore_reqnodenotavail: + self._cancel_reasons.append('ReqNodeNotAvail') + self._is_cancelling = False self._is_job_array = None self._update_state_count = 0 diff --git a/reframe/frontend/cli.py b/reframe/frontend/cli.py index db49136aa9..c66a8acd52 100644 --- a/reframe/frontend/cli.py +++ b/reframe/frontend/cli.py @@ -346,6 +346,12 @@ def main(): configvar='logging/handlers_perflog/graylog_address', help='Graylog server address' ) + argparser.add_argument( + dest='ignore_reqnodenotavail', + envvar='RFM_IGNORE_REQNODENOTAVAIL', + configvar='schedulers/ignore_reqnodenotavail', + help='Graylog server address' + ) if len(sys.argv) == 1: argparser.print_help() diff --git a/schemas/config.json b/schemas/config.json index ac7dfebaf8..9545c516c2 100644 --- a/schemas/config.json +++ b/schemas/config.json @@ -273,7 +273,11 @@ "items": { "type": "object", "properties": { - "name": {"type": "string"}, + "name": { + "type": "string", + "enum": ["local", "pbs", "slurm", "squeue", "torque"] + }, + "ignore_reqnodenotavail": {"type": "boolean"}, "job_submit_timeout": {"type": "number"}, "target_systems": {"$ref": "#/defs/system_ref"} }, @@ -415,6 +419,7 @@ "logging/handlers*/syslog_facility": "user", "modes/options": [], "modes/target_systems": ["*"], + "schedulers/ignore_reqnodenotavail": false, "schedulers/job_submit_timeout": 60, "schedulers/target_systems": ["*"], "systems/descr": "", From b7319818d1171a8d0cdfda1b82986b55c3d55f5a Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Wed, 27 May 2020 23:19:00 +0200 Subject: [PATCH 2/2] Make RFM_IGNORE_REQNODENOTAVAIL boolean --- reframe/frontend/cli.py | 1 + 1 file changed, 1 insertion(+) diff --git a/reframe/frontend/cli.py b/reframe/frontend/cli.py index c66a8acd52..c10de90197 100644 --- a/reframe/frontend/cli.py +++ b/reframe/frontend/cli.py @@ -350,6 +350,7 @@ def main(): dest='ignore_reqnodenotavail', envvar='RFM_IGNORE_REQNODENOTAVAIL', configvar='schedulers/ignore_reqnodenotavail', + action='store_true', help='Graylog server address' )