Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions docs/config_reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1006,6 +1006,25 @@ Common scheduler options
In such cases, you may set this parameter to ``true`` to avoid this.


.. js:attribute:: .schedulers[].resubmit_on_errors

:required: No
:default: ``[]``

This option is relevant to the Slurm backends only.

If any of the listed errors occur, ReFrame will try to resubmit the job after some seconds.
As an example, you could have ReFrame trying to resubmit a job in case that the maximum submission limit per user is reached by setting this field to ``["QOSMaxSubmitJobPerUserLimit"]``.
You can ignore multiple errors at the same time if you add more error strings in the list.

.. versionadded:: 3.5

.. warning::
Job submission is a synchronous operation in ReFrame.
If this option is set, ReFrame's execution will block until the error conditions specified in this list are resolved.
No other test would be able to proceed.


Execution Mode Configuration
----------------------------

Expand Down
23 changes: 22 additions & 1 deletion reframe/core/schedulers/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,9 @@ def __init__(self):
self._use_nodes_opt = rt.runtime().get_option(
f'schedulers/@{self.registered_name}/use_nodes_option'
)
self._resubmit_on_errors = rt.runtime().get_option(
f'schedulers/@{self.registered_name}/resubmit_on_errors'
)

def make_job(self, *args, **kwargs):
return _SlurmJob(*args, **kwargs)
Expand Down Expand Up @@ -227,7 +230,25 @@ def emit_preamble(self, job):

def submit(self, job):
cmd = f'sbatch {job.script_filename}'
completed = _run_strict(cmd, timeout=self._submit_timeout)
intervals = itertools.cycle([1, 2, 3])
while True:
try:
completed = _run_strict(cmd, timeout=self._submit_timeout)
break
except SpawnedProcessError as e:
error_match = re.search(
rf'({"|".join(self._resubmit_on_errors)})', e.stderr
)
if not self._resubmit_on_errors or not error_match:
raise

t = next(intervals)
self.log(
f'encountered a job submission error: '
f'{error_match.group(1)}: will resubmit after {t}s'
)
time.sleep(t)

jobid_match = re.search(r'Submitted batch job (?P<jobid>\d+)',
completed.stdout)
if not jobid_match:
Expand Down
5 changes: 5 additions & 0 deletions reframe/schemas/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,10 @@
"enum": ["local", "pbs", "slurm", "squeue", "torque"]
},
"ignore_reqnodenotavail": {"type": "boolean"},
"resubmit_on_errors": {
"type": "array",
"items": {"type": "string"}
},
"job_submit_timeout": {"type": "number"},
"target_systems": {"$ref": "#/defs/system_ref"},
"use_nodes_option": {"type": "boolean"}
Expand Down Expand Up @@ -445,6 +449,7 @@
"modes/options": [],
"modes/target_systems": ["*"],
"schedulers/ignore_reqnodenotavail": false,
"schedulers/resubmit_on_errors": [],
"schedulers/job_submit_timeout": 60,
"schedulers/target_systems": ["*"],
"schedulers/use_nodes_option": false,
Expand Down