diff --git a/docs/config_reference.rst b/docs/config_reference.rst
index 0c7b27d50b..265567cd2d 100644
--- a/docs/config_reference.rst
+++ b/docs/config_reference.rst
@@ -210,19 +210,22 @@ System Partition Configuration
Supported schedulers are the following:
- ``local``: Jobs will be launched locally without using any job scheduler.
+ - ``oar``: Jobs will be launched using the `OAR `__ scheduler.
- ``pbs``: Jobs will be launched using the `PBS Pro `__ scheduler.
- - ``torque``: Jobs will be launched using the `Torque `__ scheduler.
- ``sge``: Jobs will be launched using the `Sun Grid Engine `__ scheduler.
- ``slurm``: Jobs will be launched using the `Slurm `__ scheduler.
This backend requires job accounting to be enabled in the target system.
If not, you should consider using the ``squeue`` backend below.
- ``squeue``: Jobs will be launched using the `Slurm `__ scheduler.
This backend does not rely on job accounting to retrieve job statuses, but ReFrame does its best to query the job state as reliably as possible.
- - ``lsf``: Jobs will be launched using the `LSF `__ scheduler.
+ - ``torque``: Jobs will be launched using the `Torque `__ scheduler.
.. versionadded:: 3.7.2
Support for the SGE scheduler is added.
+ .. versionadded:: 3.8.2
+ Support for the OAR scheduler is added.
+
.. note::
The way that multiple node jobs are submitted using the SGE scheduler can be very site-specific.
diff --git a/docs/tutorial_tips_tricks.rst b/docs/tutorial_tips_tricks.rst
index 6f63faf191..8d47639dc5 100644
--- a/docs/tutorial_tips_tricks.rst
+++ b/docs/tutorial_tips_tricks.rst
@@ -552,8 +552,10 @@ The following is an example of ``.gitlab-ci.yml`` file that does exactly that:
It defines two stages.
The first one, called ``generate``, will call ReFrame to generate the pipeline specification for the desired tests.
All the usual `test selection options `__ can be used to select specific tests.
-ReFrame will process them as usual, but instead of running the selected tests, it will generate the correct steps for running each test individually as a Gitlab job.
-We then pass the generated CI pipeline file to second phase as an artifact and we are done!
+ReFrame will process them as usual, but instead of running the selected tests, it will generate the correct steps
+for running each test individually as a Gitlab job. We then pass the generated CI pipeline file to second phase as
+an artifact and we are done! If ``image`` keyword is defined in ``.gitlab-ci.yml``, the emitted pipeline will use
+the same image as the one defined in the parent pipeline.
The following figure shows one part of the automatically generated pipeline for the test graph depicted `above <#fig-deps-complex>`__.
diff --git a/reframe/core/backends.py b/reframe/core/backends.py
index 4a8205b897..ff202be366 100644
--- a/reframe/core/backends.py
+++ b/reframe/core/backends.py
@@ -18,10 +18,11 @@
_launchers = {}
_scheduler_backend_modules = [
'reframe.core.schedulers.local',
- 'reframe.core.schedulers.slurm',
+ 'reframe.core.schedulers.lsf',
'reframe.core.schedulers.pbs',
+ 'reframe.core.schedulers.oar',
'reframe.core.schedulers.sge',
- 'reframe.core.schedulers.lsf'
+ 'reframe.core.schedulers.slurm'
]
_schedulers = {}
diff --git a/reframe/core/schedulers/oar.py b/reframe/core/schedulers/oar.py
new file mode 100644
index 0000000000..236a0f702e
--- /dev/null
+++ b/reframe/core/schedulers/oar.py
@@ -0,0 +1,190 @@
+# Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich)
+# ReFrame Project Developers. See the top-level LICENSE file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+#
+# OAR backend
+#
+# - Initial version submitted by Mahendra Paipuri, INRIA
+#
+
+import functools
+import os
+import re
+import time
+
+import reframe.core.runtime as rt
+import reframe.utility.osext as osext
+from reframe.core.backends import register_scheduler
+from reframe.core.exceptions import JobError, JobSchedulerError
+from reframe.core.schedulers.pbs import PbsJobScheduler
+from reframe.utility import seconds_to_hms
+
+
+# States can be found here:
+# https://github.com/oar-team/oar/blob/0fccc4fc3bb86ee935ce58effc5aec514a3e155d/sources/core/qfunctions/oarstat#L293
+def oar_state_completed(state):
+ completion_states = {
+ 'Error',
+ 'Terminated',
+ }
+ if state:
+ return all(s in completion_states for s in state.split(','))
+
+ return False
+
+
+def oar_state_pending(state):
+ pending_states = {
+ 'Waiting',
+ 'toLaunch',
+ 'Launching',
+ 'Hold',
+ 'Running',
+ 'toError',
+ 'Finishing',
+ 'Suspended',
+ 'Resuming',
+ }
+ if state:
+ return any(s in pending_states for s in state.split(','))
+
+ return False
+
+
+_run_strict = functools.partial(osext.run_command, check=True)
+
+
+@register_scheduler('oar')
+class OarJobScheduler(PbsJobScheduler):
+ # host is de-facto nodes and core is number of cores requested per node
+ # number of sockets can also be specified using cpu={num_sockets}
+ TASKS_OPT = '-l /host={num_nodes}/core={num_tasks_per_node}'
+
+ def __init__(self):
+ self._prefix = '#OAR'
+ self._submit_timeout = rt.runtime().get_option(
+ f'schedulers/@{self.registered_name}/job_submit_timeout'
+ )
+
+ def emit_preamble(self, job):
+ # Same reason as oarsub, we give full path to output and error files to
+ # avoid writing them in the working dir
+ preamble = [
+ self._format_option(f'-n "{job.name}"'),
+ self._format_option(f'-O {os.path.join(job.workdir, job.stdout)}'),
+ self._format_option(f'-E {os.path.join(job.workdir, job.stderr)}'),
+ ]
+
+ if job.time_limit is not None:
+ h, m, s = seconds_to_hms(job.time_limit)
+ self.TASKS_OPT += ',walltime=%d:%d:%d' % (h, m, s)
+
+ # Get number of nodes in the reservation
+ num_tasks_per_node = job.num_tasks_per_node or 1
+ num_nodes = job.num_tasks // num_tasks_per_node
+
+ # Emit main resource reservation option
+ options = [self.TASKS_OPT.format(
+ num_nodes=num_nodes, num_tasks_per_node=num_tasks_per_node,
+ )]
+
+ # Emit the rest of the options
+ options += job.sched_access + job.options + job.cli_options
+ for opt in options:
+ if opt.startswith('#'):
+ preamble.append(opt)
+ else:
+ preamble.append(self._format_option(opt))
+
+ # OAR starts the job in the home directory by default
+ preamble.append(f'cd {job.workdir}')
+ return preamble
+
+ def submit(self, job):
+ # For some reason OAR job manager says that job launching dir is
+ # working dir of the repo and not stage dir. A workaround is to give
+ # full path of script to oarsub
+ job_script_fullpath = os.path.join(job.workdir, job.script_filename)
+
+ # OAR needs -S to submit job in batch mode
+ cmd = f'oarsub -S {job_script_fullpath}'
+ completed = _run_strict(cmd, timeout=self._submit_timeout)
+ jobid_match = re.search(r'.*OAR_JOB_ID=(?P\S+)',
+ completed.stdout)
+ if not jobid_match:
+ raise JobSchedulerError('could not retrieve the job id '
+ 'of the submitted job')
+
+ job._jobid = jobid_match.group('jobid')
+ job._submit_time = time.time()
+
+ def cancel(self, job):
+ _run_strict(f'oardel {job.jobid}', timeout=self._submit_timeout)
+ job._cancelled = True
+
+ def poll(self, *jobs):
+ if jobs:
+ # Filter out non-jobs
+ jobs = [job for job in jobs if job is not None]
+
+ if not jobs:
+ return
+
+ for job in jobs:
+ completed = _run_strict(
+ f'oarstat -fj {job.jobid}'
+ )
+
+ # Store information for each job separately
+ jobinfo = {}
+
+ # Typical oarstat -fj output:
+ # https://github.com/oar-team/oar/blob/0fccc4fc3bb86ee935ce58effc5aec514a3e155d/sources/core/qfunctions/oarstat#L310
+ job_raw_info = completed.stdout
+ jobid_match = re.search(
+ r'^Job_Id:\s*(?P\S+)', completed.stdout, re.MULTILINE
+ )
+ if jobid_match:
+ jobid = jobid_match.group('jobid')
+ jobinfo[jobid] = job_raw_info
+
+ if job.jobid not in jobinfo:
+ self.log(f'Job {job.jobid} not known to scheduler, '
+ f'assuming job completed')
+ job._state = 'Terminated'
+ job._completed = True
+ continue
+
+ info = jobinfo[job.jobid]
+ state_match = re.search(
+ r'^\s*state = (?P[A-Z]\S+)', info, re.MULTILINE
+ )
+ if not state_match:
+ self.log(f'Job state not found (job info follows):\n{info}')
+ continue
+
+ job._state = state_match.group('state')
+ if oar_state_completed(job.state):
+ exitcode_match = re.search(
+ r'^\s*exit_code = (?P\d+)',
+ info, re.MULTILINE,
+ )
+
+ if exitcode_match:
+ job._exitcode = int(exitcode_match.group('code'))
+
+ # We report a job as finished only when its stdout/stderr are
+ # written back to the working directory
+ stdout = os.path.join(job.workdir, job.stdout)
+ stderr = os.path.join(job.workdir, job.stderr)
+ out_ready = os.path.exists(stdout) and os.path.exists(stderr)
+ done = job.cancelled or out_ready
+ if done:
+ job._completed = True
+ elif oar_state_pending(job.state) and job.max_pending_time:
+ if time.time() - job.submit_time >= job.max_pending_time:
+ self.cancel(job)
+ job._exception = JobError('maximum pending time exceeded',
+ job.jobid)
diff --git a/reframe/core/schedulers/registry.py b/reframe/core/schedulers/registry.py
index 8da99255e1..1fb5e775db 100644
--- a/reframe/core/schedulers/registry.py
+++ b/reframe/core/schedulers/registry.py
@@ -36,8 +36,8 @@ def getscheduler(name):
# Import the schedulers modules to trigger their registration
import reframe.core.schedulers.local # noqa: F401, F403
-import reframe.core.schedulers.slurm # noqa: F401, F403
+import reframe.core.schedulers.lsf # noqa: F401, F403
+import reframe.core.schedulers.oar # noqa: F401, F403
import reframe.core.schedulers.pbs # noqa: F401, F403
import reframe.core.schedulers.sge # noqa: F401, F403
-import reframe.core.schedulers.torque # noqa: F401, F403
-import reframe.core.schedulers.lsf # noqa: F401, F403
+import reframe.core.schedulers.slurm # noqa: F401, F403
diff --git a/reframe/schemas/config.json b/reframe/schemas/config.json
index 70d82e8fc8..7c6f78979e 100644
--- a/reframe/schemas/config.json
+++ b/reframe/schemas/config.json
@@ -247,8 +247,8 @@
"scheduler": {
"type": "string",
"enum": [
- "local", "pbs", "slurm",
- "sge", "squeue", "torque", "lsf"
+ "local", "lsf", "oar", "pbs",
+ "sge", "slurm", "squeue", "torque"
]
},
"launcher": {
@@ -370,7 +370,8 @@
"properties": {
"name": {
"type": "string",
- "enum": ["local", "pbs", "sge", "slurm", "squeue", "torque", "lsf"]
+ "enum": ["local", "lsf", "oar", "pbs",
+ "sge", "slurm", "squeue", "torque"]
},
"ignore_reqnodenotavail": {"type": "boolean"},
"resubmit_on_errors": {
diff --git a/unittests/test_schedulers.py b/unittests/test_schedulers.py
index 8f082e2548..db864afc90 100644
--- a/unittests/test_schedulers.py
+++ b/unittests/test_schedulers.py
@@ -26,8 +26,8 @@ def launcher():
return getlauncher('local')
-@pytest.fixture(params=['sge', 'slurm', 'squeue', 'local',
- 'pbs', 'torque', 'lsf'])
+@pytest.fixture(params=['local', 'lsf', 'oar', 'pbs',
+ 'sge', 'slurm', 'squeue', 'torque'])
def scheduler(request):
return getscheduler(request.param)
@@ -226,6 +226,21 @@ def _expected_torque_directives(job):
])
+def _expected_oar_directives(job):
+ num_nodes = job.num_tasks // job.num_tasks_per_node
+ num_tasks_per_node = job.num_tasks_per_node
+ return set([
+ f'#OAR -n "testjob"',
+ f'#OAR -O {job.stdout}',
+ f'#OAR -E {job.stderr}',
+ f'#OAR -l /host={num_nodes}/core={num_tasks_per_node},walltime=0:5:0',
+ f'#OAR --account=spam',
+ f'#OAR --gres=gpu:4',
+ f'#DW jobdw capacity=100GB',
+ f'#DW stage_in source=/foo'
+ ])
+
+
def _expected_local_directives(job):
return set()