Skip to content

Commit bfbc02d

Browse files
author
Theofilos Manitaras
committed
Add 'maint' mode for '--flex-alloc-nodes'
* Add unittests * Update documentation
1 parent c18cde8 commit bfbc02d

File tree

5 files changed

+134
-23
lines changed

5 files changed

+134
-23
lines changed

docs/manpage.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,10 @@ Options controlling job submission
341341
If ``key`` starts with ``-`` or ``#``, the option will be passed verbatim to the job script.
342342
Otherwise, ReFrame will add ``-`` or ``--`` as well as the directive corresponding to the current scheduler.
343343
This option will be emitted after any options specified in the :js:attr:`access` system partition configuration parameter.
344+
Especially for the Slurm scheduler, constraint options, i.e ``-J constraint=value``, ``-J C=value``, ``-J --constraint=value``, ``-J -C=value` are going to be combined with the corresponding ones specified in the :js:attr:`access` system partition configuration parameter.
345+
If multiple constraint options are specified with separate key-value pairs, only the last one is going to be taken into account.
346+
For multiple combined constraints use the ``-J constraint=value1,value2`` syntax.
347+
Note that the above is not valid if ``key`` starts with ``#`` in which case the option is going to be passed verbatim to the job script.
344348

345349

346350
------------------------
@@ -364,6 +368,8 @@ If no node can be selected, the test will be marked as a failure with an appropr
364368
It is therefore possible that the number of tasks assigned does not correspond to the actual idle nodes.
365369

366370
This is the default policy.
371+
372+
- ``maint``: Flexible tests will be assigned as many tasks as needed in order to span over the nodes of the node pool which are currently under maintenance.
367373
- Any positive integer: Flexible tests will be assigned as many tasks as needed in order to span over the specified number of nodes from the node pool.
368374

369375
---------------------------------------

reframe/core/schedulers/__init__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,13 @@ def guess_num_tasks(self):
361361
'flex_alloc_nodes: selecting idle nodes: '
362362
'available nodes now: %s' % len(available_nodes)
363363
)
364+
elif self.sched_flex_alloc_nodes == 'maint':
365+
available_nodes = {n for n in available_nodes
366+
if n.under_maintenance()}
367+
getlogger().debug(
368+
'flex_alloc_nodes: selecting nodes under maintenance: '
369+
'available nodes now: %s' % len(available_nodes)
370+
)
364371

365372
return len(available_nodes) * num_tasks_per_node
366373

reframe/core/schedulers/slurm.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,11 +195,35 @@ def emit_preamble(self, job):
195195
hint = 'multithread' if job.use_smt else 'nomultithread'
196196

197197
for opt in job.sched_access:
198-
preamble.append('%s %s' % (self._prefix, opt))
198+
if not opt.strip().startswith(('-C', '--constraint')):
199+
preamble.append('%s %s' % (self._prefix, opt))
200+
201+
constraints = []
202+
constraint_parser = ArgumentParser()
203+
constraint_parser.add_argument('-C', '--constraint')
204+
parsed_options, _ = constraint_parser.parse_known_args(
205+
job.sched_access)
206+
if parsed_options.constraint:
207+
constraints.append(parsed_options.constraint.strip())
208+
209+
# NOTE: Here last of the passed --constraint job options is taken
210+
# into account in order to respect the behavior of slurm.
211+
parsed_options, _ = constraint_parser.parse_known_args(job.options)
212+
if parsed_options.constraint:
213+
constraints.append(parsed_options.constraint.strip())
214+
215+
if constraints:
216+
preamble.append(
217+
self._format_option(','.join(constraints), '--constraint={0}')
218+
)
199219

200220
preamble.append(self._format_option(hint, '--hint={0}'))
201221
prefix_patt = re.compile(r'(#\w+)')
202222
for opt in job.options:
223+
if opt.strip().startswith(('-C', '--constraint')):
224+
# Constraints are already processed
225+
continue
226+
203227
if not prefix_patt.match(opt):
204228
preamble.append('%s %s' % (self._prefix, opt))
205229
else:
@@ -591,6 +615,10 @@ def is_available(self):
591615
return all([self._states == {'IDLE'}, self._partitions,
592616
self._active_features, self._states])
593617

618+
def under_maintenance(self):
619+
return all([self._states == {'MAINT'}, self._partitions,
620+
self._active_features, self._states])
621+
594622
def is_down(self):
595623
return bool({'DOWN', 'DRAIN', 'MAINT', 'NO_RESPOND'} & self._states)
596624

reframe/frontend/cli.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,7 @@ def main():
260260
)
261261
run_options.add_argument(
262262
'--flex-alloc-nodes', action='store',
263-
dest='flex_alloc_nodes', metavar='{all|idle|NUM}', default=None,
263+
dest='flex_alloc_nodes', metavar='{all|idle|maint|NUM}', default=None,
264264
help='Set strategy for the flexible node allocation (default: "idle").'
265265
)
266266
env_options.add_argument(
@@ -656,7 +656,8 @@ def print_infoline(param, value):
656656
if sched_flex_alloc_nodes <= 0:
657657
raise ConfigError(errmsg.format(options.flex_alloc_nodes))
658658
except ValueError:
659-
if not options.flex_alloc_nodes.casefold() in {'idle', 'all'}:
659+
if (not options.flex_alloc_nodes.casefold() in
660+
{'idle', 'all', 'maint'}):
660661
raise ConfigError(
661662
errmsg.format(options.flex_alloc_nodes)) from None
662663

unittests/test_schedulers.py

Lines changed: 89 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def exec_ctx(temp_runtime, scheduler):
6767
next(rt)
6868
if scheduler.registered_name == 'squeue':
6969
# slurm backend fulfills the functionality of the squeue backend, so
70-
# if squeue is not configured, use slurrm instead
70+
# if squeue is not configured, use slurm instead
7171
partition = (fixtures.partition_by_scheduler('squeue') or
7272
fixtures.partition_by_scheduler('slurm'))
7373
else:
@@ -370,6 +370,40 @@ def test_no_empty_lines_in_preamble(minimal_job):
370370
assert line != ''
371371

372372

373+
def test_combined_access_constraint(make_job, slurm_only):
374+
job = make_job(sched_access=['--constraint=c1'])
375+
job.options = ['-C c2,c3']
376+
prepare_job(job)
377+
with open(job.script_filename) as fp:
378+
script_content = fp.read()
379+
380+
assert re.search(r'(?m)--constraint=c1,c2,c3$', script_content)
381+
assert re.search(r'(?m)--constraint=(c1|c2,c3)$', script_content) is None
382+
383+
384+
def test_combined_access_multiple_constraints(make_job, slurm_only):
385+
job = make_job(sched_access=['--constraint=c1'])
386+
job.options = ['--constraint=c2', '-C c3']
387+
prepare_job(job)
388+
with open(job.script_filename) as fp:
389+
script_content = fp.read()
390+
391+
assert re.search(r'(?m)--constraint=c1,c3$', script_content)
392+
assert re.search(r'(?m)--constraint=(c1|c2|c3)$', script_content) is None
393+
394+
395+
def test_combined_access_verbatim_constraint(make_job, slurm_only):
396+
job = make_job(sched_access=['--constraint=c1'])
397+
job.options = ['#SBATCH --constraint=c2', '#SBATCH -C c3']
398+
prepare_job(job)
399+
with open(job.script_filename) as fp:
400+
script_content = fp.read()
401+
402+
assert re.search(r'(?m)--constraint=c1$', script_content)
403+
assert re.search(r'(?m)^#SBATCH --constraint=c2$', script_content)
404+
assert re.search(r'(?m)^#SBATCH -C c3$', script_content)
405+
406+
373407
def test_guess_num_tasks(minimal_job, scheduler):
374408
minimal_job.num_tasks = 0
375409
if scheduler.registered_name == 'local':
@@ -613,6 +647,24 @@ def slurm_nodes():
613647
'ExtSensorsTemp=n/s Reason=Foo/ '
614648
'failed [reframe_user@01 Jan 2018]',
615649

650+
'NodeName=nid00006 Arch=x86_64 CoresPerSocket=12 '
651+
'CPUAlloc=0 CPUErr=0 CPUTot=24 CPULoad=0.00 '
652+
'AvailableFeatures=f6 ActiveFeatures=f6 '
653+
'Gres=gpu_mem:16280,gpu:1 NodeAddr=nid00006'
654+
'NodeHostName=nid00006 Version=10.00 OS=Linux '
655+
'RealMemory=32220 AllocMem=0 FreeMem=10000 '
656+
'Sockets=1 Boards=1 State=MAINT '
657+
'ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A '
658+
'MCS_label=N/A Partitions=p4 '
659+
'BootTime=01 Jan 2018 '
660+
'SlurmdStartTime=01 Jan 2018 '
661+
'CfgTRES=cpu=24,mem=32220M '
662+
'AllocTRES= CapWatts=n/a CurrentWatts=100 '
663+
'LowestJoules=100000000 ConsumedJoules=0 '
664+
'ExtSensorsJoules=n/s ExtSensorsWatts=0 '
665+
'ExtSensorsTemp=n/s Reason=Foo/ '
666+
'failed [reframe_user@01 Jan 2018]',
667+
616668
'Node invalid_node2 not found']
617669

618670

@@ -827,6 +879,13 @@ def test_flex_alloc_not_enough_idle_nodes(make_flexible_job):
827879
prepare_job(job)
828880

829881

882+
def test_flex_alloc_maintenance_nodes(make_flexible_job):
883+
job = make_flexible_job('maint')
884+
job.options = ['--partition=p4']
885+
prepare_job(job)
886+
assert job.num_tasks == 4
887+
888+
830889
def test_flex_alloc_not_enough_nodes_constraint_partition(make_flexible_job):
831890
job = make_flexible_job('all')
832891
job.options = ['-C f1,f2', '--partition=p1,p2']
@@ -934,6 +993,29 @@ def slurm_node_nopart():
934993
)
935994

936995

996+
@pytest.fixture
997+
def slurm_node_maintenance():
998+
return _SlurmNode(
999+
'NodeName=nid00006 Arch=x86_64 CoresPerSocket=12 '
1000+
'CPUAlloc=0 CPUErr=0 CPUTot=24 CPULoad=0.00 '
1001+
'AvailableFeatures=f6 ActiveFeatures=f6 '
1002+
'Gres=gpu_mem:16280,gpu:1 NodeAddr=nid00006'
1003+
'NodeHostName=nid00006 Version=10.00 OS=Linux '
1004+
'RealMemory=32220 AllocMem=0 FreeMem=10000 '
1005+
'Sockets=1 Boards=1 State=MAINT '
1006+
'ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A '
1007+
'MCS_label=N/A Partitions=p4 '
1008+
'BootTime=01 Jan 2018 '
1009+
'SlurmdStartTime=01 Jan 2018 '
1010+
'CfgTRES=cpu=24,mem=32220M '
1011+
'AllocTRES= CapWatts=n/a CurrentWatts=100 '
1012+
'LowestJoules=100000000 ConsumedJoules=0 '
1013+
'ExtSensorsJoules=n/s ExtSensorsWatts=0 '
1014+
'ExtSensorsTemp=n/s Reason=Foo/ '
1015+
'failed [reframe_user@01 Jan 2018]'
1016+
)
1017+
1018+
9371019
def test_slurm_node_noname():
9381020
with pytest.raises(JobError):
9391021
_SlurmNode(
@@ -1006,22 +1088,9 @@ def test_slurm_node_is_down(slurm_node_allocated,
10061088
assert slurm_node_nopart.is_down()
10071089

10081090

1009-
class TestSlurmNode:
1010-
def setUp(self):
1011-
idle_node_description = (
1012-
)
1013-
1014-
idle_drained_node_description = (
1015-
)
1016-
1017-
no_partition_node_description = (
1018-
)
1019-
1020-
self.no_name_node_description = (
1021-
)
1022-
1023-
self.allocated_node = _SlurmNode(allocated_node_description)
1024-
self.allocated_node_copy = _SlurmNode(allocated_node_description)
1025-
self.idle_node = _SlurmNode(idle_node_description)
1026-
self.idle_drained = _SlurmNode(idle_drained_node_description)
1027-
self.no_partition_node = _SlurmNode(no_partition_node_description)
1091+
def test_slurm_node_under_maintenance(slurm_node_allocated,
1092+
slurm_node_idle,
1093+
slurm_node_maintenance):
1094+
assert not slurm_node_allocated.under_maintenance()
1095+
assert not slurm_node_idle.under_maintenance()
1096+
assert slurm_node_maintenance.under_maintenance()

0 commit comments

Comments
 (0)