Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions docs/manpage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -361,19 +361,23 @@ When allocating nodes automatically, ReFrame will take into account all node lim
Nodes from this pool are allocated according to different policies.
If no node can be selected, the test will be marked as a failure with an appropriate message.

.. option:: --flex-alloc-nodes[=POLICY]
.. option:: --flex-alloc-nodes=POLICY

Set the flexible node allocation policy.
Available values are the following:

- ``all``: Flexible tests will be assigned as many tasks as needed in order to span over *all* the nodes of the node pool.
- ``idle``: Flexible tests will be assigned as many tasks as needed in order to span over the *idle* nodes of the node pool.
- ``STATE``: Flexible tests will be assigned as many tasks as needed in order to span over the nodes that are currently in state ``STATE``.
Querying of the node state and submission of the test job are two separate steps not executed atomically.
It is therefore possible that the number of tasks assigned does not correspond to the actual idle nodes.
It is therefore possible that the number of tasks assigned does not correspond to the actual nodes in the given state.

This is the default policy.
If this option is not specified, the default allocation policy for flexible tests is 'idle'.
- Any positive integer: Flexible tests will be assigned as many tasks as needed in order to span over the specified number of nodes from the node pool.

.. versionchanged:: 3.1
It is now possible to pass an arbitrary node state as a flexible node allocation parameter.


---------------------------------------
Options controlling ReFrame environment
---------------------------------------
Expand Down
20 changes: 13 additions & 7 deletions reframe/core/schedulers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,17 +349,18 @@ def guess_num_tasks(self):
return self.sched_flex_alloc_nodes * num_tasks_per_node

available_nodes = self.scheduler.allnodes()
getlogger().debug('flex_alloc_nodes: total available nodes %s ' %
getlogger().debug('flex_alloc_nodes: total available nodes: %s ' %
len(available_nodes))

# Try to guess the number of tasks now
available_nodes = self.scheduler.filternodes(self, available_nodes)
if self.sched_flex_alloc_nodes == 'idle':
if self.sched_flex_alloc_nodes.casefold() != 'all':
available_nodes = {n for n in available_nodes
if n.is_available()}
if n.in_state(self.sched_flex_alloc_nodes)}
getlogger().debug(
'flex_alloc_nodes: selecting idle nodes: '
'available nodes now: %s' % len(available_nodes)
f'flex_alloc_nodes: selecting nodes in state '
f'{self.sched_flex_alloc_nodes!r}: '
f'available nodes now: {len(available_nodes)}'
)

return len(available_nodes) * num_tasks_per_node
Expand Down Expand Up @@ -398,5 +399,10 @@ class Node(abc.ABC):
'''

@abc.abstractmethod
def is_available(self):
'''Return ``True`` if this node is available, ``False`` otherwise.'''
def in_state(self, state):
'''Returns whether the node is in the given state.

:arg state: The node state.
:returns: :class:`True` if the nodes's state matches the given one,
:class:`False` otherwise.
'''
4 changes: 2 additions & 2 deletions reframe/core/schedulers/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,5 +182,5 @@ class _LocalNode(sched.Node):
def __init__(self, name):
self._name = name

def is_available(self):
return True
def in_state(self, state):
return state.casefold() == 'idle'
6 changes: 3 additions & 3 deletions reframe/core/schedulers/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -611,9 +611,9 @@ def __eq__(self, other):
def __hash__(self):
return hash(self.name)

def is_available(self):
return all([self._states == {'IDLE'}, self._partitions,
self._active_features, self._states])
def in_state(self, state):
return all([self._states >= set(state.upper().split('+')),
self._partitions, self._active_features, self._states])

def is_down(self):
return bool({'DOWN', 'DRAIN', 'MAINT', 'NO_RESPOND'} & self._states)
Expand Down
6 changes: 1 addition & 5 deletions reframe/frontend/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ def main():
)
run_options.add_argument(
'--flex-alloc-nodes', action='store',
dest='flex_alloc_nodes', metavar='{all|idle|NUM}', default=None,
dest='flex_alloc_nodes', metavar='{all|STATE|NUM}', default=None,
help='Set strategy for the flexible node allocation (default: "idle").'
)
env_options.add_argument(
Expand Down Expand Up @@ -656,10 +656,6 @@ def print_infoline(param, value):
if sched_flex_alloc_nodes <= 0:
raise ConfigError(errmsg.format(options.flex_alloc_nodes))
except ValueError:
if not options.flex_alloc_nodes.casefold() in {'idle', 'all'}:
raise ConfigError(
errmsg.format(options.flex_alloc_nodes)) from None

sched_flex_alloc_nodes = options.flex_alloc_nodes

exec_policy.sched_flex_alloc_nodes = sched_flex_alloc_nodes
Expand Down
88 changes: 59 additions & 29 deletions unittests/test_schedulers.py
Original file line number Diff line number Diff line change
Expand Up @@ -647,6 +647,24 @@ def slurm_nodes():
'ExtSensorsTemp=n/s Reason=Foo/ '
'failed [reframe_user@01 Jan 2018]',

'NodeName=nid00006 Arch=x86_64 CoresPerSocket=12 '
'CPUAlloc=0 CPUErr=0 CPUTot=24 CPULoad=0.00 '
'AvailableFeatures=f6 ActiveFeatures=f6 '
'Gres=gpu_mem:16280,gpu:1 NodeAddr=nid00006'
'NodeHostName=nid00006 Version=10.00 OS=Linux '
'RealMemory=32220 AllocMem=0 FreeMem=10000 '
'Sockets=1 Boards=1 State=MAINT '
'ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A '
'MCS_label=N/A Partitions=p4 '
'BootTime=01 Jan 2018 '
'SlurmdStartTime=01 Jan 2018 '
'CfgTRES=cpu=24,mem=32220M '
'AllocTRES= CapWatts=n/a CurrentWatts=100 '
'LowestJoules=100000000 ConsumedJoules=0 '
'ExtSensorsJoules=n/s ExtSensorsWatts=0 '
'ExtSensorsTemp=n/s Reason=Foo/ '
'failed [reframe_user@01 Jan 2018]',

'Node invalid_node2 not found']


Expand Down Expand Up @@ -861,6 +879,13 @@ def test_flex_alloc_not_enough_idle_nodes(make_flexible_job):
prepare_job(job)


def test_flex_alloc_maintenance_nodes(make_flexible_job):
job = make_flexible_job('maint')
job.options = ['--partition=p4']
prepare_job(job)
assert job.num_tasks == 4


def test_flex_alloc_not_enough_nodes_constraint_partition(make_flexible_job):
job = make_flexible_job('all')
job.options = ['-C f1,f2', '--partition=p1,p2']
Expand Down Expand Up @@ -968,6 +993,29 @@ def slurm_node_nopart():
)


@pytest.fixture
def slurm_node_maintenance():
return _SlurmNode(
'NodeName=nid00006 Arch=x86_64 CoresPerSocket=12 '
'CPUAlloc=0 CPUErr=0 CPUTot=24 CPULoad=0.00 '
'AvailableFeatures=f6 ActiveFeatures=f6 '
'Gres=gpu_mem:16280,gpu:1 NodeAddr=nid00006'
'NodeHostName=nid00006 Version=10.00 OS=Linux '
'RealMemory=32220 AllocMem=0 FreeMem=10000 '
'Sockets=1 Boards=1 State=MAINT '
'ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A '
'MCS_label=N/A Partitions=p4 '
'BootTime=01 Jan 2018 '
'SlurmdStartTime=01 Jan 2018 '
'CfgTRES=cpu=24,mem=32220M '
'AllocTRES= CapWatts=n/a CurrentWatts=100 '
'LowestJoules=100000000 ConsumedJoules=0 '
'ExtSensorsJoules=n/s ExtSensorsWatts=0 '
'ExtSensorsTemp=n/s Reason=Foo/ '
'failed [reframe_user@01 Jan 2018]'
)


def test_slurm_node_noname():
with pytest.raises(JobError):
_SlurmNode(
Expand Down Expand Up @@ -1022,14 +1070,17 @@ def test_str(slurm_node_allocated):
assert 'nid00001' == str(slurm_node_allocated)


def test_slurm_node_is_available(slurm_node_allocated,
slurm_node_idle,
slurm_node_drained,
slurm_node_nopart):
assert not slurm_node_allocated.is_available()
assert slurm_node_idle.is_available()
assert not slurm_node_drained.is_available()
assert not slurm_node_nopart.is_available()
def test_slurm_node_in_state(slurm_node_allocated,
slurm_node_idle,
slurm_node_drained,
slurm_node_nopart):
assert slurm_node_allocated.in_state('allocated')
assert slurm_node_idle.in_state('Idle')
assert slurm_node_drained.in_state('IDLE+Drain')
assert slurm_node_drained.in_state('IDLE')
assert slurm_node_drained.in_state('idle')
assert slurm_node_drained.in_state('DRAIN')
assert not slurm_node_nopart.in_state('IDLE')


def test_slurm_node_is_down(slurm_node_allocated,
Expand All @@ -1038,24 +1089,3 @@ def test_slurm_node_is_down(slurm_node_allocated,
assert not slurm_node_allocated.is_down()
assert not slurm_node_idle.is_down()
assert slurm_node_nopart.is_down()


class TestSlurmNode:
def setUp(self):
idle_node_description = (
)

idle_drained_node_description = (
)

no_partition_node_description = (
)

self.no_name_node_description = (
)

self.allocated_node = _SlurmNode(allocated_node_description)
self.allocated_node_copy = _SlurmNode(allocated_node_description)
self.idle_node = _SlurmNode(idle_node_description)
self.idle_drained = _SlurmNode(idle_drained_node_description)
self.no_partition_node = _SlurmNode(no_partition_node_description)