diff --git a/docs/manpage.rst b/docs/manpage.rst index a5c76db856..d993f3b28f 100644 --- a/docs/manpage.rst +++ b/docs/manpage.rst @@ -361,19 +361,23 @@ When allocating nodes automatically, ReFrame will take into account all node lim Nodes from this pool are allocated according to different policies. If no node can be selected, the test will be marked as a failure with an appropriate message. -.. option:: --flex-alloc-nodes[=POLICY] +.. option:: --flex-alloc-nodes=POLICY Set the flexible node allocation policy. Available values are the following: - ``all``: Flexible tests will be assigned as many tasks as needed in order to span over *all* the nodes of the node pool. - - ``idle``: Flexible tests will be assigned as many tasks as needed in order to span over the *idle* nodes of the node pool. + - ``STATE``: Flexible tests will be assigned as many tasks as needed in order to span over the nodes that are currently in state ``STATE``. Querying of the node state and submission of the test job are two separate steps not executed atomically. - It is therefore possible that the number of tasks assigned does not correspond to the actual idle nodes. + It is therefore possible that the number of tasks assigned does not correspond to the actual nodes in the given state. - This is the default policy. + If this option is not specified, the default allocation policy for flexible tests is 'idle'. - Any positive integer: Flexible tests will be assigned as many tasks as needed in order to span over the specified number of nodes from the node pool. + .. versionchanged:: 3.1 + It is now possible to pass an arbitrary node state as a flexible node allocation parameter. + + --------------------------------------- Options controlling ReFrame environment --------------------------------------- diff --git a/reframe/core/schedulers/__init__.py b/reframe/core/schedulers/__init__.py index 312d1f6267..763b960509 100644 --- a/reframe/core/schedulers/__init__.py +++ b/reframe/core/schedulers/__init__.py @@ -349,17 +349,18 @@ def guess_num_tasks(self): return self.sched_flex_alloc_nodes * num_tasks_per_node available_nodes = self.scheduler.allnodes() - getlogger().debug('flex_alloc_nodes: total available nodes %s ' % + getlogger().debug('flex_alloc_nodes: total available nodes: %s ' % len(available_nodes)) # Try to guess the number of tasks now available_nodes = self.scheduler.filternodes(self, available_nodes) - if self.sched_flex_alloc_nodes == 'idle': + if self.sched_flex_alloc_nodes.casefold() != 'all': available_nodes = {n for n in available_nodes - if n.is_available()} + if n.in_state(self.sched_flex_alloc_nodes)} getlogger().debug( - 'flex_alloc_nodes: selecting idle nodes: ' - 'available nodes now: %s' % len(available_nodes) + f'flex_alloc_nodes: selecting nodes in state ' + f'{self.sched_flex_alloc_nodes!r}: ' + f'available nodes now: {len(available_nodes)}' ) return len(available_nodes) * num_tasks_per_node @@ -398,5 +399,10 @@ class Node(abc.ABC): ''' @abc.abstractmethod - def is_available(self): - '''Return ``True`` if this node is available, ``False`` otherwise.''' + def in_state(self, state): + '''Returns whether the node is in the given state. + + :arg state: The node state. + :returns: :class:`True` if the nodes's state matches the given one, + :class:`False` otherwise. + ''' diff --git a/reframe/core/schedulers/local.py b/reframe/core/schedulers/local.py index 2f708f756f..354755a6a3 100644 --- a/reframe/core/schedulers/local.py +++ b/reframe/core/schedulers/local.py @@ -182,5 +182,5 @@ class _LocalNode(sched.Node): def __init__(self, name): self._name = name - def is_available(self): - return True + def in_state(self, state): + return state.casefold() == 'idle' diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index a89002c8e8..2e6db6581a 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -611,9 +611,9 @@ def __eq__(self, other): def __hash__(self): return hash(self.name) - def is_available(self): - return all([self._states == {'IDLE'}, self._partitions, - self._active_features, self._states]) + def in_state(self, state): + return all([self._states >= set(state.upper().split('+')), + self._partitions, self._active_features, self._states]) def is_down(self): return bool({'DOWN', 'DRAIN', 'MAINT', 'NO_RESPOND'} & self._states) diff --git a/reframe/frontend/cli.py b/reframe/frontend/cli.py index f84615c93f..d3123f72b6 100644 --- a/reframe/frontend/cli.py +++ b/reframe/frontend/cli.py @@ -260,7 +260,7 @@ def main(): ) run_options.add_argument( '--flex-alloc-nodes', action='store', - dest='flex_alloc_nodes', metavar='{all|idle|NUM}', default=None, + dest='flex_alloc_nodes', metavar='{all|STATE|NUM}', default=None, help='Set strategy for the flexible node allocation (default: "idle").' ) env_options.add_argument( @@ -656,10 +656,6 @@ def print_infoline(param, value): if sched_flex_alloc_nodes <= 0: raise ConfigError(errmsg.format(options.flex_alloc_nodes)) except ValueError: - if not options.flex_alloc_nodes.casefold() in {'idle', 'all'}: - raise ConfigError( - errmsg.format(options.flex_alloc_nodes)) from None - sched_flex_alloc_nodes = options.flex_alloc_nodes exec_policy.sched_flex_alloc_nodes = sched_flex_alloc_nodes diff --git a/unittests/test_schedulers.py b/unittests/test_schedulers.py index 37d8ca9e9a..a52352bfd6 100644 --- a/unittests/test_schedulers.py +++ b/unittests/test_schedulers.py @@ -647,6 +647,24 @@ def slurm_nodes(): 'ExtSensorsTemp=n/s Reason=Foo/ ' 'failed [reframe_user@01 Jan 2018]', + 'NodeName=nid00006 Arch=x86_64 CoresPerSocket=12 ' + 'CPUAlloc=0 CPUErr=0 CPUTot=24 CPULoad=0.00 ' + 'AvailableFeatures=f6 ActiveFeatures=f6 ' + 'Gres=gpu_mem:16280,gpu:1 NodeAddr=nid00006' + 'NodeHostName=nid00006 Version=10.00 OS=Linux ' + 'RealMemory=32220 AllocMem=0 FreeMem=10000 ' + 'Sockets=1 Boards=1 State=MAINT ' + 'ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A ' + 'MCS_label=N/A Partitions=p4 ' + 'BootTime=01 Jan 2018 ' + 'SlurmdStartTime=01 Jan 2018 ' + 'CfgTRES=cpu=24,mem=32220M ' + 'AllocTRES= CapWatts=n/a CurrentWatts=100 ' + 'LowestJoules=100000000 ConsumedJoules=0 ' + 'ExtSensorsJoules=n/s ExtSensorsWatts=0 ' + 'ExtSensorsTemp=n/s Reason=Foo/ ' + 'failed [reframe_user@01 Jan 2018]', + 'Node invalid_node2 not found'] @@ -861,6 +879,13 @@ def test_flex_alloc_not_enough_idle_nodes(make_flexible_job): prepare_job(job) +def test_flex_alloc_maintenance_nodes(make_flexible_job): + job = make_flexible_job('maint') + job.options = ['--partition=p4'] + prepare_job(job) + assert job.num_tasks == 4 + + def test_flex_alloc_not_enough_nodes_constraint_partition(make_flexible_job): job = make_flexible_job('all') job.options = ['-C f1,f2', '--partition=p1,p2'] @@ -968,6 +993,29 @@ def slurm_node_nopart(): ) +@pytest.fixture +def slurm_node_maintenance(): + return _SlurmNode( + 'NodeName=nid00006 Arch=x86_64 CoresPerSocket=12 ' + 'CPUAlloc=0 CPUErr=0 CPUTot=24 CPULoad=0.00 ' + 'AvailableFeatures=f6 ActiveFeatures=f6 ' + 'Gres=gpu_mem:16280,gpu:1 NodeAddr=nid00006' + 'NodeHostName=nid00006 Version=10.00 OS=Linux ' + 'RealMemory=32220 AllocMem=0 FreeMem=10000 ' + 'Sockets=1 Boards=1 State=MAINT ' + 'ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A ' + 'MCS_label=N/A Partitions=p4 ' + 'BootTime=01 Jan 2018 ' + 'SlurmdStartTime=01 Jan 2018 ' + 'CfgTRES=cpu=24,mem=32220M ' + 'AllocTRES= CapWatts=n/a CurrentWatts=100 ' + 'LowestJoules=100000000 ConsumedJoules=0 ' + 'ExtSensorsJoules=n/s ExtSensorsWatts=0 ' + 'ExtSensorsTemp=n/s Reason=Foo/ ' + 'failed [reframe_user@01 Jan 2018]' + ) + + def test_slurm_node_noname(): with pytest.raises(JobError): _SlurmNode( @@ -1022,14 +1070,17 @@ def test_str(slurm_node_allocated): assert 'nid00001' == str(slurm_node_allocated) -def test_slurm_node_is_available(slurm_node_allocated, - slurm_node_idle, - slurm_node_drained, - slurm_node_nopart): - assert not slurm_node_allocated.is_available() - assert slurm_node_idle.is_available() - assert not slurm_node_drained.is_available() - assert not slurm_node_nopart.is_available() +def test_slurm_node_in_state(slurm_node_allocated, + slurm_node_idle, + slurm_node_drained, + slurm_node_nopart): + assert slurm_node_allocated.in_state('allocated') + assert slurm_node_idle.in_state('Idle') + assert slurm_node_drained.in_state('IDLE+Drain') + assert slurm_node_drained.in_state('IDLE') + assert slurm_node_drained.in_state('idle') + assert slurm_node_drained.in_state('DRAIN') + assert not slurm_node_nopart.in_state('IDLE') def test_slurm_node_is_down(slurm_node_allocated, @@ -1038,24 +1089,3 @@ def test_slurm_node_is_down(slurm_node_allocated, assert not slurm_node_allocated.is_down() assert not slurm_node_idle.is_down() assert slurm_node_nopart.is_down() - - -class TestSlurmNode: - def setUp(self): - idle_node_description = ( - ) - - idle_drained_node_description = ( - ) - - no_partition_node_description = ( - ) - - self.no_name_node_description = ( - ) - - self.allocated_node = _SlurmNode(allocated_node_description) - self.allocated_node_copy = _SlurmNode(allocated_node_description) - self.idle_node = _SlurmNode(idle_node_description) - self.idle_drained = _SlurmNode(idle_drained_node_description) - self.no_partition_node = _SlurmNode(no_partition_node_description)