From 51a03f57bf2db25f3f8b5e572fb1286415a3bd08 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Thu, 5 Dec 2019 19:18:04 +0100 Subject: [PATCH 1/2] Fix crash of local scheduler with flexible allocation This introduces a an API for a Node, which the Job talks to for determining whether the node is available or not. Different backends implement that differently. Also made private the `SlurmNode` class and the `create_nodes()` function. --- reframe/core/schedulers/__init__.py | 6 ++++++ reframe/core/schedulers/local.py | 12 ++++++++++-- reframe/core/schedulers/slurm.py | 12 ++++++------ unittests/resources/checks/hellocheck.py | 1 + unittests/test_schedulers.py | 24 +++++++++++++++--------- 5 files changed, 38 insertions(+), 17 deletions(-) diff --git a/reframe/core/schedulers/__init__.py b/reframe/core/schedulers/__init__.py index 3f4bd4fa32..622822b642 100644 --- a/reframe/core/schedulers/__init__.py +++ b/reframe/core/schedulers/__init__.py @@ -334,3 +334,9 @@ def finished(self): raise JobNotStartedError('cannot poll an unstarted job') return self.scheduler.finished(self) + + +class Node(abc.ABC): + @abc.abstractmethod + def is_available(self): + '''Return ``True`` if this node is available, ``False`` otherwise.''' diff --git a/reframe/core/schedulers/local.py b/reframe/core/schedulers/local.py index 4052c7b6b5..561ebf2ca0 100644 --- a/reframe/core/schedulers/local.py +++ b/reframe/core/schedulers/local.py @@ -56,10 +56,10 @@ def emit_preamble(self, job): return [] def allnodes(self): - return [socket.gethostname()] + return [_LocalNode(socket.gethostname())] def filternodes(self, job, nodes): - return [socket.gethostname()] + return [_LocalNode(socket.gethostname())] def _kill_all(self, job): '''Send SIGKILL to all the processes of the spawned job.''' @@ -169,3 +169,11 @@ def finished(self, job): return False return True + + +class _LocalNode(sched.Node): + def __init__(self, name): + self._name = name + + def is_available(self): + return True diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 4d846bcc6f..37e729f6dd 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -166,7 +166,7 @@ def allnodes(self): raise JobError('could not retrieve node information') from e node_descriptions = completed.stdout.splitlines() - return create_nodes(node_descriptions) + return _create_nodes(node_descriptions) def _get_default_partition(self): completed = _run_strict('scontrol -a show -o partitions') @@ -272,13 +272,13 @@ def _get_reservation_nodes(self, reservation): completed = _run_strict('scontrol -a show -o %s' % reservation_nodes) node_descriptions = completed.stdout.splitlines() - return create_nodes(node_descriptions) + return _create_nodes(node_descriptions) def _get_nodes_by_name(self, nodespec): completed = os_ext.run_command('scontrol -a show -o node %s' % nodespec) node_descriptions = completed.stdout.splitlines() - return create_nodes(node_descriptions) + return _create_nodes(node_descriptions) def _set_nodelist(self, job, nodespec): if job.nodelist is not None: @@ -485,16 +485,16 @@ def cancel(self, job): self._cancelled = True -def create_nodes(descriptions): +def _create_nodes(descriptions): nodes = set() for descr in descriptions: with suppress(JobError): - nodes.add(SlurmNode(descr)) + nodes.add(_SlurmNode(descr)) return nodes -class SlurmNode: +class _SlurmNode(sched.Node): '''Class representing a Slurm node.''' def __init__(self, node_descr): diff --git a/unittests/resources/checks/hellocheck.py b/unittests/resources/checks/hellocheck.py index 62abd1d0f5..7956eb8264 100644 --- a/unittests/resources/checks/hellocheck.py +++ b/unittests/resources/checks/hellocheck.py @@ -9,6 +9,7 @@ def __init__(self): self.descr = 'C Hello World test' # All available systems are supported + self.num_tasks = 0 self.valid_systems = ['*'] self.valid_prog_environs = ['*'] self.sourcepath = 'hello.c' diff --git a/unittests/test_schedulers.py b/unittests/test_schedulers.py index a061f2d8ef..47b2508270 100644 --- a/unittests/test_schedulers.py +++ b/unittests/test_schedulers.py @@ -16,7 +16,7 @@ from reframe.core.launchers.registry import getlauncher from reframe.core.schedulers import Job from reframe.core.schedulers.registry import getscheduler -from reframe.core.schedulers.slurm import SlurmNode, create_nodes +from reframe.core.schedulers.slurm import _SlurmNode, _create_nodes class _TestJob(abc.ABC): @@ -304,8 +304,14 @@ def test_cancel_term_ignore(self): self.assertProcessDied(sleep_pid) def test_guess_num_tasks(self): + # We want to trigger bug #1087 (Github), that's we set allocation + # policy to idle. self.testjob.num_tasks = 0 - assert self.testjob.guess_num_tasks() == 1 + self.testjob._sched_flex_alloc_nodes = 'idle' + self.prepare() + self.testjob.submit() + self.testjob.wait() + assert self.testjob.num_tasks == 1 class TestSlurmJob(_TestJob, unittest.TestCase): @@ -611,7 +617,7 @@ def create_dummy_nodes(obj): 'Node invalid_node2 not found'] - return create_nodes(node_descriptions) + return _create_nodes(node_descriptions) def create_reservation_nodes(self, res): return {n for n in self.testjob.scheduler.allnodes() @@ -906,15 +912,15 @@ def setUp(self): 'failed [reframe_user@01 Jan 2018]' ) - self.allocated_node = SlurmNode(allocated_node_description) - self.allocated_node_copy = SlurmNode(allocated_node_description) - self.idle_node = SlurmNode(idle_node_description) - self.idle_drained = SlurmNode(idle_drained_node_description) - self.no_partition_node = SlurmNode(no_partition_node_description) + self.allocated_node = _SlurmNode(allocated_node_description) + self.allocated_node_copy = _SlurmNode(allocated_node_description) + self.idle_node = _SlurmNode(idle_node_description) + self.idle_drained = _SlurmNode(idle_drained_node_description) + self.no_partition_node = _SlurmNode(no_partition_node_description) def test_no_node_name(self): with self.assertRaises(JobError): - SlurmNode(self.no_name_node_description) + _SlurmNode(self.no_name_node_description) def test_states(self): self.assertEqual(self.allocated_node.states, {'ALLOCATED'}) From 4f6044153665e55156e21f138d4bc2cdf796004f Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Thu, 5 Dec 2019 23:14:12 +0100 Subject: [PATCH 2/2] Fix unit tests --- unittests/resources/checks/hellocheck.py | 1 - 1 file changed, 1 deletion(-) diff --git a/unittests/resources/checks/hellocheck.py b/unittests/resources/checks/hellocheck.py index 7956eb8264..62abd1d0f5 100644 --- a/unittests/resources/checks/hellocheck.py +++ b/unittests/resources/checks/hellocheck.py @@ -9,7 +9,6 @@ def __init__(self): self.descr = 'C Hello World test' # All available systems are supported - self.num_tasks = 0 self.valid_systems = ['*'] self.valid_prog_environs = ['*'] self.sourcepath = 'hello.c'