From 5b02f4aec5992428ec092a430c43394e70d4ae93 Mon Sep 17 00:00:00 2001 From: Theofilos Manitaras Date: Tue, 26 Feb 2019 16:50:59 +0100 Subject: [PATCH 1/2] Fix bug ignoring a partition defined in 'access' * `SlurmNode` attributes are set to empty sets if they cannot be retrieved by the corresponding description. * Create abstract method `list_all_nodes` on the base `Job` class. * Remove method `get_partition_nodes`. * Create more unittests for flexible allocation. --- reframe/core/schedulers/__init__.py | 13 +++++----- reframe/core/schedulers/local.py | 2 +- reframe/core/schedulers/pbs.py | 2 +- reframe/core/schedulers/slurm.py | 17 +++++++------ unittests/test_launchers.py | 2 +- unittests/test_schedulers.py | 37 +++++++++++++++++++++++++---- 6 files changed, 50 insertions(+), 23 deletions(-) diff --git a/reframe/core/schedulers/__init__.py b/reframe/core/schedulers/__init__.py index 9bc08f36a4..4871ba3201 100644 --- a/reframe/core/schedulers/__init__.py +++ b/reframe/core/schedulers/__init__.py @@ -265,12 +265,13 @@ def guess_num_tasks(self): return self.sched_flex_alloc_tasks - available_nodes = self.get_partition_nodes() - getlogger().debug('flex_alloc_tasks: total available nodes in current ' - 'virtual partition: %s' % len(available_nodes)) + available_nodes = self.list_all_nodes() + getlogger().debug('flex_alloc_tasks: total available nodes %s ' + % len(available_nodes)) # Try to guess the number of tasks now - available_nodes = self.filter_nodes(available_nodes, self.options) + available_nodes = self.filter_nodes(available_nodes, + self.sched_access + self.options) if self.sched_flex_alloc_tasks == 'idle': available_nodes = {n for n in available_nodes @@ -284,8 +285,8 @@ def guess_num_tasks(self): return num_tasks @abc.abstractmethod - def get_partition_nodes(self): - # Get all nodes of the current virtual partition + def list_all_nodes(self): + # Lists all the available nodes pass @abc.abstractmethod diff --git a/reframe/core/schedulers/local.py b/reframe/core/schedulers/local.py index 47393195ff..ca3149ddc5 100644 --- a/reframe/core/schedulers/local.py +++ b/reframe/core/schedulers/local.py @@ -60,7 +60,7 @@ def submit(self): def emit_preamble(self): return [] - def get_partition_nodes(self): + def list_all_nodes(self): raise NotImplementedError( 'local scheduler does not support listing of available nodes') diff --git a/reframe/core/schedulers/pbs.py b/reframe/core/schedulers/pbs.py index 7fafc7188e..8dbc40e678 100644 --- a/reframe/core/schedulers/pbs.py +++ b/reframe/core/schedulers/pbs.py @@ -88,7 +88,7 @@ def emit_preamble(self): preamble.append('cd %s' % self.workdir) return preamble - def get_partition_nodes(self): + def list_all_nodes(self): raise NotImplementedError('pbs backend does not support node listing') def filter_nodes(self, nodes, options): diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index a0c9f5620a..46540ef768 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -152,7 +152,7 @@ def submit(self): self._jobid = int(jobid_match.group('jobid')) - def _get_all_nodes(self): + def list_all_nodes(self): try: completed = _run_strict('scontrol -a show -o nodes') except SpawnedProcessError as e: @@ -170,10 +170,6 @@ def _get_default_partition(self): return None - def get_partition_nodes(self): - nodes = self._get_all_nodes() - return self.filter_nodes(nodes, self.sched_access) - def filter_nodes(self, nodes, options): option_parser = ArgumentParser() option_parser.add_argument('--reservation') @@ -199,6 +195,8 @@ def filter_nodes(self, nodes, options): else: default_partition = self._get_default_partition() partitions = {default_partition} if default_partition else set() + getlogger().debug('flex_alloc_tasks: default partition: %s' % + default_partition) nodes = {n for n in nodes if n.partitions >= partitions} getlogger().debug( @@ -217,7 +215,7 @@ def filter_nodes(self, nodes, options): nodes &= self._get_nodes_by_name(nodelist) getlogger().debug( 'flex_alloc_tasks: filtering nodes by nodelist: %s ' - 'availablenodes now: %s' % (nodelist, len(nodes))) + 'available nodes now: %s' % (nodelist, len(nodes))) if exclude_nodes: exclude_nodes = exclude_nodes.strip() @@ -431,10 +429,11 @@ def __init__(self, node_descr): raise JobError('could not extract NodeName from node description') self._partitions = self._extract_attribute( - 'Partitions', node_descr, sep=',') + 'Partitions', node_descr, sep=',') or set() self._active_features = self._extract_attribute( - 'ActiveFeatures', node_descr, sep=',') - self._states = self._extract_attribute('State', node_descr, sep='+') + 'ActiveFeatures', node_descr, sep=',') or set() + self._states = self._extract_attribute( + 'State', node_descr, sep='+') or set() def __eq__(self, other): if not isinstance(other, type(self)): diff --git a/unittests/test_launchers.py b/unittests/test_launchers.py index a207ce294e..ecbd1e4d35 100644 --- a/unittests/test_launchers.py +++ b/unittests/test_launchers.py @@ -22,7 +22,7 @@ def cancel(self): def finished(self): pass - def get_partition_nodes(self): + def list_all_nodes(self): pass def filter_nodes(self, nodes): diff --git a/unittests/test_schedulers.py b/unittests/test_schedulers.py index cbab7176e6..779a41b4be 100644 --- a/unittests/test_schedulers.py +++ b/unittests/test_schedulers.py @@ -393,9 +393,9 @@ def test_cancel(self): def test_guess_num_tasks(self): self.testjob._num_tasks = 0 self.testjob._sched_flex_alloc_tasks = 'all' - # monkey patch `get_partition_nodes()` to simulate extraction of + # monkey patch `list_all_nodes()` to simulate extraction of # slurm nodes through the use of `scontrol show` - self.testjob.get_partition_nodes = lambda: set() + self.testjob.list_all_nodes = lambda: set() # monkey patch `_get_default_partition()` to simulate extraction # of the default partition through the use of `scontrol show` self.testjob._get_default_partition = lambda: 'pdef' @@ -566,6 +566,23 @@ def create_dummy_nodes(obj): 'AllocTRES= CapWatts=n/a CurrentWatts=100 ' 'LowestJoules=100000000 ConsumedJoules=0 ' 'ExtSensorsJoules=n/s ExtSensorsWatts=0 ' + 'ExtSensorsTemp=n/s Reason=Foo/ ', + + 'NodeName=nid00005 Arch=x86_64 CoresPerSocket=12 ' + 'CPUAlloc=0 CPUErr=0 CPUTot=24 CPULoad=0.00 ' + 'AvailableFeatures=f5 ActiveFeatures=f5 ' + 'Gres=gpu_mem:16280,gpu:1 NodeAddr=nid00003' + 'NodeHostName=nid00003 Version=10.00 OS=Linux ' + 'RealMemory=32220 AllocMem=0 FreeMem=10000 ' + 'Sockets=1 Boards=1 State=ALLOCATED ' + 'ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A ' + 'MCS_label=N/A Partitions=p1,p3 ' + 'BootTime=01 Jan 2018 ' + 'SlurmdStartTime=01 Jan 2018 ' + 'CfgTRES=cpu=24,mem=32220M ' + 'AllocTRES= CapWatts=n/a CurrentWatts=100 ' + 'LowestJoules=100000000 ConsumedJoules=0 ' + 'ExtSensorsJoules=n/s ExtSensorsWatts=0 ' 'ExtSensorsTemp=n/s Reason=Foo/ ' 'failed [reframe_user@01 Jan 2018]'] @@ -589,9 +606,9 @@ def setUp(self): stdout=os.path.join(self.workdir, 'testjob.out'), stderr=os.path.join(self.workdir, 'testjob.err') ) - # monkey patch `_get_all_nodes` to simulate extraction of + # monkey patch `list_all_nodes` to simulate extraction of # slurm nodes through the use of `scontrol show` - self.testjob._get_all_nodes = self.create_dummy_nodes + self.testjob.list_all_nodes = self.create_dummy_nodes # monkey patch `_get_default_partition` to simulate extraction # of the default partition self.testjob._get_default_partition = lambda: 'pdef' @@ -632,6 +649,16 @@ def test_sched_access_constraint_partition(self): self.prepare_job() self.assertEqual(self.testjob.num_tasks, 4) + def test_sched_access_partition(self): + self.testjob._sched_access = ['--partition=p1'] + self.prepare_job() + self.assertEqual(self.testjob.num_tasks, 16) + + def test_default_partition_all(self): + self.testjob._sched_flex_alloc_tasks = 'all' + self.prepare_job() + self.assertEqual(self.testjob.num_tasks, 16) + def test_constraint_idle(self): self.testjob._sched_flex_alloc_tasks = 'idle' self.testjob.options = ['--constraint=f1'] @@ -876,7 +903,7 @@ def test_attributes(self): self.assertEqual(self.allocated_node.partitions, {'p1', 'p2'}) self.assertEqual(self.allocated_node.active_features, {'f1', 'f2'}) self.assertEqual(self.no_partition_node.name, 'nid00004') - self.assertEqual(self.no_partition_node.partitions, None) + self.assertEqual(self.no_partition_node.partitions, set()) self.assertEqual(self.no_partition_node.active_features, {'f1', 'f2'}) def test_str(self): From a09ffb40d378e29d5dbe5345b5c75d69d373e12d Mon Sep 17 00:00:00 2001 From: Theofilos Manitaras Date: Thu, 28 Feb 2019 15:30:15 +0100 Subject: [PATCH 2/2] Address PR comments --- reframe/core/schedulers/__init__.py | 10 +++++----- reframe/core/schedulers/local.py | 2 +- reframe/core/schedulers/pbs.py | 2 +- reframe/core/schedulers/slurm.py | 2 +- unittests/test_launchers.py | 2 +- unittests/test_schedulers.py | 8 ++++---- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/reframe/core/schedulers/__init__.py b/reframe/core/schedulers/__init__.py index 4871ba3201..27323cdc3d 100644 --- a/reframe/core/schedulers/__init__.py +++ b/reframe/core/schedulers/__init__.py @@ -265,9 +265,9 @@ def guess_num_tasks(self): return self.sched_flex_alloc_tasks - available_nodes = self.list_all_nodes() - getlogger().debug('flex_alloc_tasks: total available nodes %s ' - % len(available_nodes)) + available_nodes = self.get_all_nodes() + getlogger().debug('flex_alloc_tasks: total available nodes %s ' % + len(available_nodes)) # Try to guess the number of tasks now available_nodes = self.filter_nodes(available_nodes, @@ -285,8 +285,8 @@ def guess_num_tasks(self): return num_tasks @abc.abstractmethod - def list_all_nodes(self): - # Lists all the available nodes + def get_all_nodes(self): + # Gets all the available nodes pass @abc.abstractmethod diff --git a/reframe/core/schedulers/local.py b/reframe/core/schedulers/local.py index ca3149ddc5..b2d4ac0612 100644 --- a/reframe/core/schedulers/local.py +++ b/reframe/core/schedulers/local.py @@ -60,7 +60,7 @@ def submit(self): def emit_preamble(self): return [] - def list_all_nodes(self): + def get_all_nodes(self): raise NotImplementedError( 'local scheduler does not support listing of available nodes') diff --git a/reframe/core/schedulers/pbs.py b/reframe/core/schedulers/pbs.py index 8dbc40e678..632bcbc412 100644 --- a/reframe/core/schedulers/pbs.py +++ b/reframe/core/schedulers/pbs.py @@ -88,7 +88,7 @@ def emit_preamble(self): preamble.append('cd %s' % self.workdir) return preamble - def list_all_nodes(self): + def get_all_nodes(self): raise NotImplementedError('pbs backend does not support node listing') def filter_nodes(self, nodes, options): diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 46540ef768..4612f5a64f 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -152,7 +152,7 @@ def submit(self): self._jobid = int(jobid_match.group('jobid')) - def list_all_nodes(self): + def get_all_nodes(self): try: completed = _run_strict('scontrol -a show -o nodes') except SpawnedProcessError as e: diff --git a/unittests/test_launchers.py b/unittests/test_launchers.py index ecbd1e4d35..f4847e4711 100644 --- a/unittests/test_launchers.py +++ b/unittests/test_launchers.py @@ -22,7 +22,7 @@ def cancel(self): def finished(self): pass - def list_all_nodes(self): + def get_all_nodes(self): pass def filter_nodes(self, nodes): diff --git a/unittests/test_schedulers.py b/unittests/test_schedulers.py index 779a41b4be..3242ff3324 100644 --- a/unittests/test_schedulers.py +++ b/unittests/test_schedulers.py @@ -393,9 +393,9 @@ def test_cancel(self): def test_guess_num_tasks(self): self.testjob._num_tasks = 0 self.testjob._sched_flex_alloc_tasks = 'all' - # monkey patch `list_all_nodes()` to simulate extraction of + # monkey patch `get_all_nodes()` to simulate extraction of # slurm nodes through the use of `scontrol show` - self.testjob.list_all_nodes = lambda: set() + self.testjob.get_all_nodes = lambda: set() # monkey patch `_get_default_partition()` to simulate extraction # of the default partition through the use of `scontrol show` self.testjob._get_default_partition = lambda: 'pdef' @@ -606,9 +606,9 @@ def setUp(self): stdout=os.path.join(self.workdir, 'testjob.out'), stderr=os.path.join(self.workdir, 'testjob.err') ) - # monkey patch `list_all_nodes` to simulate extraction of + # monkey patch `get_all_nodes` to simulate extraction of # slurm nodes through the use of `scontrol show` - self.testjob.list_all_nodes = self.create_dummy_nodes + self.testjob.get_all_nodes = self.create_dummy_nodes # monkey patch `_get_default_partition` to simulate extraction # of the default partition self.testjob._get_default_partition = lambda: 'pdef'