From c323ba31063d782f2beaa4fb66a4943c19018936 Mon Sep 17 00:00:00 2001 From: Theofilos Manitaras Date: Wed, 20 Nov 2019 13:24:21 +0100 Subject: [PATCH 1/3] Ignore invalid Slurm nodes in flexible alloc * Separate the retrieval of Slurm node descriptions and creation of `SlurmNode` instances when getting all nodes in flexible task allocation. * Make the necessary unittests changes to test for invalid node descriptions. --- reframe/core/schedulers/slurm.py | 16 +++++++++++++--- unittests/test_schedulers.py | 22 ++++++++++------------ 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index b739585937..0f81c9d358 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -4,6 +4,7 @@ import re import time from argparse import ArgumentParser +from contextlib import suppress from datetime import datetime import reframe.core.schedulers as sched @@ -170,14 +171,23 @@ def submit(self): self._jobid = int(jobid_match.group('jobid')) - def get_all_nodes(self): + def get_all_node_descriptions(self): try: completed = _run_strict('scontrol -a show -o nodes') except SpawnedProcessError as e: raise JobError('could not retrieve node information') from e - node_descriptions = completed.stdout.splitlines() - return {SlurmNode(descr) for descr in node_descriptions} + return completed.stdout.splitlines() + + def get_all_nodes(self): + node_descriptions = self.get_all_node_descriptions() + nodes = set() + for descr in node_descriptions: + with suppress(JobError): + slurm_node = SlurmNode(descr) + nodes.add(slurm_node) + + return nodes def _get_default_partition(self): completed = _run_strict('scontrol -a show -o partitions') diff --git a/unittests/test_schedulers.py b/unittests/test_schedulers.py index 68e247c392..f7958fe4e0 100644 --- a/unittests/test_schedulers.py +++ b/unittests/test_schedulers.py @@ -497,7 +497,7 @@ def test_submit_timelimit(self): class TestSlurmFlexibleNodeAllocation(unittest.TestCase): - def create_dummy_nodes(obj): + def create_dummy_node_descr(obj): node_descriptions = ['NodeName=nid00001 Arch=x86_64 CoresPerSocket=12 ' 'CPUAlloc=0 CPUErr=0 CPUTot=24 CPULoad=0.00 ' 'AvailableFeatures=f1,f2 ActiveFeatures=f1,f2 ' @@ -534,6 +534,8 @@ def create_dummy_nodes(obj): 'ExtSensorsTemp=n/s Reason=Foo/ ' 'failed [reframe_user@01 Jan 2018]', + 'Node invalid_node1 not found', + 'NodeName=nid00003 Arch=x86_64 CoresPerSocket=12 ' 'CPUAlloc=0 CPUErr=0 CPUTot=24 CPULoad=0.00 ' 'AvailableFeatures=f1,f3 ActiveFeatures=f1,f3 ' @@ -585,16 +587,14 @@ def create_dummy_nodes(obj): 'LowestJoules=100000000 ConsumedJoules=0 ' 'ExtSensorsJoules=n/s ExtSensorsWatts=0 ' 'ExtSensorsTemp=n/s Reason=Foo/ ' - 'failed [reframe_user@01 Jan 2018]'] + 'failed [reframe_user@01 Jan 2018]', - return {SlurmNode(desc) for desc in node_descriptions} + 'Node invalid_node2 not found'] - def create_reservation_nodes(obj, res): - return {n for n in obj.create_dummy_nodes() if n.name != 'nid00001'} + return node_descriptions - def get_nodes_by_name(obj, node_names): - nodes = obj.create_dummy_nodes() - return {n for n in nodes if n.name in node_names} + def create_reservation_nodes(obj, res): + return {n for n in obj.testjob.get_all_nodes() if n.name != 'nid00001'} def setUp(self): self.workdir = tempfile.mkdtemp(dir='unittests') @@ -607,9 +607,9 @@ def setUp(self): stdout=os.path.join(self.workdir, 'testjob.out'), stderr=os.path.join(self.workdir, 'testjob.err') ) - # monkey patch `get_all_nodes` to simulate extraction of + # monkey patch `get_all_node_descriptions` to simulate extraction of # slurm nodes through the use of `scontrol show` - self.testjob.get_all_nodes = self.create_dummy_nodes + self.testjob.get_all_node_descriptions = self.create_dummy_node_descr # monkey patch `_get_default_partition` to simulate extraction # of the default partition self.testjob._get_default_partition = lambda: 'pdef' @@ -736,14 +736,12 @@ def test_valid_reservation_option(self): def test_exclude_nodes_cmd(self): self.testjob._sched_access = ['--constraint=f1'] self.testjob._sched_exclude_nodelist = 'nid00001' - self.testjob._get_nodes_by_name = self.get_nodes_by_name self.prepare_job() self.assertEqual(self.testjob.num_tasks, 8) def test_exclude_nodes_opt(self): self.testjob._sched_access = ['--constraint=f1'] self.testjob.options = ['-x nid00001'] - self.testjob._get_nodes_by_name = self.get_nodes_by_name self.prepare_job() self.assertEqual(self.testjob.num_tasks, 8) From 639af9a9c1bf22a02fed70ea4286510ac4a02297 Mon Sep 17 00:00:00 2001 From: Theofilos Manitaras Date: Wed, 20 Nov 2019 14:45:14 +0100 Subject: [PATCH 2/3] Get SlurmNode instances from descriptions via a staticmethod --- reframe/core/schedulers/slurm.py | 37 ++++++++++++++------------------ unittests/test_schedulers.py | 17 ++++++++++----- 2 files changed, 28 insertions(+), 26 deletions(-) diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 0f81c9d358..374cfb5713 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -69,6 +69,17 @@ class SlurmJob(sched.Job): # standard job state polling using sacct. SACCT_SQUEUE_RATIO = 10 + @staticmethod + def _get_nodes_from_description(descriptions): + nodes = set() + if descriptions: + for descr in descriptions: + with suppress(JobError): + slurm_node = SlurmNode(descr) + nodes.add(slurm_node) + + return nodes + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._prefix = '#SBATCH' @@ -171,23 +182,14 @@ def submit(self): self._jobid = int(jobid_match.group('jobid')) - def get_all_node_descriptions(self): + def get_all_nodes(self): try: completed = _run_strict('scontrol -a show -o nodes') except SpawnedProcessError as e: raise JobError('could not retrieve node information') from e - return completed.stdout.splitlines() - - def get_all_nodes(self): - node_descriptions = self.get_all_node_descriptions() - nodes = set() - for descr in node_descriptions: - with suppress(JobError): - slurm_node = SlurmNode(descr) - nodes.add(slurm_node) - - return nodes + node_descriptions = completed.stdout.splitlines() + return SlurmJob._get_nodes_from_description(node_descriptions) def _get_default_partition(self): completed = _run_strict('scontrol -a show -o partitions') @@ -276,20 +278,13 @@ def _get_reservation_nodes(self, reservation): completed = _run_strict('scontrol -a show -o %s' % reservation_nodes) node_descriptions = completed.stdout.splitlines() - return {SlurmNode(descr) for descr in node_descriptions} + return SlurmJob._get_nodes_from_description(node_descriptions) def _get_nodes_by_name(self, nodespec): completed = os_ext.run_command('scontrol -a show -o node %s' % nodespec) node_descriptions = completed.stdout.splitlines() - nodes_avail = set() - for descr in node_descriptions: - try: - nodes_avail.add(SlurmNode(descr)) - except JobError: - pass - - return nodes_avail + return SlurmJob._get_nodes_from_description(node_descriptions) def _set_nodelist(self, nodespec): if self._nodelist is not None: diff --git a/unittests/test_schedulers.py b/unittests/test_schedulers.py index f7958fe4e0..b099cdbb65 100644 --- a/unittests/test_schedulers.py +++ b/unittests/test_schedulers.py @@ -15,7 +15,7 @@ from reframe.core.launchers.local import LocalLauncher from reframe.core.launchers.registry import getlauncher from reframe.core.schedulers.registry import getscheduler -from reframe.core.schedulers.slurm import SlurmNode +from reframe.core.schedulers.slurm import SlurmNode, SlurmJob class _TestJob(abc.ABC): @@ -497,7 +497,7 @@ def test_submit_timelimit(self): class TestSlurmFlexibleNodeAllocation(unittest.TestCase): - def create_dummy_node_descr(obj): + def create_dummy_nodes(obj): node_descriptions = ['NodeName=nid00001 Arch=x86_64 CoresPerSocket=12 ' 'CPUAlloc=0 CPUErr=0 CPUTot=24 CPULoad=0.00 ' 'AvailableFeatures=f1,f2 ActiveFeatures=f1,f2 ' @@ -591,11 +591,14 @@ def create_dummy_node_descr(obj): 'Node invalid_node2 not found'] - return node_descriptions + return SlurmJob._get_nodes_from_description(node_descriptions) def create_reservation_nodes(obj, res): return {n for n in obj.testjob.get_all_nodes() if n.name != 'nid00001'} + def create_dummy_nodes_by_name(obj, name): + return {n for n in obj.testjob.get_all_nodes() if n.name == name} + def setUp(self): self.workdir = tempfile.mkdtemp(dir='unittests') slurm_scheduler = getscheduler('slurm') @@ -607,9 +610,9 @@ def setUp(self): stdout=os.path.join(self.workdir, 'testjob.out'), stderr=os.path.join(self.workdir, 'testjob.err') ) - # monkey patch `get_all_node_descriptions` to simulate extraction of + # monkey patch `get_all_nodes` to simulate extraction of # slurm nodes through the use of `scontrol show` - self.testjob.get_all_node_descriptions = self.create_dummy_node_descr + self.testjob.get_all_nodes = self.create_dummy_nodes # monkey patch `_get_default_partition` to simulate extraction # of the default partition self.testjob._get_default_partition = lambda: 'pdef' @@ -736,12 +739,16 @@ def test_valid_reservation_option(self): def test_exclude_nodes_cmd(self): self.testjob._sched_access = ['--constraint=f1'] self.testjob._sched_exclude_nodelist = 'nid00001' + # monkey patch `_get_nodes_by_name` to simulate extraction of + # slurm nodes by name through the use of `scontrol show` + self.testjob._get_nodes_by_name = self.create_dummy_nodes_by_name self.prepare_job() self.assertEqual(self.testjob.num_tasks, 8) def test_exclude_nodes_opt(self): self.testjob._sched_access = ['--constraint=f1'] self.testjob.options = ['-x nid00001'] + self.testjob._get_nodes_by_name = self.create_dummy_nodes_by_name self.prepare_job() self.assertEqual(self.testjob.num_tasks, 8) From b935bd24a381dd7bde70b3f03358f15a47a6c0bd Mon Sep 17 00:00:00 2001 From: Theofilos Manitaras Date: Thu, 21 Nov 2019 09:36:50 +0100 Subject: [PATCH 3/3] Address PR comments --- reframe/core/schedulers/slurm.py | 26 ++++++++++++-------------- unittests/test_schedulers.py | 4 ++-- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 374cfb5713..f8b07de60c 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -69,17 +69,6 @@ class SlurmJob(sched.Job): # standard job state polling using sacct. SACCT_SQUEUE_RATIO = 10 - @staticmethod - def _get_nodes_from_description(descriptions): - nodes = set() - if descriptions: - for descr in descriptions: - with suppress(JobError): - slurm_node = SlurmNode(descr) - nodes.add(slurm_node) - - return nodes - def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._prefix = '#SBATCH' @@ -189,7 +178,7 @@ def get_all_nodes(self): raise JobError('could not retrieve node information') from e node_descriptions = completed.stdout.splitlines() - return SlurmJob._get_nodes_from_description(node_descriptions) + return create_nodes(node_descriptions) def _get_default_partition(self): completed = _run_strict('scontrol -a show -o partitions') @@ -278,13 +267,13 @@ def _get_reservation_nodes(self, reservation): completed = _run_strict('scontrol -a show -o %s' % reservation_nodes) node_descriptions = completed.stdout.splitlines() - return SlurmJob._get_nodes_from_description(node_descriptions) + return create_nodes(node_descriptions) def _get_nodes_by_name(self, nodespec): completed = os_ext.run_command('scontrol -a show -o node %s' % nodespec) node_descriptions = completed.stdout.splitlines() - return SlurmJob._get_nodes_from_description(node_descriptions) + return create_nodes(node_descriptions) def _set_nodelist(self, nodespec): if self._nodelist is not None: @@ -494,6 +483,15 @@ def cancel(self): self._cancelled = True +def create_nodes(descriptions): + nodes = set() + for descr in descriptions: + with suppress(JobError): + nodes.add(SlurmNode(descr)) + + return nodes + + class SlurmNode: '''Class representing a Slurm node.''' diff --git a/unittests/test_schedulers.py b/unittests/test_schedulers.py index b099cdbb65..7ab2ff9c27 100644 --- a/unittests/test_schedulers.py +++ b/unittests/test_schedulers.py @@ -15,7 +15,7 @@ from reframe.core.launchers.local import LocalLauncher from reframe.core.launchers.registry import getlauncher from reframe.core.schedulers.registry import getscheduler -from reframe.core.schedulers.slurm import SlurmNode, SlurmJob +from reframe.core.schedulers.slurm import SlurmNode, create_nodes class _TestJob(abc.ABC): @@ -591,7 +591,7 @@ def create_dummy_nodes(obj): 'Node invalid_node2 not found'] - return SlurmJob._get_nodes_from_description(node_descriptions) + return create_nodes(node_descriptions) def create_reservation_nodes(obj, res): return {n for n in obj.testjob.get_all_nodes() if n.name != 'nid00001'}