diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index b739585937..f8b07de60c 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -4,6 +4,7 @@ import re import time from argparse import ArgumentParser +from contextlib import suppress from datetime import datetime import reframe.core.schedulers as sched @@ -177,7 +178,7 @@ def get_all_nodes(self): raise JobError('could not retrieve node information') from e node_descriptions = completed.stdout.splitlines() - return {SlurmNode(descr) for descr in node_descriptions} + return create_nodes(node_descriptions) def _get_default_partition(self): completed = _run_strict('scontrol -a show -o partitions') @@ -266,20 +267,13 @@ def _get_reservation_nodes(self, reservation): completed = _run_strict('scontrol -a show -o %s' % reservation_nodes) node_descriptions = completed.stdout.splitlines() - return {SlurmNode(descr) for descr in node_descriptions} + return create_nodes(node_descriptions) def _get_nodes_by_name(self, nodespec): completed = os_ext.run_command('scontrol -a show -o node %s' % nodespec) node_descriptions = completed.stdout.splitlines() - nodes_avail = set() - for descr in node_descriptions: - try: - nodes_avail.add(SlurmNode(descr)) - except JobError: - pass - - return nodes_avail + return create_nodes(node_descriptions) def _set_nodelist(self, nodespec): if self._nodelist is not None: @@ -489,6 +483,15 @@ def cancel(self): self._cancelled = True +def create_nodes(descriptions): + nodes = set() + for descr in descriptions: + with suppress(JobError): + nodes.add(SlurmNode(descr)) + + return nodes + + class SlurmNode: '''Class representing a Slurm node.''' diff --git a/unittests/test_schedulers.py b/unittests/test_schedulers.py index 68e247c392..7ab2ff9c27 100644 --- a/unittests/test_schedulers.py +++ b/unittests/test_schedulers.py @@ -15,7 +15,7 @@ from reframe.core.launchers.local import LocalLauncher from reframe.core.launchers.registry import getlauncher from reframe.core.schedulers.registry import getscheduler -from reframe.core.schedulers.slurm import SlurmNode +from reframe.core.schedulers.slurm import SlurmNode, create_nodes class _TestJob(abc.ABC): @@ -534,6 +534,8 @@ def create_dummy_nodes(obj): 'ExtSensorsTemp=n/s Reason=Foo/ ' 'failed [reframe_user@01 Jan 2018]', + 'Node invalid_node1 not found', + 'NodeName=nid00003 Arch=x86_64 CoresPerSocket=12 ' 'CPUAlloc=0 CPUErr=0 CPUTot=24 CPULoad=0.00 ' 'AvailableFeatures=f1,f3 ActiveFeatures=f1,f3 ' @@ -585,16 +587,17 @@ def create_dummy_nodes(obj): 'LowestJoules=100000000 ConsumedJoules=0 ' 'ExtSensorsJoules=n/s ExtSensorsWatts=0 ' 'ExtSensorsTemp=n/s Reason=Foo/ ' - 'failed [reframe_user@01 Jan 2018]'] + 'failed [reframe_user@01 Jan 2018]', + + 'Node invalid_node2 not found'] - return {SlurmNode(desc) for desc in node_descriptions} + return create_nodes(node_descriptions) def create_reservation_nodes(obj, res): - return {n for n in obj.create_dummy_nodes() if n.name != 'nid00001'} + return {n for n in obj.testjob.get_all_nodes() if n.name != 'nid00001'} - def get_nodes_by_name(obj, node_names): - nodes = obj.create_dummy_nodes() - return {n for n in nodes if n.name in node_names} + def create_dummy_nodes_by_name(obj, name): + return {n for n in obj.testjob.get_all_nodes() if n.name == name} def setUp(self): self.workdir = tempfile.mkdtemp(dir='unittests') @@ -736,14 +739,16 @@ def test_valid_reservation_option(self): def test_exclude_nodes_cmd(self): self.testjob._sched_access = ['--constraint=f1'] self.testjob._sched_exclude_nodelist = 'nid00001' - self.testjob._get_nodes_by_name = self.get_nodes_by_name + # monkey patch `_get_nodes_by_name` to simulate extraction of + # slurm nodes by name through the use of `scontrol show` + self.testjob._get_nodes_by_name = self.create_dummy_nodes_by_name self.prepare_job() self.assertEqual(self.testjob.num_tasks, 8) def test_exclude_nodes_opt(self): self.testjob._sched_access = ['--constraint=f1'] self.testjob.options = ['-x nid00001'] - self.testjob._get_nodes_by_name = self.get_nodes_by_name + self.testjob._get_nodes_by_name = self.create_dummy_nodes_by_name self.prepare_job() self.assertEqual(self.testjob.num_tasks, 8)