From f3268f3818d73cbe248c3decb0b604592d0e1c4f Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Sat, 15 Jun 2019 14:51:22 +0200 Subject: [PATCH 1/6] Fix ReFrame hanging when Slurm job gets OOM killed Also: - Remove the `JobState` abstraction, because it does not offer anything for the moment. --- reframe/core/schedulers/__init__.py | 19 +------ reframe/core/schedulers/local.py | 16 ++---- reframe/core/schedulers/slurm.py | 79 +++++++++++++++-------------- unittests/test_schedulers.py | 16 ++---- 4 files changed, 50 insertions(+), 80 deletions(-) diff --git a/reframe/core/schedulers/__init__.py b/reframe/core/schedulers/__init__.py index 27323cdc3d..087e16868f 100644 --- a/reframe/core/schedulers/__init__.py +++ b/reframe/core/schedulers/__init__.py @@ -14,23 +14,6 @@ from reframe.core.logging import getlogger -class JobState: - def __init__(self, state): - self._state = state - - def __repr__(self): - return debug.repr(self) - - def __eq__(self, other): - if not isinstance(other, type(self)): - return NotImplemented - - return self._state == other._state - - def __str__(self): - return self._state - - class Job(abc.ABC): """A job descriptor. @@ -53,7 +36,7 @@ class Job(abc.ABC): _jobid = fields.TypedField('_jobid', int, type(None)) _exitcode = fields.TypedField('_exitcode', int, type(None)) - _state = fields.TypedField('_state', JobState, type(None)) + _state = fields.TypedField('_state', str, type(None)) # The sched_* arguments are exposed also to the frontend def __init__(self, diff --git a/reframe/core/schedulers/local.py b/reframe/core/schedulers/local.py index b2d4ac0612..673e151882 100644 --- a/reframe/core/schedulers/local.py +++ b/reframe/core/schedulers/local.py @@ -13,16 +13,6 @@ from reframe.core.schedulers.registry import register_scheduler -# Local job states -class LocalJobState(sched.JobState): - pass - - -LOCAL_JOB_SUCCESS = LocalJobState('SUCCESS') -LOCAL_JOB_FAILURE = LocalJobState('FAILURE') -LOCAL_JOB_TIMEOUT = LocalJobState('TIMEOUT') - - class _TimeoutExpired(ReframeError): pass @@ -153,12 +143,12 @@ def wait(self): self._wait_all(timeout) self._exitcode = self._proc.returncode if self._exitcode != 0: - self._state = LOCAL_JOB_FAILURE + self._state = 'FAILURE' else: - self._state = LOCAL_JOB_SUCCESS + self._state = 'SUCCESS' except (_TimeoutExpired, subprocess.TimeoutExpired): getlogger().debug('job timed out') - self._state = LOCAL_JOB_TIMEOUT + self._state = 'TIMEOUT' finally: # Cleanup all the processes of this job self._kill_all() diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 110dd6b50c..906ec30904 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -14,24 +14,39 @@ from reframe.core.schedulers.registry import register_scheduler -class SlurmJobState(sched.JobState): - pass - - -# Slurm Job states -SLURM_JOB_BOOT_FAIL = SlurmJobState('BOOT_FAIL') -SLURM_JOB_CANCELLED = SlurmJobState('CANCELLED') -SLURM_JOB_COMPLETED = SlurmJobState('COMPLETED') -SLURM_JOB_CONFIGURING = SlurmJobState('CONFIGURING') -SLURM_JOB_COMPLETING = SlurmJobState('COMPLETING') -SLURM_JOB_FAILED = SlurmJobState('FAILED') -SLURM_JOB_NODE_FAILED = SlurmJobState('NODE_FAILED') -SLURM_JOB_PENDING = SlurmJobState('PENDING') -SLURM_JOB_PREEMPTED = SlurmJobState('PREEMPTED') -SLURM_JOB_RESIZING = SlurmJobState('RESIZING') -SLURM_JOB_RUNNING = SlurmJobState('RUNNING') -SLURM_JOB_SUSPENDED = SlurmJobState('SUSPENDED') -SLURM_JOB_TIMEOUT = SlurmJobState('TIMEOUT') +def slurm_state_completed(state): + completion_states = { + 'BOOT_FAIL', + 'CANCELLED', + 'COMPLETED', + 'CONFIGURING', + 'COMPLETING', + 'DEADLINE', + 'FAILED', + 'NODE_FAIL', + 'OUT_OF_MEMORY', + 'PREEMPTED', + 'TIMEOUT', + } + return state in completion_states + + +def slurm_state_pending(state): + pending_states = { + 'PENDING', + 'RESV_DEL_HOLD', + 'REQUEUE_FED', + 'REQUEUE_HOLD', + 'REQUEUED', + 'RESIZING', + 'REVOKED', + 'SIGNALING', + 'SPECIAL_EXIT', + 'STAGE_OUT', + 'STOPPED', + 'SUSPENDED', + } + return state in pending_states _run_strict = functools.partial(os_ext.run_command, check=True) @@ -49,15 +64,6 @@ class SlurmJob(sched.Job): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._prefix = '#SBATCH' - self._completion_states = [SLURM_JOB_BOOT_FAIL, - SLURM_JOB_CANCELLED, - SLURM_JOB_COMPLETED, - SLURM_JOB_FAILED, - SLURM_JOB_NODE_FAILED, - SLURM_JOB_PREEMPTED, - SLURM_JOB_TIMEOUT] - self._pending_states = [SLURM_JOB_CONFIGURING, - SLURM_JOB_PENDING] # Reasons to cancel a pending job: if the job is expected to remain # pending for a much longer time then usual (mostly if a sysadmin @@ -277,17 +283,17 @@ def _update_state(self): completed.stdout) return - self._state = SlurmJobState(state_match.group('state')) + self._state = state_match.group('state') if not self._update_state_count % SlurmJob.SACCT_SQUEUE_RATIO: self._cancel_if_blocked() - if self._state in self._completion_states: + if slurm_state_completed(self._state): self._exitcode = int(state_match.group('exitcode')) self._set_nodelist(state_match.group('nodespec')) def _cancel_if_blocked(self): - if self._is_cancelling or self._state not in self._pending_states: + if self._is_cancelling or not slurm_state_pending(self._state): return completed = _run_strict('squeue -h -j %s -o %%r' % self._jobid) @@ -340,12 +346,12 @@ def wait(self): super().wait() # Quickly return in case we have finished already - if self._state in self._completion_states: + if slurm_state_completed(self._state): return intervals = itertools.cycle(settings().job_poll_intervals) self._update_state() - while self._state not in self._completion_states: + while not slurm_state_completed(self._state): time.sleep(next(intervals)) self._update_state() @@ -369,7 +375,7 @@ def finished(self): getlogger().debug('ignoring error during polling: %s' % e) return False else: - return self._state in self._completion_states + return slurm_state_completed(self._state) @register_scheduler('squeue') @@ -401,8 +407,7 @@ def _update_state(self): r'(?P.+)', completed.stdout) if state_match is None: # Assume that job has finished - self._state = (SLURM_JOB_CANCELLED if self._cancelled - else SLURM_JOB_COMPLETED) + self._state = 'CANCELLED' if self._cancelled else 'COMPLETED' # Set exit code manually, if not set already by the polling if self._exitcode is None: @@ -410,9 +415,9 @@ def _update_state(self): return - self._state = SlurmJobState(state_match.group('state')) + self._state = state_match.group('state') self._set_nodelist(state_match.group('nodespec')) - if not self._is_cancelling and self._state in self._pending_states: + if not self._is_cancelling and slurm_state_pending(self._state): self._check_and_cancel(state_match.group('reason')) def cancel(self): diff --git a/unittests/test_schedulers.py b/unittests/test_schedulers.py index 3242ff3324..22ef8d0b2d 100644 --- a/unittests/test_schedulers.py +++ b/unittests/test_schedulers.py @@ -207,10 +207,8 @@ def test_submit(self): self.assertEqual([socket.gethostname()], self.testjob.nodelist) def test_submit_timelimit(self): - from reframe.core.schedulers.local import LOCAL_JOB_TIMEOUT - super().test_submit_timelimit() - self.assertEqual(self.testjob.state, LOCAL_JOB_TIMEOUT) + self.assertEqual(self.testjob.state, 'TIMEOUT') def test_cancel_with_grace(self): # This test emulates a spawned process that ignores the SIGTERM signal @@ -224,8 +222,6 @@ def test_cancel_with_grace(self): # killed immediately after the grace period of 2 seconds expires. # # We also check that the additional spawned process is also killed. - from reframe.core.schedulers.local import LOCAL_JOB_TIMEOUT - self.parallel_cmd = 'sleep 5 &' self.pre_run = ['trap -- "" TERM'] self.post_run = ['echo $!', 'wait'] @@ -250,7 +246,7 @@ def test_cancel_with_grace(self): self.assertGreaterEqual(t_grace.total_seconds(), 2) self.assertLess(t_grace.total_seconds(), 5) - self.assertEqual(LOCAL_JOB_TIMEOUT, self.testjob.state) + self.assertEqual(self.testjob.state, 'TIMEOUT') # Verify that the spawned sleep is killed, too self.assertProcessDied(sleep_pid) @@ -267,8 +263,6 @@ def test_cancel_term_ignore(self): # spawned sleep will ignore it. We need to make sure that our # implementation grants the sleep process a grace period and then # kills it. - from reframe.core.schedulers.local import LOCAL_JOB_TIMEOUT - self.pre_run = [] self.post_run = [] self.parallel_cmd = os.path.join(fixtures.TEST_RESOURCES_CHECKS, @@ -291,7 +285,7 @@ def test_cancel_term_ignore(self): sleep_pid = int(f.read()) self.assertGreaterEqual(t_grace.total_seconds(), 2) - self.assertEqual(LOCAL_JOB_TIMEOUT, self.testjob.state) + self.assertEqual(self.testjob.state, 'TIMEOUT') # Verify that the spawned sleep is killed, too self.assertProcessDied(sleep_pid) @@ -385,10 +379,8 @@ def test_submit_timelimit(self): self.skipTest("SLURM's minimum time limit is 60s") def test_cancel(self): - from reframe.core.schedulers.slurm import SLURM_JOB_CANCELLED - super().test_cancel() - self.assertEqual(self.testjob.state, SLURM_JOB_CANCELLED) + self.assertEqual(self.testjob.state, 'CANCELLED') def test_guess_num_tasks(self): self.testjob._num_tasks = 0 From 9e6a71965c728b999599fbdbaaa3846a122f1b4f Mon Sep 17 00:00:00 2001 From: Theofilos Manitaras Date: Fri, 28 Jun 2019 08:16:37 +0200 Subject: [PATCH 2/6] Fix test_poll failure on dom for squeue --- reframe/core/schedulers/slurm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 906ec30904..9800efb704 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -19,8 +19,6 @@ def slurm_state_completed(state): 'BOOT_FAIL', 'CANCELLED', 'COMPLETED', - 'CONFIGURING', - 'COMPLETING', 'DEADLINE', 'FAILED', 'NODE_FAIL', @@ -33,6 +31,8 @@ def slurm_state_completed(state): def slurm_state_pending(state): pending_states = { + 'COMPLETING', + 'CONFIGURING', 'PENDING', 'RESV_DEL_HOLD', 'REQUEUE_FED', From a2c945ea268958999e5d61e4e66ac4c22ad4646c Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Sat, 15 Jun 2019 15:11:39 +0200 Subject: [PATCH 3/6] Fix MemoryOverconsumptionCheck message check --- cscs-checks/system/slurm/slurm.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cscs-checks/system/slurm/slurm.py b/cscs-checks/system/slurm/slurm.py index c53d2c1b24..6c8450811f 100644 --- a/cscs-checks/system/slurm/slurm.py +++ b/cscs-checks/system/slurm/slurm.py @@ -156,8 +156,7 @@ def __init__(self): self.sourcepath = 'eatmemory.c' self.tags.add('mem') self.executable_opts = ['4000M'] - self.sanity_patterns = sn.assert_found(r'exceeded memory limit', - self.stderr) + self.sanity_patterns = sn.assert_found(r'Out Of Memory', self.stderr) def setup(self, partition, environ, **job_opts): super().setup(partition, environ, **job_opts) From 08dcb2cc9939e0b1385009ba9a8162eb925f4d36 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Mon, 17 Jun 2019 14:22:37 +0200 Subject: [PATCH 4/6] Support also the "old" OOM message in Slurm check - Nothing obvious seems to have been changed inside Slurm regarding the OOM handling. It could be that the different paths are triggered by some system change, so support both the old and the new message. --- cscs-checks/system/slurm/slurm.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cscs-checks/system/slurm/slurm.py b/cscs-checks/system/slurm/slurm.py index 6c8450811f..5896b5f23c 100644 --- a/cscs-checks/system/slurm/slurm.py +++ b/cscs-checks/system/slurm/slurm.py @@ -156,7 +156,9 @@ def __init__(self): self.sourcepath = 'eatmemory.c' self.tags.add('mem') self.executable_opts = ['4000M'] - self.sanity_patterns = sn.assert_found(r'Out Of Memory', self.stderr) + self.sanity_patterns = sn.assert_found( + r'(exceeded memory limit)|(Out Of Memory)', self.stderr + ) def setup(self, partition, environ, **job_opts): super().setup(partition, environ, **job_opts) From 95f4d54ec2e00f04bfe570b3cf6b8fddb2bc0829 Mon Sep 17 00:00:00 2001 From: Theofilos Manitaras Date: Mon, 17 Jun 2019 10:01:08 +0200 Subject: [PATCH 5/6] Fix DefaultRequestGPUSetsGRES for updated slurm output --- cscs-checks/system/slurm/slurm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cscs-checks/system/slurm/slurm.py b/cscs-checks/system/slurm/slurm.py index 5896b5f23c..08e99e5165 100644 --- a/cscs-checks/system/slurm/slurm.py +++ b/cscs-checks/system/slurm/slurm.py @@ -117,7 +117,7 @@ def __init__(self): super().__init__() self.valid_systems = ['daint:gpu', 'dom:gpu'] self.executable = 'scontrol show job ${SLURM_JOB_ID}' - self.sanity_patterns = sn.assert_found(r'.*Gres=.*gpu:1.*', + self.sanity_patterns = sn.assert_found(r'.*TresPerNode=.*gpu:1.*', self.stdout) From a43eabc7f2a0d1090f93b416c4952f9c7fe4271b Mon Sep 17 00:00:00 2001 From: Theofilos Manitaras Date: Mon, 17 Jun 2019 14:29:54 +0200 Subject: [PATCH 6/6] Support also the also slurm format --- cscs-checks/system/slurm/slurm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cscs-checks/system/slurm/slurm.py b/cscs-checks/system/slurm/slurm.py index 08e99e5165..955baca829 100644 --- a/cscs-checks/system/slurm/slurm.py +++ b/cscs-checks/system/slurm/slurm.py @@ -117,8 +117,8 @@ def __init__(self): super().__init__() self.valid_systems = ['daint:gpu', 'dom:gpu'] self.executable = 'scontrol show job ${SLURM_JOB_ID}' - self.sanity_patterns = sn.assert_found(r'.*TresPerNode=.*gpu:1.*', - self.stdout) + self.sanity_patterns = sn.assert_found( + r'.*(TresPerNode|Gres)=.*gpu:1.*', self.stdout) @rfm.simple_test