From 6b0684cd2c81d44f381692ea387f24c427e74838 Mon Sep 17 00:00:00 2001 From: Steve Leak Date: Tue, 19 May 2020 18:29:09 -0700 Subject: [PATCH 1/4] handle build errors in async policy --- reframe/frontend/executors/policies.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 9feafbcf1b..8774666021 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -358,8 +358,15 @@ def _failall(self, cause): def _reschedule(self, task): getlogger().debug('scheduling test case for running') - task.compile() - task.compile_wait() + try: + task.compile() + task.compile_wait() + except (PipelineError, BuildError) as e: + getlogger().debug('build failed for %s' % task) + self.on_task_failure(task) + except Exception as e: + getlogger().debug(f'build for %s threw unhandled exception %s' % (task, e)) + raise task.run() def _reschedule_all(self): From 1aed649f48d8e34f68ec36e7a1d09051489cea30 Mon Sep 17 00:00:00 2001 From: Steve Leak Date: Tue, 19 May 2020 18:34:46 -0700 Subject: [PATCH 2/4] handle build errors in async policy --- reframe/frontend/executors/policies.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 8774666021..85422f61bb 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -11,7 +11,8 @@ from datetime import datetime -from reframe.core.exceptions import (TaskDependencyError, TaskExit) +from reframe.core.exceptions import (TaskDependencyError, TaskExit, + BuildError, PipelineError) from reframe.core.logging import getlogger from reframe.frontend.executors import (ExecutionPolicy, RegressionTask, TaskEventListener, ABORT_REASONS) From e4d45b6621023e5211f89637c71812c7031751c0 Mon Sep 17 00:00:00 2001 From: Steve Leak Date: Tue, 19 May 2020 22:08:00 -0700 Subject: [PATCH 3/4] handle build errors in async policy --- reframe/frontend/executors/policies.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 85422f61bb..9042476643 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -362,7 +362,7 @@ def _reschedule(self, task): try: task.compile() task.compile_wait() - except (PipelineError, BuildError) as e: + except (PipelineError, BuildError, TaskExit) as e: getlogger().debug('build failed for %s' % task) self.on_task_failure(task) except Exception as e: From 70a5680c67d24c8467b0c1318aa542aaf9186736 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Sun, 24 May 2020 20:16:32 +0200 Subject: [PATCH 4/4] Fix crash when a task is being rescheduled As a result of another task's exit. --- reframe/frontend/executors/policies.py | 21 ++++------ unittests/resources/checks/frontend_checks.py | 10 +++++ unittests/test_policies.py | 40 +++++++++++++++++-- 3 files changed, 54 insertions(+), 17 deletions(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 9042476643..deec7b9fe1 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -11,8 +11,7 @@ from datetime import datetime -from reframe.core.exceptions import (TaskDependencyError, TaskExit, - BuildError, PipelineError) +from reframe.core.exceptions import (TaskDependencyError, TaskExit) from reframe.core.logging import getlogger from reframe.frontend.executors import (ExecutionPolicy, RegressionTask, TaskEventListener, ABORT_REASONS) @@ -291,7 +290,9 @@ def runcase(self, case): self.printer.status('HOLD', task.check.info(), just='right') except TaskExit: if not task.failed: - self._reschedule(task) + with contextlib.suppress(TaskExit): + self._reschedule(task) + return except ABORT_REASONS as e: if not task.failed: @@ -359,15 +360,8 @@ def _failall(self, cause): def _reschedule(self, task): getlogger().debug('scheduling test case for running') - try: - task.compile() - task.compile_wait() - except (PipelineError, BuildError, TaskExit) as e: - getlogger().debug('build failed for %s' % task) - self.on_task_failure(task) - except Exception as e: - getlogger().debug(f'build for %s threw unhandled exception %s' % (task, e)) - raise + task.compile() + task.compile_wait() task.run() def _reschedule_all(self): @@ -417,7 +411,8 @@ def exit(self): time.sleep(t) except TaskExit: - self._reschedule_all() + with contextlib.suppress(TaskExit): + self._reschedule_all() except ABORT_REASONS as e: self._failall(e) raise diff --git a/unittests/resources/checks/frontend_checks.py b/unittests/resources/checks/frontend_checks.py index 1e7b77d2c2..3922dd0053 100644 --- a/unittests/resources/checks/frontend_checks.py +++ b/unittests/resources/checks/frontend_checks.py @@ -220,3 +220,13 @@ def run(self): super().run() time.sleep(0.5) os.kill(os.getpid(), signal.SIGTERM) + + +class CompileFailureCheck(rfm.RegressionTest): + def __init__(self): + self.valid_systems = ['*'] + self.valid_prog_environs = ['*'] + self.sanity_patterns = sn.assert_found(r'hello', self.stdout) + self.sourcesdir = None + self.sourcepath = 'x.c' + self.prebuild_cmd = ['echo foo > x.c'] diff --git a/unittests/test_policies.py b/unittests/test_policies.py index 9093183a3f..770ca21016 100644 --- a/unittests/test_policies.py +++ b/unittests/test_policies.py @@ -21,6 +21,7 @@ from unittests.resources.checks.frontend_checks import ( BadSetupCheck, BadSetupCheckEarly, + CompileFailureCheck, KeyboardInterruptCheck, RetriesCheck, SelfKillCheck, @@ -557,8 +558,8 @@ def test_kbd_interrupt_in_setup_with_limited_concurrency( assert_interrupted_run(runner) -def test_poll_fails_in_main_loop(async_runner, make_cases, - make_async_exec_ctx): +def test_poll_fails_main_loop(async_runner, make_cases, + make_async_exec_ctx): ctx = make_async_exec_ctx(1) next(ctx) @@ -573,8 +574,8 @@ def test_poll_fails_in_main_loop(async_runner, make_cases, assert num_checks == len(stats.failures()) -def test_poll_fails_in_busy_loop(async_runner, make_cases, - make_async_exec_ctx): +def test_poll_fails_busy_loop(async_runner, make_cases, + make_async_exec_ctx): ctx = make_async_exec_ctx(1) next(ctx) @@ -587,3 +588,34 @@ def test_poll_fails_in_busy_loop(async_runner, make_cases, assert num_checks == stats.num_cases() assert_runall(runner) assert num_checks == len(stats.failures()) + + +def test_compile_fail_reschedule_main_loop(async_runner, make_cases, + make_async_exec_ctx): + ctx = make_async_exec_ctx(1) + next(ctx) + + runner, _ = async_runner + num_checks = 2 + runner.runall(make_cases([SleepCheckPollFail(.1), CompileFailureCheck()])) + + stats = runner.stats + assert num_checks == stats.num_cases() + assert_runall(runner) + assert num_checks == len(stats.failures()) + + +def test_compile_fail_reschedule_busy_loop(async_runner, make_cases, + make_async_exec_ctx): + ctx = make_async_exec_ctx(1) + next(ctx) + + runner, _ = async_runner + num_checks = 2 + runner.runall( + make_cases([SleepCheckPollFailLate(1.5), CompileFailureCheck()]) + ) + stats = runner.stats + assert num_checks == stats.num_cases() + assert_runall(runner) + assert num_checks == len(stats.failures())