Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 0 additions & 12 deletions config/cscs-ci.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,6 @@
{
'name': 'gpu',
'scheduler': 'slurm',
'modules': [
'daint-gpu'
],
'access': [
'--constraint=gpu',
'--partition=cscsci',
Expand Down Expand Up @@ -61,9 +58,6 @@
{
'name': 'slurm',
'scheduler': 'slurm',
'modules': [
'daint-gpu'
],
'access': [
'--constraint=gpu',
f'--account={osext.osgroup()}'
Expand All @@ -86,9 +80,6 @@
{
'name': 'pbs',
'scheduler': 'pbs',
'modules': [
'daint-gpu'
],
'access': [
'proc=gpu',
f'-A {osext.osgroup()}'
Expand All @@ -103,9 +94,6 @@
{
'name': 'torque',
'scheduler': 'torque',
'modules': [
'daint-gpu'
],
'access': [
'-l proc=gpu',
f'-A {osext.osgroup()}'
Expand Down
7 changes: 7 additions & 0 deletions docs/manpage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,13 @@ Options controlling ReFrame execution
The test stage and output directories will receive a ``_retry<N>`` suffix every time the test is retried.


.. option:: --maxfail=NUM

The maximum number of failing test cases before the execution is aborted.
After ``NUM`` failed test cases the rest of the test cases will be aborted.
The counter of the failed test cases is reset to 0 in every retry.


.. option:: --disable-hook=HOOK

Disable the pipeline hook named ``HOOK`` from all the tests that will run.
Expand Down
17 changes: 14 additions & 3 deletions reframe/core/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ class TaskDependencyError(ReframeError):
'''


class FailureLimitError(ReframeError):
'''Raised when the limit of test failures has been reached.'''


class AbortTaskError(ReframeError):
'''Raised by the runtime inside a regression task to denote that it has
been aborted due to an external reason (e.g., keyboard interrupt, fatal
Expand Down Expand Up @@ -131,7 +135,7 @@ class PipelineError(ReframeError):
'''


class ReframeForceExitError(ReframeError):
class ForceExitError(ReframeError):
'''Raised when ReFrame execution must be forcefully ended,
e.g., after a SIGTERM was received.
'''
Expand Down Expand Up @@ -280,8 +284,6 @@ def user_frame(exc_type, exc_value, tb):

:returns: A frame object or :class:`None` if no user frame was found.

:meta private:

'''
if not inspect.istraceback(tb):
return None
Expand All @@ -294,8 +296,17 @@ def user_frame(exc_type, exc_value, tb):
return None


def is_exit_request(exc_type, exc_value, tb):
'''Check if the error is a request to exit.'''

return isinstance(exc_value, (KeyboardInterrupt,
ForceExitError,
FailureLimitError))


def is_severe(exc_type, exc_value, tb):
'''Check if exception is a severe one.'''

soft_errors = (ReframeError,
ConnectionError,
FileExistsError,
Expand Down
43 changes: 29 additions & 14 deletions reframe/frontend/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,10 @@ def main():
help='Set the maximum number of times a failed regression test '
'may be retried (default: 0)'
)
run_options.add_argument(
'--maxfail', metavar='NUM', action='store', default=sys.maxsize,
help='Exit after first NUM failures'
)
run_options.add_argument(
'--restore-session', action='store', nargs='?', const='',
metavar='REPORT',
Expand Down Expand Up @@ -738,11 +742,12 @@ def print_infoline(param, value):

def _case_failed(t):
rec = report.case(*t)
if rec and rec['result'] == 'failure':
return True
else:
if not rec:
return False

return (rec['result'] == 'failure' or
rec['result'] == 'aborted')

testcases = list(filter(_case_failed, testcases))
printer.verbose(
f'Filtering successful test case(s): '
Expand Down Expand Up @@ -918,7 +923,19 @@ def module_unuse(*paths):
f'--max-retries is not a valid integer: {max_retries}'
) from None

runner = Runner(exec_policy, printer, max_retries)
try:
max_failures = int(options.maxfail)
if max_failures < 0:
raise errors.ConfigError(
f'--maxfail should be a non-negative integer: '
f'{options.maxfail!r}'
)
except ValueError:
raise errors.ConfigError(
f'--maxfail is not a valid integer: {options.maxfail!r}'
) from None

runner = Runner(exec_policy, printer, max_retries, max_failures)
try:
time_start = time.time()
session_info['time_start'] = time.strftime(
Expand All @@ -933,12 +950,12 @@ def module_unuse(*paths):
session_info['time_elapsed'] = time_end - time_start

# Print a retry report if we did any retries
if runner.stats.failures(run=0):
if runner.stats.failed(run=0):
printer.info(runner.stats.retry_report())

# Print a failure report if we had failures in the last run
success = True
if runner.stats.failures():
if runner.stats.failed():
success = False
runner.stats.print_failure_report(printer)
if options.failure_stats:
Expand Down Expand Up @@ -984,16 +1001,14 @@ def module_unuse(*paths):
sys.exit(1)

sys.exit(0)
except KeyboardInterrupt:
sys.exit(1)
except errors.ReframeError as e:
printer.error(str(e))
sys.exit(1)
except (Exception, errors.ReframeFatalError):
except (Exception, KeyboardInterrupt, errors.ReframeFatalError):
exc_info = sys.exc_info()
tb = ''.join(traceback.format_exception(*exc_info))
printer.error(errors.what(*exc_info))
if errors.is_severe(*exc_info):
printer.error(f'run session stopped: {errors.what(*exc_info)}')
if errors.is_exit_request(*exc_info):
# Print stack traces for exit requests only when TOO verbose
printer.debug2(tb)
elif errors.is_severe(*exc_info):
printer.error(tb)
else:
printer.verbose(tb)
Expand Down
57 changes: 42 additions & 15 deletions reframe/frontend/executors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,17 @@
import reframe.core.runtime as runtime
import reframe.frontend.dependencies as dependencies
import reframe.utility.jsonext as jsonext
from reframe.core.exceptions import (AbortTaskError, JobNotStartedError,
ReframeForceExitError, TaskExit)
from reframe.core.exceptions import (AbortTaskError,
JobNotStartedError,
FailureLimitError,
ForceExitError,
TaskExit)
from reframe.core.schedulers.local import LocalJobScheduler
from reframe.frontend.printer import PrettyPrinter
from reframe.frontend.statistics import TestStats


ABORT_REASONS = (KeyboardInterrupt, ReframeForceExitError, AssertionError)
ABORT_REASONS = (AssertionError, FailureLimitError,
KeyboardInterrupt, ForceExitError)


class TestCase:
Expand Down Expand Up @@ -136,6 +139,8 @@ def __init__(self, case, listeners=[]):
# Timestamps for the start and finish phases of the pipeline
self._timestamps = {}

self._aborted = False

def duration(self, phase):
# Treat pseudo-phases first
if phase == 'compile_complete':
Expand Down Expand Up @@ -204,7 +209,7 @@ def exc_info(self):

@property
def failed(self):
return self._failed_stage is not None
return self._failed_stage is not None and not self._aborted

@property
def failed_stage(self):
Expand All @@ -218,6 +223,10 @@ def succeeded(self):
def completed(self):
return self.failed or self.succeeded

@property
def aborted(self):
return self._aborted

def _notify_listeners(self, callback_name):
for l in self._listeners:
callback = getattr(l, callback_name)
Expand Down Expand Up @@ -308,9 +317,13 @@ def fail(self, exc_info=None):
self._notify_listeners('on_task_failure')

def abort(self, cause=None):
if self.failed or self._aborted:
return

logging.getlogger().debug2('Aborting test case: {self.testcase!r}')
exc = AbortTaskError()
exc.__cause__ = cause
self._aborted = True
try:
# FIXME: we should perhaps extend the RegressionTest interface
# for supporting job cancelling
Expand Down Expand Up @@ -347,22 +360,28 @@ def on_task_success(self, task):


def _handle_sigterm(signum, frame):
raise ReframeForceExitError('received TERM signal')
raise ForceExitError('received TERM signal')


class Runner:
'''Responsible for executing a set of regression tests based on an
execution policy.'''

def __init__(self, policy, printer=None, max_retries=0):
def __init__(self, policy, printer=None, max_retries=0,
max_failures=sys.maxsize):
self._policy = policy
self._printer = printer or PrettyPrinter()
self._max_retries = max_retries
self._stats = TestStats()
self._policy.stats = self._stats
self._policy.printer = self._printer
self._policy.max_failures = max_failures
signal.signal(signal.SIGTERM, _handle_sigterm)

@property
def max_failures(self):
return self._max_failures

@property
def max_retries(self):
return self._max_retries
Expand All @@ -376,6 +395,7 @@ def stats(self):
return self._stats

def runall(self, testcases, restored_cases=None):
abort_reason = None
num_checks = len({tc.check.name for tc in testcases})
self._printer.separator('short double line',
'Running %d check(s)' % num_checks)
Expand All @@ -386,20 +406,27 @@ def runall(self, testcases, restored_cases=None):
if self._max_retries:
restored_cases = restored_cases or []
self._retry_failed(testcases + restored_cases)

finally:
# Print the summary line
num_failures = len(self._stats.failures())
num_failures = len(self._stats.failed())
num_completed = len(self._stats.completed())
if num_failures:
status = 'FAILED'
else:
status = 'PASSED'

self._printer.status(
'FAILED' if num_failures else 'PASSED',
'Ran %d test case(s) from %d check(s) (%d failure(s))' %
(len(testcases), num_checks, num_failures), just='center'
status,
f'Ran {num_completed}/{len(testcases)}'
f' test case(s) from {num_checks} check(s) '
f'({num_failures} failure(s))',
just='center'
)
self._printer.timestamp('Finished on', 'short double line')

def _retry_failed(self, cases):
rt = runtime.runtime()
failures = self._stats.failures()
failures = self._stats.failed()
while (failures and rt.current_run < self._max_retries):
num_failed_checks = len({tc.check.name for tc in failures})
rt.next_run()
Expand All @@ -415,7 +442,7 @@ def _retry_failed(self, cases):
cases_graph, _ = dependencies.build_deps(failed_cases, cases)
failed_cases = dependencies.toposort(cases_graph, is_subgraph=True)
self._runall(failed_cases)
failures = self._stats.failures()
failures = self._stats.failed()

def _runall(self, testcases):
def print_separator(check, prefix):
Expand Down Expand Up @@ -475,7 +502,7 @@ def __init__(self):
self.stats = None

def enter(self):
pass
self._num_failed_tasks = 0

def exit(self):
pass
Expand Down
Loading