reframe-hpc · vkarak · Dec 7, 2020 · Oct 15, 2020 · Oct 20, 2020 · Oct 23, 2020
diff --git a/docs/manpage.rst b/docs/manpage.rst
@@ -62,7 +62,8 @@ After all tests in the search path have been loaded, they are first filtered by
 Any test that is not valid for the current system, it will be filtered out.
 The current system is either auto-selected or explicitly specified with the :option:`--system` option.
 Tests can be filtered by different attributes and there are specific command line options for achieving this.
-
+A common characteristic of all test filtering options is that if a test is selected, then all its dependencies will be selected, too, regardless if they match the filtering criteria or not.
+This happens recursively so that if test ``T1`` depends on ``T2`` and ``T2`` depends on ``T3``, then selecting ``T1`` would also select ``T2`` and ``T3``.
 
 .. option:: -t, --tag=TAG
 
@@ -116,6 +117,15 @@ Tests can be filtered by different attributes and there are specific command lin
    Tests may or may not make use of it.
 
 
+.. option:: --failed
+
+   Select only the failed test cases for a previous run.
+   This option can only be used in combination with the :option:`--restore-session`.
+   To rerun the failed cases from the last run, you can use ``reframe --restore-session --failed -r``.
+
+   .. versionadded:: 3.4
+
+
 .. option:: --skip-system-check
 
    Do not filter tests against the selected system.
@@ -196,7 +206,7 @@ Options controlling ReFrame output
 
    This option can also be set using the :envvar:`RFM_STAGE_DIR` environment variable or the :js:attr:`stagedir` system configuration parameter.
 
-.. option:: --timestamp[=TIMEFMT]
+.. option:: --timestamp [TIMEFMT]
 
    Append a timestamp to the output and stage directory prefixes.
    ``TIMEFMT`` can be any valid :manpage:`strftime(3)` time format.
@@ -312,6 +322,25 @@ Options controlling ReFrame execution
    .. versionadded:: 3.2
 
 
+.. option:: --restore-session [REPORT]
+
+   Restore a testing session that has run previously.
+   ``REPORT`` is a run report file generated by ReFrame.
+   If ``REPORT`` is not given, ReFrame will pick the last report file found in the default location of report files (see the :option:`--report-file` option).
+   If passed alone, this option will simply rerun all the test cases that have run previously based on the report file data.
+   It is more useful to combine this option with any of the `test filtering <#test-filtering>`__ options, in which case only the selected test cases will be executed.
+   The difference in test selection process when using this option is that the dependencies of the selected tests will not be selected for execution, as they would normally, but they will be restored.
+   For example, if test ``T1`` depends on ``T2`` and ``T2`` depends on ``T3``, then running ``reframe -n T1 -r`` would cause both ``T2`` and ``T3`` to run.
+   However, by doing ``reframe -n T1 --restore-session -r``, only ``T1`` would run and its immediate dependence ``T2`` will be restored.
+   This is useful when you have deep test dependencies or some of the tests in the dependency chain are very time consuming.
+
+   .. note::
+      In order for a test case to be restored, its stage directory must be present.
+      This is not a problem when rerunning a failed case, since the stage directories of its dependencies are automatically kept, but if you want to rerun a successful test case, you should make sure to have run with the :option:`--keep-stage-files` option.
+
+   .. versionadded:: 3.4
+
+
 ----------------------------------
 Options controlling job submission
 ----------------------------------
@@ -463,7 +492,7 @@ Miscellaneous options
 
    This option can also be set using the :envvar:`RFM_CONFIG_FILE` environment variable.
 
-.. option:: --show-config[=PARAM]
+.. option:: --show-config [PARAM]
 
    Show the value of configuration parameter ``PARAM`` as this is defined for the currently selected system and exit.
    The parameter value is printed in JSON format.

diff --git a/reframe/core/config.py b/reframe/core/config.py
@@ -107,6 +107,9 @@ def add_sticky_option(self, option, value):
     def remove_sticky_option(self, option):
         self._sticky_options.pop(option, None)
 
+    def is_sticky_option(self, option):
+        return option in self._sticky_options
+
     @_normalize_syntax({'.*/.*modules$': normalize_module_list})
     def get(self, option, default=None):
         '''Retrieve value of option.

diff --git a/reframe/core/environments.py b/reframe/core/environments.py
@@ -8,6 +8,7 @@
 
 import reframe.core.fields as fields
 import reframe.utility as util
+import reframe.utility.jsonext as jsonext
 import reframe.utility.typecheck as typ
 
 
@@ -26,7 +27,7 @@ def normalize_module_list(modules):
     return ret
 
 
-class Environment:
+class Environment(jsonext.JSONSerializable):
     '''This class abstracts away an environment to run regression tests.
 
     It is simply a collection of modules to be loaded and environment variables

diff --git a/reframe/core/pipeline.py b/reframe/core/pipeline.py
@@ -26,6 +26,7 @@
 import reframe.core.logging as logging
 import reframe.core.runtime as rt
 import reframe.utility as util
+import reframe.utility.jsonext as jsonext
 import reframe.utility.osext as osext
 import reframe.utility.sanity as sn
 import reframe.utility.typecheck as typ
@@ -125,7 +126,7 @@ def _wrapped(*args, **kwargs):
     return _wrapped
 
 
-class RegressionTest(metaclass=RegressionTestMeta):
+class RegressionTest(jsonext.JSONSerializable, metaclass=RegressionTestMeta):
     '''Base class for regression tests.
 
     All regression tests must eventually inherit from this class.
@@ -1809,6 +1810,10 @@ def __eq__(self, other):
     def __hash__(self):
         return hash(self.name)
 
+    def __rfm_json_decode__(self, json):
+        # 'tags' are decoded as list, so we convert them to a set
+        self.tags = set(json['tags'])
+
 
 class RunOnlyRegressionTest(RegressionTest, special=True):
     '''Base class for run-only regression tests.

diff --git a/reframe/core/schedulers/__init__.py b/reframe/core/schedulers/__init__.py
@@ -13,6 +13,7 @@
 import reframe.core.fields as fields
 import reframe.core.runtime as runtime
 import reframe.core.shell as shell
+import reframe.utility.jsonext as jsonext
 import reframe.utility.typecheck as typ
 from reframe.core.exceptions import JobError, JobNotStartedError
 from reframe.core.launchers import JobLauncher
@@ -111,7 +112,7 @@ def log(self, message, level=DEBUG2):
         getlogger().log(level, f'[S] {self.registered_name}: {message}')
 
 
-class Job:
+class Job(jsonext.JSONSerializable):
     '''A job descriptor.
 
     A job descriptor is created by the framework after the "setup" phase and

diff --git a/reframe/core/systems.py b/reframe/core/systems.py
@@ -4,16 +4,16 @@
 # SPDX-License-Identifier: BSD-3-Clause
 
 import json
-import re
 
 import reframe.utility as utility
+import reframe.utility.jsonext as jsonext
 from reframe.core.backends import (getlauncher, getscheduler)
 from reframe.core.logging import getlogger
 from reframe.core.modules import ModulesSystem
 from reframe.core.environments import (Environment, ProgEnvironment)
 
 
-class SystemPartition:
+class SystemPartition(jsonext.JSONSerializable):
     '''A representation of a system partition inside ReFrame.
 
     .. warning::
@@ -237,7 +237,7 @@ def __str__(self):
         return json.dumps(self.json(), indent=2)
 
 
-class System:
+class System(jsonext.JSONSerializable):
     '''A representation of a system inside ReFrame.
 
     .. warning::

diff --git a/reframe/frontend/cli.py b/reframe/frontend/cli.py
@@ -23,13 +23,16 @@
 import reframe.frontend.argparse as argparse
 import reframe.frontend.dependencies as dependencies
 import reframe.frontend.filters as filters
+import reframe.frontend.runreport as runreport
 import reframe.utility.jsonext as jsonext
 import reframe.utility.osext as osext
-from reframe.frontend.executors import Runner, generate_testcases
+
+
+from reframe.frontend.printer import PrettyPrinter
+from reframe.frontend.loader import RegressionCheckLoader
 from reframe.frontend.executors.policies import (SerialExecutionPolicy,
                                                  AsynchronousExecutionPolicy)
-from reframe.frontend.loader import RegressionCheckLoader
-from reframe.frontend.printer import PrettyPrinter
+from reframe.frontend.executors import Runner, generate_testcases
 
 
 def format_check(check, check_deps, detailed=False):
@@ -119,23 +122,6 @@ def list_checks(testcases, printer, detailed=False):
     printer.info(f'Found {len(checks)} check(s)')
 
 
-def generate_report_filename(filepatt):
-    if '{sessionid}' not in filepatt:
-        return filepatt
-
-    search_patt = os.path.basename(filepatt).replace('{sessionid}', r'(\d+)')
-    new_id = -1
-    basedir = os.path.dirname(filepatt) or '.'
-    for filename in os.listdir(basedir):
-        match = re.match(search_patt, filename)
-        if match:
-            found_id = int(match.group(1))
-            new_id = max(found_id, new_id)
-
-    new_id += 1
-    return filepatt.format(sessionid=new_id)
-
-
 def logfiles_message():
     log_files = logging.log_files()
     msg = 'Log file(s) saved in: '
@@ -260,6 +246,10 @@ def main():
         help=('Select checks with at least one '
               'programming environment matching PATTERN')
     )
+    select_options.add_argument(
+        '--failed', action='store_true',
+        help="Select failed test cases (only when '--restore-session' is used)"
+    )
     select_options.add_argument(
         '--gpu-only', action='store_true',
         help='Select only GPU checks'
@@ -326,6 +316,11 @@ def main():
         help='Set the maximum number of times a failed regression test '
              'may be retried (default: 0)'
     )
+    run_options.add_argument(
+        '--restore-session', action='store', nargs='?', const='',
+        metavar='REPORT',
+        help='Restore a testing session from REPORT file'
+    )
     run_options.add_argument(
         '--flex-alloc-nodes', action='store',
         dest='flex_alloc_nodes', metavar='{all|STATE|NUM}', default=None,
@@ -586,10 +581,53 @@ def main():
     printer.debug(format_env(options.env_vars))
 
     # Setup the check loader
+    if options.restore_session is not None:
+        # We need to load the failed checks only from a report
+        if options.restore_session:
+            filename = options.restore_session
+        else:
+            filename = runreport.next_report_filename(
+                osext.expandvars(site_config.get('general/0/report_file')),
+                new=False
+            )
+
+        report = runreport.load_report(filename)
+        check_search_path = list(report.slice('filename', unique=True))
+        check_search_recursive = False
+
+        # If `-c` or `-R` are passed explicitly outside the configuration
+        # file, override the values set from the report file
+        if site_config.is_sticky_option('general/check_search_path'):
+            printer.warning(
+                'Ignoring check search path set in the report file: '
+                'search path set explicitly in the command-line or '
+                'the environment'
+            )
+            check_search_path = site_config.get(
+                'general/0/check_search_path'
+            )
+
+        if site_config.is_sticky_option('general/check_search_recursive'):
+            printer.warning(
+                'Ignoring check search recursive option from the report file: '
+                'option set explicitly in the command-line or the environment'
+            )
+            check_search_recursive = site_config.get(
+                'general/0/check_search_recursive'
+            )
+
+    else:
+        check_search_recursive = site_config.get(
+            'general/0/check_search_recursive'
+        )
+        check_search_path = site_config.get('general/0/check_search_path')
+
     loader = RegressionCheckLoader(
-        load_path=site_config.get('general/0/check_search_path'),
-        recurse=site_config.get('general/0/check_search_recursive'),
-        ignore_conflicts=site_config.get('general/0/ignore_check_conflicts')
+        load_path=check_search_path,
+        recurse=check_search_recursive,
+        ignore_conflicts=site_config.get(
+            'general/0/ignore_check_conflicts'
+        )
     )
 
     def print_infoline(param, value):
@@ -599,7 +637,7 @@ def print_infoline(param, value):
     session_info = {
         'cmdline': ' '.join(sys.argv),
         'config_file': rt.site_config.filename,
-        'data_version': '1.1',
+        'data_version': runreport.DATA_VERSION,
         'hostname': socket.gethostname(),
         'prefix_output': rt.output_prefix,
         'prefix_stage': rt.stage_prefix,
@@ -683,6 +721,34 @@ def print_infoline(param, value):
         elif options.cpu_only:
             testcases = filter(filters.have_cpu_only(), testcases)
 
+        testcases = list(testcases)
+        printer.verbose(
+            f'Filtering test cases(s) by other attributes: '
+            f'{len(testcases)} remaining'
+        )
+
+        # Filter in failed cases
+        if options.failed:
+            if options.restore_session is None:
+                printer.error(
+                    "the option '--failed' can only be used "
+                    "in combination with the '--restore-session' option"
+                )
+                sys.exit(1)
+
+            def _case_failed(t):
+                rec = report.case(*t)
+                if rec and rec['result'] == 'failure':
+                    return True
+                else:
+                    return False
+
+            testcases = list(filter(_case_failed, testcases))
+            printer.verbose(
+                f'Filtering successful test case(s): '
+                f'{len(testcases)} remaining'
+            )
+
         # Prepare for running
         printer.debug('Building and validating the full test DAG')
         testgraph, skipped_cases = dependencies.build_deps(testcases_all)
@@ -697,12 +763,22 @@ def print_infoline(param, value):
         dependencies.validate_deps(testgraph)
         printer.debug('Full test DAG:')
         printer.debug(dependencies.format_deps(testgraph))
+
+        restored_cases = []
         if len(testcases) != len(testcases_all):
-            testgraph = dependencies.prune_deps(testgraph, testcases)
+            testgraph = dependencies.prune_deps(
+                testgraph, testcases,
+                max_depth=1 if options.restore_session is not None else None
+            )
             printer.debug('Pruned test DAG')
             printer.debug(dependencies.format_deps(testgraph))
+            if options.restore_session is not None:
+                testgraph, restored_cases = report.restore_dangling(testgraph)
 
-        testcases = dependencies.toposort(testgraph)
+        testcases = dependencies.toposort(
+            testgraph,
+            is_subgraph=options.restore_session is not None
+        )
         printer.verbose(f'Final number of test cases: {len(testcases)}')
 
         # Disable hooks
@@ -848,7 +924,7 @@ def module_unuse(*paths):
             session_info['time_start'] = time.strftime(
                 '%FT%T%z', time.localtime(time_start),
             )
-            runner.runall(testcases)
+            runner.runall(testcases, restored_cases)
         finally:
             time_end = time.time()
             session_info['time_end'] = time.strftime(
@@ -887,9 +963,14 @@ def module_unuse(*paths):
             })
             json_report = {
                 'session_info': session_info,
-                'runs': run_stats
+                'runs': run_stats,
+                'restored_cases': []
             }
-            report_file = generate_report_filename(report_file)
+            if options.restore_session is not None:
+                for c in restored_cases:
+                    json_report['restored_cases'].append(report.case(*c))
+
+            report_file = runreport.next_report_filename(report_file)
             try:
                 with open(report_file, 'w') as fp:
                     jsonext.dump(json_report, fp, indent=2)