diff --git a/config/cscs-ci.py b/config/cscs-ci.py index de584da98d..f611aa28c7 100644 --- a/config/cscs-ci.py +++ b/config/cscs-ci.py @@ -209,11 +209,10 @@ ], 'logging': [ { - 'level': 'debug', 'handlers': [ { 'type': 'file', - 'level': 'debug', + 'level': 'debug2', 'format': '[%(asctime)s] %(levelname)s: %(check_info)s: %(message)s', # noqa: E501 'append': False }, diff --git a/config/cscs.py b/config/cscs.py index 19406d5daa..71d06304b5 100644 --- a/config/cscs.py +++ b/config/cscs.py @@ -944,12 +944,11 @@ ], 'logging': [ { - 'level': 'debug', 'handlers': [ { 'type': 'file', 'name': 'reframe.log', - 'level': 'debug', + 'level': 'debug2', 'format': '[%(asctime)s] %(levelname)s: %(check_info)s: %(message)s', # noqa: E501 'append': False }, diff --git a/docs/config_reference.rst b/docs/config_reference.rst index d91da8d8b4..ca8c667d4a 100644 --- a/docs/config_reference.rst +++ b/docs/config_reference.rst @@ -579,7 +579,7 @@ You may define different logger objects per system but *not* per partition. .. js:attribute:: .logging[].level :required: No - :default: ``"debug"`` + :default: ``"undefined"`` The level associated with this logger object. There are the following levels in decreasing severity order: @@ -590,10 +590,19 @@ You may define different logger objects per system but *not* per partition. - ``info``: Informational messages. - ``verbose``: More informational messages. - ``debug``: Debug messages. + - ``debug2``: Further debug messages. + - ``undefined``: This is the lowest level; do not filter any message. If a message is logged by the framework, its severity level will be checked by the logger and if it is higher from the logger's level, it will be passed down to its handlers. + .. versionadded:: 3.3 + The ``debug2`` and ``undefined`` levels are added. + + .. versionchanged:: 3.3 + The default level is now ``undefined``. + + .. js:attribute:: .logging[].handlers :required: Yes diff --git a/reframe/core/config.py b/reframe/core/config.py index 80cc4a7b73..ae894abd8b 100644 --- a/reframe/core/config.py +++ b/reframe/core/config.py @@ -229,17 +229,29 @@ def _create_from_json(cls, filename): return _SiteConfig(config, filename) def _detect_system(self): + getlogger().debug('Detecting system') if os.path.exists('/etc/xthostname'): # Get the cluster name on Cray systems + getlogger().debug( + "Found '/etc/xthostname': will use this to get the system name" + ) with open('/etc/xthostname') as fp: hostname = fp.read() else: hostname = socket.gethostname() + getlogger().debug( + f'Looking for a matching configuration entry ' + f'for system {hostname!r}' + ) for system in self._site_config['systems']: for patt in system['hostnames']: if re.match(patt, hostname): - return system['name'] + sysname = system['name'] + getlogger().debug( + f'Configuration found: picking system {sysname!r}' + ) + return sysname raise ConfigError(f"could not find a configuration entry " f"for the current system: '{hostname}'") @@ -278,6 +290,7 @@ def select_subconfig(self, system_fullname=None, return system_fullname = system_fullname or self._detect_system() + getlogger().debug(f'Selecting subconfig for {system_fullname!r}') try: system_name, part_name = system_fullname.split(':', maxsplit=1) except ValueError: @@ -554,13 +567,21 @@ def handler_list(handler_config, basedir=None): def _find_config_file(): # The order of elements is important, since it defines the priority - username = osext.osuser() - prefixes = [os.path.join(username, '.reframe')] if username else [] - prefixes += [reframe.INSTALL_PREFIX, '/etc/reframe.d'] + homedir = os.getenv('HOME') + prefixes = [os.path.join(homedir, '.reframe')] if homedir else [] + prefixes += [ + reframe.INSTALL_PREFIX, + '/etc/reframe.d' + ] valid_exts = ['py', 'json'] + getlogger().debug('Looking for a suitable configuration file') for d in prefixes: + if not d: + continue + for ext in valid_exts: filename = os.path.join(d, f'settings.{ext}') + getlogger().debug(f'Trying {filename!r}') if os.path.exists(filename): return filename @@ -572,8 +593,9 @@ def load_config(filename=None): filename = _find_config_file() if filename is None: # Return the generic configuration - getlogger().debug('no configuration found; ' + getlogger().debug('No configuration found; ' 'falling back to a generic one') return _SiteConfig(settings.site_configuration, '') + getlogger().debug(f'Loading configuration file: {filename!r}') return _SiteConfig.create(filename) diff --git a/reframe/core/exceptions.py b/reframe/core/exceptions.py index 40b8ea0478..b2106242ef 100644 --- a/reframe/core/exceptions.py +++ b/reframe/core/exceptions.py @@ -181,14 +181,16 @@ def __init__(self, args, stdout, stderr, exitcode): lines = [ f"command '{self.command}' failed with exit code {self.exitcode}:" ] - lines.append('=== STDOUT ===') + lines.append('--- stdout ---') if stdout: lines.append(stdout) - lines.append('=== STDERR ===') + lines.append('--- stdout ---') + lines.append('--- stderr ---') if stderr: lines.append(stderr) + lines.append('--- stderr ---') self._message = '\n'.join(lines) @property @@ -221,14 +223,16 @@ def __init__(self, args, stdout, stderr, timeout): # Format message lines = [f"command '{self.command}' timed out after {self.timeout}s:"] - lines.append('=== STDOUT ===') + lines.append('--- stdout ---') if self._stdout: lines.append(self._stdout) - lines.append('=== STDERR ===') + lines.append('--- stdout ---') + lines.append('--- stderr ---') if self._stderr: lines.append(self._stderr) + lines.append('--- stderr ---') self._message = '\n'.join(lines) @property diff --git a/reframe/core/logging.py b/reframe/core/logging.py index a00feabb17..fd9154e7e8 100644 --- a/reframe/core/logging.py +++ b/reframe/core/logging.py @@ -34,6 +34,7 @@ INFO = 20 VERBOSE = 19 DEBUG = 10 +DEBUG2 = 9 NOTSET = 0 @@ -44,6 +45,7 @@ INFO: 'info', VERBOSE: 'verbose', DEBUG: 'debug', + DEBUG2: 'debug2', NOTSET: 'undefined' } @@ -54,6 +56,7 @@ 'info': INFO, 'verbose': VERBOSE, 'debug': DEBUG, + 'debug2': DEBUG2, 'undefined': NOTSET, 'notset': NOTSET } @@ -376,6 +379,9 @@ def verbose(self, message, *args, **kwargs): def debug(self, message, *args, **kwargs): self.log(DEBUG, message, *args, **kwargs) + def debug2(self, message, *args, **kwargs): + self.log(DEBUG2, message, *args, **kwargs) + class LoggerAdapter(logging.LoggerAdapter): def __init__(self, logger=None, check=None): @@ -476,6 +482,9 @@ def log(self, level, msg, *args, **kwargs): if self.logger: super().log(level, msg, *args, **kwargs) + def debug2(self, message, *args, **kwargs): + self.log(DEBUG2, message, *args, **kwargs) + def verbose(self, message, *args, **kwargs): self.log(VERBOSE, message, *args, **kwargs) diff --git a/reframe/core/modules.py b/reframe/core/modules.py index 855d54f229..c8bc4bce6a 100644 --- a/reframe/core/modules.py +++ b/reframe/core/modules.py @@ -17,6 +17,7 @@ import reframe.utility.typecheck as types from reframe.core.exceptions import (ConfigError, EnvironError, SpawnedProcessError) +from reframe.core.logging import getlogger from reframe.utility import OrderedSet @@ -85,6 +86,7 @@ class ModulesSystem: @classmethod def create(cls, modules_kind=None): + getlogger().debug(f'Initializing modules system {modules_kind!r}') if modules_kind is None or modules_kind == 'nomod': return ModulesSystem(NoModImpl()) elif modules_kind == 'tmod31': diff --git a/reframe/core/pipeline.py b/reframe/core/pipeline.py index f794a99e98..b9d757787f 100644 --- a/reframe/core/pipeline.py +++ b/reframe/core/pipeline.py @@ -1023,7 +1023,7 @@ def is_local(self): def _setup_paths(self): '''Setup the check's dynamic paths.''' - self.logger.debug('setting up paths') + self.logger.debug('Setting up test paths') try: runtime = rt.runtime() self._stagedir = runtime.make_stagedir( @@ -1040,13 +1040,6 @@ def _setup_paths(self): def _setup_job(self, **job_opts): '''Setup the job related to this check.''' - self.logger.debug('setting up the job descriptor') - - msg = 'job scheduler backend: {0}' - self.logger.debug( - msg.format('local' if self.is_local else - self._current_partition.scheduler.registered_name)) - if self.local: scheduler = getscheduler('local')() launcher = getlauncher('local')() @@ -1054,6 +1047,11 @@ def _setup_job(self, **job_opts): scheduler = self._current_partition.scheduler launcher = self._current_partition.launcher_type() + self.logger.debug( + f'Setting up run job descriptor ' + f'(scheduler: {scheduler.registered_name!r}, ' + f'launcher: {launcher.registered_name!r})' + ) self._job = Job.create(scheduler, launcher, name='rfm_%s_job' % self.name, @@ -1064,7 +1062,6 @@ def _setup_job(self, **job_opts): **job_opts) def _setup_perf_logging(self): - self.logger.debug('setting up performance logging') self._perf_logger = logging.getperflogger(self) @_run_hooks() @@ -1094,9 +1091,8 @@ def setup(self, partition, environ, **job_opts): self._setup_job(**job_opts) def _copy_to_stagedir(self, path): - self.logger.debug('copying %s to stage directory (%s)' % - (path, self._stagedir)) - self.logger.debug('symlinking files: %s' % self.readonly_files) + self.logger.debug(f'Copying {path} to stage directory') + self.logger.debug(f'Symlinking files: {self.readonly_files}') try: osext.copytree_virtual( path, self._stagedir, self.readonly_files, dirs_exist_ok=True @@ -1105,8 +1101,7 @@ def _copy_to_stagedir(self, path): raise PipelineError('copying of files failed') from e def _clone_to_stagedir(self, url): - self.logger.debug('cloning URL %s to stage directory (%s)' % - (url, self._stagedir)) + self.logger.debug(f'Cloning URL {url} into stage directory') osext.git_clone(self.sourcesdir, self._stagedir) @_run_hooks('pre_compile') @@ -1138,9 +1133,10 @@ def compile(self): if commonpath: self.logger.warn( - "sourcepath `%s' seems to be a subdirectory of " - "sourcesdir `%s', but it will be interpreted " - "as relative to it." % (self.sourcepath, self.sourcesdir)) + f'sourcepath {self.sourcepath!r} is a subdirectory of ' + f'sourcesdir {self.sourcesdir!r}, but it will be ' + f'interpreted as relative to it' + ) if osext.is_url(self.sourcesdir): self._clone_to_stagedir(self.sourcesdir) @@ -1158,7 +1154,6 @@ def compile(self): ) staged_sourcepath = os.path.join(self._stagedir, self.sourcepath) - self.logger.debug('Staged sourcepath: %s' % staged_sourcepath) if os.path.isdir(staged_sourcepath): if not self.build_system: # Try to guess the build system @@ -1227,7 +1222,6 @@ def compile_wait(self): ''' self._build_job.wait() - self.logger.debug('compilation finished') # FIXME: this check is not reliable for certain scheduler backends if self._build_job.exitcode != 0: @@ -1317,6 +1311,7 @@ def run(self): self._job.options = resources_opts + self._job.options with osext.change_dir(self._stagedir): try: + self.logger.debug('Generating the run script') self._job.prepare( commands, environs, login=rt.runtime().get_option('general/0/use_login_shell'), @@ -1329,9 +1324,7 @@ def run(self): self._job.submit() - msg = ('spawned job (%s=%s)' % - ('pid' if self.is_local() else 'jobid', self._job.jobid)) - self.logger.debug(msg) + self.logger.debug(f'Spawned run job (id={self.job.jobid})') # Update num_tasks if test is flexible if self.job.sched_flex_alloc_nodes: @@ -1386,7 +1379,6 @@ def run_wait(self): ''' self._job.wait() - self.logger.debug('spawned job finished') @final def wait(self): @@ -1540,7 +1532,7 @@ def _copy_job_files(self, job, dst): def _copy_to_outputdir(self): '''Copy check's interesting files to the output directory.''' - self.logger.debug('copying interesting files to output directory') + self.logger.debug('Copying test files to output directory') self._copy_job_files(self._job, self.outputdir) self._copy_job_files(self._build_job, self.outputdir) @@ -1580,13 +1572,14 @@ def cleanup(self, remove_files=False): ''' aliased = os.path.samefile(self._stagedir, self._outputdir) if aliased: - self.logger.debug('skipping copy to output dir ' - 'since they alias each other') + self.logger.debug( + f'outputdir and stagedir are the same; copying skipped' + ) else: self._copy_to_outputdir() if remove_files: - self.logger.debug('removing stage directory') + self.logger.debug('Removing stage directory') osext.rmtree(self._stagedir) # Dependency API diff --git a/reframe/core/runtime.py b/reframe/core/runtime.py index a5792677b0..9e7386213b 100644 --- a/reframe/core/runtime.py +++ b/reframe/core/runtime.py @@ -16,6 +16,7 @@ import reframe.utility.osext as osext from reframe.core.environments import (Environment, snapshot) from reframe.core.exceptions import ReframeFatalError +from reframe.core.logging import getlogger from reframe.core.systems import System @@ -141,12 +142,18 @@ def stage_prefix(self): def make_stagedir(self, *dirs): wipeout = self.get_option('general/0/clean_stagedir') - return self._makedir(self.stage_prefix, - *self._format_dirs(*dirs), wipeout=wipeout) + ret = self._makedir(self.stage_prefix, + *self._format_dirs(*dirs), wipeout=wipeout) + getlogger().debug( + f'Created stage directory {ret!r} [clean_stagedir: {wipeout}]' + ) + return ret def make_outputdir(self, *dirs): - return self._makedir(self.output_prefix, - *self._format_dirs(*dirs), wipeout=True) + ret = self._makedir(self.output_prefix, + *self._format_dirs(*dirs), wipeout=True) + getlogger().debug(f'Created output directory {ret!r}') + return ret @property def modules_system(self): diff --git a/reframe/core/schedulers/__init__.py b/reframe/core/schedulers/__init__.py index d204e00fe2..36daa0620f 100644 --- a/reframe/core/schedulers/__init__.py +++ b/reframe/core/schedulers/__init__.py @@ -16,7 +16,7 @@ import reframe.utility.typecheck as typ from reframe.core.exceptions import JobError, JobNotStartedError from reframe.core.launchers import JobLauncher -from reframe.core.logging import getlogger +from reframe.core.logging import (getlogger, DEBUG2) class JobScheduler(abc.ABC): @@ -102,6 +102,14 @@ def poll(self, *jobs): :meta private: ''' + def log(self, message, level=DEBUG2): + '''Convenience method for logging debug messages from the scheduler + backends. + + :meta private: + ''' + getlogger().log(level, f'[S] {self.registered_name}: {message}') + class Job: '''A job descriptor. @@ -328,6 +336,7 @@ def submit_time(self): def prepare(self, commands, environs=None, **gen_opts): environs = environs or [] if self.num_tasks <= 0: + getlogger().debug(f'[F] Flexible node allocation requested') num_tasks_per_node = self.num_tasks_per_node or 1 min_num_tasks = (-self.num_tasks if self.num_tasks else num_tasks_per_node) @@ -336,17 +345,17 @@ def prepare(self, commands, environs=None, **gen_opts): guessed_num_tasks = self.guess_num_tasks() except NotImplementedError as e: raise JobError('flexible node allocation is not supported by ' - 'this backend') from e + 'this scheduler backend') from e if guessed_num_tasks < min_num_tasks: raise JobError( 'could not satisfy the minimum task requirement: ' 'required %s, found %s' % - (min_num_tasks, guessed_num_tasks)) + (min_num_tasks, guessed_num_tasks) + ) self.num_tasks = guessed_num_tasks - getlogger().debug('flex_alloc_nodes: setting num_tasks to %s' % - self.num_tasks) + getlogger().debug(f'[F] Setting num_tasks to {self.num_tasks}') with shell.generate_script(self.script_filename, **gen_opts) as builder: @@ -365,8 +374,9 @@ def guess_num_tasks(self): return self.sched_flex_alloc_nodes * num_tasks_per_node available_nodes = self.scheduler.allnodes() - getlogger().debug('flex_alloc_nodes: total available nodes: %s ' % - len(available_nodes)) + getlogger().debug( + f'[F] Total available nodes: {len(available_nodes)}' + ) # Try to guess the number of tasks now available_nodes = self.scheduler.filternodes(self, available_nodes) @@ -374,7 +384,7 @@ def guess_num_tasks(self): available_nodes = {n for n in available_nodes if n.in_state(self.sched_flex_alloc_nodes)} getlogger().debug( - f'flex_alloc_nodes: selecting nodes in state ' + f'[F] Selecting nodes in state ' f'{self.sched_flex_alloc_nodes!r}: ' f'available nodes now: {len(available_nodes)}' ) diff --git a/reframe/core/schedulers/local.py b/reframe/core/schedulers/local.py index 173e281cc3..6dba63ea57 100644 --- a/reframe/core/schedulers/local.py +++ b/reframe/core/schedulers/local.py @@ -15,7 +15,6 @@ import reframe.utility.osext as osext from reframe.core.backends import register_scheduler from reframe.core.exceptions import ReframeError -from reframe.core.logging import getlogger class _TimeoutExpired(ReframeError): @@ -105,9 +104,7 @@ def _kill_all(self, job): except (ProcessLookupError, PermissionError): # The process group may already be dead or assigned to a different # group, so ignore this error - getlogger().debug( - f'pid {job.jobid} already dead or assigned elsewhere' - ) + self.log(f'pid {job.jobid} already dead or assigned elsewhere') finally: # Close file handles job.f_stdout.close() @@ -169,12 +166,15 @@ def _poll_job(self, job): except OSError as e: if e.errno == errno.ECHILD: # No unwaited children + self.log('no more unwaited children') return else: raise e if job.cancel_time: # Job has been cancelled; give it a grace period and kill it + self.log(f'Job {job.jobid} has been cancelled; ' + f'giving it a grace period') t_rem = self.CANCEL_GRACE_PERIOD - (time.time() - job.cancel_time) if t_rem > 0: time.sleep(t_rem) @@ -186,6 +186,7 @@ def _poll_job(self, job): # Job has not finished; check if we have reached a timeout t_elapsed = time.time() - job.submit_time if job.time_limit and t_elapsed > job.time_limit: + self.log(f'Job {job.jobid} timed out; kill it') self._kill_all(job) job._state = 'TIMEOUT' diff --git a/reframe/core/schedulers/pbs.py b/reframe/core/schedulers/pbs.py index 2cb037dc61..a44097d1c2 100644 --- a/reframe/core/schedulers/pbs.py +++ b/reframe/core/schedulers/pbs.py @@ -21,7 +21,6 @@ from reframe.core.backends import register_scheduler from reframe.core.config import settings from reframe.core.exceptions import JobSchedulerError -from reframe.core.logging import getlogger from reframe.utility import seconds_to_hms @@ -148,7 +147,6 @@ def cancel(self, job): if time_from_submit < PBS_CANCEL_DELAY: time.sleep(PBS_CANCEL_DELAY - time_from_submit) - getlogger().debug(f'cancelling job (id={job.jobid})') _run_strict(f'qdel {job.jobid}', timeout=self._submit_timeout) job._cancelled = True diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 667e35bbbb..a329a16411 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -20,7 +20,6 @@ JobBlockedError, JobError, JobSchedulerError) -from reframe.core.logging import getlogger from reframe.utility import seconds_to_hms @@ -157,7 +156,7 @@ def emit_preamble(self, job): parsed_args, _ = jobarr_parser.parse_known_args(job.options) if parsed_args.array: job._is_array = True - getlogger().debug('Slurm job is a job array') + self.log('Slurm job is a job array') # Slurm replaces '%a' by the corresponding SLURM_ARRAY_TASK_ID outfile_fmt = '--output={0}' + ('_%a' if job.is_array else '') @@ -259,11 +258,10 @@ def _merge_files(self, job): with osext.change_dir(job.workdir): out_glob = glob.glob(job.stdout + '_*') err_glob = glob.glob(job.stderr + '_*') - getlogger().debug( - 'merging job array output files: %s' % ', '.join(out_glob)) + self.log(f'merging job array output files: {", ".join(out_glob)}') osext.concat_files(job.stdout, *out_glob, overwrite=True) - getlogger().debug( - 'merging job array error files: %s' % ','.join(err_glob)) + + self.log(f'merging job array error files: {", ".join(err_glob)}') osext.concat_files(job.stderr, *err_glob, overwrite=True) def filternodes(self, job, nodes): @@ -286,43 +284,38 @@ def filternodes(self, job, nodes): if reservation: reservation = reservation.strip() nodes &= self._get_reservation_nodes(reservation) - getlogger().debug( - 'flex_alloc_nodes: filtering nodes by reservation %s: ' - 'available nodes now: %s' % (reservation, len(nodes))) + self.log(f'[F] Filtering nodes by reservation {reservation}: ' + f'available nodes now: {len(nodes)}') if partitions: partitions = set(partitions.strip().split(',')) else: default_partition = self._get_default_partition() partitions = {default_partition} if default_partition else set() - getlogger().debug('flex_alloc_nodes: default partition: %s' % - default_partition) + self.log( + f'[F] No partition specified; using {default_partition!r}' + ) nodes = {n for n in nodes if n.partitions >= partitions} - getlogger().debug( - 'flex_alloc_nodes: filtering nodes by partition(s) %s: ' - 'available nodes now: %s' % (partitions, len(nodes))) - + self.log(f'[F] Filtering nodes by partition(s) {partitions}: ' + f'available nodes now: {len(nodes)}') if constraints: constraints = set(constraints.strip().split('&')) nodes = {n for n in nodes if n.active_features >= constraints} - getlogger().debug( - 'flex_alloc_nodes: filtering nodes by constraint(s) %s: ' - 'available nodes now: %s' % (constraints, len(nodes))) + self.log(f'[F] Filtering nodes by constraint(s) {constraints}: ' + f'available nodes now: {len(nodes)}') if nodelist: nodelist = nodelist.strip() nodes &= self._get_nodes_by_name(nodelist) - getlogger().debug( - 'flex_alloc_nodes: filtering nodes by nodelist: %s ' - 'available nodes now: %s' % (nodelist, len(nodes))) + self.log(f'[F] Filtering nodes by nodelist: {nodelist}: ' + f'available nodes now: {len(nodes)}') if exclude_nodes: exclude_nodes = exclude_nodes.strip() nodes -= self._get_nodes_by_name(exclude_nodes) - getlogger().debug( - 'flex_alloc_nodes: excluding node(s): %s ' - 'available nodes now: %s' % (exclude_nodes, len(nodes))) + self.log(f'[F] Excluding node(s): {exclude_nodes}: ' + f'available nodes now: {len(nodes)}') return nodes @@ -341,7 +334,7 @@ def _get_reservation_nodes(self, reservation): def _get_nodes_by_name(self, nodespec): completed = osext.run_command('scontrol -a show -o node %s' % - nodespec) + nodespec) node_descriptions = completed.stdout.splitlines() return _create_nodes(node_descriptions) @@ -394,8 +387,9 @@ def poll(self, *jobs): fr'(?P.*)', completed.stdout, re.MULTILINE) ) if not state_match: - getlogger().debug('job state not matched (stdout follows)\n%s' % - completed.stdout) + self.log( + f'Job state not matched (stdout follows)\n{completed.stdout}' + ) return job_info = {} @@ -436,6 +430,7 @@ def _cancel_if_pending_too_long(self, job): t_pending = time.time() - job.submit_time if t_pending >= job.max_pending_time: + self.log(f'maximum pending time for job exceeded; cancelling it') self.cancel(job) job._exception = JobError('maximum pending time exceeded') @@ -469,6 +464,7 @@ def _do_cancel_if_blocked(self, job, reason_descr): if reason in self._cancel_reasons: if reason == 'ReqNodeNotAvail' and reason_details: + self.log('Job blocked due to ReqNodeNotAvail') node_match = re.match( r'UnavailableNodes:(?P\S+)?', reason_details.strip() @@ -478,6 +474,8 @@ def _do_cancel_if_blocked(self, job, reason_descr): if node_names: # Retrieve the info of the unavailable nodes # and check if they are indeed down + self.log(f'Checking if nodes {node_names!r} ' + f'are indeed unavailable') nodes = self._get_nodes_by_name(node_names) if not any(n.is_down() for n in nodes): return @@ -511,7 +509,6 @@ def wait(self, job): self._merge_files(job) def cancel(self, job): - getlogger().debug(f'cancelling job (id={job.jobid})') _run_strict(f'scancel {job.jobid}', timeout=self._submit_timeout) job._is_cancelling = True diff --git a/reframe/core/schedulers/torque.py b/reframe/core/schedulers/torque.py index 28c5a297c5..995a6185a8 100644 --- a/reframe/core/schedulers/torque.py +++ b/reframe/core/schedulers/torque.py @@ -15,7 +15,6 @@ import reframe.utility.osext as osext from reframe.core.backends import register_scheduler from reframe.core.exceptions import JobError, JobSchedulerError -from reframe.core.logging import getlogger from reframe.core.schedulers.pbs import PbsJobScheduler, _run_strict @@ -60,10 +59,8 @@ def poll(self, *jobs): # Otherwise, it will return with return code 0 and print information # only for the jobs it could find. if completed.returncode == 153: - getlogger().debug( - 'return code = 153: jobids not known by scheduler, ' - 'assuming all jobs completed' - ) + self.log('Return code is 153: jobids not known by scheduler, ' + 'assuming all jobs completed') for job in jobs: job._state = 'COMPLETED' @@ -87,10 +84,8 @@ def poll(self, *jobs): for job in jobs: if job.jobid not in jobinfo: - getlogger().debug( - f'jobid {job.jobid} not known to scheduler, ' - f'assuming job completed' - ) + self.log(f'Job {job.jobid} not known to scheduler, ' + f'assuming job completed') job._state = 'COMPLETED' job._completed = True continue @@ -100,9 +95,7 @@ def poll(self, *jobs): r'^\s*job_state = (?P[A-Z])', info, re.MULTILINE ) if not state_match: - getlogger().debug( - f'job state not found (job info follows):\n{info}' - ) + self.log(f'Job state not found (job info follows):\n{info}') continue state = state_match.group('state') diff --git a/reframe/core/settings.py b/reframe/core/settings.py index 820417d5b3..0731139ee3 100644 --- a/reframe/core/settings.py +++ b/reframe/core/settings.py @@ -33,7 +33,6 @@ ], 'logging': [ { - 'level': 'debug', 'handlers': [ { 'type': 'stream', diff --git a/reframe/core/systems.py b/reframe/core/systems.py index 17fa370eca..4b2bb6e3da 100644 --- a/reframe/core/systems.py +++ b/reframe/core/systems.py @@ -8,6 +8,7 @@ import reframe.utility as utility from reframe.core.backends import (getlauncher, getscheduler) +from reframe.core.logging import getlogger from reframe.core.modules import ModulesSystem from reframe.core.environments import (Environment, ProgEnvironment) @@ -22,6 +23,7 @@ class SystemPartition: def __init__(self, parent, name, sched_type, launcher_type, descr, access, container_environs, resources, local_env, environs, max_jobs): + getlogger().debug(f'Initializing system partition {name!r}') self._parent_system = parent self._name = name self._sched_type = sched_type @@ -245,6 +247,7 @@ class System: def __init__(self, name, descr, hostnames, modules_system, preload_env, prefix, outputdir, resourcesdir, stagedir, partitions): + getlogger().debug(f'Initializing system {name!r}') self._name = name self._descr = descr self._hostnames = hostnames diff --git a/reframe/frontend/cli.py b/reframe/frontend/cli.py index d356354d54..734b57662d 100644 --- a/reframe/frontend/cli.py +++ b/reframe/frontend/cli.py @@ -111,6 +111,17 @@ def generate_report_filename(filepatt): return filepatt.format(sessionid=new_id) +def logfiles_message(): + log_files = logging.log_files() + msg = 'Log file(s) saved in: ' + if not log_files: + msg += '' + else: + msg += f'{", ".join(repr(f) for f in log_files)}' + + return msg + + def main(): # Setup command line options argparser = argparse.ArgumentParser() @@ -432,7 +443,8 @@ def main(): if options.upgrade_config_file is not None: old_config, *new_config = options.upgrade_config_file.split( - ':', maxsplit=1) + ':', maxsplit=1 + ) new_config = new_config[0] if new_config else None try: @@ -445,12 +457,12 @@ def main(): f'Conversion successful! ' f'The converted file can be found at {new_config!r}.' ) - sys.exit(0) # Now configure ReFrame according to the user configuration file try: try: + printer.debug('Loading user configuration') site_config = config.load_config(options.config_file) except warnings.ReframeDeprecationWarning as e: printer.warning(e) @@ -488,15 +500,18 @@ def main(): logging.configure_logging(site_config) except (OSError, errors.ConfigError) as e: printer.error(f'failed to load configuration: {e}') + printer.error(logfiles_message()) sys.exit(1) logging.getlogger().colorize = site_config.get('general/0/colorize') printer.colorize = site_config.get('general/0/colorize') printer.inc_verbosity(site_config.get('general/0/verbose')) try: + printer.debug('Initializing runtime') runtime.init_runtime(site_config) except errors.ConfigError as e: printer.error(f'failed to initialize runtime: {e}') + printer.error(logfiles_message()) sys.exit(1) rt = runtime.runtime() @@ -519,6 +534,7 @@ def main(): printer.error("stage and output refer to the same directory; " "if this is on purpose, please use the " "'--keep-stage-files' option.") + printer.error(logfiles_message()) sys.exit(1) # Show configuration after everything is set up @@ -582,6 +598,7 @@ def print_infoline(param, value): # Locate and load checks try: checks_found = loader.load_all() + printer.verbose(f'Loaded {len(checks_found)} test(s)') except OSError as e: raise errors.ReframeError from e @@ -589,27 +606,47 @@ def print_infoline(param, value): checks_matched = checks_found if options.exclude_names: for name in options.exclude_names: - checks_matched = filter(filters.have_not_name(name), - checks_matched) + checks_matched = [c for c in checks_matched + if filters.have_not_name(name)(c)] if options.names: - checks_matched = filter(filters.have_name('|'.join(options.names)), - checks_matched) + checks_matched = [c for c in checks_matched + if filters.have_name('|'.join(options.names))(c)] + + printer.verbose( + f'Filtering test(s) by name: {len(checks_matched)} remaining' + ) # Filter checks by tags for tag in options.tags: - checks_matched = filter(filters.have_tag(tag), checks_matched) + checks_matched = [c for c in checks_matched + if filters.have_tag(tag)(c)] + + printer.verbose( + f'Filtering test(s) by tags: {len(checks_matched)} remaining' + ) # Filter checks by prgenv if not options.skip_prgenv_check: for prgenv in options.prgenv: - checks_matched = filter(filters.have_prgenv(prgenv), - checks_matched) + checks_matched = [c for c in checks_matched + if filters.have_prgenv(prgenv)(c)] + + printer.verbose( + f'Filtering test(s) by programming environment: ' + f'{len(list(checks_matched))} remaining' + ) # Filter checks by system if not options.skip_system_check: - checks_matched = filter( - filters.have_partition(rt.system.partitions), checks_matched) + partitions = rt.system.partitions + checks_matched = [c for c in checks_matched + if filters.have_partition(partitions)(c)] + + printer.verbose( + f'Filtering test(s) by system: ' + f'{len(list(checks_matched))} remaining' + ) # Filter checks further if options.gpu_only and options.cpu_only: @@ -618,9 +655,11 @@ def print_infoline(param, value): sys.exit(1) if options.gpu_only: - checks_matched = filter(filters.have_gpu_only(), checks_matched) + checks_matched = [c for c in checks_matched + if filters.have_gpu_only()(c)] elif options.cpu_only: - checks_matched = filter(filters.have_cpu_only(), checks_matched) + checks_matched = [c for c in checks_matched + if filters.have_cpu_only()(c)] # Determine the allowed programming environments allowed_environs = {e.name @@ -628,9 +667,6 @@ def print_infoline(param, value): for p in rt.system.partitions for e in p.environs if re.match(env_patt, e.name)} - # Generate the test cases, validate dependencies and sort them - checks_matched = list(checks_matched) - # Disable hooks for c in checks_matched: for h in options.hooks: @@ -640,6 +676,8 @@ def print_infoline(param, value): options.skip_system_check, options.skip_prgenv_check, allowed_environs) + printer.debug(f'Generated {len(testcases)} test case(s)') + printer.debug('Building and validating the test DAG') testgraph = dependencies.build_deps(testcases) dependencies.validate_deps(testgraph) testcases = dependencies.toposort(testgraph) @@ -653,6 +691,7 @@ def print_infoline(param, value): # Load the environment for the current system try: + printer.debug(f'Loading environment for current system') runtime.loadenv(rt.system.preload_environ) except errors.EnvironError as e: printer.error("failed to load current system's environment; " @@ -660,6 +699,7 @@ def print_infoline(param, value): printer.debug(str(e)) raise + printer.debug(f'Loading user modules from command line') for m in site_config.get('general/0/user_modules'): try: rt.modules_system.load_module(m, force=True) @@ -673,7 +713,7 @@ def print_infoline(param, value): # Act on checks success = True if options.list or options.list_detailed: - list_checks(list(checks_matched), printer, options.list_detailed) + list_checks(checks_matched, printer, options.list_detailed) elif options.run: # Setup the execution policy if options.exec_policy == 'serial': @@ -814,9 +854,4 @@ def print_infoline(param, value): printer.error(f'could not save log file: {e}') sys.exit(1) finally: - if not log_files: - msg = '' - else: - msg = f'{", ".join(repr(f) for f in log_files)}' - - printer.info(f'Log file(s) saved in: {msg}') + printer.info(logfiles_message()) diff --git a/reframe/frontend/executors/__init__.py b/reframe/frontend/executors/__init__.py index 2978b565f7..ca80e8887e 100644 --- a/reframe/frontend/executors/__init__.py +++ b/reframe/frontend/executors/__init__.py @@ -58,8 +58,8 @@ def __eq__(self, other): self.partition.fullname == other.partition.fullname) def __repr__(self): - return '(%r, %r, %r)' % (self.check.name, - self.partition.fullname, self.environ.name) + c, p, e = self.check.name, self.partition.fullname, self.environ.name + return f'({c!r}, {p!r}, {e!r})' @property def check(self): @@ -243,7 +243,7 @@ def __exit__(this, exc_type, exc_value, traceback): try: with logging.logging_context(self.check) as logger: - logger.debug(f'entering stage: {self._current_stage}') + logger.debug(f'Entering stage: {self._current_stage}') with update_timestamps(): return fn(*args, **kwargs) @@ -299,7 +299,7 @@ def fail(self, exc_info=None): self._notify_listeners('on_task_failure') def abort(self, cause=None): - logging.getlogger().debug('aborting: %s' % self.check.info()) + logging.getlogger().debug2('Aborting test case: {self.testcase!r}') exc = AbortTaskError() exc.__cause__ = cause try: diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 0e075ea7fd..3ce01c5018 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -13,7 +13,7 @@ from datetime import datetime from reframe.core.exceptions import (TaskDependencyError, TaskExit) -from reframe.core.logging import getlogger +from reframe.core.logging import (getlogger, VERBOSE) from reframe.frontend.executors import (ExecutionPolicy, RegressionTask, TaskEventListener, ABORT_REASONS) @@ -61,12 +61,10 @@ def running_tasks(self, num_tasks): return self def snooze(self): - from reframe.core.logging import getlogger - t_elapsed = time.time() - self._t_init self._num_polls += 1 - getlogger().debug( - f'poll rate control: sleeping for {self._sleep_duration}s ' + getlogger().debug2( + f'Poll rate control: sleeping for {self._sleep_duration}s ' f'(current poll rate: {self._num_polls/t_elapsed} polls/s)' ) time.sleep(self._sleep_duration) @@ -167,7 +165,9 @@ def on_task_failure(self, task): 'sanity', 'performance', 'total']) - getlogger().verbose(f"==> {timings}") + getlogger().info(f'==> test failed during {task.failed_stage!r}: ' + f'test staged in {task.check.stagedir!r}') + getlogger().verbose(f'==> {timings}') def on_task_success(self, task): timings = task.pipeline_timings(['compile_complete', @@ -181,7 +181,8 @@ def on_task_success(self, task): 'sanity', 'performance', 'total']) - getlogger().verbose(f"==> {timings}") + getlogger().verbose(f'==> {timings}') + # update reference count of dependencies for c in task.testcase.deps: self._task_index[c].ref_count -= 1 @@ -227,14 +228,14 @@ def __init__(self): self.task_listeners.append(self) def _remove_from_running(self, task): - getlogger().debug( - 'removing task from running list: %s' % task.check.info() + getlogger().debug2( + f'Removing task from the running list: {task.testcase}' ) try: partname = task.check.current_partition.fullname self._running_tasks[partname].remove(task) except (ValueError, AttributeError, KeyError): - getlogger().debug('not in running tasks') + getlogger().debug2('Task was not running') pass def deps_failed(self, task): @@ -259,12 +260,18 @@ def on_task_failure(self, task): self._remove_from_running(task) self.printer.status('FAIL', msg, just='right') - getlogger().verbose(f"==> {task.pipeline_timings_all()}") + stagedir = task.check.stagedir + if not stagedir: + stagedir = '' + + getlogger().info(f'==> test failed during {task.failed_stage!r}: ' + f'test staged in {stagedir!r}') + getlogger().verbose(f'==> timings: {task.pipeline_timings_all()}') def on_task_success(self, task): msg = f'{task.check.info()} [{task.pipeline_timings_basic()}]' self.printer.status('OK', msg, just='right') - getlogger().verbose(f"==> {task.pipeline_timings_all()}") + getlogger().verbose(f'==> timings: {task.pipeline_timings_all()}') # update reference count of dependencies for c in task.testcase.deps: @@ -328,8 +335,10 @@ def runcase(self, case): if len(self._running_tasks[partname]) >= partition.max_jobs: # Make sure that we still exceeded the job limit - getlogger().debug('reached job limit (%s) for partition %s' % - (partition.max_jobs, partname)) + getlogger().debug2( + f'Reached concurrency limit for partition {partname!r}: ' + f'{partition.max_jobs} job(s)' + ) self._poll_tasks() if len(self._running_tasks[partname]) < partition.max_jobs: @@ -368,11 +377,10 @@ def split_jobs(tasks): return forced_local, normal - getlogger().debug('updating counts for running test cases') for part in self._partitions: partname = part.fullname num_tasks = len(self._running_tasks[partname]) - getlogger().debug(f'polling {num_tasks} task(s) in {partname!r}') + getlogger().debug2(f'Polling {num_tasks} task(s) in {partname!r}') forced_local_jobs, part_jobs = split_jobs( self._running_tasks[partname] ) @@ -392,18 +400,19 @@ def _setup_all(self): self._waiting_tasks[:] = still_waiting def _finalize_all(self): - getlogger().debug('finalizing tasks: %s', len(self._completed_tasks)) + getlogger().debug2(f'Finalizing {len(self._completed_tasks)} task(s)') while True: try: task = self._completed_tasks.pop() except IndexError: break - getlogger().debug('finalizing task: %s' % task.check.info()) + getlogger().debug2(f'Finalizing task {task.testcase}') with contextlib.suppress(TaskExit): self._finalize_task(task) def _finalize_task(self, task): + getlogger().debug2(f'Finalizing task {task.testcase}') if not self.skip_sanity_check: task.sanity() @@ -414,12 +423,12 @@ def _finalize_task(self, task): def _failall(self, cause): '''Mark all tests as failures''' + getlogger().debug2(f'Aborting all tasks due to {type(cause).__name__}') for task in list(itertools.chain(*self._running_tasks.values())): task.abort(cause) self._running_tasks = {} for ready_list in self._ready_tasks.values(): - getlogger().debug('ready list size: %s' % len(ready_list)) for task in ready_list: task.abort(cause) @@ -429,8 +438,7 @@ def _failall(self, cause): task.abort(cause) def _reschedule(self, task): - getlogger().debug('scheduling test case for running') - + getlogger().debug2(f'Scheduling test case {task.testcase} for running') task.compile() task.compile_wait() task.run() @@ -450,16 +458,17 @@ def _reschedule_all(self): num_rescheduled += 1 if num_rescheduled: - getlogger().debug('rescheduled %s job(s) on %s' % - (num_rescheduled, partname)) + getlogger().debug2( + f'Rescheduled {num_rescheduled} job(s) on {partname!r}' + ) def exit(self): self.printer.separator('short single line', 'waiting for spawned checks to finish') while (countall(self._running_tasks) or self._waiting_tasks or self._completed_tasks or countall(self._ready_tasks)): - getlogger().debug(f'running tasks: ' - f'{countall(self._running_tasks)}') + getlogger().debug2(f'Running tasks: ' + f'{countall(self._running_tasks)}') try: self._poll_tasks() diff --git a/reframe/frontend/loader.py b/reframe/frontend/loader.py index 7516860079..956ed5f5f6 100644 --- a/reframe/frontend/loader.py +++ b/reframe/frontend/loader.py @@ -64,8 +64,15 @@ def _validate_source(self, filename): with open(filename, 'r') as f: source_tree = ast.parse(f.read(), filename) + msg = f'Validating {filename!r}: ' validator = RegressionCheckValidator() validator.visit(source_tree) + if validator.valid: + msg += 'OK' + else: + msg += 'not a test file' + + getlogger().debug(msg) return validator.valid @property @@ -90,16 +97,20 @@ def load_from_module(self, module): # Warn in case of old syntax if hasattr(module, '_get_checks'): getlogger().warning( - '%s: _get_checks() is no more supported in test files: ' - 'please use @reframe.simple_test or ' - '@reframe.parameterized_test decorators' % module.__file__ + f'{module.__file__}: _get_checks() is no more supported ' + f'in test files: please use @reframe.simple_test or ' + f'@reframe.parameterized_test decorators' ) if not hasattr(module, '_rfm_gettests'): + getlogger().debug('No tests registered') return [] candidates = module._rfm_gettests() if not isinstance(candidates, collections.abc.Sequence): + getlogger().warning( + f'Tests not registered correctly in {module.__name__!r}' + ) return [] ret = [] @@ -122,6 +133,7 @@ def load_from_module(self, module): else: raise NameConflictError(msg) + getlogger().debug(f' > Loaded {len(ret)} test(s)') return ret def load_from_file(self, filename, **check_args): @@ -153,8 +165,10 @@ def load_all(self): If a prefix exists, it will be prepended to each path.''' checks = [] for d in self._load_path: + getlogger().debug(f'Looking for tests in {d!r}') if not os.path.exists(d): continue + if os.path.isdir(d): checks.extend(self.load_from_dir(d, self._recurse)) else: diff --git a/reframe/schemas/config.json b/reframe/schemas/config.json index 5497cfe9c5..05ea54336e 100644 --- a/reframe/schemas/config.json +++ b/reframe/schemas/config.json @@ -31,8 +31,8 @@ }, "loglevel": { "type": "string", - "enum": ["critical", "error", "warning", - "info", "verbose", "debug"] + "enum": ["critical", "error", "warning", "info", + "verbose", "debug", "debug2", "undefined"] }, "handler_common": { "type": "object", @@ -416,7 +416,7 @@ "general/use_login_shell": false, "general/user_modules": [], "general/verbose": 0, - "logging/level": "debug", + "logging/level": "undefined", "logging/target_systems": ["*"], "logging/handlers*/*_level": "info", "logging/handlers*/*_format": "%(message)s", diff --git a/reframe/utility/osext.py b/reframe/utility/osext.py index de24345e08..bdac7bdade 100644 --- a/reframe/utility/osext.py +++ b/reframe/utility/osext.py @@ -97,7 +97,7 @@ def run_command_async(cmd, if log: from reframe.core.logging import getlogger - getlogger().debug('executing OS command: ' + cmd) + getlogger().debug2(f'[CMD] {cmd!r}') if not shell: cmd = shlex.split(cmd) diff --git a/unittests/test_exceptions.py b/unittests/test_exceptions.py index bbb9deaa61..01c516b92f 100644 --- a/unittests/test_exceptions.py +++ b/unittests/test_exceptions.py @@ -58,10 +58,12 @@ def test_spawned_process_error(): with pytest.raises( exc.ReframeError, match=(r"command 'foo bar' failed with exit code 1:\n" - r"=== STDOUT ===\n" + r"--- stdout ---\n" r'partial output\n' - r"=== STDERR ===\n" - r"error message") + r"--- stdout ---\n" + r"--- stderr ---\n" + r"error message\n" + r"--- stderr ---") ): raise e @@ -74,10 +76,12 @@ def test_spawned_process_error_list_args(): with pytest.raises( exc.ReframeError, match=(r"command 'foo bar' failed with exit code 1:\n" - r"=== STDOUT ===\n" + r"--- stdout ---\n" r'partial output\n' - r"=== STDERR ===\n" - r"error message") + r"--- stdout ---\n" + r"--- stderr ---\n" + r"error message\n" + r"--- stderr ---") ): raise e @@ -90,9 +94,11 @@ def test_spawned_process_error_nostdout(): with pytest.raises( exc.ReframeError, match=(r"command 'foo bar' failed with exit code 1:\n" - r"=== STDOUT ===\n" - r"=== STDERR ===\n" - r"error message") + r"--- stdout ---\n" + r"--- stdout ---\n" + r"--- stderr ---\n" + r"error message\n" + r"--- stderr ---") ): raise e @@ -103,9 +109,11 @@ def test_spawned_process_error_nostderr(): with pytest.raises( exc.ReframeError, match=(r"command 'foo bar' failed with exit code 1:\n" - r"=== STDOUT ===\n" + r"--- stdout ---\n" r'partial output\n' - r"=== STDERR ===") + r"--- stdout ---\n" + r"--- stderr ---\n" + r"--- stderr ---") ): raise e @@ -115,10 +123,12 @@ def test_spawned_process_timeout(): e = exc.SpawnedProcessTimeout(*exc_args) with pytest.raises(exc.ReframeError, match=(r"command 'foo bar' timed out after 10s:\n" - r"=== STDOUT ===\n" + r"--- stdout ---\n" r'partial output\n' - r"=== STDERR ===\n" - r"partial error")): + r"--- stdout ---\n" + r"--- stderr ---\n" + r"partial error\n" + r"--- stderr ---")): raise e @@ -127,9 +137,11 @@ def test_spawned_process_timeout_nostdout(): e = exc.SpawnedProcessTimeout(*exc_args) with pytest.raises(exc.ReframeError, match=(r"command 'foo bar' timed out after 10s:\n" - r"=== STDOUT ===\n" - r"=== STDERR ===\n" - r"partial error")): + r"--- stdout ---\n" + r"--- stdout ---\n" + r"--- stderr ---\n" + r"partial error\n" + r"--- stderr ---")): raise e @@ -138,9 +150,11 @@ def test_spawned_process_timeout_nostderr(): e = exc.SpawnedProcessTimeout(*exc_args) with pytest.raises(exc.ReframeError, match=(r"command 'foo bar' timed out after 10s:\n" - r"=== STDOUT ===\n" + r"--- stdout ---\n" r'partial output\n' - r"=== STDERR ===")): + r"--- stdout ---\n" + r"--- stderr ---\n" + r"--- stderr ---")): raise e