From 5402151b21e1b370f055baec4074e7c7e79be68e Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Fri, 6 Dec 2019 11:43:28 +0100 Subject: [PATCH 01/16] Add check_completion_time attribute to perflogs --- reframe/core/logging.py | 10 ++++++++++ reframe/core/schedulers/__init__.py | 9 ++++++++- reframe/core/schedulers/local.py | 4 ++++ reframe/core/schedulers/pbs.py | 10 +++++++--- reframe/core/schedulers/slurm.py | 15 +++++++++++++-- 5 files changed, 42 insertions(+), 6 deletions(-) diff --git a/reframe/core/logging.py b/reframe/core/logging.py index 03ced55af1..adaa249b63 100644 --- a/reframe/core/logging.py +++ b/reframe/core/logging.py @@ -374,6 +374,7 @@ def __init__(self, logger=None, check=None): 'check_outputdir': None, 'check_stagedir': None, 'check_num_tasks': None, + 'check_completion_time': None, 'check_perf_var': None, 'check_perf_value': None, 'check_perf_ref': None, @@ -427,6 +428,15 @@ def _update_check_extras(self): if self.check.job: self.extra['check_jobid'] = self.check.job.jobid + if self.check.job: + if self.check.job.completion_time: + # Format is the same for every handler + fmt = self.logger.handlers[0].formatter.datefmt + self.extra['check_completion_time'] = ( + self.check.job.completion_time.strftime(fmt)) + else: + self.extra['check_completion_time'] = "NA" + def log_performance(self, level, tag, value, ref, low_thres, upper_thres, unit=None, *, msg=None): diff --git a/reframe/core/schedulers/__init__.py b/reframe/core/schedulers/__init__.py index 622822b642..fe1d8ea203 100644 --- a/reframe/core/schedulers/__init__.py +++ b/reframe/core/schedulers/__init__.py @@ -3,6 +3,7 @@ # import abc +from datetime import datetime import reframe.core.environments as env import reframe.core.fields as fields @@ -189,6 +190,7 @@ def __init__(self, self._script_filename = script_filename or '%s.sh' % name self._stdout = stdout or '%s.out' % name self._stderr = stderr or '%s.err' % name + self._completion_time = None # Backend scheduler related information self._sched_flex_alloc_nodes = sched_flex_alloc_nodes @@ -259,6 +261,10 @@ def sched_account(self): def sched_exclusive_access(self): return self._sched_exclusive_access + @property + def completion_time(self): + return self.scheduler.completion_time or self._completion_time + def prepare(self, commands, environs=None, **gen_opts): environs = environs or [] if self.num_tasks <= 0: @@ -321,7 +327,8 @@ def wait(self): if self.jobid is None: raise JobNotStartedError('cannot wait an unstarted job') - return self.scheduler.wait(self) + self.scheduler.wait(self) + self._completion_time = datetime.now() def cancel(self): if self.jobid is None: diff --git a/reframe/core/schedulers/local.py b/reframe/core/schedulers/local.py index 561ebf2ca0..a7051fad38 100644 --- a/reframe/core/schedulers/local.py +++ b/reframe/core/schedulers/local.py @@ -30,6 +30,10 @@ def __init__(self): self._f_stdout = None self._f_stderr = None + @property + def completion_time(self): + return None + def submit(self, job): # `chmod +x' first, because we will execute the script locally os.chmod(job.script_filename, diff --git a/reframe/core/schedulers/pbs.py b/reframe/core/schedulers/pbs.py index 7edb61ae25..bdb4d3e21b 100644 --- a/reframe/core/schedulers/pbs.py +++ b/reframe/core/schedulers/pbs.py @@ -31,11 +31,15 @@ class PbsJobScheduler(sched.JobScheduler): def __init__(self): self._prefix = '#PBS' - self._time_finished = None + self._completion_time = None # Optional part of the job id refering to the PBS server self._pbs_server = None + @property + def completion_time(self): + return self._completion_time + def _emit_lselect_option(self, job): num_tasks_per_node = job.num_tasks_per_node or 1 num_cpus_per_task = job.num_cpus_per_task or 1 @@ -127,7 +131,7 @@ def finished(self, job): if done: t_now = datetime.now() - self._time_finished = self._time_finished or t_now - time_from_finish = (t_now - self._time_finished).total_seconds() + self._completion_time = self._completion_time or t_now + time_from_finish = (t_now - self._completion_time).total_seconds() return done and time_from_finish > PBS_OUTPUT_WRITEBACK_WAIT diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 37e729f6dd..e5f5c741d0 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -3,6 +3,7 @@ import itertools import re import time +import os from argparse import ArgumentParser from contextlib import suppress from datetime import datetime @@ -88,6 +89,12 @@ def __init__(self): self._is_cancelling = False self._is_job_array = None self._update_state_count = 0 + self._completion_time = None + os.environ["SLURM_TIME_FORMAT"] = "standard" + + @property + def completion_time(self): + return self._completion_time def _format_option(self, var, option): if var is not None: @@ -291,7 +298,7 @@ def _update_state(self, job): '''Check the status of the job.''' completed = _run_strict( - 'sacct -S %s -P -j %s -o jobid,state,exitcode,nodelist' % + 'sacct -S %s -P -j %s -o jobid,state,exitcode,nodelist,end' % (datetime.now().strftime('%F'), job.jobid) ) self._update_state_count += 1 @@ -303,7 +310,7 @@ def _update_state(self, job): # See (`Job Array Support\d+(?:_\d+|_\[\d+-\d+\])?)\|(?P\S+)([^\|]*)\|' - r'(?P\d+)\:(?P\d+)\|(?P.*)', + r'(?P\d+)\:(?P\d+)\|(?P.*)\|(?P\S+)', completed.stdout, re.MULTILINE)) if not state_match: getlogger().debug('job state not matched (stdout follows)\n%s' % @@ -318,6 +325,10 @@ def _update_state(self, job): if slurm_state_completed(job.state): # Since Slurm exitcodes are positive take the maximum one job.exitcode = max(int(s.group('exitcode')) for s in state_match) + self._completion_time = max( + datetime.strptime(s.group('end'), '%Y-%m-%dT%H:%M:%S') + for s in state_match + ) # Use ',' to join nodes to be consistent with Slurm syntax self._set_nodelist( From 733bbf13eb0f71ec0d1d4f41855e68fd005aefe7 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Fri, 6 Dec 2019 13:45:22 +0100 Subject: [PATCH 02/16] Fix long line --- reframe/core/schedulers/slurm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index e5f5c741d0..2841e2bf53 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -310,7 +310,8 @@ def _update_state(self, job): # See (`Job Array Support\d+(?:_\d+|_\[\d+-\d+\])?)\|(?P\S+)([^\|]*)\|' - r'(?P\d+)\:(?P\d+)\|(?P.*)\|(?P\S+)', + r'(?P\d+)\:(?P\d+)\|(?P.*)\|' + r'(?P\S+)', completed.stdout, re.MULTILINE)) if not state_match: getlogger().debug('job state not matched (stdout follows)\n%s' % From e15780e23333ab6be7a6e9b6b96988ade5f9a9cc Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Wed, 11 Dec 2019 18:43:58 +0100 Subject: [PATCH 03/16] Address PR comments --- reframe/core/logging.py | 11 ++++------- reframe/core/schedulers/__init__.py | 14 ++++++++++++-- reframe/core/schedulers/slurm.py | 8 +++----- reframe/utility/os_ext.py | 7 +++++-- unittests/test_launchers.py | 4 ++++ 5 files changed, 28 insertions(+), 16 deletions(-) diff --git a/reframe/core/logging.py b/reframe/core/logging.py index adaa249b63..3719bcceab 100644 --- a/reframe/core/logging.py +++ b/reframe/core/logging.py @@ -427,15 +427,12 @@ def _update_check_extras(self): if self.check.job: self.extra['check_jobid'] = self.check.job.jobid - - if self.check.job: if self.check.job.completion_time: - # Format is the same for every handler + # Use the logging handlers' date format to format completion_time + # NOTE: All handlers use the same date format fmt = self.logger.handlers[0].formatter.datefmt - self.extra['check_completion_time'] = ( - self.check.job.completion_time.strftime(fmt)) - else: - self.extra['check_completion_time'] = "NA" + ct = self.check.job.completion_time.strftime(fmt) + self.extra['check_completion_time'] = ct def log_performance(self, level, tag, value, ref, low_thres, upper_thres, unit=None, *, msg=None): diff --git a/reframe/core/schedulers/__init__.py b/reframe/core/schedulers/__init__.py index fe1d8ea203..30f53cda56 100644 --- a/reframe/core/schedulers/__init__.py +++ b/reframe/core/schedulers/__init__.py @@ -15,6 +15,11 @@ class JobScheduler(abc.ABC): + @property + @abc.abstractmethod + def completion_time(self): + pass + @abc.abstractmethod def emit_preamble(self, job): pass @@ -44,6 +49,7 @@ def finished(self, job): pass + class Job: '''A job descriptor. @@ -328,7 +334,7 @@ def wait(self): raise JobNotStartedError('cannot wait an unstarted job') self.scheduler.wait(self) - self._completion_time = datetime.now() + self._completion_time = self._completion_time or datetime.now() def cancel(self): if self.jobid is None: @@ -340,7 +346,11 @@ def finished(self): if self.jobid is None: raise JobNotStartedError('cannot poll an unstarted job') - return self.scheduler.finished(self) + res = self.scheduler.finished(self) + if res: + self._completion_time = self._completion_time or datetime.now() + + return res class Node(abc.ABC): diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 2841e2bf53..53df13aa02 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -90,7 +90,6 @@ def __init__(self): self._is_job_array = None self._update_state_count = 0 self._completion_time = None - os.environ["SLURM_TIME_FORMAT"] = "standard" @property def completion_time(self): @@ -297,10 +296,9 @@ def _set_nodelist(self, job, nodespec): def _update_state(self, job): '''Check the status of the job.''' - completed = _run_strict( - 'sacct -S %s -P -j %s -o jobid,state,exitcode,nodelist,end' % - (datetime.now().strftime('%F'), job.jobid) - ) + cmd = 'sacct -S %s -P -j %s -o jobid,state,exitcode,nodelist,end' %( + datetime.now().strftime('%F'), job.jobid) + completed = _run_strict(cmd, env={'SLURM_TIME_FORMAT': 'standard'}) self._update_state_count += 1 # This matches the format for both normal jobs as well as job arrays. diff --git a/reframe/utility/os_ext.py b/reframe/utility/os_ext.py index dc8387fbd5..b9640c6fbd 100644 --- a/reframe/utility/os_ext.py +++ b/reframe/utility/os_ext.py @@ -19,9 +19,10 @@ SpawnedProcessTimeout) -def run_command(cmd, check=False, timeout=None, shell=False): +def run_command(cmd, check=False, timeout=None, shell=False, env=None): try: - proc = run_command_async(cmd, shell=shell, start_new_session=True) + proc = run_command_async(cmd, shell=shell, start_new_session=True, + env=env) proc_stdout, proc_stderr = proc.communicate(timeout=timeout) except subprocess.TimeoutExpired as e: os.killpg(proc.pid, signal.SIGKILL) @@ -65,6 +66,7 @@ def run_command_async(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False, + env=None, **popen_args): # Import logger here to avoid unnecessary circular dependencies from reframe.core.logging import getlogger @@ -78,6 +80,7 @@ def run_command_async(cmd, stderr=stderr, universal_newlines=True, shell=shell, + env=env, **popen_args) diff --git a/unittests/test_launchers.py b/unittests/test_launchers.py index c9ae9c0dae..74d858f430 100644 --- a/unittests/test_launchers.py +++ b/unittests/test_launchers.py @@ -7,6 +7,10 @@ class FakeJobScheduler(JobScheduler): + @property + def completion_time(self): + pass + def emit_preamble(self, job): pass From 3904cdea52845c2722da601d83a98985dc94f1d4 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Wed, 11 Dec 2019 18:49:30 +0100 Subject: [PATCH 04/16] Fix PEP8 issues --- reframe/core/logging.py | 3 ++- reframe/core/schedulers/slurm.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/reframe/core/logging.py b/reframe/core/logging.py index 3719bcceab..44f7dadaac 100644 --- a/reframe/core/logging.py +++ b/reframe/core/logging.py @@ -428,7 +428,8 @@ def _update_check_extras(self): if self.check.job: self.extra['check_jobid'] = self.check.job.jobid if self.check.job.completion_time: - # Use the logging handlers' date format to format completion_time + # Use the logging handlers' date format to format + # completion_time # NOTE: All handlers use the same date format fmt = self.logger.handlers[0].formatter.datefmt ct = self.check.job.completion_time.strftime(fmt) diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 53df13aa02..2dfba5667f 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -296,8 +296,8 @@ def _set_nodelist(self, job, nodespec): def _update_state(self, job): '''Check the status of the job.''' - cmd = 'sacct -S %s -P -j %s -o jobid,state,exitcode,nodelist,end' %( - datetime.now().strftime('%F'), job.jobid) + cmd = 'sacct -S %s -P -j %s -o jobid,state,exitcode,nodelist,end' % ( + datetime.now().strftime('%F'), job.jobid) completed = _run_strict(cmd, env={'SLURM_TIME_FORMAT': 'standard'}) self._update_state_count += 1 From 3853a067935d2b752e31138d617096b41f951130 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Thu, 12 Dec 2019 10:08:13 +0100 Subject: [PATCH 05/16] Fix sacct environment --- reframe/core/schedulers/__init__.py | 6 +++--- reframe/core/schedulers/slurm.py | 7 +++++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/reframe/core/schedulers/__init__.py b/reframe/core/schedulers/__init__.py index 30f53cda56..30dbdd3ba0 100644 --- a/reframe/core/schedulers/__init__.py +++ b/reframe/core/schedulers/__init__.py @@ -346,11 +346,11 @@ def finished(self): if self.jobid is None: raise JobNotStartedError('cannot poll an unstarted job') - res = self.scheduler.finished(self) - if res: + done = self.scheduler.finished(self) + if done: self._completion_time = self._completion_time or datetime.now() - return res + return done class Node(abc.ABC): diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 2dfba5667f..155707ca27 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -297,8 +297,11 @@ def _update_state(self, job): '''Check the status of the job.''' cmd = 'sacct -S %s -P -j %s -o jobid,state,exitcode,nodelist,end' % ( - datetime.now().strftime('%F'), job.jobid) - completed = _run_strict(cmd, env={'SLURM_TIME_FORMAT': 'standard'}) + datetime.now().strftime('%F'), job.jobid + ) + sacct_env = os.environ.copy() + sacct_env['SLURM_TIME_FORMAT'] = 'standard' + completed = _run_strict(cmd, env=sacct_env) self._update_state_count += 1 # This matches the format for both normal jobs as well as job arrays. From 380d21e6ef97648de99c4d9fb76814484f2f024f Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Fri, 13 Dec 2019 16:38:41 +0100 Subject: [PATCH 06/16] Modify environment for sacct with context manager --- reframe/core/environments.py | 17 +++++++++++++++++ reframe/core/schedulers/slurm.py | 14 +++++++------- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/reframe/core/environments.py b/reframe/core/environments.py index fef08e7221..ffa0cfb251 100644 --- a/reframe/core/environments.py +++ b/reframe/core/environments.py @@ -144,6 +144,23 @@ def emit_load_commands(*environs): return commands +class modify_env: + '''Context manager to temporarily change the environment.''' + + def __init__(self, variables=None): + self._environ_save = snapshot() + self._new_vars = variables + + def __enter__(self): + if self._new_vars: + os.environ.update(self._new_vars) + + return snapshot() + + def __exit__(self, exc_type, exc_value, traceback): + self._environ_save.restore() + + class ProgEnvironment(Environment): '''A class representing a programming environment. diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 155707ca27..d81d0df32c 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -3,11 +3,11 @@ import itertools import re import time -import os from argparse import ArgumentParser from contextlib import suppress from datetime import datetime +import reframe.core.environments as env import reframe.core.schedulers as sched import reframe.utility.os_ext as os_ext from reframe.core.config import settings @@ -296,12 +296,12 @@ def _set_nodelist(self, job, nodespec): def _update_state(self, job): '''Check the status of the job.''' - cmd = 'sacct -S %s -P -j %s -o jobid,state,exitcode,nodelist,end' % ( - datetime.now().strftime('%F'), job.jobid - ) - sacct_env = os.environ.copy() - sacct_env['SLURM_TIME_FORMAT'] = 'standard' - completed = _run_strict(cmd, env=sacct_env) + with env.modify_env(variables={'SLURM_TIME_FORMAT': 'standard'}): + completed = _run_strict( + 'sacct -S %s -P -j %s -o jobid,state,exitcode,nodelist,end' % + (datetime.now().strftime('%F'), job.jobid) + ) + self._update_state_count += 1 # This matches the format for both normal jobs as well as job arrays. From f84b7c92cac5d627c4758f225aa1f07910e8d7e1 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Mon, 16 Dec 2019 16:08:43 +0100 Subject: [PATCH 07/16] Address PR comments - Change the environment context manager so that it uses Environment class - Add an option for logging in run_command - Set completion_time of slurm scheduler inside the respective function instead of update_state --- reframe/core/environments.py | 16 ++++----- reframe/core/schedulers/__init__.py | 6 ++-- reframe/core/schedulers/local.py | 3 +- reframe/core/schedulers/pbs.py | 11 +++---- reframe/core/schedulers/slurm.py | 51 +++++++++++++++++++++-------- reframe/utility/os_ext.py | 13 ++++---- unittests/test_launchers.py | 2 +- 7 files changed, 61 insertions(+), 41 deletions(-) diff --git a/reframe/core/environments.py b/reframe/core/environments.py index ffa0cfb251..6af9e062a5 100644 --- a/reframe/core/environments.py +++ b/reframe/core/environments.py @@ -144,18 +144,18 @@ def emit_load_commands(*environs): return commands -class modify_env: +class temp_environment: '''Context manager to temporarily change the environment.''' - def __init__(self, variables=None): - self._environ_save = snapshot() - self._new_vars = variables + def __init__(self, modules=[], variables=[]): + self._modules = modules + self._variables = variables def __enter__(self): - if self._new_vars: - os.environ.update(self._new_vars) - - return snapshot() + self._environ_save, _ = load(Environment(name='temp_env', + modules=self._modules, + variables=self._variables)) + return self._environ_save def __exit__(self, exc_type, exc_value, traceback): self._environ_save.restore() diff --git a/reframe/core/schedulers/__init__.py b/reframe/core/schedulers/__init__.py index 30dbdd3ba0..943e088deb 100644 --- a/reframe/core/schedulers/__init__.py +++ b/reframe/core/schedulers/__init__.py @@ -15,9 +15,8 @@ class JobScheduler(abc.ABC): - @property @abc.abstractmethod - def completion_time(self): + def completion_time(self, job): pass @abc.abstractmethod @@ -49,7 +48,6 @@ def finished(self, job): pass - class Job: '''A job descriptor. @@ -269,7 +267,7 @@ def sched_exclusive_access(self): @property def completion_time(self): - return self.scheduler.completion_time or self._completion_time + return self.scheduler.completion_time(self) or self._completion_time def prepare(self, commands, environs=None, **gen_opts): environs = environs or [] diff --git a/reframe/core/schedulers/local.py b/reframe/core/schedulers/local.py index a7051fad38..cf0eabe2e3 100644 --- a/reframe/core/schedulers/local.py +++ b/reframe/core/schedulers/local.py @@ -30,8 +30,7 @@ def __init__(self): self._f_stdout = None self._f_stderr = None - @property - def completion_time(self): + def completion_time(self, job): return None def submit(self, job): diff --git a/reframe/core/schedulers/pbs.py b/reframe/core/schedulers/pbs.py index bdb4d3e21b..418e3a4bad 100644 --- a/reframe/core/schedulers/pbs.py +++ b/reframe/core/schedulers/pbs.py @@ -31,14 +31,13 @@ class PbsJobScheduler(sched.JobScheduler): def __init__(self): self._prefix = '#PBS' - self._completion_time = None + self._time_finished = None # Optional part of the job id refering to the PBS server self._pbs_server = None - @property - def completion_time(self): - return self._completion_time + def completion_time(self, job): + return None def _emit_lselect_option(self, job): num_tasks_per_node = job.num_tasks_per_node or 1 @@ -131,7 +130,7 @@ def finished(self, job): if done: t_now = datetime.now() - self._completion_time = self._completion_time or t_now - time_from_finish = (t_now - self._completion_time).total_seconds() + self._time_finished = self._time_finished or t_now + time_from_finish = (t_now - self._time_finished).total_seconds() return done and time_from_finish > PBS_OUTPUT_WRITEBACK_WAIT diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index d81d0df32c..41dbd6ce26 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -91,8 +91,35 @@ def __init__(self): self._update_state_count = 0 self._completion_time = None - @property - def completion_time(self): + def completion_time(self, job): + if (self._completion_time or + not slurm_state_completed(job.state)): + return self._completion_time + + slurm_variable = [('SLURM_TIME_FORMAT', 'standard')] + with env.temp_environment(variables=slurm_variable): + completed = _run_strict( + 'sacct -S %s -P -j %s -o jobid,end' % + (datetime.now().strftime('%F'), job.jobid), + log=False + ) + + # This matches the format for both normal jobs as well as job arrays. + # For job arrays the job_id has one of the following formats: + # * _ + # * _[-] + # See (`Job Array Support\d+(?:_\d+|_\[\d+-\d+\])?)\|(?P\S+)', + completed.stdout, re.MULTILINE)) + if not state_match: + return self._completion_time + + self._completion_time = max( + datetime.strptime(s.group('end'), '%Y-%m-%dT%H:%M:%S') + for s in state_match + ) + return self._completion_time def _format_option(self, var, option): @@ -296,12 +323,10 @@ def _set_nodelist(self, job, nodespec): def _update_state(self, job): '''Check the status of the job.''' - with env.modify_env(variables={'SLURM_TIME_FORMAT': 'standard'}): - completed = _run_strict( - 'sacct -S %s -P -j %s -o jobid,state,exitcode,nodelist,end' % - (datetime.now().strftime('%F'), job.jobid) - ) - + completed = _run_strict( + 'sacct -S %s -P -j %s -o jobid,state,exitcode,nodelist' % + (datetime.now().strftime('%F'), job.jobid) + ) self._update_state_count += 1 # This matches the format for both normal jobs as well as job arrays. @@ -311,8 +336,7 @@ def _update_state(self, job): # See (`Job Array Support\d+(?:_\d+|_\[\d+-\d+\])?)\|(?P\S+)([^\|]*)\|' - r'(?P\d+)\:(?P\d+)\|(?P.*)\|' - r'(?P\S+)', + r'(?P\d+)\:(?P\d+)\|(?P.*)', completed.stdout, re.MULTILINE)) if not state_match: getlogger().debug('job state not matched (stdout follows)\n%s' % @@ -327,10 +351,6 @@ def _update_state(self, job): if slurm_state_completed(job.state): # Since Slurm exitcodes are positive take the maximum one job.exitcode = max(int(s.group('exitcode')) for s in state_match) - self._completion_time = max( - datetime.strptime(s.group('end'), '%Y-%m-%dT%H:%M:%S') - for s in state_match - ) # Use ',' to join nodes to be consistent with Slurm syntax self._set_nodelist( @@ -451,6 +471,9 @@ def __init__(self): self._squeue_delay = 2 self._cancelled = False + def completion_time(self, job): + return None + def submit(self, job): super().submit(job) self._submit_time = datetime.now() diff --git a/reframe/utility/os_ext.py b/reframe/utility/os_ext.py index b9640c6fbd..cbb36ff957 100644 --- a/reframe/utility/os_ext.py +++ b/reframe/utility/os_ext.py @@ -19,10 +19,10 @@ SpawnedProcessTimeout) -def run_command(cmd, check=False, timeout=None, shell=False, env=None): +def run_command(cmd, check=False, timeout=None, shell=False, log=True): try: proc = run_command_async(cmd, shell=shell, start_new_session=True, - env=env) + log=log) proc_stdout, proc_stderr = proc.communicate(timeout=timeout) except subprocess.TimeoutExpired as e: os.killpg(proc.pid, signal.SIGKILL) @@ -66,12 +66,14 @@ def run_command_async(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False, - env=None, + log=True, **popen_args): # Import logger here to avoid unnecessary circular dependencies - from reframe.core.logging import getlogger + if log: + from reframe.core.logging import getlogger + + getlogger().debug('executing OS command: ' + cmd) - getlogger().debug('executing OS command: ' + cmd) if not shell: cmd = shlex.split(cmd) @@ -80,7 +82,6 @@ def run_command_async(cmd, stderr=stderr, universal_newlines=True, shell=shell, - env=env, **popen_args) diff --git a/unittests/test_launchers.py b/unittests/test_launchers.py index 74d858f430..a1df4e57eb 100644 --- a/unittests/test_launchers.py +++ b/unittests/test_launchers.py @@ -8,7 +8,7 @@ class FakeJobScheduler(JobScheduler): @property - def completion_time(self): + def completion_time(self, job): pass def emit_preamble(self, job): From db48d69ac1ba20d172e7c8f3639ba62a3ff80689 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Tue, 17 Dec 2019 16:53:19 +0100 Subject: [PATCH 08/16] Address PR comments --- reframe/core/schedulers/slurm.py | 33 +++++++++++++++----------------- reframe/utility/os_ext.py | 1 - unittests/test_environments.py | 9 +++++++++ 3 files changed, 24 insertions(+), 19 deletions(-) diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 41dbd6ce26..6436131849 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -70,6 +70,13 @@ class SlurmJobScheduler(sched.JobScheduler): # standard job state polling using sacct. SACCT_SQUEUE_RATIO = 10 + # This matches the format for both normal jobs as well as job arrays. + # For job arrays the job_id has one of the following formats: + # * _ + # * _[-] + # See (`Job Array Support_ - # * _[-] - # See (`Job Array Support\d+(?:_\d+|_\[\d+-\d+\])?)\|(?P\S+)', + r'^(?P%s)\|(?P\S+)' % self._state_patt, completed.stdout, re.MULTILINE)) if not state_match: - return self._completion_time + return None self._completion_time = max( datetime.strptime(s.group('end'), '%Y-%m-%dT%H:%M:%S') @@ -329,14 +331,9 @@ def _update_state(self, job): ) self._update_state_count += 1 - # This matches the format for both normal jobs as well as job arrays. - # For job arrays the job_id has one of the following formats: - # * _ - # * _[-] - # See (`Job Array Support\d+(?:_\d+|_\[\d+-\d+\])?)\|(?P\S+)([^\|]*)\|' - r'(?P\d+)\:(?P\d+)\|(?P.*)', + r'^(?P%s)\|(?P\S+)([^\|]*)\|(?P\d+)\:' + r'(?P\d+)\|(?P.*)' % self._state_patt, completed.stdout, re.MULTILINE)) if not state_match: getlogger().debug('job state not matched (stdout follows)\n%s' % @@ -345,7 +342,7 @@ def _update_state(self, job): # Join the states with ',' in case of job arrays job.state = ','.join(s.group('state') for s in state_match) - if not self._update_state_count % SlurmJobScheduler.SACCT_SQUEUE_RATIO: + if not self._update_state_count % self.SACCT_SQUEUE_RATIO: self._cancel_if_blocked(job) if slurm_state_completed(job.state): diff --git a/reframe/utility/os_ext.py b/reframe/utility/os_ext.py index cbb36ff957..917e353d5d 100644 --- a/reframe/utility/os_ext.py +++ b/reframe/utility/os_ext.py @@ -71,7 +71,6 @@ def run_command_async(cmd, # Import logger here to avoid unnecessary circular dependencies if log: from reframe.core.logging import getlogger - getlogger().debug('executing OS command: ' + cmd) if not shell: diff --git a/unittests/test_environments.py b/unittests/test_environments.py index 47741e9d84..f718b3e9db 100644 --- a/unittests/test_environments.py +++ b/unittests/test_environments.py @@ -88,6 +88,15 @@ def test_load_restore(self): assert not self.environ.is_loaded + @fixtures.switch_to_user_runtime + def test_temp_environment_context_manager(self): + with env.temp_environment(variables=[('_var0', 'val2'), + ('_var3', 'val3')], + modules=['testmod_foo']): + assert os.environ['_var0'] == 'val2' + assert os.environ['_var3'] == 'val3' + assert self.environ_save == env.snapshot() + @fixtures.switch_to_user_runtime def test_load_already_present(self): self.setup_modules_system() From de5294ce54eb8a478848dd0eac7f197a4c67f35d Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Tue, 17 Dec 2019 17:52:36 +0100 Subject: [PATCH 09/16] Return env in temp_environment context manager --- reframe/core/environments.py | 7 +++---- unittests/test_environments.py | 11 +++++------ 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/reframe/core/environments.py b/reframe/core/environments.py index 6af9e062a5..11f2d07c73 100644 --- a/reframe/core/environments.py +++ b/reframe/core/environments.py @@ -152,10 +152,9 @@ def __init__(self, modules=[], variables=[]): self._variables = variables def __enter__(self): - self._environ_save, _ = load(Environment(name='temp_env', - modules=self._modules, - variables=self._variables)) - return self._environ_save + new_env = Environment('_rfm_temp_env', self._modules, self._variables) + self._environ_save, _ = load(new_env) + return new_env def __exit__(self, exc_type, exc_value, traceback): self._environ_save.restore() diff --git a/unittests/test_environments.py b/unittests/test_environments.py index f718b3e9db..1e96ada453 100644 --- a/unittests/test_environments.py +++ b/unittests/test_environments.py @@ -90,12 +90,11 @@ def test_load_restore(self): @fixtures.switch_to_user_runtime def test_temp_environment_context_manager(self): - with env.temp_environment(variables=[('_var0', 'val2'), - ('_var3', 'val3')], - modules=['testmod_foo']): - assert os.environ['_var0'] == 'val2' - assert os.environ['_var3'] == 'val3' - assert self.environ_save == env.snapshot() + with env.temp_environment(['testmod_foo'], + [('_var0', 'val2'), ('_var3', 'val3')]) as environ: + assert environ.is_loaded + + assert not environ.is_loaded @fixtures.switch_to_user_runtime def test_load_already_present(self): From 7f1ff463f8b3314f37021707bb846c28d44d0128 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Wed, 18 Dec 2019 09:14:20 +0100 Subject: [PATCH 10/16] Fix PEP8 issues --- reframe/core/schedulers/slurm.py | 2 +- unittests/test_environments.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 6436131849..50f419825e 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -104,7 +104,7 @@ def completion_time(self, job): return self._completion_time with env.temp_environment( - variables=[('SLURM_TIME_FORMAT', 'standard')]): + variables=[('SLURM_TIME_FORMAT', 'standard')]): completed = os_ext.run_command( 'sacct -S %s -P -j %s -o jobid,end' % (datetime.now().strftime('%F'), job.jobid), diff --git a/unittests/test_environments.py b/unittests/test_environments.py index 1e96ada453..4fd7311dea 100644 --- a/unittests/test_environments.py +++ b/unittests/test_environments.py @@ -91,7 +91,8 @@ def test_load_restore(self): @fixtures.switch_to_user_runtime def test_temp_environment_context_manager(self): with env.temp_environment(['testmod_foo'], - [('_var0', 'val2'), ('_var3', 'val3')]) as environ: + [('_var0', 'val2'), + ('_var3', 'val3')]) as environ: assert environ.is_loaded assert not environ.is_loaded From b5db7dcd4beeb602e2649eb3414eb1c431429245 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Wed, 18 Dec 2019 12:02:31 +0100 Subject: [PATCH 11/16] Remove fixtures decorator --- unittests/test_environments.py | 1 - 1 file changed, 1 deletion(-) diff --git a/unittests/test_environments.py b/unittests/test_environments.py index 4fd7311dea..576b0dcceb 100644 --- a/unittests/test_environments.py +++ b/unittests/test_environments.py @@ -88,7 +88,6 @@ def test_load_restore(self): assert not self.environ.is_loaded - @fixtures.switch_to_user_runtime def test_temp_environment_context_manager(self): with env.temp_environment(['testmod_foo'], [('_var0', 'val2'), From 5340b83e764fba74e186b62d104108245eb6f577 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Wed, 18 Dec 2019 15:38:27 +0100 Subject: [PATCH 12/16] Add back fixtures after merging with master --- unittests/test_environments.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unittests/test_environments.py b/unittests/test_environments.py index 576b0dcceb..4fd7311dea 100644 --- a/unittests/test_environments.py +++ b/unittests/test_environments.py @@ -88,6 +88,7 @@ def test_load_restore(self): assert not self.environ.is_loaded + @fixtures.switch_to_user_runtime def test_temp_environment_context_manager(self): with env.temp_environment(['testmod_foo'], [('_var0', 'val2'), From 93ca17b85675ecd7d8c1f937aca8f5a0e1bac1a1 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Wed, 18 Dec 2019 16:32:54 +0100 Subject: [PATCH 13/16] Fix unit tests using modules systems Plus some minor fine tuning of code style. --- reframe/core/schedulers/slurm.py | 3 +-- unittests/test_environments.py | 9 +++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 50f419825e..769048c393 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -103,8 +103,7 @@ def completion_time(self, job): not slurm_state_completed(job.state)): return self._completion_time - with env.temp_environment( - variables=[('SLURM_TIME_FORMAT', 'standard')]): + with env.temp_environment(variables={'SLURM_TIME_FORMAT': 'standard'}): completed = os_ext.run_command( 'sacct -S %s -P -j %s -o jobid,end' % (datetime.now().strftime('%F'), job.jobid), diff --git a/unittests/test_environments.py b/unittests/test_environments.py index 4fd7311dea..7d22c51b2a 100644 --- a/unittests/test_environments.py +++ b/unittests/test_environments.py @@ -89,10 +89,11 @@ def test_load_restore(self): assert not self.environ.is_loaded @fixtures.switch_to_user_runtime - def test_temp_environment_context_manager(self): - with env.temp_environment(['testmod_foo'], - [('_var0', 'val2'), - ('_var3', 'val3')]) as environ: + def test_temp_environment(self): + self.setup_modules_system() + with env.temp_environment( + ['testmod_foo'], {'_var0': 'val2', '_var3': 'val3'} + ) as environ: assert environ.is_loaded assert not environ.is_loaded From cc16120f6e16c8990cb5c5944e36f574320a3ddf Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Wed, 18 Dec 2019 17:54:33 +0100 Subject: [PATCH 14/16] Add documentation for check_completion_time --- docs/running.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/running.rst b/docs/running.rst index 396cb0ff43..c46adf07d5 100644 --- a/docs/running.rst +++ b/docs/running.rst @@ -840,6 +840,11 @@ All handlers accept the following set of attributes (keys) in their configuratio * ``format`` (default: ``'%(message)s'``): Format string for the printout of the log record. ReFrame supports all the `format strings `__ from Python's logging library and provides the following additional ones: + - ``check_completion_time``: A timestamp with the completion time of the regression test. + For the ``slurm`` scheduler, the timestamp relies on job accounting (``sacct`` command) and returns the termination time of the job. + The rest of the schedulers report as completion time the moment the framework realizes the test has finished, as soon as the ``wait`` phase ends. + The accuracy is dependent on the polling rate of the ``wait`` phase and can be lower for tests that take more time. + If a job or process is not yet completed, ``None`` will be printed. - ``check_environ``: The programming environment a test is currently executing for. - ``check_info``: Print live information of the currently executing check. By default this field has the form `` on using ``. From 8af1098fc3f3728960c98d5c5c026ce74222ef2f Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Wed, 18 Dec 2019 17:58:21 +0100 Subject: [PATCH 15/16] Add documentation for check_completion_time --- docs/running.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/running.rst b/docs/running.rst index c46adf07d5..28a4635844 100644 --- a/docs/running.rst +++ b/docs/running.rst @@ -841,7 +841,7 @@ All handlers accept the following set of attributes (keys) in their configuratio ReFrame supports all the `format strings `__ from Python's logging library and provides the following additional ones: - ``check_completion_time``: A timestamp with the completion time of the regression test. - For the ``slurm`` scheduler, the timestamp relies on job accounting (``sacct`` command) and returns the termination time of the job. + The ``slurm`` scheduler relies on job accounting (``sacct`` command) and returns the actual termination time of the job. The rest of the schedulers report as completion time the moment the framework realizes the test has finished, as soon as the ``wait`` phase ends. The accuracy is dependent on the polling rate of the ``wait`` phase and can be lower for tests that take more time. If a job or process is not yet completed, ``None`` will be printed. From 7a359d6fe4d5666d3503b5d713141f0a12340388 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Wed, 18 Dec 2019 19:47:29 +0100 Subject: [PATCH 16/16] Changed log variable to check_job_completion_time Also: - Fine tune documentation --- docs/running.rst | 13 ++++++++----- reframe/core/logging.py | 4 ++-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/docs/running.rst b/docs/running.rst index 28a4635844..448134e894 100644 --- a/docs/running.rst +++ b/docs/running.rst @@ -840,17 +840,20 @@ All handlers accept the following set of attributes (keys) in their configuratio * ``format`` (default: ``'%(message)s'``): Format string for the printout of the log record. ReFrame supports all the `format strings `__ from Python's logging library and provides the following additional ones: - - ``check_completion_time``: A timestamp with the completion time of the regression test. - The ``slurm`` scheduler relies on job accounting (``sacct`` command) and returns the actual termination time of the job. - The rest of the schedulers report as completion time the moment the framework realizes the test has finished, as soon as the ``wait`` phase ends. - The accuracy is dependent on the polling rate of the ``wait`` phase and can be lower for tests that take more time. - If a job or process is not yet completed, ``None`` will be printed. - ``check_environ``: The programming environment a test is currently executing for. - ``check_info``: Print live information of the currently executing check. By default this field has the form `` on using ``. It can be configured on a per test basis by overriding the :func:`info ` method of a specific regression test. - ``check_jobid``: Prints the job or process id of the job or process associated with the currently executing regression test. If a job or process is not yet created, ``-1`` will be printed. + - ``check_job_completion_time``: *[new in 2.21]* The completion time of the job spawned by this regression test. + This timestamp will be formatted according to ``datefmt`` (see below). + The accuracy of the timestamp depends on the backend scheduler. + The ``slurm`` scheduler backend relies on job accounting and returns the actual termination time of the job. + The rest of the backends report as completion time the moment when the framework realizes that the spawned job has finished. + In this case, the accuracy depends on the execution policy used. + If tests are executed with the serial execution policy, this is close to the real completion time, but if the asynchronous execution policy is used, it can differ significantly. + If the job completion time cannot be retrieved, ``None`` will be printed. - ``check_name``: Prints the name of the regression test on behalf of which ReFrame is currently executing. If ReFrame is not in the context of regression test, ``reframe`` will be printed. - ``check_num_tasks``: The number of tasks assigned to the regression test. diff --git a/reframe/core/logging.py b/reframe/core/logging.py index 44f7dadaac..1b021246cf 100644 --- a/reframe/core/logging.py +++ b/reframe/core/logging.py @@ -367,6 +367,7 @@ def __init__(self, logger=None, check=None): { 'check_name': 'reframe', 'check_jobid': '-1', + 'check_job_completion_time': None, 'check_info': 'reframe', 'check_system': None, 'check_partition': None, @@ -374,7 +375,6 @@ def __init__(self, logger=None, check=None): 'check_outputdir': None, 'check_stagedir': None, 'check_num_tasks': None, - 'check_completion_time': None, 'check_perf_var': None, 'check_perf_value': None, 'check_perf_ref': None, @@ -433,7 +433,7 @@ def _update_check_extras(self): # NOTE: All handlers use the same date format fmt = self.logger.handlers[0].formatter.datefmt ct = self.check.job.completion_time.strftime(fmt) - self.extra['check_completion_time'] = ct + self.extra['check_job_completion_time'] = ct def log_performance(self, level, tag, value, ref, low_thres, upper_thres, unit=None, *, msg=None):