From 13d5bc6f5cdb2bdaf316239255cb88f128dd95ad Mon Sep 17 00:00:00 2001 From: Theofilos Manitaras Date: Fri, 25 May 2018 10:32:26 +0200 Subject: [PATCH 1/3] Introduce SACCT_SQUEUE_RATIO to reduce squeue calls * Reduce the `squeue` rate by introducing the constant `SACCT_SQUEUE_RATIO`. * Fix some warnings regarding regexes. --- reframe/core/schedulers/slurm.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 3e7e3ada6a..a77f105bb6 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -31,6 +31,9 @@ class SlurmJobState(sched.JobState): SLURM_JOB_SUSPENDED = SlurmJobState('SUSPENDED') SLURM_JOB_TIMEOUT = SlurmJobState('TIMEOUT') +# Number of _update_state calls per which _cancel_if_blocked is called +SACCT_SQUEUE_RATIO = 20 + @register_scheduler('slurm') class SlurmJob(sched.Job): @@ -61,6 +64,7 @@ def __init__(self, *args, **kwargs): 'ReqNodeNotAvail', # Inaccurate SLURM doc 'QOSUsageThreshold'] self._is_cancelling = False + self._update_state_count = 0 def _emit_job_option(self, var, option, builder): if var is not None: @@ -144,7 +148,7 @@ def prepare(self, builder): def submit(self): cmd = 'sbatch %s' % self.script_filename completed = self._run_command(cmd, settings().job_submit_timeout) - jobid_match = re.search('Submitted batch job (?P\d+)', + jobid_match = re.search(r'Submitted batch job (?P\d+)', completed.stdout) if not jobid_match: raise JobError( @@ -183,7 +187,7 @@ def _count_compatible_nodes(self, nodes): def _get_reservation_nodes(self): command = 'scontrol show res %s' % self.sched_reservation completed = os_ext.run_command(command, check=True) - node_match = re.search('(Nodes=\S+)', completed.stdout) + node_match = re.search(r'(Nodes=\S+)', completed.stdout) if node_match: reservation_nodes = node_match[1] else: @@ -217,6 +221,7 @@ def _update_state(self): 'sacct -S %s -P -j %s -o jobid,state,exitcode' % (datetime.now().strftime('%F'), self._jobid) ) + self._update_state_count += 1 state_match = re.search(r'^(?P\d+)\|(?P\S+)([^\|]*)\|' r'(?P\d+)\:(?P\d+)', completed.stdout, re.MULTILINE) @@ -226,7 +231,11 @@ def _update_state(self): return self._state = SlurmJobState(state_match.group('state')) - self._cancel_if_blocked() + + if self._update_state_count == SACCT_SQUEUE_RATIO: + self._update_state_count = 0 + self._cancel_if_blocked() + if self._state in self._completion_states: self._exitcode = int(state_match.group('exitcode')) From 42409e4eec66a1b4b82ef98605993e63cbeb2b8e Mon Sep 17 00:00:00 2001 From: Theofilos Manitaras Date: Mon, 28 May 2018 15:06:38 +0200 Subject: [PATCH 2/3] Address PR comments --- reframe/core/schedulers/slurm.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index a77f105bb6..8291913410 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -31,12 +31,13 @@ class SlurmJobState(sched.JobState): SLURM_JOB_SUSPENDED = SlurmJobState('SUSPENDED') SLURM_JOB_TIMEOUT = SlurmJobState('TIMEOUT') -# Number of _update_state calls per which _cancel_if_blocked is called -SACCT_SQUEUE_RATIO = 20 - @register_scheduler('slurm') class SlurmJob(sched.Job): + # The following ratio was introduced in order to reduce the number of + # squeue calls during asynchronous execution of ReFrame + SACCT_SQUEUE_RATIO = 10 + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._prefix = '#SBATCH' @@ -232,8 +233,7 @@ def _update_state(self): self._state = SlurmJobState(state_match.group('state')) - if self._update_state_count == SACCT_SQUEUE_RATIO: - self._update_state_count = 0 + if not self._update_state_count % SlurmJob.SACCT_SQUEUE_RATIO: self._cancel_if_blocked() if self._state in self._completion_states: From bb30865866891ac8753650d68b3468eff84d2d57 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Mon, 28 May 2018 15:51:04 +0200 Subject: [PATCH 3/3] Improve description of SACCT_SQUEUE_RATIO variable --- reframe/core/schedulers/slurm.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 8291913410..d961d46b40 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -34,8 +34,11 @@ class SlurmJobState(sched.JobState): @register_scheduler('slurm') class SlurmJob(sched.Job): - # The following ratio was introduced in order to reduce the number of - # squeue calls during asynchronous execution of ReFrame + # In some systems, scheduler performance is sensitive to the squeue poll + # ratio. In this backend, squeue is used to obtain the reason a job is + # blocked, so as to cancel it if it is blocked indefinitely. The following + # variable controls the frequency of squeue polling compared to the + # standard job state polling using sacct. SACCT_SQUEUE_RATIO = 10 def __init__(self, *args, **kwargs):