diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 3e7e3ada6a..d961d46b40 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -34,6 +34,13 @@ class SlurmJobState(sched.JobState): @register_scheduler('slurm') class SlurmJob(sched.Job): + # In some systems, scheduler performance is sensitive to the squeue poll + # ratio. In this backend, squeue is used to obtain the reason a job is + # blocked, so as to cancel it if it is blocked indefinitely. The following + # variable controls the frequency of squeue polling compared to the + # standard job state polling using sacct. + SACCT_SQUEUE_RATIO = 10 + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._prefix = '#SBATCH' @@ -61,6 +68,7 @@ def __init__(self, *args, **kwargs): 'ReqNodeNotAvail', # Inaccurate SLURM doc 'QOSUsageThreshold'] self._is_cancelling = False + self._update_state_count = 0 def _emit_job_option(self, var, option, builder): if var is not None: @@ -144,7 +152,7 @@ def prepare(self, builder): def submit(self): cmd = 'sbatch %s' % self.script_filename completed = self._run_command(cmd, settings().job_submit_timeout) - jobid_match = re.search('Submitted batch job (?P\d+)', + jobid_match = re.search(r'Submitted batch job (?P\d+)', completed.stdout) if not jobid_match: raise JobError( @@ -183,7 +191,7 @@ def _count_compatible_nodes(self, nodes): def _get_reservation_nodes(self): command = 'scontrol show res %s' % self.sched_reservation completed = os_ext.run_command(command, check=True) - node_match = re.search('(Nodes=\S+)', completed.stdout) + node_match = re.search(r'(Nodes=\S+)', completed.stdout) if node_match: reservation_nodes = node_match[1] else: @@ -217,6 +225,7 @@ def _update_state(self): 'sacct -S %s -P -j %s -o jobid,state,exitcode' % (datetime.now().strftime('%F'), self._jobid) ) + self._update_state_count += 1 state_match = re.search(r'^(?P\d+)\|(?P\S+)([^\|]*)\|' r'(?P\d+)\:(?P\d+)', completed.stdout, re.MULTILINE) @@ -226,7 +235,10 @@ def _update_state(self): return self._state = SlurmJobState(state_match.group('state')) - self._cancel_if_blocked() + + if not self._update_state_count % SlurmJob.SACCT_SQUEUE_RATIO: + self._cancel_if_blocked() + if self._state in self._completion_states: self._exitcode = int(state_match.group('exitcode'))