Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions reframe/core/schedulers/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@ class SlurmJobState(sched.JobState):

@register_scheduler('slurm')
class SlurmJob(sched.Job):
# In some systems, scheduler performance is sensitive to the squeue poll
# ratio. In this backend, squeue is used to obtain the reason a job is
# blocked, so as to cancel it if it is blocked indefinitely. The following
# variable controls the frequency of squeue polling compared to the
# standard job state polling using sacct.
SACCT_SQUEUE_RATIO = 10

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._prefix = '#SBATCH'
Expand Down Expand Up @@ -61,6 +68,7 @@ def __init__(self, *args, **kwargs):
'ReqNodeNotAvail', # Inaccurate SLURM doc
'QOSUsageThreshold']
self._is_cancelling = False
self._update_state_count = 0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it state or status?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@victorusu I think 'state' is fine.


def _emit_job_option(self, var, option, builder):
if var is not None:
Expand Down Expand Up @@ -144,7 +152,7 @@ def prepare(self, builder):
def submit(self):
cmd = 'sbatch %s' % self.script_filename
completed = self._run_command(cmd, settings().job_submit_timeout)
jobid_match = re.search('Submitted batch job (?P<jobid>\d+)',
jobid_match = re.search(r'Submitted batch job (?P<jobid>\d+)',
completed.stdout)
if not jobid_match:
raise JobError(
Expand Down Expand Up @@ -183,7 +191,7 @@ def _count_compatible_nodes(self, nodes):
def _get_reservation_nodes(self):
command = 'scontrol show res %s' % self.sched_reservation
completed = os_ext.run_command(command, check=True)
node_match = re.search('(Nodes=\S+)', completed.stdout)
node_match = re.search(r'(Nodes=\S+)', completed.stdout)
if node_match:
reservation_nodes = node_match[1]
else:
Expand Down Expand Up @@ -217,6 +225,7 @@ def _update_state(self):
'sacct -S %s -P -j %s -o jobid,state,exitcode' %
(datetime.now().strftime('%F'), self._jobid)
)
self._update_state_count += 1
state_match = re.search(r'^(?P<jobid>\d+)\|(?P<state>\S+)([^\|]*)\|'
r'(?P<exitcode>\d+)\:(?P<signal>\d+)',
completed.stdout, re.MULTILINE)
Expand All @@ -226,7 +235,10 @@ def _update_state(self):
return

self._state = SlurmJobState(state_match.group('state'))
self._cancel_if_blocked()

if not self._update_state_count % SlurmJob.SACCT_SQUEUE_RATIO:
self._cancel_if_blocked()

if self._state in self._completion_states:
self._exitcode = int(state_match.group('exitcode'))

Expand Down