-
Notifications
You must be signed in to change notification settings - Fork 117
[feat] Add SGE scheduler backend #1959
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
14 commits
Select commit
Hold shift + click to select a range
2f7e83f
Minimally working SGE scheduler
giordano db90f78
Use XML output
giordano aae98c4
Inherit `SgeJobScheduler` from `PbsJobScheduler`
giordano 624c44e
Apply suggestions from code review
giordano dc60d73
Some polling improvements to the SGE scheduler
1f6cfb3
Simplify completion assessment
b66901f
Merge pull request #1 from vkarak/mg/sge-scheduler-improvements
giordano d90431c
Fix variable names and improve error message
giordano ab5fcc2
Update documentation
b954398
WIP: Add unit test
833357c
WIP: Add unit test
f364ab5
Remove `-lselect` option from the SGE scheduler + add unit tests
393fc9e
Document how slots can be defined and used with the SGE backend
2bcc6f6
Merge branch 'master' into mg/sge-scheduler
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,146 @@ | ||
| # Copyright 2016-2021 Swiss National Supercomputing Centre (CSCS/ETH Zurich) | ||
| # ReFrame Project Developers. See the top-level LICENSE file for details. | ||
| # | ||
| # SPDX-License-Identifier: BSD-3-Clause | ||
|
|
||
| # | ||
| # SGE backend | ||
| # | ||
| # - Initial version submitted by Mosè Giordano, UCL (based on the PBS backend) | ||
| # | ||
|
|
||
| import functools | ||
| import re | ||
| import time | ||
| import xml.etree.ElementTree as ET | ||
|
|
||
| import reframe.core.runtime as rt | ||
| import reframe.utility.osext as osext | ||
| from reframe.core.backends import register_scheduler | ||
| from reframe.core.exceptions import JobSchedulerError | ||
| from reframe.core.schedulers.pbs import PbsJobScheduler | ||
| from reframe.utility import seconds_to_hms | ||
|
|
||
| _run_strict = functools.partial(osext.run_command, check=True) | ||
|
|
||
|
|
||
| @register_scheduler('sge') | ||
| class SgeJobScheduler(PbsJobScheduler): | ||
| def __init__(self): | ||
| self._prefix = '#$' | ||
| self._submit_timeout = rt.runtime().get_option( | ||
| f'schedulers/@{self.registered_name}/job_submit_timeout' | ||
| ) | ||
|
|
||
| def emit_preamble(self, job): | ||
| preamble = [ | ||
| self._format_option(f'-N "{job.name}"'), | ||
| self._format_option(f'-o {job.stdout}'), | ||
| self._format_option(f'-e {job.stderr}'), | ||
| self._format_option(f'-wd {job.workdir}') | ||
| ] | ||
|
|
||
| if job.time_limit is not None: | ||
| h, m, s = seconds_to_hms(job.time_limit) | ||
| preamble.append( | ||
| self._format_option(f'-l h_rt=%d:%d:%d' % (h, m, s)) | ||
| ) | ||
|
|
||
| # Emit the rest of the options | ||
| options = job.options + job.cli_options | ||
| for opt in options: | ||
| if opt.startswith('#'): | ||
| preamble.append(opt) | ||
| else: | ||
| preamble.append(self._format_option(opt)) | ||
|
|
||
| return preamble | ||
|
|
||
| def submit(self, job): | ||
| # `-o` and `-e` options are only recognized in command line by the PBS, | ||
| # SGE, and Slurm wrappers. | ||
| cmd = f'qsub -o {job.stdout} -e {job.stderr} {job.script_filename}' | ||
| completed = _run_strict(cmd, timeout=self._submit_timeout) | ||
| jobid_match = re.search(r'^Your job (?P<jobid>\S+)', completed.stdout) | ||
| if not jobid_match: | ||
| raise JobSchedulerError('could not retrieve the job id ' | ||
| 'of the submitted job') | ||
|
|
||
| job._jobid = jobid_match.group('jobid') | ||
| job._submit_time = time.time() | ||
|
|
||
| def poll(self, *jobs): | ||
| if jobs: | ||
| # Filter out non-jobs | ||
| jobs = [job for job in jobs if job is not None] | ||
|
|
||
| if not jobs: | ||
| return | ||
|
|
||
| user = osext.osuser() | ||
| completed = osext.run_command(f'qstat -xml -u {user}') | ||
| if completed.returncode != 0: | ||
| raise JobSchedulerError( | ||
| f'qstat failed with exit code {completed.returncode} ' | ||
| f'(standard error follows):\n{completed.stderr}' | ||
| ) | ||
|
|
||
| # Index the jobs to poll on their jobid | ||
| jobs_to_poll = {job.jobid: job for job in jobs} | ||
|
|
||
| # Parse the XML | ||
| root = ET.fromstring(completed.stdout) | ||
|
|
||
| # We are iterating over the returned XML and update the status of the | ||
| # jobs relevant to ReFrame; the naming convention of variables matches | ||
| # that of SGE's XML output | ||
|
|
||
| known_jobs = set() # jobs known to the SGE scheduler | ||
| for queue_info in root: | ||
| # Reads the XML and prints jobs with status belonging to user. | ||
| if queue_info is None: | ||
| raise JobSchedulerError('could not retrieve queue information') | ||
|
|
||
| for job_list in queue_info: | ||
vkarak marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| if job_list.find("JB_owner").text != user: | ||
| # Not a job of this user. | ||
| continue | ||
|
|
||
| jobid = job_list.find("JB_job_number").text | ||
| if jobid not in jobs_to_poll: | ||
| # Not a reframe job | ||
| continue | ||
|
|
||
| state = job_list.find("state").text | ||
| job = jobs_to_poll[jobid] | ||
| known_jobs.add(job) | ||
|
|
||
| # For the list of known statuses see `man 5 sge_status` | ||
| # (https://arc.liv.ac.uk/SGE/htmlman/htmlman5/sge_status.html) | ||
| if state in ['r', 'hr', 't', 'Rr', 'Rt']: | ||
| job._state = 'RUNNING' | ||
| elif state in ['qw', 'Rq', 'hqw', 'hRwq']: | ||
| job._state = 'PENDING' | ||
| elif state in ['s', 'ts', 'S', 'tS', 'T', 'tT', 'Rs', | ||
| 'Rts', 'RS', 'RtS', 'RT', 'RtT']: | ||
| job._state = 'SUSPENDED' | ||
| elif state in ['Eqw', 'Ehqw', 'EhRqw']: | ||
| job._state = 'ERROR' | ||
| elif state in ['dr', 'dt', 'dRr', 'dRt', 'ds', | ||
| 'dS', 'dT', 'dRs', 'dRS', 'dRT']: | ||
| job._state = 'DELETING' | ||
| elif state == 'z': | ||
| job._state = 'COMPLETED' | ||
|
|
||
| # Mark any "unknown" job as completed | ||
| unknown_jobs = set(jobs) - known_jobs | ||
| for job in unknown_jobs: | ||
| self.log(f'Job {job.jobid} not known to scheduler, ' | ||
| f'assuming job completed') | ||
| job._state = 'COMPLETED' | ||
|
|
||
| def finished(self, job): | ||
| if job.exception: | ||
| raise job.exception | ||
|
|
||
| return job.state == 'COMPLETED' | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.