From b1984b186ca0b48bd080630086d1570c5acd31a1 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Fri, 22 Jan 2021 09:07:55 +0100 Subject: [PATCH 01/10] Add configuration option for blocking submission --- reframe/core/schedulers/slurm.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 90c4b9ba21..ffbb0e01d6 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -125,6 +125,9 @@ def __init__(self): self._use_nodes_opt = rt.runtime().get_option( f'schedulers/@{self.registered_name}/use_nodes_option' ) + self._block_submission = rt.runtime().get_option( + f'schedulers/@{self.registered_name}/block_submission' + ) def make_job(self, *args, **kwargs): return _SlurmJob(*args, **kwargs) @@ -227,7 +230,20 @@ def emit_preamble(self, job): def submit(self, job): cmd = f'sbatch {job.script_filename}' - completed = _run_strict(cmd, timeout=self._submit_timeout) + intervals = itertools.cycle([1, 2, 3]) + while True: + try: + completed = _run_strict(cmd, timeout=self._submit_timeout) + break + except SpawnedProcessError as e: + if (not self._block_submission or + e.exitcode != 1 or + 'sbatch: error: QOSMaxSubmitJobPerUserLimit' not in e.stderr + ): + raise e + + time.sleep(next(intervals)) + jobid_match = re.search(r'Submitted batch job (?P\d+)', completed.stdout) if not jobid_match: From 8861b94487982e4ce04c29f7282820e89f77f947 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Fri, 22 Jan 2021 09:14:57 +0100 Subject: [PATCH 02/10] Add json schema --- reframe/schemas/config.json | 2 ++ 1 file changed, 2 insertions(+) diff --git a/reframe/schemas/config.json b/reframe/schemas/config.json index a41895a85c..52c7813574 100644 --- a/reframe/schemas/config.json +++ b/reframe/schemas/config.json @@ -293,6 +293,7 @@ "enum": ["local", "pbs", "slurm", "squeue", "torque"] }, "ignore_reqnodenotavail": {"type": "boolean"}, + "block_submission": {"type": "boolean"}, "job_submit_timeout": {"type": "number"}, "target_systems": {"$ref": "#/defs/system_ref"}, "use_nodes_option": {"type": "boolean"} @@ -445,6 +446,7 @@ "modes/options": [], "modes/target_systems": ["*"], "schedulers/ignore_reqnodenotavail": false, + "schedulers/block_submission": false, "schedulers/job_submit_timeout": 60, "schedulers/target_systems": ["*"], "schedulers/use_nodes_option": false, From d111aa5765ee69fbfd3c09ecacf197ba300d6e99 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Fri, 22 Jan 2021 09:30:00 +0100 Subject: [PATCH 03/10] Fix PEP8 issue --- reframe/core/schedulers/slurm.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index ffbb0e01d6..5ea5eb4406 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -236,9 +236,11 @@ def submit(self, job): completed = _run_strict(cmd, timeout=self._submit_timeout) break except SpawnedProcessError as e: - if (not self._block_submission or + sbatch_error = 'sbatch: error: QOSMaxSubmitJobPerUserLimit' + if ( + not self._block_submission or e.exitcode != 1 or - 'sbatch: error: QOSMaxSubmitJobPerUserLimit' not in e.stderr + sbatch_error not in e.stderr ): raise e From b481bfc3d71831a054a7078cd4d4cc93534e0af8 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Mon, 25 Jan 2021 09:38:59 +0100 Subject: [PATCH 04/10] add resubmit_on_qos_errors option in configuration --- docs/config_reference.rst | 14 ++++++++++++++ reframe/core/schedulers/slurm.py | 11 +++++------ reframe/schemas/config.json | 7 +++++-- 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/docs/config_reference.rst b/docs/config_reference.rst index c7aa95ebe1..71b0c77a7b 100644 --- a/docs/config_reference.rst +++ b/docs/config_reference.rst @@ -1006,6 +1006,20 @@ Common scheduler options In such cases, you may set this parameter to ``true`` to avoid this. +.. js:attribute:: .schedulers[].resubmit_on_qos_errors + + :required: No + :default: ``[]`` + + This option is relevant to the Slurm backends only. + + When a job is submitted certain errors can be ignored and the framework will try to submit again the job after some seconds. + ReFrame is checking with a regular expression if any of the given expressions is contained in the ``stderr`` of the submission command. + Keep in mind that the submission is blocking and ReFrame will not continue until the submission is successful or has a different error. + + .. versionadded:: 3.4 + + Execution Mode Configuration ---------------------------- diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 5ea5eb4406..11f44e5ceb 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -125,8 +125,8 @@ def __init__(self): self._use_nodes_opt = rt.runtime().get_option( f'schedulers/@{self.registered_name}/use_nodes_option' ) - self._block_submission = rt.runtime().get_option( - f'schedulers/@{self.registered_name}/block_submission' + self._resubmit_on_qos_errors = rt.runtime().get_option( + f'schedulers/@{self.registered_name}/resubmit_on_qos_errors' ) def make_job(self, *args, **kwargs): @@ -236,11 +236,10 @@ def submit(self, job): completed = _run_strict(cmd, timeout=self._submit_timeout) break except SpawnedProcessError as e: - sbatch_error = 'sbatch: error: QOSMaxSubmitJobPerUserLimit' + sbatch_error = '|'.join(self._resubmit_on_qos_errors) if ( - not self._block_submission or - e.exitcode != 1 or - sbatch_error not in e.stderr + not self._resubmit_on_qos_errors or + not re.search(sbatch_error, e.stderr) ): raise e diff --git a/reframe/schemas/config.json b/reframe/schemas/config.json index 52c7813574..4f7dc78523 100644 --- a/reframe/schemas/config.json +++ b/reframe/schemas/config.json @@ -293,7 +293,10 @@ "enum": ["local", "pbs", "slurm", "squeue", "torque"] }, "ignore_reqnodenotavail": {"type": "boolean"}, - "block_submission": {"type": "boolean"}, + "resubmit_on_qos_errors": { + "type": "array", + "items": {"type": "string"} + }, "job_submit_timeout": {"type": "number"}, "target_systems": {"$ref": "#/defs/system_ref"}, "use_nodes_option": {"type": "boolean"} @@ -446,7 +449,7 @@ "modes/options": [], "modes/target_systems": ["*"], "schedulers/ignore_reqnodenotavail": false, - "schedulers/block_submission": false, + "resubmit_on_qos_errors": [], "schedulers/job_submit_timeout": 60, "schedulers/target_systems": ["*"], "schedulers/use_nodes_option": false, From e7f28498964f6eddedcd17e0c18fd5d7ead747bd Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Thu, 28 Jan 2021 16:43:06 +0100 Subject: [PATCH 05/10] Change resubmit_on_qos_errors to a more generic option --- docs/config_reference.rst | 9 +++++---- reframe/core/schedulers/slurm.py | 12 +++++------- reframe/schemas/config.json | 4 ++-- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/docs/config_reference.rst b/docs/config_reference.rst index 71b0c77a7b..2bbb4352c8 100644 --- a/docs/config_reference.rst +++ b/docs/config_reference.rst @@ -1006,7 +1006,7 @@ Common scheduler options In such cases, you may set this parameter to ``true`` to avoid this. -.. js:attribute:: .schedulers[].resubmit_on_qos_errors +.. js:attribute:: .schedulers[].resubmit_on_errors :required: No :default: ``[]`` @@ -1014,10 +1014,11 @@ Common scheduler options This option is relevant to the Slurm backends only. When a job is submitted certain errors can be ignored and the framework will try to submit again the job after some seconds. - ReFrame is checking with a regular expression if any of the given expressions is contained in the ``stderr`` of the submission command. - Keep in mind that the submission is blocking and ReFrame will not continue until the submission is successful or has a different error. + An example of this could be QOS errors like ``QOSMaxSubmitJobPerUserLimit``, in which case you would have to set the option to ``["QOSMaxSubmitJobPerUserLimit"]``. You can ignore multiple errors at the same time if you include all the error strings in the list. + Job submission is a synchronous operation in ReFrame. + If this option is set, it will block the whole execution until the error conditions specified in this list are raised. - .. versionadded:: 3.4 + .. versionadded:: 3.5 Execution Mode Configuration diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 11f44e5ceb..77fe4f3ce8 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -125,8 +125,8 @@ def __init__(self): self._use_nodes_opt = rt.runtime().get_option( f'schedulers/@{self.registered_name}/use_nodes_option' ) - self._resubmit_on_qos_errors = rt.runtime().get_option( - f'schedulers/@{self.registered_name}/resubmit_on_qos_errors' + self._resubmit_on_errors = rt.runtime().get_option( + f'schedulers/@{self.registered_name}/resubmit_on_errors' ) def make_job(self, *args, **kwargs): @@ -236,11 +236,9 @@ def submit(self, job): completed = _run_strict(cmd, timeout=self._submit_timeout) break except SpawnedProcessError as e: - sbatch_error = '|'.join(self._resubmit_on_qos_errors) - if ( - not self._resubmit_on_qos_errors or - not re.search(sbatch_error, e.stderr) - ): + sbatch_error = '|'.join(self._resubmit_on_errors) + if (not self._resubmit_on_errors or + not re.search(sbatch_error, e.stderr)): raise e time.sleep(next(intervals)) diff --git a/reframe/schemas/config.json b/reframe/schemas/config.json index 4f7dc78523..519565ad8a 100644 --- a/reframe/schemas/config.json +++ b/reframe/schemas/config.json @@ -293,7 +293,7 @@ "enum": ["local", "pbs", "slurm", "squeue", "torque"] }, "ignore_reqnodenotavail": {"type": "boolean"}, - "resubmit_on_qos_errors": { + "resubmit_on_errors": { "type": "array", "items": {"type": "string"} }, @@ -449,7 +449,7 @@ "modes/options": [], "modes/target_systems": ["*"], "schedulers/ignore_reqnodenotavail": false, - "resubmit_on_qos_errors": [], + "schedulers/resubmit_on_errors": [], "schedulers/job_submit_timeout": 60, "schedulers/target_systems": ["*"], "schedulers/use_nodes_option": false, From 787627a12e605c7350732bf2625e2c1372e8af85 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Fri, 29 Jan 2021 13:53:30 +0100 Subject: [PATCH 06/10] Fine tune documentation --- docs/config_reference.rst | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/docs/config_reference.rst b/docs/config_reference.rst index 2bbb4352c8..dd641745dc 100644 --- a/docs/config_reference.rst +++ b/docs/config_reference.rst @@ -1013,13 +1013,17 @@ Common scheduler options This option is relevant to the Slurm backends only. - When a job is submitted certain errors can be ignored and the framework will try to submit again the job after some seconds. - An example of this could be QOS errors like ``QOSMaxSubmitJobPerUserLimit``, in which case you would have to set the option to ``["QOSMaxSubmitJobPerUserLimit"]``. You can ignore multiple errors at the same time if you include all the error strings in the list. - Job submission is a synchronous operation in ReFrame. - If this option is set, it will block the whole execution until the error conditions specified in this list are raised. + If any of the listed errors occur, ReFrame will try to resubmit the job after some seconds. + As an example, you could have ReFrame trying to resubmit a job in case that the maximum submission limit per user is reached by setting this field to ``["QOSMaxSubmitJobPerUserLimit"]``. + You can ignore multiple errors at the same time if you add more error strings in the list. .. versionadded:: 3.5 + .. warning:: + Job submission is a synchronous operation in ReFrame. + If this option is set, ReFrame's execution will block until the error conditions specified in this list are resolved. + No other test would be able to proceed. + Execution Mode Configuration ---------------------------- From f8fb2e526ce6cf64f72e52ba7bf2bb8763513c08 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Mon, 1 Feb 2021 14:33:59 +0100 Subject: [PATCH 07/10] Log error condition in case job submission is going to be retried --- reframe/core/schedulers/slurm.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 77fe4f3ce8..d0678b5271 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -236,12 +236,17 @@ def submit(self, job): completed = _run_strict(cmd, timeout=self._submit_timeout) break except SpawnedProcessError as e: - sbatch_error = '|'.join(self._resubmit_on_errors) + sbatch_error = rf'({"|".join(self._resubmit_on_errors)})' if (not self._resubmit_on_errors or not re.search(sbatch_error, e.stderr)): - raise e + raise - time.sleep(next(intervals)) + t = next(intervals) + self.log( + f'encountered a job submission error: {sbatch_error.group(1)}' + f'will resubmit after {t}s' + ) + time.sleep(t) jobid_match = re.search(r'Submitted batch job (?P\d+)', completed.stdout) From 16788d4a1f9ccf04b1f2f65f8e435fb297369a16 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Mon, 1 Feb 2021 20:56:33 +0100 Subject: [PATCH 08/10] Fix error message matching --- reframe/core/schedulers/slurm.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index d0678b5271..05eb125a9c 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -236,9 +236,10 @@ def submit(self, job): completed = _run_strict(cmd, timeout=self._submit_timeout) break except SpawnedProcessError as e: - sbatch_error = rf'({"|".join(self._resubmit_on_errors)})' - if (not self._resubmit_on_errors or - not re.search(sbatch_error, e.stderr)): + error_match = re.search( + rf'({"|".join(self._resubmit_on_errors)})', e.stderr + ) + if not self._resubmit_on_errors or not error_match: raise t = next(intervals) From e63c838e8b977cd594727bc908003eb543743f0f Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Mon, 1 Feb 2021 21:10:47 +0100 Subject: [PATCH 09/10] Fix NameError --- reframe/core/schedulers/slurm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 05eb125a9c..26982b0a1f 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -244,7 +244,7 @@ def submit(self, job): t = next(intervals) self.log( - f'encountered a job submission error: {sbatch_error.group(1)}' + f'encountered a job submission error: {error_match.group(1)}' f'will resubmit after {t}s' ) time.sleep(t) From ad57a02d419476abdcf10b1e667450c5b2442509 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Mon, 1 Feb 2021 21:20:38 +0100 Subject: [PATCH 10/10] Fix PEP8 issue --- reframe/core/schedulers/slurm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 26982b0a1f..5e8aaca88e 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -244,8 +244,8 @@ def submit(self, job): t = next(intervals) self.log( - f'encountered a job submission error: {error_match.group(1)}' - f'will resubmit after {t}s' + f'encountered a job submission error: ' + f'{error_match.group(1)}: will resubmit after {t}s' ) time.sleep(t)