From b778a767c95d03afddb5b5ddc4ea28e4aa219ed7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mos=C3=A8=20Giordano?= Date: Mon, 11 Sep 2023 18:44:10 +0100 Subject: [PATCH 1/2] Check exit status of PBS Pro jobs --- reframe/core/schedulers/pbs.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/reframe/core/schedulers/pbs.py b/reframe/core/schedulers/pbs.py index bb93c0d150..e6193f29a3 100644 --- a/reframe/core/schedulers/pbs.py +++ b/reframe/core/schedulers/pbs.py @@ -178,7 +178,9 @@ def _update_nodelist(self, job, nodespec): job._nodelist = [x.split('/')[0] for x in nodespec.split('+')] job._nodelist.sort() - def poll(self, *jobs): + # The second argument is to specialise some code paths to PBS Pro only, but + # not Torque. + def _poll(self, is_pbs_pro, *jobs): def output_ready(job): # We report a job as finished only when its stdout/stderr are # written back to the working directory @@ -209,6 +211,19 @@ def output_ready(job): if job.cancelled or output_ready(job): self.log(f'Assuming job {job.jobid} completed') job._completed = True + if is_pbs_pro: + # With PBS Pro we can obtain the exit status of the job, + # in case it actually failed. + extended_info = osext.run_command( + f'qstat -xf {job.jobid}' + ) + exit_status_match = re.search( + r'^ *Exit_status *= *(?P\d+)', + extended_info.stdout, + flags=re.MULTILINE, + ) + if exit_status_match: + job._exitcode = int(exit_status_match.group('exit_status')) return @@ -277,7 +292,13 @@ def output_ready(job): job._exception = JobError('maximum pending time exceeded', job.jobid) + def poll(self, *job): + self._poll(True, *job) + @register_scheduler('torque') class TorqueJobScheduler(PbsJobScheduler): TASKS_OPT = '-l nodes={num_nodes}:ppn={num_cpus_per_node}' + + def poll(self, *job): + self._poll(False, *job) From 722bca584069cf4e84886300380c8b293daa333c Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Sat, 23 Sep 2023 00:07:00 +0200 Subject: [PATCH 2/2] Address PR comments --- reframe/core/schedulers/pbs.py | 42 +++++++++++++++++----------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/reframe/core/schedulers/pbs.py b/reframe/core/schedulers/pbs.py index e6193f29a3..4c357f8e1b 100644 --- a/reframe/core/schedulers/pbs.py +++ b/reframe/core/schedulers/pbs.py @@ -178,9 +178,21 @@ def _update_nodelist(self, job, nodespec): job._nodelist = [x.split('/')[0] for x in nodespec.split('+')] job._nodelist.sort() - # The second argument is to specialise some code paths to PBS Pro only, but - # not Torque. - def _poll(self, is_pbs_pro, *jobs): + def _query_exit_code(self, job): + '''Try to retrieve the exit code of a past job.''' + + # With PBS Pro we can obtain the exit status of a past job + extended_info = osext.run_command(f'qstat -xf {job.jobid}') + exit_status_match = re.search( + r'^ *Exit_status *= *(?P\d+)', extended_info.stdout, + flags=re.MULTILINE, + ) + if exit_status_match: + return int(exit_status_match.group('exit_status')) + + return None + + def poll(self, *jobs): def output_ready(job): # We report a job as finished only when its stdout/stderr are # written back to the working directory @@ -211,19 +223,7 @@ def output_ready(job): if job.cancelled or output_ready(job): self.log(f'Assuming job {job.jobid} completed') job._completed = True - if is_pbs_pro: - # With PBS Pro we can obtain the exit status of the job, - # in case it actually failed. - extended_info = osext.run_command( - f'qstat -xf {job.jobid}' - ) - exit_status_match = re.search( - r'^ *Exit_status *= *(?P\d+)', - extended_info.stdout, - flags=re.MULTILINE, - ) - if exit_status_match: - job._exitcode = int(exit_status_match.group('exit_status')) + job._exitcode = self._query_exit_code(job) return @@ -292,13 +292,13 @@ def output_ready(job): job._exception = JobError('maximum pending time exceeded', job.jobid) - def poll(self, *job): - self._poll(True, *job) - @register_scheduler('torque') class TorqueJobScheduler(PbsJobScheduler): TASKS_OPT = '-l nodes={num_nodes}:ppn={num_cpus_per_node}' - def poll(self, *job): - self._poll(False, *job) + def _query_exit_code(self, job): + '''Try to retrieve the exit code of a past job.''' + + # Torque does not provide a way to retrieve the history of jobs + return None