From 433a8debc7f6f0b303e2938b099d7e5089ebbb43 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Thu, 29 Jul 2021 11:40:27 +0300 Subject: [PATCH 01/76] Add the build_complete method --- reframe/core/pipeline.py | 29 ++++++++++++++++++++++++++ reframe/frontend/executors/__init__.py | 11 ++++++++++ 2 files changed, 40 insertions(+) diff --git a/reframe/core/pipeline.py b/reframe/core/pipeline.py index 571e6e0a3a..0b98321f3e 100644 --- a/reframe/core/pipeline.py +++ b/reframe/core/pipeline.py @@ -1500,6 +1500,35 @@ def run(self): if self.job.sched_flex_alloc_nodes: self.num_tasks = self.job.num_tasks + @final + def build_complete(self): + '''Check if the build phase has completed. + + :returns: :class:`True` if the associated build job has finished, + :class:`False` otherwise. + + If no job descriptor is yet associated with this test, + :class:`True` is returned. + :raises reframe.core.exceptions.ReframeError: In case of errors. + + .. warning:: + You may not override this method directly unless you are in + special test. See `here + `__ for + more details. + + + .. versionchanged:: 3.4 + Overriding this method directly in no longer allowed. See `here + `__ for + more details. + + ''' + if not self._build_job: + return True + + return self._build_job.finished() + @final def run_complete(self): '''Check if the run phase has completed. diff --git a/reframe/frontend/executors/__init__.py b/reframe/frontend/executors/__init__.py index a7699aa9b3..ac1b014ac9 100644 --- a/reframe/frontend/executors/__init__.py +++ b/reframe/frontend/executors/__init__.py @@ -302,6 +302,13 @@ def run_complete(self): return done + def build_complete(self): + done = self._safe_call(self.check.run_complete) + if done: + self._notify_listeners('on_task_build_exit') + + return done + def run_wait(self): self._safe_call(self.check.run_wait) self.zombie = False @@ -373,6 +380,10 @@ def on_task_run(self, task): def on_task_exit(self, task): '''Called whenever a RegressionTask finishes.''' + @abc.abstractmethod + def on_task_build_exit(self, task): + '''Called whenever a RegressionTask build finishes.''' + @abc.abstractmethod def on_task_skip(self, task): '''Called whenever a RegressionTask is skipped.''' From f4c1c6de5e1603cb1440e4ea7e2fdbc21a45d7a5 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Thu, 29 Jul 2021 13:13:59 +0300 Subject: [PATCH 02/76] Split build and run pipeline phase --- reframe/frontend/executors/policies.py | 73 +++++++++++++++++++------- 1 file changed, 53 insertions(+), 20 deletions(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 56d0661e19..e3c5058644 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -161,6 +161,9 @@ def on_task_run(self, task): def on_task_exit(self, task): pass + def on_task_build_exit(self, task): + pass + def on_task_skip(self, task): msg = str(task.exc_info[1]) self.printer.status('SKIP', msg, just='right') @@ -261,6 +264,17 @@ def _remove_from_running(self, task): getlogger().debug2('Task was not running') pass + def _remove_from_building(self, task): + getlogger().debug2( + f'Removing task from the building list: {task.testcase}' + ) + try: + partname = task.check.current_partition.fullname + self._building_tasks[partname].remove(task) + except (ValueError, AttributeError, KeyError): + getlogger().debug2('Task was not building') + pass + # FIXME: The following functions are very similar and they are also reused # in the serial policy; we should refactor them def deps_failed(self, task): @@ -339,6 +353,11 @@ def on_task_exit(self, task): self._remove_from_running(task) self._completed_tasks.append(task) + def on_task_build_exit(self, task): + task.build_wait() + self._remove_from_building(task) + self._reschedule(task) + def _setup_task(self, task): if self.deps_skipped(task): try: @@ -394,20 +413,20 @@ def runcase(self, case): return - if len(self._running_tasks[partname]) >= partition.max_jobs: - # Make sure that we still exceeded the job limit - getlogger().debug2( - f'Reached concurrency limit for partition {partname!r}: ' - f'{partition.max_jobs} job(s)' - ) - self._poll_tasks() - - if len(self._running_tasks[partname]) < partition.max_jobs: - # Task was put in _ready_tasks during setup - self._ready_tasks[partname].pop() - self._reschedule(task) - else: - self.printer.status('HOLD', task.check.info(), just='right') + # if len(self._running_tasks[partname]) >= partition.max_jobs: + # # Make sure that we still exceeded the job limit + # getlogger().debug2( + # f'Reached concurrency limit for partition {partname!r}: ' + # f'{partition.max_jobs} job(s)' + # ) + # self._poll_tasks() + + # if len(self._running_tasks[partname]) < partition.max_jobs: + # # Task was put in _ready_tasks during setup + # self._ready_tasks[partname].pop() + # self._reschedule(task) + # else: + # self.printer.status('HOLD', task.check.info(), just='right') except TaskExit: if not task.failed and not task.skipped: with contextlib.suppress(TaskExit): @@ -424,12 +443,12 @@ def runcase(self, case): def _poll_tasks(self): '''Update the counts of running checks per partition.''' - def split_jobs(tasks): + def split_jobs(tasks, build_split=False): '''Split jobs into forced local and normal ones.''' forced_local = [] normal = [] for t in tasks: - if t.check.local: + if t.check.local or (build_split and t.check.build_locally): forced_local.append(t.check.job) else: normal.append(t.check.job) @@ -439,7 +458,7 @@ def split_jobs(tasks): for part in self._partitions: partname = part.fullname num_tasks = len(self._running_tasks[partname]) - getlogger().debug2(f'Polling {num_tasks} task(s) in {partname!r}') + getlogger().debug2(f'Polling {num_tasks} running task(s) in {partname!r}') forced_local_jobs, part_jobs = split_jobs( self._running_tasks[partname] ) @@ -447,9 +466,21 @@ def split_jobs(tasks): self.local_scheduler.poll(*forced_local_jobs) # Trigger notifications for finished jobs - for t in self._running_tasks[partname]: + for t in self._running_tasks[partname][:]: t.run_complete() + num_tasks = len(self._building_tasks[partname]) + getlogger().debug2(f'Polling {num_tasks} building task(s) in {partname!r}') + forced_local_jobs, part_jobs = split_jobs( + self._building_tasks[partname], build_split=True + ) + part.scheduler.poll(*part_jobs) + self.local_scheduler.poll(*forced_local_jobs) + + # Trigger notifications for finished jobs + for t in self._building_tasks[partname][:]: + t.build_complete() + def _setup_all(self): still_waiting = [] for task in self._waiting_tasks: @@ -496,10 +527,12 @@ def _failall(self, cause): self._completed_tasks): task.abort(cause) - def _reschedule(self, task): + def _reschedule_building(self, task): getlogger().debug2(f'Scheduling test case {task.testcase} for running') task.compile() - task.compile_wait() + + def _reschedule(self, task): + getlogger().debug2(f'Scheduling test case {task.testcase} for running') task.run() def _reschedule_all(self): From ca4755fe7e4a9b73f4789c30a5cd5b300fc7a13f Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Wed, 18 Aug 2021 10:23:05 +0200 Subject: [PATCH 03/76] Save intermediate work --- reframe/core/pipeline.py | 9 +++++++-- reframe/frontend/executors/__init__.py | 9 +++++++-- reframe/frontend/executors/policies.py | 26 ++++++++++++++++++++------ 3 files changed, 34 insertions(+), 10 deletions(-) diff --git a/reframe/core/pipeline.py b/reframe/core/pipeline.py index 0b98321f3e..73ce8e237c 100644 --- a/reframe/core/pipeline.py +++ b/reframe/core/pipeline.py @@ -1501,7 +1501,7 @@ def run(self): self.num_tasks = self.job.num_tasks @final - def build_complete(self): + def compile_complete(self): '''Check if the build phase has completed. :returns: :class:`True` if the associated build job has finished, @@ -1525,9 +1525,14 @@ def build_complete(self): ''' if not self._build_job: + print('no build job????\n\n\n') return True - return self._build_job.finished() + t = self._build_job.finished() + if t: + + print(f'Finished? {t}') + return t @final def run_complete(self): diff --git a/reframe/frontend/executors/__init__.py b/reframe/frontend/executors/__init__.py index ac1b014ac9..2ea7b27685 100644 --- a/reframe/frontend/executors/__init__.py +++ b/reframe/frontend/executors/__init__.py @@ -286,6 +286,7 @@ def setup(self, *args, **kwargs): def compile(self): self._safe_call(self.check.compile) + self._notify_listeners('on_task_build') def compile_wait(self): self._safe_call(self.check.compile_wait) @@ -302,8 +303,8 @@ def run_complete(self): return done - def build_complete(self): - done = self._safe_call(self.check.run_complete) + def compile_complete(self): + done = self._safe_call(self.check.compile_complete) if done: self._notify_listeners('on_task_build_exit') @@ -376,6 +377,10 @@ def on_task_setup(self, task): def on_task_run(self, task): '''Called whenever the run() method of a RegressionTask is called.''' + @abc.abstractmethod + def on_task_build(self, task): + '''Called whenever the build() method of a RegressionTask is called.''' + @abc.abstractmethod def on_task_exit(self, task): '''Called whenever a RegressionTask finishes.''' diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index e3c5058644..5c2f5a2a80 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -158,6 +158,9 @@ def on_task_setup(self, task): def on_task_run(self, task): pass + def on_task_build(self, task): + pass + def on_task_exit(self, task): pass @@ -230,6 +233,9 @@ def __init__(self): # Index tasks by test cases self._task_index = {} + # All currently building tasks per partition + self._building_tasks = {} + # All currently running tasks per partition self._running_tasks = {} @@ -300,6 +306,10 @@ def on_task_run(self, task): partname = task.check.current_partition.fullname self._running_tasks[partname].append(task) + def on_task_build(self, task): + partname = task.check.current_partition.fullname + self._building_tasks[partname].append(task) + def on_task_skip(self, task): # Remove the task from the running list if it was skipped after the # run phase @@ -354,9 +364,9 @@ def on_task_exit(self, task): self._completed_tasks.append(task) def on_task_build_exit(self, task): - task.build_wait() + task.compile_wait() self._remove_from_building(task) - self._reschedule(task) + self._reschedule_run(task) def _setup_task(self, task): if self.deps_skipped(task): @@ -390,6 +400,7 @@ def runcase(self, case): # Set partition-based counters, if not set already self._running_tasks.setdefault(partition.fullname, []) + self._building_tasks.setdefault(partition.fullname, []) self._ready_tasks.setdefault(partition.fullname, []) self._max_jobs.setdefault(partition.fullname, partition.max_jobs) @@ -479,7 +490,9 @@ def split_jobs(tasks, build_split=False): # Trigger notifications for finished jobs for t in self._building_tasks[partname][:]: - t.build_complete() + # print(f'There is a task: {t}') + t.compile_complete() + # print(f'Checked task: {t}') def _setup_all(self): still_waiting = [] @@ -527,11 +540,11 @@ def _failall(self, cause): self._completed_tasks): task.abort(cause) - def _reschedule_building(self, task): + def _reschedule(self, task): getlogger().debug2(f'Scheduling test case {task.testcase} for running') task.compile() - def _reschedule(self, task): + def _reschedule_run(self, task): getlogger().debug2(f'Scheduling test case {task.testcase} for running') task.run() @@ -558,7 +571,8 @@ def exit(self): self.printer.separator('short single line', 'waiting for spawned checks to finish') while (countall(self._running_tasks) or self._waiting_tasks or - self._completed_tasks or countall(self._ready_tasks)): + self._completed_tasks or countall(self._ready_tasks) or + countall(self._building_tasks)): getlogger().debug2(f'Running tasks: ' f'{countall(self._running_tasks)}') try: From 187a9f7cdd8b4478dfd557b936d52035605db3b4 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Wed, 15 Sep 2021 14:50:34 +0200 Subject: [PATCH 04/76] Remove print helping messages --- reframe/core/pipeline.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/reframe/core/pipeline.py b/reframe/core/pipeline.py index 87d5284c80..b071914fb0 100644 --- a/reframe/core/pipeline.py +++ b/reframe/core/pipeline.py @@ -1582,13 +1582,9 @@ def compile_complete(self): ''' if not self._build_job: - print('no build job????\n\n\n') return True t = self._build_job.finished() - if t: - - print(f'Finished? {t}') return t @final From e90e1e09810b957b163543b59140295f7773a98d Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Wed, 15 Sep 2021 14:58:53 +0200 Subject: [PATCH 05/76] Parameterize split for run and build jobs --- reframe/frontend/executors/policies.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 5c2f5a2a80..d78f81fc9b 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -454,15 +454,21 @@ def runcase(self, case): def _poll_tasks(self): '''Update the counts of running checks per partition.''' - def split_jobs(tasks, build_split=False): + def split_jobs(tasks, build_split=False, build_jobs=False): '''Split jobs into forced local and normal ones.''' forced_local = [] normal = [] for t in tasks: if t.check.local or (build_split and t.check.build_locally): - forced_local.append(t.check.job) + if build_jobs: + forced_local.append(t.check.build_job) + else: + forced_local.append(t.check.job) else: - normal.append(t.check.job) + if build_jobs: + normal.append(t.check.build_job) + else: + normal.append(t.check.job) return forced_local, normal @@ -483,16 +489,14 @@ def split_jobs(tasks, build_split=False): num_tasks = len(self._building_tasks[partname]) getlogger().debug2(f'Polling {num_tasks} building task(s) in {partname!r}') forced_local_jobs, part_jobs = split_jobs( - self._building_tasks[partname], build_split=True + self._building_tasks[partname], build_split=True, build_jobs=True ) part.scheduler.poll(*part_jobs) self.local_scheduler.poll(*forced_local_jobs) # Trigger notifications for finished jobs for t in self._building_tasks[partname][:]: - # print(f'There is a task: {t}') t.compile_complete() - # print(f'Checked task: {t}') def _setup_all(self): still_waiting = [] From c58b3908815899962f690ec8478b480529496c0d Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Wed, 15 Sep 2021 15:00:33 +0200 Subject: [PATCH 06/76] Remove from building when test is skipped in 'compile_complete' and 'compile_wait' --- reframe/frontend/executors/policies.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index d78f81fc9b..2a7b2bf3f0 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -318,6 +318,9 @@ def on_task_skip(self, task): if task.failed_stage in ('run_complete', 'run_wait'): self._running_tasks[partname].remove(task) + if task.failed_stage in ('compile_complete', 'compile_wait'): + self._building_tasks[partname].remove(task) + msg = str(task.exc_info[1]) self.printer.status('SKIP', msg, just='right') From a51e6b11fa2a2db85f4592d094f0fc0eed2e1c2e Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Thu, 16 Sep 2021 09:57:41 +0200 Subject: [PATCH 07/76] Temporarily remove failing unittests --- unittests/test_policies.py | 56 +++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/unittests/test_policies.py b/unittests/test_policies.py index 5baf9ff94a..0d53019481 100644 --- a/unittests/test_policies.py +++ b/unittests/test_policies.py @@ -545,6 +545,12 @@ def on_task_exit(self, task): last = self.num_tasks[-1] self.num_tasks.append(last - 1) + def on_task_build(self, task): + pass + + def on_task_build_exit(self, task): + pass + def on_task_success(self, task): pass @@ -792,31 +798,31 @@ def test_run_complete_fails_busy_loop(async_runner, make_cases, assert isinstance(t.check, SleepCheck) -def test_compile_fail_reschedule_main_loop(async_runner, make_cases, - make_exec_ctx): - make_exec_ctx(options=max_jobs_opts(1)) - runner, _ = async_runner - num_checks = 2 - runner.runall(make_cases([SleepCheckPollFail(.1), CompileFailureCheck()])) - - stats = runner.stats - assert num_checks == stats.num_cases() - assert_runall(runner) - assert num_checks == len(stats.failed()) - - -def test_compile_fail_reschedule_busy_loop(async_runner, make_cases, - make_exec_ctx): - make_exec_ctx(options=max_jobs_opts(1)) - runner, _ = async_runner - num_checks = 2 - runner.runall( - make_cases([SleepCheckPollFailLate(1.5), CompileFailureCheck()]) - ) - stats = runner.stats - assert num_checks == stats.num_cases() - assert_runall(runner) - assert num_checks == len(stats.failed()) +# def test_compile_fail_reschedule_main_loop(async_runner, make_cases, +# make_exec_ctx): +# make_exec_ctx(options=max_jobs_opts(1)) +# runner, _ = async_runner +# num_checks = 2 +# runner.runall(make_cases([SleepCheckPollFail(.1), CompileFailureCheck()])) + +# stats = runner.stats +# assert num_checks == stats.num_cases() +# assert_runall(runner) +# assert num_checks == len(stats.failed()) + + +# def test_compile_fail_reschedule_busy_loop(async_runner, make_cases, +# make_exec_ctx): +# make_exec_ctx(options=max_jobs_opts(1)) +# runner, _ = async_runner +# num_checks = 2 +# runner.runall( +# make_cases([SleepCheckPollFailLate(1.5), CompileFailureCheck()]) +# ) +# stats = runner.stats +# assert num_checks == stats.num_cases() +# assert_runall(runner) +# assert num_checks == len(stats.failed()) @pytest.fixture From a3c588ed7ab0de3b32c953b9b88df61320be96e2 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Tue, 21 Sep 2021 09:52:08 +0200 Subject: [PATCH 08/76] Remove building task from queue when it fails --- reframe/frontend/executors/policies.py | 1 + 1 file changed, 1 insertion(+) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 2a7b2bf3f0..f3837f70f3 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -334,6 +334,7 @@ def on_task_failure(self, task): self.printer.status('ERROR', msg, just='right') else: self._remove_from_running(task) + self._remove_from_building(task) self.printer.status('FAIL', msg, just='right') stagedir = task.check.stagedir From c8a7cc03b14607a92626fd928d211790a1b59902 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Tue, 21 Sep 2021 09:52:44 +0200 Subject: [PATCH 09/76] Re-enable all unittests --- unittests/test_policies.py | 50 +++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/unittests/test_policies.py b/unittests/test_policies.py index 0d53019481..9f862f0b57 100644 --- a/unittests/test_policies.py +++ b/unittests/test_policies.py @@ -798,31 +798,31 @@ def test_run_complete_fails_busy_loop(async_runner, make_cases, assert isinstance(t.check, SleepCheck) -# def test_compile_fail_reschedule_main_loop(async_runner, make_cases, -# make_exec_ctx): -# make_exec_ctx(options=max_jobs_opts(1)) -# runner, _ = async_runner -# num_checks = 2 -# runner.runall(make_cases([SleepCheckPollFail(.1), CompileFailureCheck()])) - -# stats = runner.stats -# assert num_checks == stats.num_cases() -# assert_runall(runner) -# assert num_checks == len(stats.failed()) - - -# def test_compile_fail_reschedule_busy_loop(async_runner, make_cases, -# make_exec_ctx): -# make_exec_ctx(options=max_jobs_opts(1)) -# runner, _ = async_runner -# num_checks = 2 -# runner.runall( -# make_cases([SleepCheckPollFailLate(1.5), CompileFailureCheck()]) -# ) -# stats = runner.stats -# assert num_checks == stats.num_cases() -# assert_runall(runner) -# assert num_checks == len(stats.failed()) +def test_compile_fail_reschedule_main_loop(async_runner, make_cases, + make_exec_ctx): + make_exec_ctx(options=max_jobs_opts(1)) + runner, _ = async_runner + num_checks = 2 + runner.runall(make_cases([SleepCheckPollFail(.1), CompileFailureCheck()])) + + stats = runner.stats + assert num_checks == stats.num_cases() + assert_runall(runner) + assert num_checks == len(stats.failed()) + + +def test_compile_fail_reschedule_busy_loop(async_runner, make_cases, + make_exec_ctx): + make_exec_ctx(options=max_jobs_opts(1)) + runner, _ = async_runner + num_checks = 2 + runner.runall( + make_cases([SleepCheckPollFailLate(1.5), CompileFailureCheck()]) + ) + stats = runner.stats + assert num_checks == stats.num_cases() + assert_runall(runner) + assert num_checks == len(stats.failed()) @pytest.fixture From 4439ad476e5144f61c206315885caafe5dee9037 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Tue, 21 Sep 2021 10:56:14 +0200 Subject: [PATCH 10/76] Update documentation --- docs/manpage.rst | 6 +++--- docs/pipeline.rst | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/manpage.rst b/docs/manpage.rst index 5cb1a9ece4..9714d00ca5 100644 --- a/docs/manpage.rst +++ b/docs/manpage.rst @@ -299,13 +299,13 @@ Options controlling ReFrame execution - ``async``: Tests will be executed asynchronously. This is the default policy. - The ``async`` execution policy executes the run phase of tests asynchronously by submitting their associated jobs in a non-blocking way. - ReFrame's runtime monitors the progress of each test and will resume the pipeline execution of an asynchronously spawned test as soon as its run phase has finished. + The ``async`` execution policy executes the build and run phases of tests asynchronously by submitting their associated jobs in a non-blocking way. + ReFrame's runtime monitors the progress of each test and will resume the pipeline execution of an asynchronously spawned test as soon as its build or run phase have finished. Note that the rest of the pipeline stages are still executed sequentially in this policy. Concurrency can be controlled by setting the :js:attr:`max_jobs` system partition configuration parameter. As soon as the concurrency limit is reached, ReFrame will first poll the status of all its pending tests to check if any execution slots have been freed up. - If there are tests that have finished their run phase, ReFrame will keep pushing tests for execution until the concurrency limit is reached again. + If there are tests that have finished their build or run phase, ReFrame will keep pushing tests for execution until the concurrency limit is reached again. If no execution slots are available, ReFrame will throttle job submission. .. option:: --force-local diff --git a/docs/pipeline.rst b/docs/pipeline.rst index 3d9438fa6a..366c1362e0 100644 --- a/docs/pipeline.rst +++ b/docs/pipeline.rst @@ -52,7 +52,8 @@ A `job descriptor `__ this phase is a no-op. Before building the test, all the `resources `__ associated with it are copied to the test case's stage directory. @@ -114,7 +115,7 @@ As the figure below shows, this can lead to long idling times in the run phase, In the asynchronous execution policy, multiple tests can be simultaneously on-the-fly. -When a test enters the run phase, ReFrame does not block, but continues by picking the next test case to run. +When a test enters the build or run phase, ReFrame does not block, but continues by picking the next test case to run. This continues until no more test cases are left for execution or until a maximum concurrency limit is reached. At the end, ReFrame enters a busy-wait loop monitoring the spawned test cases. As soon as test case finishes, it resumes its pipeline and runs it to completion. From 692bf03df5ae9b0d1410cae281efd8d7243ae233 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Tue, 21 Sep 2021 11:00:10 +0200 Subject: [PATCH 11/76] Split long lines --- reframe/frontend/executors/policies.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index f3837f70f3..744b01c66f 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -479,7 +479,8 @@ def split_jobs(tasks, build_split=False, build_jobs=False): for part in self._partitions: partname = part.fullname num_tasks = len(self._running_tasks[partname]) - getlogger().debug2(f'Polling {num_tasks} running task(s) in {partname!r}') + getlogger().debug2(f'Polling {num_tasks} running task(s) in ' + f'{partname!r}') forced_local_jobs, part_jobs = split_jobs( self._running_tasks[partname] ) @@ -491,9 +492,11 @@ def split_jobs(tasks, build_split=False, build_jobs=False): t.run_complete() num_tasks = len(self._building_tasks[partname]) - getlogger().debug2(f'Polling {num_tasks} building task(s) in {partname!r}') + getlogger().debug2(f'Polling {num_tasks} building task(s) in ' + f'{partname!r}') forced_local_jobs, part_jobs = split_jobs( - self._building_tasks[partname], build_split=True, build_jobs=True + self._building_tasks[partname], build_split=True, + build_jobs=True ) part.scheduler.poll(*part_jobs) self.local_scheduler.poll(*forced_local_jobs) From ef5063268e5b93c7f4da9f3419d4469a52f2e412 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Mon, 27 Sep 2021 14:18:35 +0200 Subject: [PATCH 12/76] Fix unittests --- unittests/test_policies.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unittests/test_policies.py b/unittests/test_policies.py index 9f862f0b57..9a5b5d61dc 100644 --- a/unittests/test_policies.py +++ b/unittests/test_policies.py @@ -728,8 +728,8 @@ def test_kbd_interrupt_in_wait_with_limited_concurrency( runner, _ = async_runner with pytest.raises(KeyboardInterrupt): runner.runall(make_cases([ - KeyboardInterruptCheck(), SleepCheck(10), - SleepCheck(10), SleepCheck(10) + SleepCheck(10), SleepCheck(10), + KeyboardInterruptCheck(), SleepCheck(10) ])) assert_interrupted_run(runner) From 91eea08987baa5dcb192a3c6eb57d5ee17014d1f Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Fri, 1 Oct 2021 15:55:47 +0200 Subject: [PATCH 13/76] Address PR comments --- docs/pipeline.rst | 2 +- reframe/core/pipeline.py | 9 +----- reframe/frontend/executors/__init__.py | 12 +++---- reframe/frontend/executors/policies.py | 45 +++++++++++++------------- unittests/test_policies.py | 4 +-- 5 files changed, 32 insertions(+), 40 deletions(-) diff --git a/docs/pipeline.rst b/docs/pipeline.rst index 366c1362e0..e88c92fb3b 100644 --- a/docs/pipeline.rst +++ b/docs/pipeline.rst @@ -52,7 +52,7 @@ A `job descriptor `__ this phase is a no-op. diff --git a/reframe/core/pipeline.py b/reframe/core/pipeline.py index b071914fb0..3060168c8a 100644 --- a/reframe/core/pipeline.py +++ b/reframe/core/pipeline.py @@ -1574,18 +1574,11 @@ def compile_complete(self): `__ for more details. - - .. versionchanged:: 3.4 - Overriding this method directly in no longer allowed. See `here - `__ for - more details. - ''' if not self._build_job: return True - t = self._build_job.finished() - return t + return self._build_job.finished() @final def run_complete(self): diff --git a/reframe/frontend/executors/__init__.py b/reframe/frontend/executors/__init__.py index 2ea7b27685..1c03621707 100644 --- a/reframe/frontend/executors/__init__.py +++ b/reframe/frontend/executors/__init__.py @@ -286,7 +286,7 @@ def setup(self, *args, **kwargs): def compile(self): self._safe_call(self.check.compile) - self._notify_listeners('on_task_build') + self._notify_listeners('on_task_compile') def compile_wait(self): self._safe_call(self.check.compile_wait) @@ -306,7 +306,7 @@ def run_complete(self): def compile_complete(self): done = self._safe_call(self.check.compile_complete) if done: - self._notify_listeners('on_task_build_exit') + self._notify_listeners('on_task_compile_exit') return done @@ -378,16 +378,16 @@ def on_task_run(self, task): '''Called whenever the run() method of a RegressionTask is called.''' @abc.abstractmethod - def on_task_build(self, task): - '''Called whenever the build() method of a RegressionTask is called.''' + def on_task_compile(self, task): + '''Called whenever the compile() method of a RegressionTask is called.''' @abc.abstractmethod def on_task_exit(self, task): '''Called whenever a RegressionTask finishes.''' @abc.abstractmethod - def on_task_build_exit(self, task): - '''Called whenever a RegressionTask build finishes.''' + def on_task_compile_exit(self, task): + '''Called whenever a RegressionTask compilation phase finishes.''' @abc.abstractmethod def on_task_skip(self, task): diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 744b01c66f..aa672a395b 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -158,13 +158,13 @@ def on_task_setup(self, task): def on_task_run(self, task): pass - def on_task_build(self, task): + def on_task_compile(self, task): pass def on_task_exit(self, task): pass - def on_task_build_exit(self, task): + def on_task_compile_exit(self, task): pass def on_task_skip(self, task): @@ -234,7 +234,7 @@ def __init__(self): self._task_index = {} # All currently building tasks per partition - self._building_tasks = {} + self._build_tasks = {} # All currently running tasks per partition self._running_tasks = {} @@ -276,7 +276,7 @@ def _remove_from_building(self, task): ) try: partname = task.check.current_partition.fullname - self._building_tasks[partname].remove(task) + self._build_tasks[partname].remove(task) except (ValueError, AttributeError, KeyError): getlogger().debug2('Task was not building') pass @@ -306,9 +306,9 @@ def on_task_run(self, task): partname = task.check.current_partition.fullname self._running_tasks[partname].append(task) - def on_task_build(self, task): + def on_task_compile(self, task): partname = task.check.current_partition.fullname - self._building_tasks[partname].append(task) + self._build_tasks[partname].append(task) def on_task_skip(self, task): # Remove the task from the running list if it was skipped after the @@ -319,7 +319,7 @@ def on_task_skip(self, task): self._running_tasks[partname].remove(task) if task.failed_stage in ('compile_complete', 'compile_wait'): - self._building_tasks[partname].remove(task) + self._build_tasks[partname].remove(task) msg = str(task.exc_info[1]) self.printer.status('SKIP', msg, just='right') @@ -367,7 +367,7 @@ def on_task_exit(self, task): self._remove_from_running(task) self._completed_tasks.append(task) - def on_task_build_exit(self, task): + def on_task_compile_exit(self, task): task.compile_wait() self._remove_from_building(task) self._reschedule_run(task) @@ -404,7 +404,7 @@ def runcase(self, case): # Set partition-based counters, if not set already self._running_tasks.setdefault(partition.fullname, []) - self._building_tasks.setdefault(partition.fullname, []) + self._build_tasks.setdefault(partition.fullname, []) self._ready_tasks.setdefault(partition.fullname, []) self._max_jobs.setdefault(partition.fullname, partition.max_jobs) @@ -445,7 +445,7 @@ def runcase(self, case): except TaskExit: if not task.failed and not task.skipped: with contextlib.suppress(TaskExit): - self._reschedule(task) + self._reschedule_compile(task) return except ABORT_REASONS as e: @@ -458,18 +458,18 @@ def runcase(self, case): def _poll_tasks(self): '''Update the counts of running checks per partition.''' - def split_jobs(tasks, build_split=False, build_jobs=False): + def split_jobs(tasks, split_build_jobs=False): '''Split jobs into forced local and normal ones.''' forced_local = [] normal = [] for t in tasks: - if t.check.local or (build_split and t.check.build_locally): - if build_jobs: + if t.check.local or (split_build_jobs and t.check.build_locally): + if split_build_jobs: forced_local.append(t.check.build_job) else: forced_local.append(t.check.job) else: - if build_jobs: + if split_build_jobs: normal.append(t.check.build_job) else: normal.append(t.check.job) @@ -491,18 +491,17 @@ def split_jobs(tasks, build_split=False, build_jobs=False): for t in self._running_tasks[partname][:]: t.run_complete() - num_tasks = len(self._building_tasks[partname]) + num_tasks = len(self._build_tasks[partname]) getlogger().debug2(f'Polling {num_tasks} building task(s) in ' f'{partname!r}') forced_local_jobs, part_jobs = split_jobs( - self._building_tasks[partname], build_split=True, - build_jobs=True + self._build_tasks[partname], split_build_jobs=True ) part.scheduler.poll(*part_jobs) self.local_scheduler.poll(*forced_local_jobs) - # Trigger notifications for finished jobs - for t in self._building_tasks[partname][:]: + # Trigger notifications for finished compilation jobs + for t in self._build_tasks[partname][:]: t.compile_complete() def _setup_all(self): @@ -551,8 +550,8 @@ def _failall(self, cause): self._completed_tasks): task.abort(cause) - def _reschedule(self, task): - getlogger().debug2(f'Scheduling test case {task.testcase} for running') + def _reschedule_compile(self, task): + getlogger().debug2(f'Scheduling test case {task.testcase} for compiling') task.compile() def _reschedule_run(self, task): @@ -570,7 +569,7 @@ def _reschedule_all(self): except IndexError: break - self._reschedule(task) + self._reschedule_compile(task) num_rescheduled += 1 if num_rescheduled: @@ -583,7 +582,7 @@ def exit(self): 'waiting for spawned checks to finish') while (countall(self._running_tasks) or self._waiting_tasks or self._completed_tasks or countall(self._ready_tasks) or - countall(self._building_tasks)): + countall(self._build_tasks)): getlogger().debug2(f'Running tasks: ' f'{countall(self._running_tasks)}') try: diff --git a/unittests/test_policies.py b/unittests/test_policies.py index 9a5b5d61dc..e2d1afb159 100644 --- a/unittests/test_policies.py +++ b/unittests/test_policies.py @@ -545,10 +545,10 @@ def on_task_exit(self, task): last = self.num_tasks[-1] self.num_tasks.append(last - 1) - def on_task_build(self, task): + def on_task_compile(self, task): pass - def on_task_build_exit(self, task): + def on_task_compile_exit(self, task): pass def on_task_success(self, task): From 1b290ca9312cd148abcb52932d686629adaec3a5 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Wed, 13 Oct 2021 15:28:38 +0200 Subject: [PATCH 14/76] Address PR comments --- reframe/frontend/executors/policies.py | 34 +++++++++++++++----------- unittests/test_policies.py | 4 +-- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index e1dd34e29b..916c168cef 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -430,20 +430,22 @@ def runcase(self, case): return - # if len(self._running_tasks[partname]) >= partition.max_jobs: - # # Make sure that we still exceeded the job limit - # getlogger().debug2( - # f'Reached concurrency limit for partition {partname!r}: ' - # f'{partition.max_jobs} job(s)' - # ) - # self._poll_tasks() - - # if len(self._running_tasks[partname]) < partition.max_jobs: - # # Task was put in _ready_tasks during setup - # self._ready_tasks[partname].pop() - # self._reschedule(task) - # else: - # self.printer.status('HOLD', task.check.info(), just='right') + if (len(self._running_tasks[partname]) + + len(self._build_tasks[partname]) >= partition.max_jobs): + # Make sure that we still exceeded the job limit + getlogger().debug2( + f'Reached concurrency limit for partition {partname!r}: ' + f'{partition.max_jobs} job(s)' + ) + self._poll_tasks() + + if (len(self._running_tasks[partname]) + + len(self._build_tasks[partname]) < partition.max_jobs): + # Task was put in _ready_tasks during setup + self._ready_tasks[partname].pop() + self._reschedule_compile(task) + else: + self.printer.status('HOLD', task.check.info(), just='right') except TaskExit: if not task.failed and not task.skipped: with contextlib.suppress(TaskExit): @@ -544,6 +546,10 @@ def _failall(self, cause): task.abort(cause) self._running_tasks = {} + for task in list(itertools.chain(*self._build_tasks.values())): + task.abort(cause) + + self._build_tasks = {} for ready_list in self._ready_tasks.values(): for task in ready_list: task.abort(cause) diff --git a/unittests/test_policies.py b/unittests/test_policies.py index e2d1afb159..3f34a6e3b6 100644 --- a/unittests/test_policies.py +++ b/unittests/test_policies.py @@ -728,8 +728,8 @@ def test_kbd_interrupt_in_wait_with_limited_concurrency( runner, _ = async_runner with pytest.raises(KeyboardInterrupt): runner.runall(make_cases([ - SleepCheck(10), SleepCheck(10), - KeyboardInterruptCheck(), SleepCheck(10) + KeyboardInterruptCheck(), SleepCheck(10), + SleepCheck(10), SleepCheck(10) ])) assert_interrupted_run(runner) From 717f0ef2984a29bbcbdb40308bca70b21b03713b Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Wed, 13 Oct 2021 15:41:22 +0200 Subject: [PATCH 15/76] Change split_jobs arguments --- reframe/frontend/executors/policies.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 916c168cef..110f80de6a 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -462,20 +462,20 @@ def runcase(self, case): def _poll_tasks(self): '''Update the counts of running checks per partition.''' - def split_jobs(tasks, split_build_jobs=False): + def split_jobs(tasks, kind='run'): '''Split jobs into forced local and normal ones.''' forced_local = [] normal = [] for t in tasks: - if t.check.local or (split_build_jobs and t.check.build_locally): - if split_build_jobs: + if t.check.local or (kind=='build' and t.check.build_locally): + if kind=='build': forced_local.append(t.check.build_job) - else: + elif kind=='run': forced_local.append(t.check.job) else: - if split_build_jobs: + if kind=='build': normal.append(t.check.build_job) - else: + elif kind=='run': normal.append(t.check.job) return forced_local, normal @@ -499,7 +499,7 @@ def split_jobs(tasks, split_build_jobs=False): getlogger().debug2(f'Polling {num_tasks} building task(s) in ' f'{partname!r}') forced_local_jobs, part_jobs = split_jobs( - self._build_tasks[partname], split_build_jobs=True + self._build_tasks[partname], kind='build' ) part.scheduler.poll(*part_jobs) self.local_scheduler.poll(*forced_local_jobs) From bb3b563b1d9a2e64ae64829caeb8909e5bc6b23f Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Wed, 13 Oct 2021 15:44:22 +0200 Subject: [PATCH 16/76] Split long lines --- reframe/frontend/executors/__init__.py | 3 ++- reframe/frontend/executors/policies.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/reframe/frontend/executors/__init__.py b/reframe/frontend/executors/__init__.py index 1c03621707..18642600f2 100644 --- a/reframe/frontend/executors/__init__.py +++ b/reframe/frontend/executors/__init__.py @@ -379,7 +379,8 @@ def on_task_run(self, task): @abc.abstractmethod def on_task_compile(self, task): - '''Called whenever the compile() method of a RegressionTask is called.''' + '''Called whenever the compile() method of a RegressionTask is + called.''' @abc.abstractmethod def on_task_exit(self, task): diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 110f80de6a..bf1286f5da 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -559,7 +559,8 @@ def _failall(self, cause): task.abort(cause) def _reschedule_compile(self, task): - getlogger().debug2(f'Scheduling test case {task.testcase} for compiling') + getlogger().debug2(f'Scheduling test case {task.testcase} for ' + f'compiling') task.compile() def _reschedule_run(self, task): From 40fc721622e51587a12ef3cec6fd1e392cfcf4e0 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Wed, 13 Oct 2021 15:52:30 +0200 Subject: [PATCH 17/76] Update split_jobs --- reframe/frontend/executors/policies.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index bf1286f5da..b6c24dcd11 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -467,15 +467,16 @@ def split_jobs(tasks, kind='run'): forced_local = [] normal = [] for t in tasks: - if t.check.local or (kind=='build' and t.check.build_locally): - if kind=='build': + if kind == 'build': + if t.check.local or t.check.build_locally: forced_local.append(t.check.build_job) - elif kind=='run': - forced_local.append(t.check.job) - else: - if kind=='build': + else: normal.append(t.check.build_job) - elif kind=='run': + + elif kind == 'run': + if t.check.local: + forced_local.append(t.check.job) + else: normal.append(t.check.job) return forced_local, normal From d9740f293ead102a63bd95210490e1598159d411 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Wed, 20 Oct 2021 13:35:41 +0200 Subject: [PATCH 18/76] Address PR comments --- reframe/frontend/executors/policies.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index b6c24dcd11..b4a5553815 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -235,10 +235,10 @@ def __init__(self): # Index tasks by test cases self._task_index = {} - # All currently building tasks per partition + # All tasks currently in their build phase per partition self._build_tasks = {} - # All currently running tasks per partition + # All tasks currently in their run phase per partition self._running_tasks = {} # Tasks that need to be finalized @@ -492,7 +492,10 @@ def split_jobs(tasks, kind='run'): part.scheduler.poll(*part_jobs) self.local_scheduler.poll(*forced_local_jobs) - # Trigger notifications for finished jobs + # Trigger notifications for finished jobs. + # We need need a copy of the list here in order to not modify the + # list while looping over it. `run_complete` calls `on_task_exit`, + # which in turn will remove the task from `_running_tasks`. for t in self._running_tasks[partname][:]: t.run_complete() From 6b52963b7d2d26e885cf4636e614cd59e5c6e0f0 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Wed, 20 Oct 2021 13:42:06 +0200 Subject: [PATCH 19/76] Remove migration warning --- reframe/core/pipeline.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/reframe/core/pipeline.py b/reframe/core/pipeline.py index 3060168c8a..284badd68a 100644 --- a/reframe/core/pipeline.py +++ b/reframe/core/pipeline.py @@ -1568,12 +1568,6 @@ def compile_complete(self): :class:`True` is returned. :raises reframe.core.exceptions.ReframeError: In case of errors. - .. warning:: - You may not override this method directly unless you are in - special test. See `here - `__ for - more details. - ''' if not self._build_job: return True From 7d8035313250e0d34cba10d259a65fabb73a20e4 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Wed, 20 Oct 2021 21:48:28 +0200 Subject: [PATCH 20/76] Minor style fixes --- reframe/frontend/executors/policies.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index b4a5553815..97f7aec4b3 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -464,6 +464,7 @@ def _poll_tasks(self): def split_jobs(tasks, kind='run'): '''Split jobs into forced local and normal ones.''' + forced_local = [] normal = [] for t in tasks: @@ -493,9 +494,11 @@ def split_jobs(tasks, kind='run'): self.local_scheduler.poll(*forced_local_jobs) # Trigger notifications for finished jobs. - # We need need a copy of the list here in order to not modify the - # list while looping over it. `run_complete` calls `on_task_exit`, - # which in turn will remove the task from `_running_tasks`. + # + # NOTE: We need a copy of the list here in order to not modify the + # list while looping over it. `run_complete()` calls + # `on_task_exit()`, which in turn will remove the task from + # `_running_tasks`. for t in self._running_tasks[partname][:]: t.run_complete() From 0c0b2185f521fd92e64655be3a0cd902e208bd06 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Tue, 16 Nov 2021 09:42:50 +0100 Subject: [PATCH 21/76] Adding rfm_max_jobs in config --- reframe/frontend/executors/policies.py | 4 ++++ reframe/schemas/config.json | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index b4a5553815..4fae2ab8f6 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -10,6 +10,7 @@ import sys import time +import reframe.core.runtime as rt from reframe.core.exceptions import (FailureLimitError, SkipTestError, TaskDependencyError, @@ -261,6 +262,9 @@ def __init__(self): self.task_listeners.append(self) + # Max jobs spawned by the reframe thread + self._rfm_max_jobs = rt.runtime().get_option(f'systems/0/rfm_max_jobs') + def _remove_from_running(self, task): getlogger().debug2( f'Removing task from the running list: {task.testcase}' diff --git a/reframe/schemas/config.json b/reframe/schemas/config.json index 7c6f78979e..2a4c97a3a0 100644 --- a/reframe/schemas/config.json +++ b/reframe/schemas/config.json @@ -237,6 +237,7 @@ "stagedir": {"type": "string"}, "outputdir": {"type": "string"}, "resourcesdir": {"type": "string"}, + "rfm_max_jobs": {"type": "number"}, "partitions": { "type": "array", "items": { @@ -560,6 +561,7 @@ "systems/partitions/prepare_cmds": [], "systems/partitions/processor": {}, "systems/partitions/devices": [], - "systems/partitions/extras": {} + "systems/partitions/extras": {}, + "systems/rfm_max_jobs": 8 } } From 299c99220d67d40dfe0f3d737efb295c5d05ae77 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Tue, 16 Nov 2021 09:59:48 +0100 Subject: [PATCH 22/76] Rename _ready_tasks to _ready_to_compile_tasks --- reframe/frontend/executors/policies.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 4fae2ab8f6..34317f9670 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -249,7 +249,7 @@ def __init__(self): self._retired_tasks = [] # Ready tasks to be executed per partition - self._ready_tasks = {} + self._ready_to_compile_tasks = {} # Tasks that are waiting for dependencies self._waiting_tasks = [] @@ -306,7 +306,7 @@ def deps_skipped(self, task): def on_task_setup(self, task): partname = task.check.current_partition.fullname - self._ready_tasks[partname].append(task) + self._ready_to_compile_tasks[partname].append(task) def on_task_run(self, task): partname = task.check.current_partition.fullname @@ -411,7 +411,7 @@ def runcase(self, case): # Set partition-based counters, if not set already self._running_tasks.setdefault(partition.fullname, []) self._build_tasks.setdefault(partition.fullname, []) - self._ready_tasks.setdefault(partition.fullname, []) + self._ready_to_compile_tasks.setdefault(partition.fullname, []) self._max_jobs.setdefault(partition.fullname, partition.max_jobs) task = RegressionTask(case, self.task_listeners) @@ -445,8 +445,8 @@ def runcase(self, case): if (len(self._running_tasks[partname]) + len(self._build_tasks[partname]) < partition.max_jobs): - # Task was put in _ready_tasks during setup - self._ready_tasks[partname].pop() + # Task was put in _ready_to_compile_tasks during setup + self._ready_to_compile_tasks[partname].pop() self._reschedule_compile(task) else: self.printer.status('HOLD', task.check.info(), just='right') @@ -558,7 +558,7 @@ def _failall(self, cause): task.abort(cause) self._build_tasks = {} - for ready_list in self._ready_tasks.values(): + for ready_list in self._ready_to_compile_tasks.values(): for task in ready_list: task.abort(cause) @@ -582,7 +582,7 @@ def _reschedule_all(self): num_rescheduled = 0 for _ in range(num_empty_slots): try: - task = self._ready_tasks[partname].pop() + task = self._ready_to_compile_tasks[partname].pop() except IndexError: break @@ -598,7 +598,7 @@ def exit(self): self.printer.separator('short single line', 'waiting for spawned checks to finish') while (countall(self._running_tasks) or self._waiting_tasks or - self._completed_tasks or countall(self._ready_tasks) or + self._completed_tasks or countall(self._ready_to_compile_tasks) or countall(self._build_tasks)): getlogger().debug2(f'Running tasks: ' f'{countall(self._running_tasks)}') From 5133cc1a7e986fc7e1e1a1be275cff1095598efc Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Tue, 16 Nov 2021 10:01:36 +0100 Subject: [PATCH 23/76] Rename _build_tasks to _compiling_tasks --- reframe/frontend/executors/policies.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 34317f9670..2406c58f1b 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -237,7 +237,7 @@ def __init__(self): self._task_index = {} # All tasks currently in their build phase per partition - self._build_tasks = {} + self._compiling_tasks = {} # All tasks currently in their run phase per partition self._running_tasks = {} @@ -282,7 +282,7 @@ def _remove_from_building(self, task): ) try: partname = task.check.current_partition.fullname - self._build_tasks[partname].remove(task) + self._compiling_tasks[partname].remove(task) except (ValueError, AttributeError, KeyError): getlogger().debug2('Task was not building') pass @@ -314,7 +314,7 @@ def on_task_run(self, task): def on_task_compile(self, task): partname = task.check.current_partition.fullname - self._build_tasks[partname].append(task) + self._compiling_tasks[partname].append(task) def on_task_skip(self, task): # Remove the task from the running list if it was skipped after the @@ -325,7 +325,7 @@ def on_task_skip(self, task): self._running_tasks[partname].remove(task) if task.failed_stage in ('compile_complete', 'compile_wait'): - self._build_tasks[partname].remove(task) + self._compiling_tasks[partname].remove(task) msg = str(task.exc_info[1]) self.printer.status('SKIP', msg, just='right') @@ -410,7 +410,7 @@ def runcase(self, case): # Set partition-based counters, if not set already self._running_tasks.setdefault(partition.fullname, []) - self._build_tasks.setdefault(partition.fullname, []) + self._compiling_tasks.setdefault(partition.fullname, []) self._ready_to_compile_tasks.setdefault(partition.fullname, []) self._max_jobs.setdefault(partition.fullname, partition.max_jobs) @@ -435,7 +435,7 @@ def runcase(self, case): return if (len(self._running_tasks[partname]) + - len(self._build_tasks[partname]) >= partition.max_jobs): + len(self._compiling_tasks[partname]) >= partition.max_jobs): # Make sure that we still exceeded the job limit getlogger().debug2( f'Reached concurrency limit for partition {partname!r}: ' @@ -444,7 +444,7 @@ def runcase(self, case): self._poll_tasks() if (len(self._running_tasks[partname]) + - len(self._build_tasks[partname]) < partition.max_jobs): + len(self._compiling_tasks[partname]) < partition.max_jobs): # Task was put in _ready_to_compile_tasks during setup self._ready_to_compile_tasks[partname].pop() self._reschedule_compile(task) @@ -503,17 +503,17 @@ def split_jobs(tasks, kind='run'): for t in self._running_tasks[partname][:]: t.run_complete() - num_tasks = len(self._build_tasks[partname]) + num_tasks = len(self._compiling_tasks[partname]) getlogger().debug2(f'Polling {num_tasks} building task(s) in ' f'{partname!r}') forced_local_jobs, part_jobs = split_jobs( - self._build_tasks[partname], kind='build' + self._compiling_tasks[partname], kind='build' ) part.scheduler.poll(*part_jobs) self.local_scheduler.poll(*forced_local_jobs) # Trigger notifications for finished compilation jobs - for t in self._build_tasks[partname][:]: + for t in self._compiling_tasks[partname][:]: t.compile_complete() def _setup_all(self): @@ -554,10 +554,10 @@ def _failall(self, cause): task.abort(cause) self._running_tasks = {} - for task in list(itertools.chain(*self._build_tasks.values())): + for task in list(itertools.chain(*self._compiling_tasks.values())): task.abort(cause) - self._build_tasks = {} + self._compiling_tasks = {} for ready_list in self._ready_to_compile_tasks.values(): for task in ready_list: task.abort(cause) @@ -599,7 +599,7 @@ def exit(self): 'waiting for spawned checks to finish') while (countall(self._running_tasks) or self._waiting_tasks or self._completed_tasks or countall(self._ready_to_compile_tasks) or - countall(self._build_tasks)): + countall(self._compiling_tasks)): getlogger().debug2(f'Running tasks: ' f'{countall(self._running_tasks)}') try: From a261d574c9d97b2046ee17736e5c1b9dcec83bf6 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Tue, 16 Nov 2021 10:03:55 +0100 Subject: [PATCH 24/76] Update description of list --- reframe/frontend/executors/policies.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 2406c58f1b..5ab5fe7ea5 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -248,7 +248,7 @@ def __init__(self): # Retired tasks that need to be cleaned up self._retired_tasks = [] - # Ready tasks to be executed per partition + # Tasks ready to be compiled per partition self._ready_to_compile_tasks = {} # Tasks that are waiting for dependencies From 461d973e1402264f60a40c5f9daf2703d3620f38 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Tue, 16 Nov 2021 10:06:43 +0100 Subject: [PATCH 25/76] Update descr order --- reframe/frontend/executors/policies.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 5ab5fe7ea5..c0a05333f3 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -236,9 +236,18 @@ def __init__(self): # Index tasks by test cases self._task_index = {} + # Tasks that are waiting for dependencies + self._waiting_tasks = [] + + # Tasks ready to be compiled per partition + self._ready_to_compile_tasks = {} + # All tasks currently in their build phase per partition self._compiling_tasks = {} + # Tasks ready to run per partition + self._ready_to_run_tasks = {} + # All tasks currently in their run phase per partition self._running_tasks = {} @@ -248,23 +257,17 @@ def __init__(self): # Retired tasks that need to be cleaned up self._retired_tasks = [] - # Tasks ready to be compiled per partition - self._ready_to_compile_tasks = {} - - # Tasks that are waiting for dependencies - self._waiting_tasks = [] - # Job limit per partition self._max_jobs = {} + # Max jobs spawned by the reframe thread + self._rfm_max_jobs = rt.runtime().get_option(f'systems/0/rfm_max_jobs') + # Keep a reference to all the partitions self._partitions = set() self.task_listeners.append(self) - # Max jobs spawned by the reframe thread - self._rfm_max_jobs = rt.runtime().get_option(f'systems/0/rfm_max_jobs') - def _remove_from_running(self, task): getlogger().debug2( f'Removing task from the running list: {task.testcase}' From 89199f24055895c7ba042ed4070d8f32573e30cf Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Tue, 16 Nov 2021 11:00:38 +0100 Subject: [PATCH 26/76] Split scheduling of running and compiling jobs --- reframe/frontend/executors/policies.py | 27 +++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index c0a05333f3..63c2bf90fe 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -379,7 +379,8 @@ def on_task_exit(self, task): def on_task_compile_exit(self, task): task.compile_wait() self._remove_from_building(task) - self._reschedule_run(task) + partname = task.check.current_partition.fullname + self._ready_to_run_tasks[partname].append(task) def _setup_task(self, task): if self.deps_skipped(task): @@ -415,6 +416,7 @@ def runcase(self, case): self._running_tasks.setdefault(partition.fullname, []) self._compiling_tasks.setdefault(partition.fullname, []) self._ready_to_compile_tasks.setdefault(partition.fullname, []) + self._ready_to_run_tasks.setdefault(partition.fullname, []) self._max_jobs.setdefault(partition.fullname, partition.max_jobs) task = RegressionTask(case, self.task_listeners) @@ -578,23 +580,28 @@ def _reschedule_run(self, task): getlogger().debug2(f'Scheduling test case {task.testcase} for running') task.run() - def _reschedule_all(self): - for partname, tasks in self._running_tasks.items(): - num_tasks = len(tasks) + def _reschedule_all(self, phase='run'): + for part in self._partitions: + partname = part.fullname + num_tasks = ( + len(self._running_tasks[partname]) + + len(self._compiling_tasks[partname]) + ) num_empty_slots = self._max_jobs[partname] - num_tasks num_rescheduled = 0 for _ in range(num_empty_slots): try: - task = self._ready_to_compile_tasks[partname].pop() + queue = getattr(self, f'_ready_to_{phase}_tasks') + task = queue[partname].pop() except IndexError: break - self._reschedule_compile(task) + getattr(self, f'_reschedule_{phase}')(task) num_rescheduled += 1 if num_rescheduled: getlogger().debug2( - f'Rescheduled {num_rescheduled} job(s) on {partname!r}' + f'Rescheduled {num_rescheduled} {phase} job(s) on {partname!r}' ) def exit(self): @@ -616,14 +623,16 @@ def exit(self): num_running = countall(self._running_tasks) self._finalize_all() self._setup_all() - self._reschedule_all() + self._reschedule_all(phase='compile') + self._reschedule_all(phase='run') _cleanup_all(self._retired_tasks, not self.keep_stage_files) if num_running: self._pollctl.running_tasks(num_running).snooze() except TaskExit: with contextlib.suppress(TaskExit): - self._reschedule_all() + self._reschedule_all(phase='compile') + self._reschedule_all(phase='run') except ABORT_REASONS as e: self._failall(e) raise From 545234eb92be376a7af84ffb28d097a67a97ab3e Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Tue, 16 Nov 2021 11:25:03 +0100 Subject: [PATCH 27/76] Skip no-op phases in tests --- reframe/frontend/executors/policies.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 63c2bf90fe..58928b06bb 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -16,6 +16,8 @@ TaskDependencyError, TaskExit) from reframe.core.logging import getlogger +from reframe.core.pipeline import (CompileOnlyRegressionTest, + RunOnlyRegressionTest) from reframe.frontend.executors import (ExecutionPolicy, RegressionTask, TaskEventListener, ABORT_REASONS) @@ -309,7 +311,10 @@ def deps_skipped(self, task): def on_task_setup(self, task): partname = task.check.current_partition.fullname - self._ready_to_compile_tasks[partname].append(task) + if (isinstance(task.check, RunOnlyRegressionTest)): + self._ready_to_run_tasks[partname].append(task) + else: + self._ready_to_compile_tasks[partname].append(task) def on_task_run(self, task): partname = task.check.current_partition.fullname @@ -380,7 +385,10 @@ def on_task_compile_exit(self, task): task.compile_wait() self._remove_from_building(task) partname = task.check.current_partition.fullname - self._ready_to_run_tasks[partname].append(task) + if (isinstance(task.check, CompileOnlyRegressionTest)): + self._completed_tasks.append(task) + else: + self._ready_to_run_tasks[partname].append(task) def _setup_task(self, task): if self.deps_skipped(task): @@ -450,9 +458,14 @@ def runcase(self, case): if (len(self._running_tasks[partname]) + len(self._compiling_tasks[partname]) < partition.max_jobs): - # Task was put in _ready_to_compile_tasks during setup - self._ready_to_compile_tasks[partname].pop() - self._reschedule_compile(task) + if isinstance(task.check, RunOnlyRegressionTest): + # Task was put in _ready_to_run_tasks during setup + self._ready_to_run_tasks[partname].pop() + self._reschedule_run(task) + else: + # Task was put in _ready_to_compile_tasks during setup + self._ready_to_compile_tasks[partname].pop() + self._reschedule_compile(task) else: self.printer.status('HOLD', task.check.info(), just='right') except TaskExit: From 4499dbbb71000e1bde4df0457b5f9470caad9edb Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Tue, 16 Nov 2021 13:36:33 +0100 Subject: [PATCH 28/76] Create separate lists for forced_local jobs --- reframe/frontend/executors/policies.py | 79 ++++++++++++++------------ 1 file changed, 44 insertions(+), 35 deletions(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 58928b06bb..0e823bb943 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -23,7 +23,12 @@ def countall(d): - return functools.reduce(lambda l, r: l + len(r), d.values(), 0) + res = 0 + for (q1, q2) in d.values(): + res += len(q1) + res += len(q2) + + return res def _cleanup_all(tasks, *args, **kwargs): @@ -276,7 +281,7 @@ def _remove_from_running(self, task): ) try: partname = task.check.current_partition.fullname - self._running_tasks[partname].remove(task) + self._running_tasks[partname][0].remove(task) except (ValueError, AttributeError, KeyError): getlogger().debug2('Task was not running') pass @@ -287,7 +292,7 @@ def _remove_from_building(self, task): ) try: partname = task.check.current_partition.fullname - self._compiling_tasks[partname].remove(task) + self._compiling_tasks[partname][0].remove(task) except (ValueError, AttributeError, KeyError): getlogger().debug2('Task was not building') pass @@ -312,17 +317,17 @@ def deps_skipped(self, task): def on_task_setup(self, task): partname = task.check.current_partition.fullname if (isinstance(task.check, RunOnlyRegressionTest)): - self._ready_to_run_tasks[partname].append(task) + self._ready_to_run_tasks[partname][0].append(task) else: - self._ready_to_compile_tasks[partname].append(task) + self._ready_to_compile_tasks[partname][0].append(task) def on_task_run(self, task): partname = task.check.current_partition.fullname - self._running_tasks[partname].append(task) + self._running_tasks[partname][0].append(task) def on_task_compile(self, task): partname = task.check.current_partition.fullname - self._compiling_tasks[partname].append(task) + self._compiling_tasks[partname][0].append(task) def on_task_skip(self, task): # Remove the task from the running list if it was skipped after the @@ -330,10 +335,10 @@ def on_task_skip(self, task): if task.check.current_partition: partname = task.check.current_partition.fullname if task.failed_stage in ('run_complete', 'run_wait'): - self._running_tasks[partname].remove(task) + self._running_tasks[partname][0].remove(task) if task.failed_stage in ('compile_complete', 'compile_wait'): - self._compiling_tasks[partname].remove(task) + self._compiling_tasks[partname][0].remove(task) msg = str(task.exc_info[1]) self.printer.status('SKIP', msg, just='right') @@ -388,7 +393,7 @@ def on_task_compile_exit(self, task): if (isinstance(task.check, CompileOnlyRegressionTest)): self._completed_tasks.append(task) else: - self._ready_to_run_tasks[partname].append(task) + self._ready_to_run_tasks[partname][0].append(task) def _setup_task(self, task): if self.deps_skipped(task): @@ -421,10 +426,10 @@ def runcase(self, case): self._partitions.add(partition) # Set partition-based counters, if not set already - self._running_tasks.setdefault(partition.fullname, []) - self._compiling_tasks.setdefault(partition.fullname, []) - self._ready_to_compile_tasks.setdefault(partition.fullname, []) - self._ready_to_run_tasks.setdefault(partition.fullname, []) + self._running_tasks.setdefault(partition.fullname, ([], [])) + self._compiling_tasks.setdefault(partition.fullname, ([], [])) + self._ready_to_compile_tasks.setdefault(partition.fullname, ([], [])) + self._ready_to_run_tasks.setdefault(partition.fullname, ([], [])) self._max_jobs.setdefault(partition.fullname, partition.max_jobs) task = RegressionTask(case, self.task_listeners) @@ -447,8 +452,8 @@ def runcase(self, case): return - if (len(self._running_tasks[partname]) + - len(self._compiling_tasks[partname]) >= partition.max_jobs): + if (len(self._running_tasks[partname][0]) + + len(self._compiling_tasks[partname][0]) >= partition.max_jobs): # Make sure that we still exceeded the job limit getlogger().debug2( f'Reached concurrency limit for partition {partname!r}: ' @@ -456,15 +461,15 @@ def runcase(self, case): ) self._poll_tasks() - if (len(self._running_tasks[partname]) + - len(self._compiling_tasks[partname]) < partition.max_jobs): + if (len(self._running_tasks[partname][0]) + + len(self._compiling_tasks[partname][0]) < partition.max_jobs): if isinstance(task.check, RunOnlyRegressionTest): # Task was put in _ready_to_run_tasks during setup - self._ready_to_run_tasks[partname].pop() + self._ready_to_run_tasks[partname][0].pop() self._reschedule_run(task) else: # Task was put in _ready_to_compile_tasks during setup - self._ready_to_compile_tasks[partname].pop() + self._ready_to_compile_tasks[partname][0].pop() self._reschedule_compile(task) else: self.printer.status('HOLD', task.check.info(), just='right') @@ -505,11 +510,11 @@ def split_jobs(tasks, kind='run'): for part in self._partitions: partname = part.fullname - num_tasks = len(self._running_tasks[partname]) + num_tasks = len(self._running_tasks[partname][0]) getlogger().debug2(f'Polling {num_tasks} running task(s) in ' f'{partname!r}') forced_local_jobs, part_jobs = split_jobs( - self._running_tasks[partname] + self._running_tasks[partname][0] ) part.scheduler.poll(*part_jobs) self.local_scheduler.poll(*forced_local_jobs) @@ -518,20 +523,20 @@ def split_jobs(tasks, kind='run'): # We need need a copy of the list here in order to not modify the # list while looping over it. `run_complete` calls `on_task_exit`, # which in turn will remove the task from `_running_tasks`. - for t in self._running_tasks[partname][:]: + for t in self._running_tasks[partname][0][:]: t.run_complete() - num_tasks = len(self._compiling_tasks[partname]) + num_tasks = len(self._compiling_tasks[partname][0]) getlogger().debug2(f'Polling {num_tasks} building task(s) in ' f'{partname!r}') forced_local_jobs, part_jobs = split_jobs( - self._compiling_tasks[partname], kind='build' + self._compiling_tasks[partname][0], kind='build' ) part.scheduler.poll(*part_jobs) self.local_scheduler.poll(*forced_local_jobs) # Trigger notifications for finished compilation jobs - for t in self._compiling_tasks[partname][:]: + for t in self._compiling_tasks[partname][0][:]: t.compile_complete() def _setup_all(self): @@ -568,18 +573,22 @@ def _finalize_task(self, task): def _failall(self, cause): '''Mark all tests as failures''' getlogger().debug2(f'Aborting all tasks due to {type(cause).__name__}') - for task in list(itertools.chain(*self._running_tasks.values())): + for task in list(itertools.chain(*itertools.chain(*self._running_tasks.values()))): task.abort(cause) self._running_tasks = {} - for task in list(itertools.chain(*self._compiling_tasks.values())): + for task in list(itertools.chain(*itertools.chain(*self._compiling_tasks.values()))): task.abort(cause) self._compiling_tasks = {} - for ready_list in self._ready_to_compile_tasks.values(): - for task in ready_list: - task.abort(cause) + for task in list(itertools.chain(*itertools.chain(*self._ready_to_compile_tasks.values()))): + task.abort(cause) + + self._ready_to_compile_tasks = {} + for task in list(itertools.chain(*itertools.chain(*self._ready_to_run_tasks.values()))): + task.abort(cause) + self._ready_to_run_tasks = {} for task in itertools.chain(self._waiting_tasks, self._completed_tasks): task.abort(cause) @@ -597,15 +606,15 @@ def _reschedule_all(self, phase='run'): for part in self._partitions: partname = part.fullname num_tasks = ( - len(self._running_tasks[partname]) + - len(self._compiling_tasks[partname]) + len(self._running_tasks[partname][0]) + + len(self._compiling_tasks[partname][0]) ) num_empty_slots = self._max_jobs[partname] - num_tasks num_rescheduled = 0 for _ in range(num_empty_slots): try: queue = getattr(self, f'_ready_to_{phase}_tasks') - task = queue[partname].pop() + task = queue[partname][0].pop() except IndexError: break @@ -622,7 +631,7 @@ def exit(self): 'waiting for spawned checks to finish') while (countall(self._running_tasks) or self._waiting_tasks or self._completed_tasks or countall(self._ready_to_compile_tasks) or - countall(self._compiling_tasks)): + countall(self._compiling_tasks) or countall(self._ready_to_run_tasks)): getlogger().debug2(f'Running tasks: ' f'{countall(self._running_tasks)}') try: From 258618d279c9790e25cd172e5144f1c53628700c Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Tue, 16 Nov 2021 18:34:00 +0100 Subject: [PATCH 29/76] Enforce different limits for forced local jobs --- reframe/frontend/executors/policies.py | 101 +++++++++++++------------ 1 file changed, 54 insertions(+), 47 deletions(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 0e823bb943..c9f63e9816 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -281,7 +281,7 @@ def _remove_from_running(self, task): ) try: partname = task.check.current_partition.fullname - self._running_tasks[partname][0].remove(task) + self._running_tasks[partname][self.local_index(task, phase='run')].remove(task) except (ValueError, AttributeError, KeyError): getlogger().debug2('Task was not running') pass @@ -292,7 +292,7 @@ def _remove_from_building(self, task): ) try: partname = task.check.current_partition.fullname - self._compiling_tasks[partname][0].remove(task) + self._compiling_tasks[partname][self.local_index(task, phase='compile')].remove(task) except (ValueError, AttributeError, KeyError): getlogger().debug2('Task was not building') pass @@ -314,20 +314,26 @@ def deps_skipped(self, task): return any(self._task_index[c].skipped for c in task.testcase.deps if c in self._task_index) + def local_index(self, task, phase='run'): + return ( + task.check.local or + (phase == 'build' and task.check.build_locally) + ) + def on_task_setup(self, task): partname = task.check.current_partition.fullname if (isinstance(task.check, RunOnlyRegressionTest)): - self._ready_to_run_tasks[partname][0].append(task) + self._ready_to_run_tasks[partname][self.local_index(task, phase='run')].append(task) else: - self._ready_to_compile_tasks[partname][0].append(task) + self._ready_to_compile_tasks[partname][self.local_index(task, phase='compile')].append(task) def on_task_run(self, task): partname = task.check.current_partition.fullname - self._running_tasks[partname][0].append(task) + self._running_tasks[partname][self.local_index(task, phase='run')].append(task) def on_task_compile(self, task): partname = task.check.current_partition.fullname - self._compiling_tasks[partname][0].append(task) + self._compiling_tasks[partname][self.local_index(task, phase='compile')].append(task) def on_task_skip(self, task): # Remove the task from the running list if it was skipped after the @@ -335,10 +341,10 @@ def on_task_skip(self, task): if task.check.current_partition: partname = task.check.current_partition.fullname if task.failed_stage in ('run_complete', 'run_wait'): - self._running_tasks[partname][0].remove(task) + self._running_tasks[partname][self.local_index(task, phase='run')].remove(task) if task.failed_stage in ('compile_complete', 'compile_wait'): - self._compiling_tasks[partname][0].remove(task) + self._compiling_tasks[partname][self.local_index(task, phase='compile')].remove(task) msg = str(task.exc_info[1]) self.printer.status('SKIP', msg, just='right') @@ -393,7 +399,7 @@ def on_task_compile_exit(self, task): if (isinstance(task.check, CompileOnlyRegressionTest)): self._completed_tasks.append(task) else: - self._ready_to_run_tasks[partname][0].append(task) + self._ready_to_run_tasks[partname][self.local_index(task, phase='run')].append(task) def _setup_task(self, task): if self.deps_skipped(task): @@ -452,8 +458,13 @@ def runcase(self, case): return - if (len(self._running_tasks[partname][0]) + - len(self._compiling_tasks[partname][0]) >= partition.max_jobs): + if isinstance(task.check, RunOnlyRegressionTest): + local_index = self.local_index(task, phase='run') + else: + local_index = self.local_index(task, phase='compile') + + if (len(self._running_tasks[partname][local_index]) + + len(self._compiling_tasks[partname][local_index]) >= partition.max_jobs): # Make sure that we still exceeded the job limit getlogger().debug2( f'Reached concurrency limit for partition {partname!r}: ' @@ -461,15 +472,15 @@ def runcase(self, case): ) self._poll_tasks() - if (len(self._running_tasks[partname][0]) + - len(self._compiling_tasks[partname][0]) < partition.max_jobs): + if (len(self._running_tasks[partname][local_index]) + + len(self._compiling_tasks[partname][local_index]) < partition.max_jobs): if isinstance(task.check, RunOnlyRegressionTest): # Task was put in _ready_to_run_tasks during setup - self._ready_to_run_tasks[partname][0].pop() + self._ready_to_run_tasks[partname][local_index].pop() self._reschedule_run(task) else: # Task was put in _ready_to_compile_tasks during setup - self._ready_to_compile_tasks[partname][0].pop() + self._ready_to_compile_tasks[partname][local_index].pop() self._reschedule_compile(task) else: self.printer.status('HOLD', task.check.info(), just='right') @@ -488,34 +499,13 @@ def runcase(self, case): def _poll_tasks(self): '''Update the counts of running checks per partition.''' - - def split_jobs(tasks, kind='run'): - '''Split jobs into forced local and normal ones.''' - forced_local = [] - normal = [] - for t in tasks: - if kind == 'build': - if t.check.local or t.check.build_locally: - forced_local.append(t.check.build_job) - else: - normal.append(t.check.build_job) - - elif kind == 'run': - if t.check.local: - forced_local.append(t.check.job) - else: - normal.append(t.check.job) - - return forced_local, normal - for part in self._partitions: partname = part.fullname num_tasks = len(self._running_tasks[partname][0]) getlogger().debug2(f'Polling {num_tasks} running task(s) in ' f'{partname!r}') - forced_local_jobs, part_jobs = split_jobs( - self._running_tasks[partname][0] - ) + part_jobs = [t.check.job for t in self._running_tasks[partname][0]] + forced_local_jobs = [t.check.job for t in self._running_tasks[partname][1]] part.scheduler.poll(*part_jobs) self.local_scheduler.poll(*forced_local_jobs) @@ -523,20 +513,19 @@ def split_jobs(tasks, kind='run'): # We need need a copy of the list here in order to not modify the # list while looping over it. `run_complete` calls `on_task_exit`, # which in turn will remove the task from `_running_tasks`. - for t in self._running_tasks[partname][0][:]: + for t in self._running_tasks[partname][0] + self._running_tasks[partname][1]: t.run_complete() num_tasks = len(self._compiling_tasks[partname][0]) getlogger().debug2(f'Polling {num_tasks} building task(s) in ' f'{partname!r}') - forced_local_jobs, part_jobs = split_jobs( - self._compiling_tasks[partname][0], kind='build' - ) + part_jobs = [t.check.build_job for t in self._compiling_tasks[partname][0]] + forced_local_jobs = [t.check.build_job for t in self._compiling_tasks[partname][1]] part.scheduler.poll(*part_jobs) self.local_scheduler.poll(*forced_local_jobs) # Trigger notifications for finished compilation jobs - for t in self._compiling_tasks[partname][0][:]: + for t in self._compiling_tasks[partname][0] + self._compiling_tasks[partname][1]: t.compile_complete() def _setup_all(self): @@ -603,15 +592,21 @@ def _reschedule_run(self, task): task.run() def _reschedule_all(self, phase='run'): + local_tasks = 0 + for (_, lt) in self._running_tasks.values(): + local_tasks += len(lt) + + local_slots = self._rfm_max_jobs - local_tasks for part in self._partitions: partname = part.fullname - num_tasks = ( + part_tasks = ( len(self._running_tasks[partname][0]) + len(self._compiling_tasks[partname][0]) ) - num_empty_slots = self._max_jobs[partname] - num_tasks + part_slots = self._max_jobs[partname] - part_tasks num_rescheduled = 0 - for _ in range(num_empty_slots): + + for _ in range(part_slots): try: queue = getattr(self, f'_ready_to_{phase}_tasks') task = queue[partname][0].pop() @@ -621,9 +616,21 @@ def _reschedule_all(self, phase='run'): getattr(self, f'_reschedule_{phase}')(task) num_rescheduled += 1 + for _ in range(local_slots): + try: + queue = getattr(self, f'_ready_to_{phase}_tasks') + task = queue[partname][1].pop() + except IndexError: + break + + getattr(self, f'_reschedule_{phase}')(task) + local_slots -= 1 + num_rescheduled += 1 + if num_rescheduled: getlogger().debug2( - f'Rescheduled {num_rescheduled} {phase} job(s) on {partname!r}' + f'Rescheduled {num_rescheduled} {phase} job(s) on ' + f'{partname!r}' ) def exit(self): From 1fdbccd3da35309237921ebc869e98c8cc4b8a56 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Tue, 16 Nov 2021 20:17:15 +0100 Subject: [PATCH 30/76] Fix runcase checks --- reframe/frontend/executors/policies.py | 32 ++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index c9f63e9816..65c1c0dbe1 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -317,7 +317,7 @@ def deps_skipped(self, task): def local_index(self, task, phase='run'): return ( task.check.local or - (phase == 'build' and task.check.build_locally) + (phase == 'compile' and task.check.build_locally) ) def on_task_setup(self, task): @@ -463,8 +463,25 @@ def runcase(self, case): else: local_index = self.local_index(task, phase='compile') - if (len(self._running_tasks[partname][local_index]) + - len(self._compiling_tasks[partname][local_index]) >= partition.max_jobs): + job_limit = self._rfm_max_jobs if local_index else partition.max_jobs + + def all_submissions(local, partname=None): + if local: + local_tasks = 0 + for (_, lt) in self._running_tasks.values(): + local_tasks += len(lt) + + for (_, lt) in self._compiling_tasks.values(): + local_tasks += len(lt) + + return local_tasks + else: + return ( + len(self._running_tasks[partname][local]) + + len(self._compiling_tasks[partname][local]) + ) + + if (all_submissions(local_index, partname) >= job_limit): # Make sure that we still exceeded the job limit getlogger().debug2( f'Reached concurrency limit for partition {partname!r}: ' @@ -472,8 +489,7 @@ def runcase(self, case): ) self._poll_tasks() - if (len(self._running_tasks[partname][local_index]) + - len(self._compiling_tasks[partname][local_index]) < partition.max_jobs): + if (all_submissions(local_index, partname) < job_limit): if isinstance(task.check, RunOnlyRegressionTest): # Task was put in _ready_to_run_tasks during setup self._ready_to_run_tasks[partname][local_index].pop() @@ -484,6 +500,12 @@ def runcase(self, case): self._reschedule_compile(task) else: self.printer.status('HOLD', task.check.info(), just='right') + + # NOTE: If we don't schedule runs here and we have a lot of tests + # compiling we will begin submitting only after all the tests are + # processed. On the other hand I am not sure where to schedule + # runs here. + self._reschedule_all(phase='run') except TaskExit: if not task.failed and not task.skipped: with contextlib.suppress(TaskExit): From 30b273d3b87973ccfffa76f89656538614f16d6f Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Thu, 18 Nov 2021 18:56:35 +0100 Subject: [PATCH 31/76] Refactor async policy --- reframe/frontend/executors/__init__.py | 1 + reframe/frontend/executors/policies.py | 617 ++++++++++--------------- 2 files changed, 246 insertions(+), 372 deletions(-) diff --git a/reframe/frontend/executors/__init__.py b/reframe/frontend/executors/__init__.py index 0c0252197b..a9f7a1571f 100644 --- a/reframe/frontend/executors/__init__.py +++ b/reframe/frontend/executors/__init__.py @@ -128,6 +128,7 @@ def __init__(self, case, listeners=[]): self._case = case self._failed_stage = None self._current_stage = 'startup' + self.policy_stage = 'wait' self._exc_info = (None, None, None) self._listeners = list(listeners) self._skipped = False diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 65c1c0dbe1..c5c9bddcc1 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -235,7 +235,6 @@ def exit(self): class AsynchronousExecutionPolicy(ExecutionPolicy, TaskEventListener): def __init__(self): - super().__init__() self._pollctl = _PollController() @@ -243,25 +242,19 @@ def __init__(self): # Index tasks by test cases self._task_index = {} - # Tasks that are waiting for dependencies - self._waiting_tasks = [] - - # Tasks ready to be compiled per partition - self._ready_to_compile_tasks = {} + # A set of all the current tasks + self._current_tasks = set() - # All tasks currently in their build phase per partition - self._compiling_tasks = {} - - # Tasks ready to run per partition - self._ready_to_run_tasks = {} + # Keep a reference to all the partitions + self._partitions = set() - # All tasks currently in their run phase per partition - self._running_tasks = {} + # A set of the jobs that should be polled by this scheduler + self._local_scheduler_tasks = set() - # Tasks that need to be finalized - self._completed_tasks = [] + # Sets of the jobs that should be polled for each partition + self._scheduler_tasks = {} - # Retired tasks that need to be cleaned up + # self._retired_tasks = [] # Job limit per partition @@ -270,144 +263,87 @@ def __init__(self): # Max jobs spawned by the reframe thread self._rfm_max_jobs = rt.runtime().get_option(f'systems/0/rfm_max_jobs') - # Keep a reference to all the partitions - self._partitions = set() - self.task_listeners.append(self) - def _remove_from_running(self, task): - getlogger().debug2( - f'Removing task from the running list: {task.testcase}' - ) - try: - partname = task.check.current_partition.fullname - self._running_tasks[partname][self.local_index(task, phase='run')].remove(task) - except (ValueError, AttributeError, KeyError): - getlogger().debug2('Task was not running') - pass - - def _remove_from_building(self, task): - getlogger().debug2( - f'Removing task from the building list: {task.testcase}' - ) - try: - partname = task.check.current_partition.fullname - self._compiling_tasks[partname][self.local_index(task, phase='compile')].remove(task) - except (ValueError, AttributeError, KeyError): - getlogger().debug2('Task was not building') - pass - - # FIXME: The following functions are very similar and they are also reused - # in the serial policy; we should refactor them - def deps_failed(self, task): - # NOTE: Restored dependencies are not in the task_index - return any(self._task_index[c].failed - for c in task.testcase.deps if c in self._task_index) - - def deps_succeeded(self, task): - # NOTE: Restored dependencies are not in the task_index - return all(self._task_index[c].succeeded - for c in task.testcase.deps if c in self._task_index) - - def deps_skipped(self, task): - # NOTE: Restored dependencies are not in the task_index - return any(self._task_index[c].skipped - for c in task.testcase.deps if c in self._task_index) + def runcase(self, case): + super().runcase(case) + check, partition, environ = case + self._partitions.add(partition) - def local_index(self, task, phase='run'): - return ( - task.check.local or - (phase == 'compile' and task.check.build_locally) + # Set partition-based counters, if not set already + self._scheduler_tasks.setdefault(partition.fullname, set()) + self._max_jobs.setdefault(partition.fullname, partition.max_jobs) + task = RegressionTask(case, self.task_listeners) + self._task_index[case] = task + self.stats.add_task(task) + self.printer.status( + 'START', '%s on %s using %s' % + (check.name, partition.fullname, environ.name) ) + self._current_tasks.add(task) - def on_task_setup(self, task): - partname = task.check.current_partition.fullname - if (isinstance(task.check, RunOnlyRegressionTest)): - self._ready_to_run_tasks[partname][self.local_index(task, phase='run')].append(task) - else: - self._ready_to_compile_tasks[partname][self.local_index(task, phase='compile')].append(task) - - def on_task_run(self, task): - partname = task.check.current_partition.fullname - self._running_tasks[partname][self.local_index(task, phase='run')].append(task) - - def on_task_compile(self, task): - partname = task.check.current_partition.fullname - self._compiling_tasks[partname][self.local_index(task, phase='compile')].append(task) - - def on_task_skip(self, task): - # Remove the task from the running list if it was skipped after the - # run phase - if task.check.current_partition: - partname = task.check.current_partition.fullname - if task.failed_stage in ('run_complete', 'run_wait'): - self._running_tasks[partname][self.local_index(task, phase='run')].remove(task) - - if task.failed_stage in ('compile_complete', 'compile_wait'): - self._compiling_tasks[partname][self.local_index(task, phase='compile')].remove(task) - - msg = str(task.exc_info[1]) - self.printer.status('SKIP', msg, just='right') - - def on_task_failure(self, task): - if task.aborted: - return - - self._num_failed_tasks += 1 - msg = f'{task.check.info()} [{task.pipeline_timings_basic()}]' - if task.failed_stage == 'cleanup': - self.printer.status('ERROR', msg, just='right') - else: - self._remove_from_running(task) - self._remove_from_building(task) - self.printer.status('FAIL', msg, just='right') - - stagedir = task.check.stagedir - if not stagedir: - stagedir = '' - - getlogger().info(f'==> test failed during {task.failed_stage!r}: ' - f'test staged in {stagedir!r}') - getlogger().verbose(f'==> timings: {task.pipeline_timings_all()}') - if self._num_failed_tasks >= self.max_failures: - raise FailureLimitError( - f'maximum number of failures ({self.max_failures}) reached' - ) - - def on_task_success(self, task): - msg = f'{task.check.info()} [{task.pipeline_timings_basic()}]' - self.printer.status('OK', msg, just='right') - getlogger().verbose(f'==> timings: {task.pipeline_timings_all()}') - - # Update reference count of dependencies - for c in task.testcase.deps: - # NOTE: Restored dependencies are not in the task_index - if c in self._task_index: - self._task_index[c].ref_count -= 1 - - self._retired_tasks.append(task) + def exit(self): + self.printer.separator('short single line', + 'waiting for spawned checks to finish') + while self._current_tasks: + try: + self._poll_tasks() + num_running = sum( + 1 if t.policy_stage in ['running', 'compiling'] + else 0 for t in self._current_tasks + ) + self.advance_all(self._current_tasks) + _cleanup_all(self._retired_tasks, not self.keep_stage_files) + if num_running: + self._pollctl.running_tasks(num_running).snooze() + except ABORT_REASONS as e: + self._failall(e) + raise - def on_task_exit(self, task): - task.run_wait() - self._remove_from_running(task) - self._completed_tasks.append(task) + self.printer.separator('short single line', + 'all spawned checks have finished\n') - def on_task_compile_exit(self, task): - task.compile_wait() - self._remove_from_building(task) - partname = task.check.current_partition.fullname - if (isinstance(task.check, CompileOnlyRegressionTest)): - self._completed_tasks.append(task) - else: - self._ready_to_run_tasks[partname][self.local_index(task, phase='run')].append(task) + def _poll_tasks(self): + for part in self._partitions: + jobs = [] + for t in self._scheduler_tasks[part.fullname]: + if t.policy_stage == 'compiling': + jobs.append(t.check.build_job) + elif t.policy_stage == 'running': + jobs.append(t.check.job) + + part.scheduler.poll(*jobs) + + jobs = [] + for t in self._local_scheduler_tasks: + if t.policy_stage == 'compiling': + jobs.append(t.check.build_job) + elif t.policy_stage == 'running': + jobs.append(t.check.job) + + self.local_scheduler.poll(*jobs) + + def advance_all(self, tasks, timeout=None): + t_init = time.time() + num_prog = 0 + + # progress might remove the tasks that retire or fail + for t in list(tasks): + method = getattr(self, f'advance_{t.policy_stage}') + num_prog += method(t) + t_elapsed = time.time() - t_init + if timeout and t_elapsed > timeout and num_prog: + break - def _setup_task(self, task): + def advance_wait(self, task): if self.deps_skipped(task): try: raise SkipTestError('skipped due to skipped dependencies') except SkipTestError as e: task.skip() - return False + self._current_tasks.remove(task) + return 1 + elif self.deps_succeeded(task): try: task.setup(task.testcase.partition, @@ -415,278 +351,215 @@ def _setup_task(self, task): sched_flex_alloc_nodes=self.sched_flex_alloc_nodes, sched_options=self.sched_options) except TaskExit: - return False + self._current_tasks.remove(task) + return 1 else: - return True + if isinstance(task.check, RunOnlyRegressionTest): + task.policy_stage = 'ready_to_run' + else: + task.policy_stage = 'ready_to_compile' + + return 1 + elif self.deps_failed(task): exc = TaskDependencyError('dependencies failed') task.fail((type(exc), exc, None)) - return False + self._current_tasks.remove(task) + return 1 else: # Not all dependencies have finished yet - return False + return 0 - def runcase(self, case): - super().runcase(case) - check, partition, environ = case - self._partitions.add(partition) - - # Set partition-based counters, if not set already - self._running_tasks.setdefault(partition.fullname, ([], [])) - self._compiling_tasks.setdefault(partition.fullname, ([], [])) - self._ready_to_compile_tasks.setdefault(partition.fullname, ([], [])) - self._ready_to_run_tasks.setdefault(partition.fullname, ([], [])) - self._max_jobs.setdefault(partition.fullname, partition.max_jobs) + def advance_ready_to_compile(self, task): + if task.check.local or task.check.build_locally: + if len(self._local_scheduler_tasks) <= self._rfm_max_jobs: + try: + task.compile() + task.policy_stage = 'compiling' + self._local_scheduler_tasks.add(task) + except TaskExit: + self._current_tasks.remove(task) - task = RegressionTask(case, self.task_listeners) - self._task_index[case] = task - self.stats.add_task(task) - self.printer.status( - 'RUN', '%s on %s using %s' % - (check.name, partition.fullname, environ.name) - ) - try: - partname = partition.fullname - if not self._setup_task(task): - if not task.skipped and not task.failed: - self.printer.status( - 'DEP', '%s on %s using %s' % - (check.name, partname, environ.name), - just='right' - ) - self._waiting_tasks.append(task) - - return - - if isinstance(task.check, RunOnlyRegressionTest): - local_index = self.local_index(task, phase='run') + return 1 else: - local_index = self.local_index(task, phase='compile') + return 0 - job_limit = self._rfm_max_jobs if local_index else partition.max_jobs + partname = task.check.current_partition.fullname + if len(self._scheduler_tasks[partname]) <= self._max_jobs[partname]: + try: + task.compile() + task.policy_stage = 'compiling' + self._scheduler_tasks[partname].add(task) + except TaskExit: + self._current_tasks.remove(task) - def all_submissions(local, partname=None): - if local: - local_tasks = 0 - for (_, lt) in self._running_tasks.values(): - local_tasks += len(lt) + return 1 - for (_, lt) in self._compiling_tasks.values(): - local_tasks += len(lt) + return 0 - return local_tasks + def advance_compiling(self, task): + try: + if task.compile_complete(): + if task.check.local or task.check.build_locally: + self._local_scheduler_tasks.remove(task) else: - return ( - len(self._running_tasks[partname][local]) + - len(self._compiling_tasks[partname][local]) - ) - - if (all_submissions(local_index, partname) >= job_limit): - # Make sure that we still exceeded the job limit - getlogger().debug2( - f'Reached concurrency limit for partition {partname!r}: ' - f'{partition.max_jobs} job(s)' - ) - self._poll_tasks() + partname = task.check.current_partition.fullname + self._scheduler_tasks[partname].remove(task) - if (all_submissions(local_index, partname) < job_limit): - if isinstance(task.check, RunOnlyRegressionTest): - # Task was put in _ready_to_run_tasks during setup - self._ready_to_run_tasks[partname][local_index].pop() - self._reschedule_run(task) + if isinstance(task.check, CompileOnlyRegressionTest): + task.policy_stage = 'completed' else: - # Task was put in _ready_to_compile_tasks during setup - self._ready_to_compile_tasks[partname][local_index].pop() - self._reschedule_compile(task) + task.policy_stage = 'ready_to_run' + + return 1 else: - self.printer.status('HOLD', task.check.info(), just='right') + return 0 - # NOTE: If we don't schedule runs here and we have a lot of tests - # compiling we will begin submitting only after all the tests are - # processed. On the other hand I am not sure where to schedule - # runs here. - self._reschedule_all(phase='run') except TaskExit: - if not task.failed and not task.skipped: - with contextlib.suppress(TaskExit): - self._reschedule_compile(task) + self._current_tasks.remove(task) + return 1 - return - except ABORT_REASONS as e: - # If abort was caused due to failure elsewhere, abort current - # task as well - task.abort(e) - self._failall(e) - raise + def advance_ready_to_run(self, task): + if task.check.local: + if len(self._local_scheduler_tasks) <= self._rfm_max_jobs: + try: + task.run() + task.policy_stage = 'running' + self._local_scheduler_tasks.add(task) + except TaskExit: + self._current_tasks.remove(task) - def _poll_tasks(self): - '''Update the counts of running checks per partition.''' - for part in self._partitions: - partname = part.fullname - num_tasks = len(self._running_tasks[partname][0]) - getlogger().debug2(f'Polling {num_tasks} running task(s) in ' - f'{partname!r}') - part_jobs = [t.check.job for t in self._running_tasks[partname][0]] - forced_local_jobs = [t.check.job for t in self._running_tasks[partname][1]] - part.scheduler.poll(*part_jobs) - self.local_scheduler.poll(*forced_local_jobs) - - # Trigger notifications for finished jobs. - # We need need a copy of the list here in order to not modify the - # list while looping over it. `run_complete` calls `on_task_exit`, - # which in turn will remove the task from `_running_tasks`. - for t in self._running_tasks[partname][0] + self._running_tasks[partname][1]: - t.run_complete() - - num_tasks = len(self._compiling_tasks[partname][0]) - getlogger().debug2(f'Polling {num_tasks} building task(s) in ' - f'{partname!r}') - part_jobs = [t.check.build_job for t in self._compiling_tasks[partname][0]] - forced_local_jobs = [t.check.build_job for t in self._compiling_tasks[partname][1]] - part.scheduler.poll(*part_jobs) - self.local_scheduler.poll(*forced_local_jobs) - - # Trigger notifications for finished compilation jobs - for t in self._compiling_tasks[partname][0] + self._compiling_tasks[partname][1]: - t.compile_complete() - - def _setup_all(self): - still_waiting = [] - for task in self._waiting_tasks: - if (not self._setup_task(task) and - not task.failed and not task.skipped): - still_waiting.append(task) - - self._waiting_tasks[:] = still_waiting - - def _finalize_all(self): - getlogger().debug2(f'Finalizing {len(self._completed_tasks)} task(s)') - while True: + return 1 + else: + return 0 + + partname = task.check.current_partition.fullname + if len(self._scheduler_tasks[partname]) <= self._max_jobs[partname]: try: - task = self._completed_tasks.pop() - except IndexError: - break + task.run() + task.policy_stage = 'running' + self._scheduler_tasks[partname].add(task) + except TaskExit: + self._current_tasks.remove(task) - getlogger().debug2(f'Finalizing task {task.testcase}') - with contextlib.suppress(TaskExit): - self._finalize_task(task) + return 1 - def _finalize_task(self, task): - getlogger().debug2(f'Finalizing task {task.testcase}') - if not self.skip_sanity_check: - task.sanity() + return 0 - if not self.skip_performance_check: - task.performance() + def advance_running(self, task): + try: + if task.run_complete(): + if task.check.local: + self._local_scheduler_tasks.remove(task) + else: + partname = task.check.current_partition.fullname + self._scheduler_tasks[partname].remove(task) - task.finalize() + task.policy_stage = 'completed' + return 1 + else: + return 0 - def _failall(self, cause): - '''Mark all tests as failures''' - getlogger().debug2(f'Aborting all tasks due to {type(cause).__name__}') - for task in list(itertools.chain(*itertools.chain(*self._running_tasks.values()))): - task.abort(cause) + except TaskExit: + self._current_tasks.remove(task) + return 1 - self._running_tasks = {} - for task in list(itertools.chain(*itertools.chain(*self._compiling_tasks.values()))): - task.abort(cause) + def advance_completed(self, task): + try: + if not self.skip_sanity_check: + task.sanity() - self._compiling_tasks = {} - for task in list(itertools.chain(*itertools.chain(*self._ready_to_compile_tasks.values()))): - task.abort(cause) + if not self.skip_performance_check: + task.performance() - self._ready_to_compile_tasks = {} - for task in list(itertools.chain(*itertools.chain(*self._ready_to_run_tasks.values()))): - task.abort(cause) + task.finalize() + self._retired_tasks.append(task) + self._current_tasks.remove(task) + except TaskExit: + self._current_tasks.remove(task) + finally: + return 1 - self._ready_to_run_tasks = {} - for task in itertools.chain(self._waiting_tasks, - self._completed_tasks): - task.abort(cause) + def deps_failed(self, task): + # NOTE: Restored dependencies are not in the task_index + return any(self._task_index[c].failed + for c in task.testcase.deps if c in self._task_index) - def _reschedule_compile(self, task): - getlogger().debug2(f'Scheduling test case {task.testcase} for ' - f'compiling') - task.compile() + def deps_succeeded(self, task): + # NOTE: Restored dependencies are not in the task_index + return all(self._task_index[c].succeeded + for c in task.testcase.deps if c in self._task_index) - def _reschedule_run(self, task): - getlogger().debug2(f'Scheduling test case {task.testcase} for running') - task.run() + def deps_skipped(self, task): + # NOTE: Restored dependencies are not in the task_index + return any(self._task_index[c].skipped + for c in task.testcase.deps if c in self._task_index) - def _reschedule_all(self, phase='run'): - local_tasks = 0 - for (_, lt) in self._running_tasks.values(): - local_tasks += len(lt) + def _failall(self, cause): + '''Mark all tests as failures''' + getlogger().debug2(f'Aborting all tasks due to {type(cause).__name__}') + for task in self._current_tasks: + task.abort(cause) - local_slots = self._rfm_max_jobs - local_tasks - for part in self._partitions: - partname = part.fullname - part_tasks = ( - len(self._running_tasks[partname][0]) + - len(self._compiling_tasks[partname][0]) - ) - part_slots = self._max_jobs[partname] - part_tasks - num_rescheduled = 0 + # TODO all this prints have to obviously leave from here... + def on_task_setup(self, task): + print(task.check.name, 'setup') - for _ in range(part_slots): - try: - queue = getattr(self, f'_ready_to_{phase}_tasks') - task = queue[partname][0].pop() - except IndexError: - break + def on_task_run(self, task): + print(task.check.name, 'run') - getattr(self, f'_reschedule_{phase}')(task) - num_rescheduled += 1 + def on_task_compile(self, task): + print(task.check.name, 'compile') - for _ in range(local_slots): - try: - queue = getattr(self, f'_ready_to_{phase}_tasks') - task = queue[partname][1].pop() - except IndexError: - break + def on_task_exit(self, task): + print(task.check.name, 'run exit') - getattr(self, f'_reschedule_{phase}')(task) - local_slots -= 1 - num_rescheduled += 1 + def on_task_compile_exit(self, task): + print(task.check.name, 'compile exit') - if num_rescheduled: - getlogger().debug2( - f'Rescheduled {num_rescheduled} {phase} job(s) on ' - f'{partname!r}' - ) + def on_task_skip(self, task): + print(task.check.name, 'skip') - def exit(self): - self.printer.separator('short single line', - 'waiting for spawned checks to finish') - while (countall(self._running_tasks) or self._waiting_tasks or - self._completed_tasks or countall(self._ready_to_compile_tasks) or - countall(self._compiling_tasks) or countall(self._ready_to_run_tasks)): - getlogger().debug2(f'Running tasks: ' - f'{countall(self._running_tasks)}') - try: - self._poll_tasks() + def on_task_failure(self, task): + self._num_failed_tasks += 1 + timings = task.pipeline_timings(['compile_complete', + 'run_complete', + 'total']) + msg = f'{task.check.info()} [{timings}]' + if task.failed_stage == 'cleanup': + self.printer.status('ERROR', msg, just='right') + else: + self.printer.status('FAIL', msg, just='right') - # We count running tasks just after polling in order to check - # more reliably that the state has changed, so that we - # decrease the sleep time. Otherwise if the number of tasks - # rescheduled was the as the number of tasks retired, the - # sleep time would be increased. - num_running = countall(self._running_tasks) - self._finalize_all() - self._setup_all() - self._reschedule_all(phase='compile') - self._reschedule_all(phase='run') - _cleanup_all(self._retired_tasks, not self.keep_stage_files) - if num_running: - self._pollctl.running_tasks(num_running).snooze() + timings = task.pipeline_timings(['setup', + 'compile_complete', + 'run_complete', + 'sanity', + 'performance', + 'total']) + getlogger().info(f'==> test failed during {task.failed_stage!r}: ' + f'test staged in {task.check.stagedir!r}') + getlogger().verbose(f'==> {timings}') + if self._num_failed_tasks >= self.max_failures: + raise FailureLimitError( + f'maximum number of failures ({self.max_failures}) reached' + ) - except TaskExit: - with contextlib.suppress(TaskExit): - self._reschedule_all(phase='compile') - self._reschedule_all(phase='run') - except ABORT_REASONS as e: - self._failall(e) - raise + def on_task_success(self, task): + timings = task.pipeline_timings(['compile_complete', + 'run_complete', + 'total']) + msg = f'{task.check.info()} [{timings}]' + self.printer.status('OK', msg, just='right') + timings = task.pipeline_timings(['setup', + 'compile_complete', + 'run_complete', + 'sanity', + 'performance', + 'total']) + getlogger().verbose(f'==> {timings}') - self.printer.separator('short single line', - 'all spawned checks have finished\n') + for c in task.testcase.deps: + # NOTE: Restored dependencies are not in the task_index + if c in self._task_index: + self._task_index[c].ref_count -= 1 From 35b1981d8e6cf1be4d3fdbd3243aad310b029690 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Thu, 25 Nov 2021 18:00:10 +0100 Subject: [PATCH 32/76] Add compile_wait/run_wait methods to run hooks --- reframe/frontend/executors/policies.py | 30 ++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index c5c9bddcc1..a7df496891 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -276,9 +276,9 @@ def runcase(self, case): task = RegressionTask(case, self.task_listeners) self._task_index[case] = task self.stats.add_task(task) - self.printer.status( - 'START', '%s on %s using %s' % - (check.name, partition.fullname, environ.name) + self.printer.info( + f'==> added {check.name} on {partition.fullname} ' + f'using {environ.name}' ) self._current_tasks.add(task) @@ -400,6 +400,7 @@ def advance_ready_to_compile(self, task): def advance_compiling(self, task): try: if task.compile_complete(): + task.compile_wait() if task.check.local or task.check.build_locally: self._local_scheduler_tasks.remove(task) else: @@ -416,6 +417,12 @@ def advance_compiling(self, task): return 0 except TaskExit: + if task.check.local or task.check.build_locally: + self._local_scheduler_tasks.remove(task) + else: + partname = task.check.current_partition.fullname + self._scheduler_tasks[partname].remove(task) + self._current_tasks.remove(task) return 1 @@ -449,6 +456,7 @@ def advance_ready_to_run(self, task): def advance_running(self, task): try: if task.run_complete(): + task.run_wait() if task.check.local: self._local_scheduler_tasks.remove(task) else: @@ -461,6 +469,12 @@ def advance_running(self, task): return 0 except TaskExit: + if task.check.local: + self._local_scheduler_tasks.remove(task) + else: + partname = task.check.current_partition.fullname + self._scheduler_tasks[partname].remove(task) + self._current_tasks.remove(task) return 1 @@ -503,21 +517,29 @@ def _failall(self, cause): # TODO all this prints have to obviously leave from here... def on_task_setup(self, task): - print(task.check.name, 'setup') + self.printer.status( + 'START', '%s on %s using %s' % + (task.check.name, task.check.current_partition.fullname, task.check.current_environ.name) + ) def on_task_run(self, task): + pass print(task.check.name, 'run') def on_task_compile(self, task): + pass print(task.check.name, 'compile') def on_task_exit(self, task): + pass print(task.check.name, 'run exit') def on_task_compile_exit(self, task): + pass print(task.check.name, 'compile exit') def on_task_skip(self, task): + pass print(task.check.name, 'skip') def on_task_failure(self, task): From 411a58b2e76e39965392c1d016a95f0d4e1d6703 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Thu, 25 Nov 2021 18:14:22 +0100 Subject: [PATCH 33/76] Fix weird attribute error bug The AttributeError appears somehow in some RunOnlyRegressionTest tests where access to a property raises an AttributeError in the pipeline. --- reframe/frontend/executors/policies.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index a7df496891..6aaa169780 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -354,10 +354,10 @@ def advance_wait(self, task): self._current_tasks.remove(task) return 1 else: - if isinstance(task.check, RunOnlyRegressionTest): - task.policy_stage = 'ready_to_run' - else: - task.policy_stage = 'ready_to_compile' + # if isinstance(task.check, RunOnlyRegressionTest): + # task.policy_stage = 'ready_to_run' + # else: + task.policy_stage = 'ready_to_compile' return 1 @@ -407,10 +407,10 @@ def advance_compiling(self, task): partname = task.check.current_partition.fullname self._scheduler_tasks[partname].remove(task) - if isinstance(task.check, CompileOnlyRegressionTest): - task.policy_stage = 'completed' - else: - task.policy_stage = 'ready_to_run' + # if isinstance(task.check, CompileOnlyRegressionTest): + # task.policy_stage = 'completed' + # else: + task.policy_stage = 'ready_to_run' return 1 else: From 3a52512ff4286a2c12223093f05deb076c1ddf3a Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Fri, 26 Nov 2021 10:51:20 +0100 Subject: [PATCH 34/76] Fix maxlimit bug --- reframe/frontend/executors/policies.py | 65 ++++++++++++++++++-------- 1 file changed, 45 insertions(+), 20 deletions(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 6aaa169780..aa46c0da51 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -353,13 +353,22 @@ def advance_wait(self, task): except TaskExit: self._current_tasks.remove(task) return 1 + + if isinstance(task.check, RunOnlyRegressionTest): + try: + task.compile() + task.compile_wait() + except TaskExit: + # Run and run_wait are no-ops for + # CompileOnlyRegressionTest. This shouldn't fail. + self._current_tasks.remove(task) + return 1 + + task.policy_stage = 'ready_to_run' else: - # if isinstance(task.check, RunOnlyRegressionTest): - # task.policy_stage = 'ready_to_run' - # else: task.policy_stage = 'ready_to_compile' - return 1 + return 1 elif self.deps_failed(task): exc = TaskDependencyError('dependencies failed') @@ -407,10 +416,19 @@ def advance_compiling(self, task): partname = task.check.current_partition.fullname self._scheduler_tasks[partname].remove(task) - # if isinstance(task.check, CompileOnlyRegressionTest): - # task.policy_stage = 'completed' - # else: - task.policy_stage = 'ready_to_run' + if isinstance(task.check, CompileOnlyRegressionTest): + try: + task.run() + task.run_wait() + except TaskExit: + # Run and run_wait are no-ops for + # CompileOnlyRegressionTest. This shouldn't fail. + self._current_tasks.remove(task) + return 1 + + task.policy_stage = 'completed' + else: + task.policy_stage = 'ready_to_run' return 1 else: @@ -513,34 +531,41 @@ def _failall(self, cause): '''Mark all tests as failures''' getlogger().debug2(f'Aborting all tasks due to {type(cause).__name__}') for task in self._current_tasks: - task.abort(cause) + with contextlib.suppress(FailureLimitError): + task.abort(cause) # TODO all this prints have to obviously leave from here... def on_task_setup(self, task): - self.printer.status( - 'START', '%s on %s using %s' % - (task.check.name, task.check.current_partition.fullname, task.check.current_environ.name) - ) + # print(task.check.name, 'setup') + pass def on_task_run(self, task): - pass - print(task.check.name, 'run') + if isinstance(task.check, RunOnlyRegressionTest): + self.printer.status( + 'RUN', '%s on %s using %s' % + (task.check.name, task.check.current_partition.fullname, task.check.current_environ.name) + ) def on_task_compile(self, task): - pass - print(task.check.name, 'compile') + if isinstance(task.check, RunOnlyRegressionTest): + return + + self.printer.status( + 'BUILD', '%s on %s using %s' % + (task.check.name, task.check.current_partition.fullname, task.check.current_environ.name) + ) def on_task_exit(self, task): pass - print(task.check.name, 'run exit') + # print(task.check.name, 'run exit') def on_task_compile_exit(self, task): pass - print(task.check.name, 'compile exit') + # print(task.check.name, 'compile exit') def on_task_skip(self, task): pass - print(task.check.name, 'skip') + # print(task.check.name, 'skip') def on_task_failure(self, task): self._num_failed_tasks += 1 From 844a6c7e335dd66d938f7e1b30f614a0fa108122 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Fri, 26 Nov 2021 13:22:11 +0100 Subject: [PATCH 35/76] add printing of statistics --- reframe/frontend/executors/policies.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index aa46c0da51..4556be830a 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -282,10 +282,36 @@ def runcase(self, case): ) self._current_tasks.add(task) + def print_state_of_tasks(self, tasks): + stats = { + 'wait': [], + 'ready_to_compile': {}, + 'compiling': {}, + 'ready_to_run': {}, + 'running': {}, + 'completed': {} + } + print(f'Total tasks: {len(tasks)}') + for t in tasks: + if t.policy_stage == 'wait': + stats['wait'].append(t) + else: + stats[t.policy_stage].setdefault(t.check.current_partition.fullname, []) + stats[t.policy_stage][t.check.current_partition.fullname].append(t) + + print(f"Tasks in wait: {len(stats['wait'])}") + phases = ['ready_to_compile', 'compiling', 'ready_to_run', 'running', 'completed'] + for phase in phases: + print(f"Tasks in {phase}:") + for part in stats[phase]: + print(f" {part}: {len(stats[phase][part])}") + def exit(self): self.printer.separator('short single line', 'waiting for spawned checks to finish') while self._current_tasks: + # print() + # self.print_state_of_tasks(self._current_tasks) try: self._poll_tasks() num_running = sum( From ac971819c7c49593eb77029940f4044ac7eab93d Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Fri, 26 Nov 2021 14:11:01 +0100 Subject: [PATCH 36/76] Fix concurrency limits --- reframe/frontend/executors/policies.py | 8 ++++---- unittests/test_policies.py | 9 ++++++--- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 4556be830a..1ca1dcde31 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -407,7 +407,7 @@ def advance_wait(self, task): def advance_ready_to_compile(self, task): if task.check.local or task.check.build_locally: - if len(self._local_scheduler_tasks) <= self._rfm_max_jobs: + if len(self._local_scheduler_tasks) < self._rfm_max_jobs: try: task.compile() task.policy_stage = 'compiling' @@ -420,7 +420,7 @@ def advance_ready_to_compile(self, task): return 0 partname = task.check.current_partition.fullname - if len(self._scheduler_tasks[partname]) <= self._max_jobs[partname]: + if len(self._scheduler_tasks[partname]) < self._max_jobs[partname]: try: task.compile() task.policy_stage = 'compiling' @@ -472,7 +472,7 @@ def advance_compiling(self, task): def advance_ready_to_run(self, task): if task.check.local: - if len(self._local_scheduler_tasks) <= self._rfm_max_jobs: + if len(self._local_scheduler_tasks) < self._rfm_max_jobs: try: task.run() task.policy_stage = 'running' @@ -485,7 +485,7 @@ def advance_ready_to_run(self, task): return 0 partname = task.check.current_partition.fullname - if len(self._scheduler_tasks[partname]) <= self._max_jobs[partname]: + if len(self._scheduler_tasks[partname]) < self._max_jobs[partname]: try: task.run() task.policy_stage = 'running' diff --git a/unittests/test_policies.py b/unittests/test_policies.py index 3f34a6e3b6..6f1d352f87 100644 --- a/unittests/test_policies.py +++ b/unittests/test_policies.py @@ -597,9 +597,10 @@ def _read_timestamps(tasks): def test_concurrency_unlimited(async_runner, make_cases, make_exec_ctx): num_checks = 3 - make_exec_ctx(options=max_jobs_opts(num_checks)) + # make_exec_ctx(options=max_jobs_opts(num_checks)) runner, monitor = async_runner + runner.policy._rfm_max_jobs = num_checks runner.runall(make_cases([SleepCheck(.5) for i in range(num_checks)])) # Ensure that all tests were run and without failures. @@ -624,9 +625,10 @@ def test_concurrency_unlimited(async_runner, make_cases, make_exec_ctx): def test_concurrency_limited(async_runner, make_cases, make_exec_ctx): # The number of checks must be <= 2*max_jobs. num_checks, max_jobs = 5, 3 - make_exec_ctx(options=max_jobs_opts(max_jobs)) + # make_exec_ctx(options=max_jobs_opts(max_jobs)) runner, monitor = async_runner + runner.policy._rfm_max_jobs = 3 runner.runall(make_cases([SleepCheck(.5) for i in range(num_checks)])) # Ensure that all tests were run and without failures. @@ -665,9 +667,10 @@ def test_concurrency_limited(async_runner, make_cases, make_exec_ctx): def test_concurrency_none(async_runner, make_cases, make_exec_ctx): num_checks = 3 - make_exec_ctx(options=max_jobs_opts(1)) + # make_exec_ctx(options=max_jobs_opts(1)) runner, monitor = async_runner + runner.policy._rfm_max_jobs = 1 runner.runall(make_cases([SleepCheck(.5) for i in range(num_checks)])) # Ensure that all tests were run and without failures. From 8a9f5e14b5771d47b6582358429f7422d9c2dd8e Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Fri, 26 Nov 2021 15:12:39 +0100 Subject: [PATCH 37/76] Remove unused imports --- reframe/frontend/executors/policies.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 1ca1dcde31..ae782aa666 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -4,8 +4,6 @@ # SPDX-License-Identifier: BSD-3-Clause import contextlib -import functools -import itertools import math import sys import time From 7da9b3cb8512c7ea10f5faa1757a6a9d6ec6714b Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Mon, 13 Dec 2021 17:48:28 +0100 Subject: [PATCH 38/76] Address PR comments --- reframe/frontend/executors/__init__.py | 18 +--- reframe/frontend/executors/policies.py | 131 ++++++++----------------- unittests/test_policies.py | 12 ++- 3 files changed, 56 insertions(+), 105 deletions(-) diff --git a/reframe/frontend/executors/__init__.py b/reframe/frontend/executors/__init__.py index a9f7a1571f..399b692b4b 100644 --- a/reframe/frontend/executors/__init__.py +++ b/reframe/frontend/executors/__init__.py @@ -498,26 +498,16 @@ def print_separator(check, prefix): '%s %s (%s)' % (prefix, check.name, check.descr) ) + self._printer.separator('short single line', + 'start processing checks') self._policy.enter() self._printer.reset_progress(len(testcases)) - last_check = None for t in testcases: - if last_check is None or last_check.name != t.check.name: - if last_check is not None: - print_separator(last_check, 'finished processing') - self._printer.info('') - - print_separator(t.check, 'started processing') - last_check = t.check - self._policy.runcase(t) - # Close the last visual box - if last_check is not None: - print_separator(last_check, 'finished processing') - self._printer.info('') - self._policy.exit() + self._printer.separator('short single line', + 'all spawned checks have finished\n') class ExecutionPolicy(abc.ABC): diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index ae782aa666..4133c2bd75 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -246,20 +246,18 @@ def __init__(self): # Keep a reference to all the partitions self._partitions = set() - # A set of the jobs that should be polled by this scheduler - self._local_scheduler_tasks = set() - # Sets of the jobs that should be polled for each partition - self._scheduler_tasks = {} + self._scheduler_tasks = { + '_rfm_local' : set() + } - # + # Retired tasks that need to be cleaned up self._retired_tasks = [] # Job limit per partition - self._max_jobs = {} - - # Max jobs spawned by the reframe thread - self._rfm_max_jobs = rt.runtime().get_option(f'systems/0/rfm_max_jobs') + self._max_jobs = { + '_rfm_local' : rt.runtime().get_option(f'systems/0/rfm_max_jobs') + } self.task_listeners.append(self) @@ -271,15 +269,17 @@ def runcase(self, case): # Set partition-based counters, if not set already self._scheduler_tasks.setdefault(partition.fullname, set()) self._max_jobs.setdefault(partition.fullname, partition.max_jobs) + task = RegressionTask(case, self.task_listeners) self._task_index[case] = task self.stats.add_task(task) - self.printer.info( + getlogger().debug2( f'==> added {check.name} on {partition.fullname} ' f'using {environ.name}' ) self._current_tasks.add(task) + # TODO: This is only for testing purposes here and should be deleted def print_state_of_tasks(self, tasks): stats = { 'wait': [], @@ -305,16 +305,14 @@ def print_state_of_tasks(self, tasks): print(f" {part}: {len(stats[phase][part])}") def exit(self): - self.printer.separator('short single line', - 'waiting for spawned checks to finish') while self._current_tasks: # print() # self.print_state_of_tasks(self._current_tasks) try: self._poll_tasks() num_running = sum( - 1 if t.policy_stage in ['running', 'compiling'] - else 0 for t in self._current_tasks + 1 if t.policy_stage in ('running', 'compiling') else 0 + for t in self._current_tasks ) self.advance_all(self._current_tasks) _cleanup_all(self._retired_tasks, not self.keep_stage_files) @@ -324,9 +322,6 @@ def exit(self): self._failall(e) raise - self.printer.separator('short single line', - 'all spawned checks have finished\n') - def _poll_tasks(self): for part in self._partitions: jobs = [] @@ -339,7 +334,7 @@ def _poll_tasks(self): part.scheduler.poll(*jobs) jobs = [] - for t in self._local_scheduler_tasks: + for t in self._scheduler_tasks['_rfm_local']: if t.policy_stage == 'compiling': jobs.append(t.check.build_job) elif t.policy_stage == 'running': @@ -349,14 +344,14 @@ def _poll_tasks(self): def advance_all(self, tasks, timeout=None): t_init = time.time() - num_prog = 0 + num_progressed = 0 # progress might remove the tasks that retire or fail for t in list(tasks): - method = getattr(self, f'advance_{t.policy_stage}') - num_prog += method(t) + bump_state = getattr(self, f'advance_{t.policy_stage}') + num_progressed += bump_state(t) t_elapsed = time.time() - t_init - if timeout and t_elapsed > timeout and num_prog: + if timeout and t_elapsed > timeout and num_progressed: break def advance_wait(self, task): @@ -370,6 +365,10 @@ def advance_wait(self, task): elif self.deps_succeeded(task): try: + self.printer.status( + 'RUN', '%s on %s using %s' % + (task.check.name, task.testcase.partition.fullname, task.testcase.environ.name) + ) task.setup(task.testcase.partition, task.testcase.environ, sched_flex_alloc_nodes=self.sched_flex_alloc_nodes, @@ -404,20 +403,10 @@ def advance_wait(self, task): return 0 def advance_ready_to_compile(self, task): - if task.check.local or task.check.build_locally: - if len(self._local_scheduler_tasks) < self._rfm_max_jobs: - try: - task.compile() - task.policy_stage = 'compiling' - self._local_scheduler_tasks.add(task) - except TaskExit: - self._current_tasks.remove(task) - - return 1 - else: - return 0 - - partname = task.check.current_partition.fullname + partname = ( + '_rfm_local' if task.check.local or task.check.build_locally + else task.check.current_partition.fullname + ) if len(self._scheduler_tasks[partname]) < self._max_jobs[partname]: try: task.compile() @@ -431,14 +420,14 @@ def advance_ready_to_compile(self, task): return 0 def advance_compiling(self, task): + partname = ( + '_rfm_local' if task.check.local or task.check.build_locally + else task.check.current_partition.fullname + ) try: if task.compile_complete(): task.compile_wait() - if task.check.local or task.check.build_locally: - self._local_scheduler_tasks.remove(task) - else: - partname = task.check.current_partition.fullname - self._scheduler_tasks[partname].remove(task) + self._scheduler_tasks[partname].remove(task) if isinstance(task.check, CompileOnlyRegressionTest): try: @@ -459,30 +448,15 @@ def advance_compiling(self, task): return 0 except TaskExit: - if task.check.local or task.check.build_locally: - self._local_scheduler_tasks.remove(task) - else: - partname = task.check.current_partition.fullname - self._scheduler_tasks[partname].remove(task) - + self._scheduler_tasks[partname].remove(task) self._current_tasks.remove(task) return 1 def advance_ready_to_run(self, task): - if task.check.local: - if len(self._local_scheduler_tasks) < self._rfm_max_jobs: - try: - task.run() - task.policy_stage = 'running' - self._local_scheduler_tasks.add(task) - except TaskExit: - self._current_tasks.remove(task) - - return 1 - else: - return 0 - - partname = task.check.current_partition.fullname + partname = ( + '_rfm_local' if task.check.local + else task.check.current_partition.fullname + ) if len(self._scheduler_tasks[partname]) < self._max_jobs[partname]: try: task.run() @@ -496,14 +470,14 @@ def advance_ready_to_run(self, task): return 0 def advance_running(self, task): + partname = ( + '_rfm_local' if task.check.local + else task.check.current_partition.fullname + ) try: if task.run_complete(): task.run_wait() - if task.check.local: - self._local_scheduler_tasks.remove(task) - else: - partname = task.check.current_partition.fullname - self._scheduler_tasks[partname].remove(task) + self._scheduler_tasks[partname].remove(task) task.policy_stage = 'completed' return 1 @@ -511,12 +485,7 @@ def advance_running(self, task): return 0 except TaskExit: - if task.check.local: - self._local_scheduler_tasks.remove(task) - else: - partname = task.check.current_partition.fullname - self._scheduler_tasks[partname].remove(task) - + self._scheduler_tasks[partname].remove(task) self._current_tasks.remove(task) return 1 @@ -560,36 +529,22 @@ def _failall(self, cause): # TODO all this prints have to obviously leave from here... def on_task_setup(self, task): - # print(task.check.name, 'setup') pass def on_task_run(self, task): - if isinstance(task.check, RunOnlyRegressionTest): - self.printer.status( - 'RUN', '%s on %s using %s' % - (task.check.name, task.check.current_partition.fullname, task.check.current_environ.name) - ) + pass def on_task_compile(self, task): - if isinstance(task.check, RunOnlyRegressionTest): - return - - self.printer.status( - 'BUILD', '%s on %s using %s' % - (task.check.name, task.check.current_partition.fullname, task.check.current_environ.name) - ) + pass def on_task_exit(self, task): pass - # print(task.check.name, 'run exit') def on_task_compile_exit(self, task): pass - # print(task.check.name, 'compile exit') def on_task_skip(self, task): pass - # print(task.check.name, 'skip') def on_task_failure(self, task): self._num_failed_tasks += 1 diff --git a/unittests/test_policies.py b/unittests/test_policies.py index 6f1d352f87..dce2a9a638 100644 --- a/unittests/test_policies.py +++ b/unittests/test_policies.py @@ -600,7 +600,9 @@ def test_concurrency_unlimited(async_runner, make_cases, make_exec_ctx): # make_exec_ctx(options=max_jobs_opts(num_checks)) runner, monitor = async_runner - runner.policy._rfm_max_jobs = num_checks + runner.policy._max_jobs = { + '_rfm_local' : num_checks + } runner.runall(make_cases([SleepCheck(.5) for i in range(num_checks)])) # Ensure that all tests were run and without failures. @@ -628,7 +630,9 @@ def test_concurrency_limited(async_runner, make_cases, make_exec_ctx): # make_exec_ctx(options=max_jobs_opts(max_jobs)) runner, monitor = async_runner - runner.policy._rfm_max_jobs = 3 + runner.policy._max_jobs = { + '_rfm_local' : 3 + } runner.runall(make_cases([SleepCheck(.5) for i in range(num_checks)])) # Ensure that all tests were run and without failures. @@ -670,7 +674,9 @@ def test_concurrency_none(async_runner, make_cases, make_exec_ctx): # make_exec_ctx(options=max_jobs_opts(1)) runner, monitor = async_runner - runner.policy._rfm_max_jobs = 1 + runner.policy._max_jobs = { + '_rfm_local' : 1 + } runner.runall(make_cases([SleepCheck(.5) for i in range(num_checks)])) # Ensure that all tests were run and without failures. From 0ec8e75ff758476f744a3bde7faa648233b9ade8 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Mon, 13 Dec 2021 17:53:35 +0100 Subject: [PATCH 39/76] Remove empty lines --- reframe/frontend/executors/policies.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 4133c2bd75..f3ee366ddb 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -362,7 +362,6 @@ def advance_wait(self, task): task.skip() self._current_tasks.remove(task) return 1 - elif self.deps_succeeded(task): try: self.printer.status( @@ -392,7 +391,6 @@ def advance_wait(self, task): task.policy_stage = 'ready_to_compile' return 1 - elif self.deps_failed(task): exc = TaskDependencyError('dependencies failed') task.fail((type(exc), exc, None)) From bce15bdd7e6d68fb12b9420fda913f3572a10306 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Thu, 16 Dec 2021 14:14:59 +0100 Subject: [PATCH 40/76] Update policies unittests --- unittests/test_policies.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/unittests/test_policies.py b/unittests/test_policies.py index dce2a9a638..2fa736b767 100644 --- a/unittests/test_policies.py +++ b/unittests/test_policies.py @@ -595,9 +595,8 @@ def _read_timestamps(tasks): return begin_stamps, end_stamps -def test_concurrency_unlimited(async_runner, make_cases, make_exec_ctx): +def test_concurrency_unlimited(async_runner, make_cases): num_checks = 3 - # make_exec_ctx(options=max_jobs_opts(num_checks)) runner, monitor = async_runner runner.policy._max_jobs = { @@ -624,10 +623,9 @@ def test_concurrency_unlimited(async_runner, make_cases, make_exec_ctx): pytest.skip('the system seems too much loaded.') -def test_concurrency_limited(async_runner, make_cases, make_exec_ctx): +def test_concurrency_limited(async_runner, make_cases): # The number of checks must be <= 2*max_jobs. num_checks, max_jobs = 5, 3 - # make_exec_ctx(options=max_jobs_opts(max_jobs)) runner, monitor = async_runner runner.policy._max_jobs = { @@ -669,9 +667,8 @@ def test_concurrency_limited(async_runner, make_cases, make_exec_ctx): pytest.skip('the system seems too loaded.') -def test_concurrency_none(async_runner, make_cases, make_exec_ctx): +def test_concurrency_none(async_runner, make_cases): num_checks = 3 - # make_exec_ctx(options=max_jobs_opts(1)) runner, monitor = async_runner runner.policy._max_jobs = { From 6902740b605135eeeaafd305b057b8b64588c30e Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Fri, 17 Dec 2021 10:25:39 +0100 Subject: [PATCH 41/76] Change documentation output --- docs/tutorial_basics.rst | 55 +++------------- docs/tutorial_deps.rst | 117 ++++++++++---------------------- docs/tutorial_fixtures.rst | 121 ++++++++++------------------------ docs/tutorial_tips_tricks.rst | 8 +-- 4 files changed, 80 insertions(+), 221 deletions(-) diff --git a/docs/tutorial_basics.rst b/docs/tutorial_basics.rst index b77b5e4200..08f8f536f4 100644 --- a/docs/tutorial_basics.rst +++ b/docs/tutorial_basics.rst @@ -113,11 +113,8 @@ Now it's time to run our first test: [==========] Running 1 check(s) [==========] Started on Mon Oct 12 18:23:30 2020 - [----------] started processing HelloTest (HelloTest) + [----------] start processing checks [ RUN ] HelloTest on generic:default using builtin - [----------] finished processing HelloTest (HelloTest) - - [----------] waiting for spawned checks to finish [ OK ] (1/1) HelloTest on generic:default using builtin [compile: 0.389s run: 0.406s total: 0.811s] [----------] all spawned checks have finished @@ -283,17 +280,11 @@ Let's run the test now: [==========] Running 2 check(s) [==========] Started on Tue Mar 9 23:25:22 2021 - [----------] started processing HelloMultiLangTest_c (HelloMultiLangTest_c) + [----------] start processing checks [ RUN ] HelloMultiLangTest_c on generic:default using builtin - [----------] finished processing HelloMultiLangTest_c (HelloMultiLangTest_c) - - [----------] started processing HelloMultiLangTest_cpp (HelloMultiLangTest_cpp) [ RUN ] HelloMultiLangTest_cpp on generic:default using builtin [ FAIL ] (1/2) HelloMultiLangTest_cpp on generic:default using builtin [compile: 0.006s run: n/a total: 0.023s] ==> test failed during 'compile': test staged in '/Users/user/Repositories/reframe/stage/generic/default/builtin/HelloMultiLangTest_cpp' - [----------] finished processing HelloMultiLangTest_cpp (HelloMultiLangTest_cpp) - - [----------] waiting for spawned checks to finish [ OK ] (2/2) HelloMultiLangTest_c on generic:default using builtin [compile: 0.981s run: 0.468s total: 1.475s] [----------] all spawned checks have finished @@ -397,17 +388,11 @@ Let's now rerun our "Hello, World!" tests: [==========] Running 2 check(s) [==========] Started on Tue Mar 9 23:28:00 2021 - [----------] started processing HelloMultiLangTest_c (HelloMultiLangTest_c) + [----------] start processing checks [ RUN ] HelloMultiLangTest_c on catalina:default using gnu [ RUN ] HelloMultiLangTest_c on catalina:default using clang - [----------] finished processing HelloMultiLangTest_c (HelloMultiLangTest_c) - - [----------] started processing HelloMultiLangTest_cpp (HelloMultiLangTest_cpp) [ RUN ] HelloMultiLangTest_cpp on catalina:default using gnu [ RUN ] HelloMultiLangTest_cpp on catalina:default using clang - [----------] finished processing HelloMultiLangTest_cpp (HelloMultiLangTest_cpp) - - [----------] waiting for spawned checks to finish [ OK ] (1/4) HelloMultiLangTest_cpp on catalina:default using gnu [compile: 0.768s run: 1.115s total: 1.909s] [ OK ] (2/4) HelloMultiLangTest_c on catalina:default using gnu [compile: 0.600s run: 2.230s total: 2.857s] [ OK ] (3/4) HelloMultiLangTest_c on catalina:default using clang [compile: 0.238s run: 2.129s total: 2.393s] @@ -499,12 +484,9 @@ Let's run the test now: [==========] Running 1 check(s) [==========] Started on Mon Oct 12 20:02:37 2020 - [----------] started processing HelloThreadedTest (HelloThreadedTest) + [----------] start processing checks [ RUN ] HelloThreadedTest on catalina:default using gnu [ RUN ] HelloThreadedTest on catalina:default using clang - [----------] finished processing HelloThreadedTest (HelloThreadedTest) - - [----------] waiting for spawned checks to finish [ OK ] (1/2) HelloThreadedTest on catalina:default using gnu [compile: 1.591s run: 1.205s total: 2.816s] [ OK ] (2/2) HelloThreadedTest on catalina:default using clang [compile: 1.141s run: 0.309s total: 1.465s] [----------] all spawned checks have finished @@ -592,12 +574,9 @@ Let's run this version of the test now and see if it fails: [==========] Running 1 check(s) [==========] Started on Mon Oct 12 20:04:59 2020 - [----------] started processing HelloThreadedExtendedTest (HelloThreadedExtendedTest) + [----------] start processing checks [ RUN ] HelloThreadedExtendedTest on catalina:default using gnu [ RUN ] HelloThreadedExtendedTest on catalina:default using clang - [----------] finished processing HelloThreadedExtendedTest (HelloThreadedExtendedTest) - - [----------] waiting for spawned checks to finish [ FAIL ] (1/2) HelloThreadedExtendedTest on catalina:default using gnu [compile: 1.222s run: 0.891s total: 2.130s] [ FAIL ] (2/2) HelloThreadedExtendedTest on catalina:default using clang [compile: 0.835s run: 0.167s total: 1.018s] [----------] all spawned checks have finished @@ -718,11 +697,8 @@ The :option:`--performance-report` will generate a short report at the end for e [==========] Running 1 check(s) [==========] Started on Mon Oct 12 20:06:09 2020 - [----------] started processing StreamTest (StreamTest) + [----------] start processing checks [ RUN ] StreamTest on catalina:default using gnu - [----------] finished processing StreamTest (StreamTest) - - [----------] waiting for spawned checks to finish [ OK ] (1/1) StreamTest on catalina:default using gnu [compile: 1.386s run: 2.377s total: 3.780s] [----------] all spawned checks have finished @@ -967,7 +943,7 @@ We will only do so with the final versions of the tests from the previous sectio [==========] Running 4 check(s) [==========] Started on Mon Jan 25 00:34:32 2021 - [----------] started processing HelloMultiLangTest_c (HelloMultiLangTest_c) + [----------] start processing checks [ RUN ] HelloMultiLangTest_c on daint:login using builtin [ RUN ] HelloMultiLangTest_c on daint:login using gnu [ RUN ] HelloMultiLangTest_c on daint:login using intel @@ -981,9 +957,6 @@ We will only do so with the final versions of the tests from the previous sectio [ RUN ] HelloMultiLangTest_c on daint:mc using intel [ RUN ] HelloMultiLangTest_c on daint:mc using pgi [ RUN ] HelloMultiLangTest_c on daint:mc using cray - [----------] finished processing HelloMultiLangTest_c (HelloMultiLangTest_c) - - [----------] started processing HelloMultiLangTest_cpp (HelloMultiLangTest_cpp) [ RUN ] HelloMultiLangTest_cpp on daint:login using builtin [ RUN ] HelloMultiLangTest_cpp on daint:login using gnu [ RUN ] HelloMultiLangTest_cpp on daint:login using intel @@ -997,9 +970,6 @@ We will only do so with the final versions of the tests from the previous sectio [ RUN ] HelloMultiLangTest_cpp on daint:mc using intel [ RUN ] HelloMultiLangTest_cpp on daint:mc using pgi [ RUN ] HelloMultiLangTest_cpp on daint:mc using cray - [----------] finished processing HelloMultiLangTest_cpp (HelloMultiLangTest_cpp) - - [----------] started processing HelloThreadedExtended2Test (HelloThreadedExtended2Test) [ RUN ] HelloThreadedExtended2Test on daint:login using builtin [ RUN ] HelloThreadedExtended2Test on daint:login using gnu [ RUN ] HelloThreadedExtended2Test on daint:login using intel @@ -1013,15 +983,9 @@ We will only do so with the final versions of the tests from the previous sectio [ RUN ] HelloThreadedExtended2Test on daint:mc using intel [ RUN ] HelloThreadedExtended2Test on daint:mc using pgi [ RUN ] HelloThreadedExtended2Test on daint:mc using cray - [----------] finished processing HelloThreadedExtended2Test (HelloThreadedExtended2Test) - - [----------] started processing StreamWithRefTest (StreamWithRefTest) [ RUN ] StreamWithRefTest on daint:login using gnu [ RUN ] StreamWithRefTest on daint:gpu using gnu [ RUN ] StreamWithRefTest on daint:mc using gnu - [----------] finished processing StreamWithRefTest (StreamWithRefTest) - - [----------] waiting for spawned checks to finish [ OK ] ( 1/42) HelloThreadedExtended2Test on daint:login using cray [compile: 0.959s run: 56.203s total: 57.189s] [ OK ] ( 2/42) HelloThreadedExtended2Test on daint:login using intel [compile: 2.096s run: 61.438s total: 64.062s] [ OK ] ( 3/42) HelloMultiLangTest_cpp on daint:login using cray [compile: 0.479s run: 98.909s total: 99.406s] @@ -1205,7 +1169,7 @@ Let's run our adapted test now: [==========] Running 1 check(s) [==========] Started on Mon Oct 12 20:16:03 2020 - [----------] started processing StreamMultiSysTest (StreamMultiSysTest) + [----------] start processing checks [ RUN ] StreamMultiSysTest on daint:login using gnu [ RUN ] StreamMultiSysTest on daint:login using intel [ RUN ] StreamMultiSysTest on daint:login using pgi @@ -1218,9 +1182,6 @@ Let's run our adapted test now: [ RUN ] StreamMultiSysTest on daint:mc using intel [ RUN ] StreamMultiSysTest on daint:mc using pgi [ RUN ] StreamMultiSysTest on daint:mc using cray - [----------] finished processing StreamMultiSysTest (StreamMultiSysTest) - - [----------] waiting for spawned checks to finish [ OK ] ( 1/12) StreamMultiSysTest on daint:gpu using pgi [compile: 2.092s run: 11.201s total: 13.307s] [ OK ] ( 2/12) StreamMultiSysTest on daint:gpu using gnu [compile: 2.349s run: 17.140s total: 19.509s] [ OK ] ( 3/12) StreamMultiSysTest on daint:login using pgi [compile: 2.230s run: 20.946s total: 23.189s] diff --git a/docs/tutorial_deps.rst b/docs/tutorial_deps.rst index fea0450671..e8bc2a80bd 100644 --- a/docs/tutorial_deps.rst +++ b/docs/tutorial_deps.rst @@ -114,7 +114,7 @@ Here is the output when running the OSU tests with the asynchronous execution po .. code-block:: none [ReFrame Setup] - version: 3.6.0-dev.0+4de0fee1 + version: 3.10.0-dev.2 command: './bin/reframe -c tutorials/deps/osu_benchmarks.py -r' launched by: user@daint101 working directory: '/users/user/Devel/reframe' @@ -126,96 +126,51 @@ Here is the output when running the OSU tests with the asynchronous execution po [==========] Running 8 check(s) [==========] Started on Wed Mar 10 20:53:56 2021 - [----------] started processing OSUDownloadTest (OSU benchmarks download sources) + [----------] start processing checks [ RUN ] OSUDownloadTest on daint:login using builtin - [----------] finished processing OSUDownloadTest (OSU benchmarks download sources) - - [----------] started processing OSUBuildTest (OSU benchmarks build test) + [ OK ] ( 1/22) OSUDownloadTest on daint:login using builtin [compile: 0.035s run: 2.520s total: 2.716s] [ RUN ] OSUBuildTest on daint:gpu using gnu - [ DEP ] OSUBuildTest on daint:gpu using gnu [ RUN ] OSUBuildTest on daint:gpu using intel - [ DEP ] OSUBuildTest on daint:gpu using intel [ RUN ] OSUBuildTest on daint:gpu using pgi - [ DEP ] OSUBuildTest on daint:gpu using pgi - [----------] finished processing OSUBuildTest (OSU benchmarks build test) - - [----------] started processing OSULatencyTest (OSU latency test) + [ OK ] ( 2/22) OSUBuildTest on daint:gpu using gnu [compile: 156.713s run: 10.222s total: 170.501s] [ RUN ] OSULatencyTest on daint:gpu using gnu - [ DEP ] OSULatencyTest on daint:gpu using gnu - [ RUN ] OSULatencyTest on daint:gpu using intel - [ DEP ] OSULatencyTest on daint:gpu using intel - [ RUN ] OSULatencyTest on daint:gpu using pgi - [ DEP ] OSULatencyTest on daint:gpu using pgi - [----------] finished processing OSULatencyTest (OSU latency test) - - [----------] started processing OSUBandwidthTest (OSU bandwidth test) [ RUN ] OSUBandwidthTest on daint:gpu using gnu - [ DEP ] OSUBandwidthTest on daint:gpu using gnu - [ RUN ] OSUBandwidthTest on daint:gpu using intel - [ DEP ] OSUBandwidthTest on daint:gpu using intel - [ RUN ] OSUBandwidthTest on daint:gpu using pgi - [ DEP ] OSUBandwidthTest on daint:gpu using pgi - [----------] finished processing OSUBandwidthTest (OSU bandwidth test) - - [----------] started processing OSUAllreduceTest_2 (OSU Allreduce test) [ RUN ] OSUAllreduceTest_2 on daint:gpu using gnu - [ DEP ] OSUAllreduceTest_2 on daint:gpu using gnu - [ RUN ] OSUAllreduceTest_2 on daint:gpu using intel - [ DEP ] OSUAllreduceTest_2 on daint:gpu using intel - [ RUN ] OSUAllreduceTest_2 on daint:gpu using pgi - [ DEP ] OSUAllreduceTest_2 on daint:gpu using pgi - [----------] finished processing OSUAllreduceTest_2 (OSU Allreduce test) - - [----------] started processing OSUAllreduceTest_4 (OSU Allreduce test) [ RUN ] OSUAllreduceTest_4 on daint:gpu using gnu - [ DEP ] OSUAllreduceTest_4 on daint:gpu using gnu - [ RUN ] OSUAllreduceTest_4 on daint:gpu using intel - [ DEP ] OSUAllreduceTest_4 on daint:gpu using intel - [ RUN ] OSUAllreduceTest_4 on daint:gpu using pgi - [ DEP ] OSUAllreduceTest_4 on daint:gpu using pgi - [----------] finished processing OSUAllreduceTest_4 (OSU Allreduce test) - - [----------] started processing OSUAllreduceTest_8 (OSU Allreduce test) + [ RUN ] OSUAllreduceTest_16 on daint:gpu using gnu [ RUN ] OSUAllreduceTest_8 on daint:gpu using gnu - [ DEP ] OSUAllreduceTest_8 on daint:gpu using gnu - [ RUN ] OSUAllreduceTest_8 on daint:gpu using intel - [ DEP ] OSUAllreduceTest_8 on daint:gpu using intel + [ OK ] ( 3/22) OSUBuildTest on daint:gpu using pgi [compile: 168.692s run: 0.751s total: 171.227s] [ RUN ] OSUAllreduceTest_8 on daint:gpu using pgi - [ DEP ] OSUAllreduceTest_8 on daint:gpu using pgi - [----------] finished processing OSUAllreduceTest_8 (OSU Allreduce test) - - [----------] started processing OSUAllreduceTest_16 (OSU Allreduce test) - [ RUN ] OSUAllreduceTest_16 on daint:gpu using gnu - [ DEP ] OSUAllreduceTest_16 on daint:gpu using gnu - [ RUN ] OSUAllreduceTest_16 on daint:gpu using intel - [ DEP ] OSUAllreduceTest_16 on daint:gpu using intel + [ RUN ] OSULatencyTest on daint:gpu using pgi + [ RUN ] OSUBandwidthTest on daint:gpu using pgi + [ RUN ] OSUAllreduceTest_2 on daint:gpu using pgi + [ RUN ] OSUAllreduceTest_4 on daint:gpu using pgi [ RUN ] OSUAllreduceTest_16 on daint:gpu using pgi - [ DEP ] OSUAllreduceTest_16 on daint:gpu using pgi - [----------] finished processing OSUAllreduceTest_16 (OSU Allreduce test) - - [----------] waiting for spawned checks to finish - [ OK ] ( 1/22) OSUDownloadTest on daint:login using builtin [compile: 0.007s run: 2.033s total: 2.078s] - [ OK ] ( 2/22) OSUBuildTest on daint:gpu using gnu [compile: 20.531s run: 0.039s total: 83.089s] - [ OK ] ( 3/22) OSUBuildTest on daint:gpu using pgi [compile: 27.193s run: 55.871s total: 83.082s] - [ OK ] ( 4/22) OSUAllreduceTest_16 on daint:gpu using gnu [compile: 0.007s run: 30.713s total: 33.470s] - [ OK ] ( 5/22) OSUBuildTest on daint:gpu using intel [compile: 35.256s run: 54.218s total: 116.712s] - [ OK ] ( 6/22) OSULatencyTest on daint:gpu using pgi [compile: 0.011s run: 23.738s total: 51.190s] - [ OK ] ( 7/22) OSUAllreduceTest_2 on daint:gpu using gnu [compile: 0.008s run: 31.879s total: 51.187s] - [ OK ] ( 8/22) OSUAllreduceTest_4 on daint:gpu using gnu [compile: 0.006s run: 37.447s total: 51.194s] - [ OK ] ( 9/22) OSUAllreduceTest_8 on daint:gpu using gnu [compile: 0.007s run: 42.914s total: 51.202s] - [ OK ] (10/22) OSUAllreduceTest_16 on daint:gpu using pgi [compile: 0.006s run: 51.172s total: 51.197s] - [ OK ] (11/22) OSULatencyTest on daint:gpu using gnu [compile: 0.007s run: 21.500s total: 51.730s] - [ OK ] (12/22) OSUAllreduceTest_2 on daint:gpu using pgi [compile: 0.007s run: 35.083s total: 51.700s] - [ OK ] (13/22) OSUAllreduceTest_8 on daint:gpu using pgi [compile: 0.007s run: 46.187s total: 51.681s] - [ OK ] (14/22) OSUAllreduceTest_4 on daint:gpu using pgi [compile: 0.007s run: 41.060s total: 52.030s] - [ OK ] (15/22) OSUAllreduceTest_2 on daint:gpu using intel [compile: 0.008s run: 27.401s total: 35.900s] - [ OK ] (16/22) OSUBandwidthTest on daint:gpu using gnu [compile: 0.008s run: 82.553s total: 107.334s] - [ OK ] (17/22) OSUBandwidthTest on daint:gpu using pgi [compile: 0.009s run: 87.559s total: 109.613s] - [ OK ] (18/22) OSUAllreduceTest_16 on daint:gpu using intel [compile: 0.006s run: 99.899s total: 99.924s] - [ OK ] (19/22) OSUBandwidthTest on daint:gpu using intel [compile: 0.007s run: 116.771s total: 128.125s] - [ OK ] (20/22) OSULatencyTest on daint:gpu using intel [compile: 0.008s run: 114.236s total: 128.398s] - [ OK ] (21/22) OSUAllreduceTest_8 on daint:gpu using intel [compile: 0.008s run: 125.541s total: 128.387s] - [ OK ] (22/22) OSUAllreduceTest_4 on daint:gpu using intel [compile: 0.007s run: 123.079s total: 128.651s] + [ OK ] ( 4/22) OSULatencyTest on daint:gpu using gnu [compile: 0.031s run: 63.644s total: 64.558s] + [ OK ] ( 5/22) OSUAllreduceTest_2 on daint:gpu using gnu [compile: 0.016s run: 53.954s total: 64.619s] + [ OK ] ( 6/22) OSULatencyTest on daint:gpu using pgi [compile: 0.032s run: 28.134s total: 65.222s] + [ OK ] ( 7/22) OSUAllreduceTest_4 on daint:gpu using gnu [compile: 0.015s run: 49.682s total: 65.862s] + [ OK ] ( 8/22) OSUAllreduceTest_16 on daint:gpu using gnu [compile: 0.011s run: 44.188s total: 66.009s] + [ OK ] ( 9/22) OSUAllreduceTest_8 on daint:gpu using gnu [compile: 0.014s run: 38.366s total: 66.076s] + [ OK ] (10/22) OSUAllreduceTest_8 on daint:gpu using pgi [compile: 0.009s run: 34.306s total: 66.546s] + [ OK ] (11/22) OSUBuildTest on daint:gpu using intel [compile: 245.878s run: 0.555s total: 246.570s] + [ RUN ] OSUAllreduceTest_8 on daint:gpu using intel + [ RUN ] OSUAllreduceTest_4 on daint:gpu using intel + [ RUN ] OSULatencyTest on daint:gpu using intel + [ RUN ] OSUBandwidthTest on daint:gpu using intel + [ RUN ] OSUAllreduceTest_2 on daint:gpu using intel + [ RUN ] OSUAllreduceTest_16 on daint:gpu using intel + [ OK ] (12/22) OSUBandwidthTest on daint:gpu using gnu [compile: 0.017s run: 98.239s total: 104.363s] + [ OK ] (13/22) OSUAllreduceTest_2 on daint:gpu using pgi [compile: 0.014s run: 58.084s total: 93.705s] + [ OK ] (14/22) OSUAllreduceTest_4 on daint:gpu using pgi [compile: 0.023s run: 53.762s total: 82.721s] + [ OK ] (15/22) OSUAllreduceTest_16 on daint:gpu using pgi [compile: 0.052s run: 49.170s total: 82.695s] + [ OK ] (16/22) OSUBandwidthTest on daint:gpu using pgi [compile: 0.048s run: 89.141s total: 125.222s] + [ OK ] (17/22) OSUAllreduceTest_2 on daint:gpu using intel [compile: 0.024s run: 46.974s total: 65.742s] + [ OK ] (18/22) OSUAllreduceTest_8 on daint:gpu using intel [compile: 0.010s run: 70.032s total: 71.045s] + [ OK ] (19/22) OSUAllreduceTest_4 on daint:gpu using intel [compile: 0.045s run: 67.585s total: 72.897s] + [ OK ] (20/22) OSULatencyTest on daint:gpu using intel [compile: 0.013s run: 61.913s total: 73.029s] + [ OK ] (21/22) OSUAllreduceTest_16 on daint:gpu using intel [compile: 0.024s run: 59.141s total: 81.230s] + [ OK ] (22/22) OSUBandwidthTest on daint:gpu using intel [compile: 0.044s run: 121.324s total: 136.121s] [----------] all spawned checks have finished [ PASSED ] Ran 22/22 test case(s) from 8 check(s) (0 failure(s)) @@ -226,7 +181,7 @@ Here is the output when running the OSU tests with the asynchronous execution po Before starting running the tests, ReFrame topologically sorts them based on their dependencies and schedules them for running using the selected execution policy. With the serial execution policy, ReFrame simply executes the tests to completion as they "arrive," since the tests are already topologically sorted. In the asynchronous execution policy, tests are spawned and not waited for. -If a test's dependencies have not yet completed, it will not start its execution and a ``DEP`` message will be printed to denote this. +If a test's dependencies have not yet completed, it will not start its execution immediately. ReFrame's runtime takes care of properly cleaning up the resources of the tests respecting dependencies. Normally when an individual test finishes successfully, its stage directory is cleaned up. diff --git a/docs/tutorial_fixtures.rst b/docs/tutorial_fixtures.rst index e8410f461f..eb51b33b1e 100644 --- a/docs/tutorial_fixtures.rst +++ b/docs/tutorial_fixtures.rst @@ -171,102 +171,51 @@ The following listing shows the output of running the tutorial examples. [==========] Running 10 check(s) [==========] Started on Sun Oct 31 22:00:28 2021 - [----------] started processing fetch_osu_benchmarks~daint (Fetch OSU benchmarks) + [----------] start processing checks [ RUN ] fetch_osu_benchmarks~daint on daint:gpu using gnu - [----------] finished processing fetch_osu_benchmarks~daint (Fetch OSU benchmarks) - - [----------] started processing build_osu_benchmarks~daint:gpu+gnu (Build OSU benchmarks) - [ RUN ] build_osu_benchmarks~daint:gpu+gnu on daint:gpu using gnu - [ DEP ] build_osu_benchmarks~daint:gpu+gnu on daint:gpu using gnu - [----------] finished processing build_osu_benchmarks~daint:gpu+gnu (Build OSU benchmarks) - - [----------] started processing build_osu_benchmarks~daint:gpu+intel (Build OSU benchmarks) + [ OK ] ( 1/22) fetch_osu_benchmarks~daint on daint:gpu using gnu [compile: 0.007s run: 2.960s total: 2.988s] [ RUN ] build_osu_benchmarks~daint:gpu+intel on daint:gpu using intel - [ DEP ] build_osu_benchmarks~daint:gpu+intel on daint:gpu using intel - [----------] finished processing build_osu_benchmarks~daint:gpu+intel (Build OSU benchmarks) - - [----------] started processing build_osu_benchmarks~daint:gpu+pgi (Build OSU benchmarks) [ RUN ] build_osu_benchmarks~daint:gpu+pgi on daint:gpu using pgi - [ DEP ] build_osu_benchmarks~daint:gpu+pgi on daint:gpu using pgi - [----------] finished processing build_osu_benchmarks~daint:gpu+pgi (Build OSU benchmarks) - - [----------] started processing osu_allreduce_test_16 (OSU Allreduce test) + [ RUN ] build_osu_benchmarks~daint:gpu+gnu on daint:gpu using gnu + [ OK ] ( 2/22) build_osu_benchmarks~daint:gpu+gnu on daint:gpu using gnu [compile: 26.322s run: 2.609s total: 30.214s] [ RUN ] osu_allreduce_test_16 on daint:gpu using gnu - [ DEP ] osu_allreduce_test_16 on daint:gpu using gnu - [ RUN ] osu_allreduce_test_16 on daint:gpu using intel - [ DEP ] osu_allreduce_test_16 on daint:gpu using intel - [ RUN ] osu_allreduce_test_16 on daint:gpu using pgi - [ DEP ] osu_allreduce_test_16 on daint:gpu using pgi - [----------] finished processing osu_allreduce_test_16 (OSU Allreduce test) - - [----------] started processing osu_allreduce_test_8 (OSU Allreduce test) + [ RUN ] osu_bandwidth_test on daint:gpu using gnu + [ RUN ] osu_latency_test on daint:gpu using gnu + [ RUN ] osu_allreduce_test_2 on daint:gpu using gnu [ RUN ] osu_allreduce_test_8 on daint:gpu using gnu - [ DEP ] osu_allreduce_test_8 on daint:gpu using gnu - [ RUN ] osu_allreduce_test_8 on daint:gpu using intel - [ DEP ] osu_allreduce_test_8 on daint:gpu using intel - [ RUN ] osu_allreduce_test_8 on daint:gpu using pgi - [ DEP ] osu_allreduce_test_8 on daint:gpu using pgi - [----------] finished processing osu_allreduce_test_8 (OSU Allreduce test) - - [----------] started processing osu_allreduce_test_4 (OSU Allreduce test) [ RUN ] osu_allreduce_test_4 on daint:gpu using gnu - [ DEP ] osu_allreduce_test_4 on daint:gpu using gnu + [ OK ] ( 3/22) build_osu_benchmarks~daint:gpu+intel on daint:gpu using intel [compile: 53.068s run: 0.650s total: 53.773s] + [ RUN ] osu_allreduce_test_2 on daint:gpu using intel + [ RUN ] osu_latency_test on daint:gpu using intel [ RUN ] osu_allreduce_test_4 on daint:gpu using intel - [ DEP ] osu_allreduce_test_4 on daint:gpu using intel + [ RUN ] osu_allreduce_test_16 on daint:gpu using intel + [ RUN ] osu_allreduce_test_8 on daint:gpu using intel + [ OK ] ( 4/22) build_osu_benchmarks~daint:gpu+pgi on daint:gpu using pgi [compile: 52.482s run: 0.803s total: 53.981s] [ RUN ] osu_allreduce_test_4 on daint:gpu using pgi - [ DEP ] osu_allreduce_test_4 on daint:gpu using pgi - [----------] finished processing osu_allreduce_test_4 (OSU Allreduce test) - - [----------] started processing osu_allreduce_test_2 (OSU Allreduce test) - [ RUN ] osu_allreduce_test_2 on daint:gpu using gnu - [ DEP ] osu_allreduce_test_2 on daint:gpu using gnu - [ RUN ] osu_allreduce_test_2 on daint:gpu using intel - [ DEP ] osu_allreduce_test_2 on daint:gpu using intel - [ RUN ] osu_allreduce_test_2 on daint:gpu using pgi - [ DEP ] osu_allreduce_test_2 on daint:gpu using pgi - [----------] finished processing osu_allreduce_test_2 (OSU Allreduce test) - - [----------] started processing osu_bandwidth_test (OSU bandwidth test) - [ RUN ] osu_bandwidth_test on daint:gpu using gnu - [ DEP ] osu_bandwidth_test on daint:gpu using gnu [ RUN ] osu_bandwidth_test on daint:gpu using intel - [ DEP ] osu_bandwidth_test on daint:gpu using intel - [ RUN ] osu_bandwidth_test on daint:gpu using pgi - [ DEP ] osu_bandwidth_test on daint:gpu using pgi - [----------] finished processing osu_bandwidth_test (OSU bandwidth test) - - [----------] started processing osu_latency_test (OSU latency test) - [ RUN ] osu_latency_test on daint:gpu using gnu - [ DEP ] osu_latency_test on daint:gpu using gnu - [ RUN ] osu_latency_test on daint:gpu using intel - [ DEP ] osu_latency_test on daint:gpu using intel + [ OK ] ( 5/22) osu_allreduce_test_16 on daint:gpu using gnu [compile: 0.015s run: 23.535s total: 23.922s] [ RUN ] osu_latency_test on daint:gpu using pgi - [ DEP ] osu_latency_test on daint:gpu using pgi - [----------] finished processing osu_latency_test (OSU latency test) - - [----------] waiting for spawned checks to finish - [ OK ] ( 1/22) fetch_osu_benchmarks~daint on daint:gpu using gnu [compile: 0.009s run: 2.761s total: 2.802s] - [ OK ] ( 2/22) build_osu_benchmarks~daint:gpu+gnu on daint:gpu using gnu [compile: 25.758s run: 0.056s total: 104.626s] - [ OK ] ( 3/22) build_osu_benchmarks~daint:gpu+pgi on daint:gpu using pgi [compile: 33.936s run: 70.452s total: 104.473s] - [ OK ] ( 4/22) build_osu_benchmarks~daint:gpu+intel on daint:gpu using intel [compile: 44.565s run: 65.010s total: 143.664s] - [ OK ] ( 5/22) osu_allreduce_test_4 on daint:gpu using gnu [compile: 0.011s run: 78.717s total: 101.428s] - [ OK ] ( 6/22) osu_allreduce_test_2 on daint:gpu using pgi [compile: 0.014s run: 88.060s total: 101.409s] - [ OK ] ( 7/22) osu_latency_test on daint:gpu using pgi [compile: 0.009s run: 101.325s total: 101.375s] - [ OK ] ( 8/22) osu_allreduce_test_8 on daint:gpu using pgi [compile: 0.013s run: 76.031s total: 102.005s] - [ OK ] ( 9/22) osu_allreduce_test_2 on daint:gpu using gnu [compile: 0.011s run: 85.525s total: 101.974s] - [ OK ] (10/22) osu_allreduce_test_4 on daint:gpu using pgi [compile: 0.011s run: 82.847s total: 102.407s] - [ OK ] (11/22) osu_allreduce_test_8 on daint:gpu using gnu [compile: 0.010s run: 77.818s total: 106.993s] - [ OK ] (12/22) osu_latency_test on daint:gpu using gnu [compile: 0.012s run: 103.641s total: 106.858s] - [ OK ] (13/22) osu_bandwidth_test on daint:gpu using pgi [compile: 0.011s run: 157.129s total: 164.087s] - [ OK ] (14/22) osu_bandwidth_test on daint:gpu using gnu [compile: 0.010s run: 154.343s total: 164.540s] - [ OK ] (15/22) osu_allreduce_test_8 on daint:gpu using intel [compile: 0.010s run: 194.643s total: 207.980s] - [ OK ] (16/22) osu_allreduce_test_2 on daint:gpu using intel [compile: 0.013s run: 201.145s total: 207.983s] - [ OK ] (17/22) osu_allreduce_test_4 on daint:gpu using intel [compile: 0.016s run: 198.143s total: 208.335s] - [ OK ] (18/22) osu_latency_test on daint:gpu using intel [compile: 0.010s run: 208.271s total: 208.312s] - [ OK ] (19/22) osu_allreduce_test_16 on daint:gpu using pgi [compile: 0.013s run: 215.854s total: 248.101s] - [ OK ] (20/22) osu_allreduce_test_16 on daint:gpu using gnu [compile: 0.010s run: 213.190s total: 248.731s] - [ OK ] (21/22) osu_allreduce_test_16 on daint:gpu using intel [compile: 0.010s run: 194.339s total: 210.962s] - [ OK ] (22/22) osu_bandwidth_test on daint:gpu using intel [compile: 0.022s run: 267.171s total: 270.475s] + [ RUN ] osu_bandwidth_test on daint:gpu using pgi + [ RUN ] osu_allreduce_test_2 on daint:gpu using pgi + [ RUN ] osu_allreduce_test_16 on daint:gpu using pgi + [ RUN ] osu_allreduce_test_8 on daint:gpu using pgi + [ OK ] ( 6/22) osu_latency_test on daint:gpu using gnu [compile: 0.010s run: 47.016s total: 54.703s] + [ OK ] ( 7/22) osu_allreduce_test_2 on daint:gpu using intel [compile: 0.009s run: 41.732s total: 42.313s] + [ OK ] ( 8/22) osu_allreduce_test_2 on daint:gpu using gnu [compile: 0.012s run: 54.571s total: 65.684s] + [ OK ] ( 9/22) osu_allreduce_test_8 on daint:gpu using gnu [compile: 0.011s run: 51.414s total: 65.712s] + [ OK ] (10/22) osu_allreduce_test_4 on daint:gpu using gnu [compile: 0.010s run: 48.378s total: 65.741s] + [ OK ] (11/22) osu_latency_test on daint:gpu using intel [compile: 0.008s run: 39.131s total: 42.877s] + [ OK ] (12/22) osu_allreduce_test_4 on daint:gpu using intel [compile: 0.009s run: 35.861s total: 42.898s] + [ OK ] (13/22) osu_allreduce_test_16 on daint:gpu using intel [compile: 0.008s run: 32.300s total: 42.901s] + [ OK ] (14/22) osu_allreduce_test_8 on daint:gpu using intel [compile: 0.009s run: 29.237s total: 42.914s] + [ OK ] (15/22) osu_allreduce_test_4 on daint:gpu using pgi [compile: 0.009s run: 26.134s total: 42.904s] + [ OK ] (16/22) osu_latency_test on daint:gpu using pgi [compile: 0.009s run: 23.085s total: 47.232s] + [ OK ] (17/22) osu_allreduce_test_2 on daint:gpu using pgi [compile: 0.008s run: 17.401s total: 41.728s] + [ OK ] (18/22) osu_allreduce_test_16 on daint:gpu using pgi [compile: 0.008s run: 15.895s total: 36.613s] + [ OK ] (19/22) osu_allreduce_test_8 on daint:gpu using pgi [compile: 0.009s run: 13.485s total: 34.296s] + [ OK ] (20/22) osu_bandwidth_test on daint:gpu using gnu [compile: 0.011s run: 80.564s total: 85.070s] + [ OK ] (21/22) osu_bandwidth_test on daint:gpu using intel [compile: 0.008s run: 76.772s total: 97.828s] + [ OK ] (22/22) osu_bandwidth_test on daint:gpu using pgi [compile: 0.009s run: 83.003s total: 110.656s] [----------] all spawned checks have finished [ PASSED ] Ran 22/22 test case(s) from 10 check(s) (0 failure(s), 0 skipped) diff --git a/docs/tutorial_tips_tricks.rst b/docs/tutorial_tips_tricks.rst index 304a87bf5e..34efaa0713 100644 --- a/docs/tutorial_tips_tricks.rst +++ b/docs/tutorial_tips_tricks.rst @@ -129,7 +129,6 @@ If we run the test, we can see that the correct standard output filename will be .. code-block:: none - [----------] waiting for spawned checks to finish rfm_HelloMultiLangTest_cpp_job.out [ OK ] (1/4) HelloMultiLangTest_cpp on catalina:default using gnu [compile: 0.677s run: 0.700s total: 1.394s] rfm_HelloMultiLangTest_c_job.out @@ -417,7 +416,6 @@ Let's run the whole test DAG: - [----------] waiting for spawned checks to finish [ OK ] ( 1/10) T0 on generic:default using builtin [compile: 0.014s run: 0.297s total: 0.337s] [ OK ] ( 2/10) T4 on generic:default using builtin [compile: 0.010s run: 0.171s total: 0.207s] [ OK ] ( 3/10) T5 on generic:default using builtin [compile: 0.010s run: 0.192s total: 0.225s] @@ -478,11 +476,8 @@ Notice how only the :class:`T6` test was rerun and none of its dependencies, sin [==========] Running 1 check(s) [==========] Started on Thu Jan 21 14:27:18 2021 - [----------] started processing T6 (T6) + [----------] start processing checks [ RUN ] T6 on generic:default using builtin - [----------] finished processing T6 (T6) - - [----------] waiting for spawned checks to finish [ OK ] (1/1) T6 on generic:default using builtin [compile: 0.012s run: 0.428s total: 0.464s] [----------] all spawned checks have finished @@ -498,7 +493,6 @@ If we tried to run :class:`T6` without restoring the session, we would have to r .. code-block:: none - [----------] waiting for spawned checks to finish [ OK ] (1/5) T0 on generic:default using builtin [compile: 0.012s run: 0.424s total: 0.464s] [ OK ] (2/5) T4 on generic:default using builtin [compile: 0.011s run: 0.348s total: 0.381s] [ OK ] (3/5) T5 on generic:default using builtin [compile: 0.007s run: 0.225s total: 0.248s] From 95820adee6c3cac7ea40db796e357d101ad385d9 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Fri, 17 Dec 2021 10:46:25 +0100 Subject: [PATCH 42/76] Fix formatting issues --- reframe/frontend/executors/__init__.py | 2 +- reframe/frontend/executors/policies.py | 40 +++++--------------------- unittests/test_policies.py | 6 ++-- 3 files changed, 11 insertions(+), 37 deletions(-) diff --git a/reframe/frontend/executors/__init__.py b/reframe/frontend/executors/__init__.py index 399b692b4b..94bb04a862 100644 --- a/reframe/frontend/executors/__init__.py +++ b/reframe/frontend/executors/__init__.py @@ -507,7 +507,7 @@ def print_separator(check, prefix): self._policy.exit() self._printer.separator('short single line', - 'all spawned checks have finished\n') + 'all spawned checks have finished\n') class ExecutionPolicy(abc.ABC): diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index f3ee366ddb..b857a7323c 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -96,8 +96,8 @@ def runcase(self, case): check, partition, environ = case self.printer.status( - 'RUN', '%s on %s using %s' % - (check.name, partition.fullname, environ.name) + 'RUN', + f'{check.name} on {partition.fullname} using {environ.name}' ) task = RegressionTask(case, self.task_listeners) self._task_index[case] = task @@ -248,7 +248,7 @@ def __init__(self): # Sets of the jobs that should be polled for each partition self._scheduler_tasks = { - '_rfm_local' : set() + '_rfm_local': set() } # Retired tasks that need to be cleaned up @@ -256,7 +256,7 @@ def __init__(self): # Job limit per partition self._max_jobs = { - '_rfm_local' : rt.runtime().get_option(f'systems/0/rfm_max_jobs') + '_rfm_local': rt.runtime().get_option(f'systems/0/rfm_max_jobs') } self.task_listeners.append(self) @@ -279,35 +279,8 @@ def runcase(self, case): ) self._current_tasks.add(task) - # TODO: This is only for testing purposes here and should be deleted - def print_state_of_tasks(self, tasks): - stats = { - 'wait': [], - 'ready_to_compile': {}, - 'compiling': {}, - 'ready_to_run': {}, - 'running': {}, - 'completed': {} - } - print(f'Total tasks: {len(tasks)}') - for t in tasks: - if t.policy_stage == 'wait': - stats['wait'].append(t) - else: - stats[t.policy_stage].setdefault(t.check.current_partition.fullname, []) - stats[t.policy_stage][t.check.current_partition.fullname].append(t) - - print(f"Tasks in wait: {len(stats['wait'])}") - phases = ['ready_to_compile', 'compiling', 'ready_to_run', 'running', 'completed'] - for phase in phases: - print(f"Tasks in {phase}:") - for part in stats[phase]: - print(f" {part}: {len(stats[phase][part])}") - def exit(self): while self._current_tasks: - # print() - # self.print_state_of_tasks(self._current_tasks) try: self._poll_tasks() num_running = sum( @@ -365,8 +338,9 @@ def advance_wait(self, task): elif self.deps_succeeded(task): try: self.printer.status( - 'RUN', '%s on %s using %s' % - (task.check.name, task.testcase.partition.fullname, task.testcase.environ.name) + 'RUN', f'{task.check.name} on ' + f'{task.testcase.partition.fullname} using ' + f'{task.testcase.environ.name}' ) task.setup(task.testcase.partition, task.testcase.environ, diff --git a/unittests/test_policies.py b/unittests/test_policies.py index 2fa736b767..71b69edeb2 100644 --- a/unittests/test_policies.py +++ b/unittests/test_policies.py @@ -600,7 +600,7 @@ def test_concurrency_unlimited(async_runner, make_cases): runner, monitor = async_runner runner.policy._max_jobs = { - '_rfm_local' : num_checks + '_rfm_local': num_checks } runner.runall(make_cases([SleepCheck(.5) for i in range(num_checks)])) @@ -629,7 +629,7 @@ def test_concurrency_limited(async_runner, make_cases): runner, monitor = async_runner runner.policy._max_jobs = { - '_rfm_local' : 3 + '_rfm_local': 3 } runner.runall(make_cases([SleepCheck(.5) for i in range(num_checks)])) @@ -672,7 +672,7 @@ def test_concurrency_none(async_runner, make_cases): runner, monitor = async_runner runner.policy._max_jobs = { - '_rfm_local' : 1 + '_rfm_local': 1 } runner.runall(make_cases([SleepCheck(.5) for i in range(num_checks)])) From 8adc8678526a859ef7bc0375cea05f72e3574630 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Fri, 17 Dec 2021 13:26:32 +0100 Subject: [PATCH 43/76] Add pipeline explanation in code --- reframe/frontend/executors/policies.py | 48 +++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index b857a7323c..59e53d6a35 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -231,6 +231,49 @@ def exit(self): _cleanup_all(self._retired_tasks, not self.keep_stage_files) +######################### Stages of the pipeline ######################### +# +# Each test starts from the `wait` stage and in the last step of the +# policy there a loop where all tests are bumped to the next phase +# if possible. +# +# +--------------------[ wait ] +# | | +# | if all deps finished and +# | test is not RunOnly +# | | +# | ↓ +# | [ ready_to_compile ] +# | | +# | if there are available +# | slots +# if all deps finished and | +# test is RunOnly ↓ +# | [ compiling ]--------------+ +# | | | +# | if compilation has finished and | +# | test is not CompileOnly | +# | | | +# | ↓ | +# +--------------->[ ready_to_run ] | +# | | +# if there are available | +# slots | +# | if compilation has finished and +# ↓ test is CompileOnly +# [ running ] | +# | | +# if job has finished | +# tests can exit the | | +# pipeline at any point ↓ | +# if they fail [ completed ]<-------------+ +# : | +# : if sanity and performance +# | succeed +# | | +# ↓ ↓ +# ( failed ) ( retired ) + class AsynchronousExecutionPolicy(ExecutionPolicy, TaskEventListener): def __init__(self): super().__init__() @@ -274,7 +317,7 @@ def runcase(self, case): self._task_index[case] = task self.stats.add_task(task) getlogger().debug2( - f'==> added {check.name} on {partition.fullname} ' + f'Added {check.name} on {partition.fullname} ' f'using {environ.name}' ) self._current_tasks.add(task) @@ -319,6 +362,7 @@ def advance_all(self, tasks, timeout=None): t_init = time.time() num_progressed = 0 + getlogger().debug2(f"Current tests: {len(tasks)}") # progress might remove the tasks that retire or fail for t in list(tasks): bump_state = getattr(self, f'advance_{t.policy_stage}') @@ -327,6 +371,8 @@ def advance_all(self, tasks, timeout=None): if timeout and t_elapsed > timeout and num_progressed: break + getlogger().debug2(f"Bumped {num_progressed} test(s).") + def advance_wait(self, task): if self.deps_skipped(task): try: From 1b5a4560584c39be6a79c80cca6e80ba69bd051f Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Fri, 17 Dec 2021 18:02:04 +0100 Subject: [PATCH 44/76] Address comments --- reframe/frontend/executors/__init__.py | 17 +++++++-- reframe/frontend/executors/policies.py | 53 +++++++++++--------------- reframe/schemas/config.json | 4 +- 3 files changed, 38 insertions(+), 36 deletions(-) diff --git a/reframe/frontend/executors/__init__.py b/reframe/frontend/executors/__init__.py index 94bb04a862..6bac640696 100644 --- a/reframe/frontend/executors/__init__.py +++ b/reframe/frontend/executors/__init__.py @@ -128,7 +128,6 @@ def __init__(self, case, listeners=[]): self._case = case self._failed_stage = None self._current_stage = 'startup' - self.policy_stage = 'wait' self._exc_info = (None, None, None) self._listeners = list(listeners) self._skipped = False @@ -216,6 +215,10 @@ def failed(self): return (self._failed_stage is not None and not self._aborted and not self._skipped) + @property + def current_stage(self): + return self._current_stage + @property def failed_stage(self): return self._failed_stage @@ -249,7 +252,11 @@ class update_timestamps: # we don't want to masquerade the self argument of our containing # function def __enter__(this): - if fn.__name__ != 'poll': + if ( + fn.__name__ != 'poll' and + fn.__name__ != 'run_complete' and + fn.__name__ != 'compile_complete' + ): stage = self._current_stage self._timestamps[f'{stage}_start'] = time.time() @@ -258,7 +265,11 @@ def __exit__(this, exc_type, exc_value, traceback): self._timestamps[f'{stage}_finish'] = time.time() self._timestamps['pipeline_end'] = time.time() - if fn.__name__ != 'poll': + if ( + fn.__name__ != 'poll' and + fn.__name__ != 'run_complete' and + fn.__name__ != 'compile_complete' + ): self._current_stage = fn.__name__ try: diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 59e53d6a35..a812cf7c9a 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -233,40 +233,40 @@ def exit(self): ######################### Stages of the pipeline ######################### # -# Each test starts from the `wait` stage and in the last step of the +# Each test starts from the `startup` stage and in the last step of the # policy there a loop where all tests are bumped to the next phase # if possible. # -# +--------------------[ wait ] +# +-------------------[ startup ] # | | # | if all deps finished and # | test is not RunOnly # | | # | ↓ -# | [ ready_to_compile ] +# | [ setup ] # | | # | if there are available # | slots # if all deps finished and | # test is RunOnly ↓ -# | [ compiling ]--------------+ +# | [ compile ]---------------+ # | | | # | if compilation has finished and | # | test is not CompileOnly | # | | | # | ↓ | -# +--------------->[ ready_to_run ] | +# +---------------->[ compile_wait ] | # | | # if there are available | # slots | # | if compilation has finished and # ↓ test is CompileOnly -# [ running ] | +# [ run ] | # | | # if job has finished | # tests can exit the | | # pipeline at any point ↓ | -# if they fail [ completed ]<-------------+ +# if they fail [ run_wait ]<--------------+ # : | # : if sanity and performance # | succeed @@ -299,7 +299,7 @@ def __init__(self): # Job limit per partition self._max_jobs = { - '_rfm_local': rt.runtime().get_option(f'systems/0/rfm_max_jobs') + '_rfm_local': rt.runtime().get_option(f'systems/0/max_local_jobs') } self.task_listeners.append(self) @@ -327,7 +327,7 @@ def exit(self): try: self._poll_tasks() num_running = sum( - 1 if t.policy_stage in ('running', 'compiling') else 0 + 1 if t._current_stage in ('run', 'compile') else 0 for t in self._current_tasks ) self.advance_all(self._current_tasks) @@ -342,18 +342,18 @@ def _poll_tasks(self): for part in self._partitions: jobs = [] for t in self._scheduler_tasks[part.fullname]: - if t.policy_stage == 'compiling': + if t._current_stage == 'compile': jobs.append(t.check.build_job) - elif t.policy_stage == 'running': + elif t._current_stage == 'run': jobs.append(t.check.job) part.scheduler.poll(*jobs) jobs = [] for t in self._scheduler_tasks['_rfm_local']: - if t.policy_stage == 'compiling': + if t._current_stage == 'compile': jobs.append(t.check.build_job) - elif t.policy_stage == 'running': + elif t._current_stage == 'run': jobs.append(t.check.job) self.local_scheduler.poll(*jobs) @@ -365,7 +365,7 @@ def advance_all(self, tasks, timeout=None): getlogger().debug2(f"Current tests: {len(tasks)}") # progress might remove the tasks that retire or fail for t in list(tasks): - bump_state = getattr(self, f'advance_{t.policy_stage}') + bump_state = getattr(self, f'advance_{t._current_stage}') num_progressed += bump_state(t) t_elapsed = time.time() - t_init if timeout and t_elapsed > timeout and num_progressed: @@ -373,7 +373,7 @@ def advance_all(self, tasks, timeout=None): getlogger().debug2(f"Bumped {num_progressed} test(s).") - def advance_wait(self, task): + def advance_startup(self, task): if self.deps_skipped(task): try: raise SkipTestError('skipped due to skipped dependencies') @@ -399,6 +399,7 @@ def advance_wait(self, task): if isinstance(task.check, RunOnlyRegressionTest): try: task.compile() + task.compile_complete() task.compile_wait() except TaskExit: # Run and run_wait are no-ops for @@ -406,10 +407,6 @@ def advance_wait(self, task): self._current_tasks.remove(task) return 1 - task.policy_stage = 'ready_to_run' - else: - task.policy_stage = 'ready_to_compile' - return 1 elif self.deps_failed(task): exc = TaskDependencyError('dependencies failed') @@ -420,7 +417,7 @@ def advance_wait(self, task): # Not all dependencies have finished yet return 0 - def advance_ready_to_compile(self, task): + def advance_setup(self, task): partname = ( '_rfm_local' if task.check.local or task.check.build_locally else task.check.current_partition.fullname @@ -428,7 +425,6 @@ def advance_ready_to_compile(self, task): if len(self._scheduler_tasks[partname]) < self._max_jobs[partname]: try: task.compile() - task.policy_stage = 'compiling' self._scheduler_tasks[partname].add(task) except TaskExit: self._current_tasks.remove(task) @@ -437,7 +433,7 @@ def advance_ready_to_compile(self, task): return 0 - def advance_compiling(self, task): + def advance_compile(self, task): partname = ( '_rfm_local' if task.check.local or task.check.build_locally else task.check.current_partition.fullname @@ -450,6 +446,7 @@ def advance_compiling(self, task): if isinstance(task.check, CompileOnlyRegressionTest): try: task.run() + task.run_complete() task.run_wait() except TaskExit: # Run and run_wait are no-ops for @@ -457,10 +454,6 @@ def advance_compiling(self, task): self._current_tasks.remove(task) return 1 - task.policy_stage = 'completed' - else: - task.policy_stage = 'ready_to_run' - return 1 else: return 0 @@ -470,7 +463,7 @@ def advance_compiling(self, task): self._current_tasks.remove(task) return 1 - def advance_ready_to_run(self, task): + def advance_compile_wait(self, task): partname = ( '_rfm_local' if task.check.local else task.check.current_partition.fullname @@ -478,7 +471,6 @@ def advance_ready_to_run(self, task): if len(self._scheduler_tasks[partname]) < self._max_jobs[partname]: try: task.run() - task.policy_stage = 'running' self._scheduler_tasks[partname].add(task) except TaskExit: self._current_tasks.remove(task) @@ -487,7 +479,7 @@ def advance_ready_to_run(self, task): return 0 - def advance_running(self, task): + def advance_run(self, task): partname = ( '_rfm_local' if task.check.local else task.check.current_partition.fullname @@ -497,7 +489,6 @@ def advance_running(self, task): task.run_wait() self._scheduler_tasks[partname].remove(task) - task.policy_stage = 'completed' return 1 else: return 0 @@ -507,7 +498,7 @@ def advance_running(self, task): self._current_tasks.remove(task) return 1 - def advance_completed(self, task): + def advance_run_wait(self, task): try: if not self.skip_sanity_check: task.sanity() diff --git a/reframe/schemas/config.json b/reframe/schemas/config.json index 2c3507c7e9..f682ed8987 100644 --- a/reframe/schemas/config.json +++ b/reframe/schemas/config.json @@ -237,7 +237,7 @@ "stagedir": {"type": "string"}, "outputdir": {"type": "string"}, "resourcesdir": {"type": "string"}, - "rfm_max_jobs": {"type": "number"}, + "max_local_jobs": {"type": "number"}, "partitions": { "type": "array", "items": { @@ -568,6 +568,6 @@ "systems/partitions/processor": {}, "systems/partitions/devices": [], "systems/partitions/extras": {}, - "systems/rfm_max_jobs": 8 + "systems/max_local_jobs": 8 } } From 75a65ab7d469218be21aadb41a53592b6e7a5310 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Tue, 21 Dec 2021 18:45:38 +0100 Subject: [PATCH 45/76] Address PR comments --- reframe/frontend/executors/policies.py | 81 +++++++++++++------------- reframe/schemas/config.json | 2 + 2 files changed, 42 insertions(+), 41 deletions(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index a812cf7c9a..48eabd0a38 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -301,6 +301,7 @@ def __init__(self): self._max_jobs = { '_rfm_local': rt.runtime().get_option(f'systems/0/max_local_jobs') } + self._policy_timeout = rt.runtime().get_option(f'systems/0/policy_timeout') self.task_listeners.append(self) @@ -330,7 +331,7 @@ def exit(self): 1 if t._current_stage in ('run', 'compile') else 0 for t in self._current_tasks ) - self.advance_all(self._current_tasks) + self.advance_all(self._current_tasks, self._policy_timeout) _cleanup_all(self._retired_tasks, not self.keep_stage_files) if num_running: self._pollctl.running_tasks(num_running).snooze() @@ -339,24 +340,33 @@ def exit(self): raise def _poll_tasks(self): - for part in self._partitions: + pairs = [(p.fullname, p.scheduler) for p in self._partitions] + pairs.append(('_rfm_local', self.local_scheduler)) + for partname, sched in pairs: jobs = [] - for t in self._scheduler_tasks[part.fullname]: + for t in self._scheduler_tasks[partname]: if t._current_stage == 'compile': jobs.append(t.check.build_job) elif t._current_stage == 'run': jobs.append(t.check.job) - part.scheduler.poll(*jobs) + sched.poll(*jobs) - jobs = [] - for t in self._scheduler_tasks['_rfm_local']: - if t._current_stage == 'compile': - jobs.append(t.check.build_job) - elif t._current_stage == 'run': - jobs.append(t.check.job) + def _execute_stage(self, task, methods): + try: + for m in methods: + m() + + return True + except TaskExit: + self._current_tasks.remove(task) + with contextlib.suppress(KeyError, AttributeError): + self._scheduler_tasks[task.check.current_partition.fullname].remove(task) - self.local_scheduler.poll(*jobs) + with contextlib.suppress(KeyError): + self._scheduler_tasks['_rfm_local'].remove(task) + + return False def advance_all(self, tasks, timeout=None): t_init = time.time() @@ -371,7 +381,7 @@ def advance_all(self, tasks, timeout=None): if timeout and t_elapsed > timeout and num_progressed: break - getlogger().debug2(f"Bumped {num_progressed} test(s).") + getlogger().debug2(f'Bumped {num_progressed} test(s)') def advance_startup(self, task): if self.deps_skipped(task): @@ -397,15 +407,11 @@ def advance_startup(self, task): return 1 if isinstance(task.check, RunOnlyRegressionTest): - try: - task.compile() - task.compile_complete() - task.compile_wait() - except TaskExit: - # Run and run_wait are no-ops for - # CompileOnlyRegressionTest. This shouldn't fail. - self._current_tasks.remove(task) - return 1 + # All tests should pass from all the pipeline stages, even if + # they are no-ops + self._execute_stage(task, [task.compile, + task.compile_complete, + task.compile_wait]) return 1 elif self.deps_failed(task): @@ -415,6 +421,7 @@ def advance_startup(self, task): return 1 else: # Not all dependencies have finished yet + getlogger().debug2(f'{task.check.info()} waiting for dependencies') return 0 def advance_setup(self, task): @@ -423,14 +430,12 @@ def advance_setup(self, task): else task.check.current_partition.fullname ) if len(self._scheduler_tasks[partname]) < self._max_jobs[partname]: - try: - task.compile() + if self._execute_stage(task, [task.compile]): self._scheduler_tasks[partname].add(task) - except TaskExit: - self._current_tasks.remove(task) return 1 + getlogger().debug2(f'Hit the max job limit of {partname}') return 0 def advance_compile(self, task): @@ -444,15 +449,9 @@ def advance_compile(self, task): self._scheduler_tasks[partname].remove(task) if isinstance(task.check, CompileOnlyRegressionTest): - try: - task.run() - task.run_complete() - task.run_wait() - except TaskExit: - # Run and run_wait are no-ops for - # CompileOnlyRegressionTest. This shouldn't fail. - self._current_tasks.remove(task) - return 1 + # All tests should pass from all the pipeline stages, + # even if they are no-ops + self._execute_stage(task, [task.run, task.run_complete, task.run_wait]) return 1 else: @@ -469,14 +468,12 @@ def advance_compile_wait(self, task): else task.check.current_partition.fullname ) if len(self._scheduler_tasks[partname]) < self._max_jobs[partname]: - try: - task.run() + if self._execute_stage(task, [task.run]): self._scheduler_tasks[partname].add(task) - except TaskExit: - self._current_tasks.remove(task) return 1 + getlogger().debug2(f'Hit the max job limit of {partname}') return 0 def advance_run(self, task): @@ -486,8 +483,8 @@ def advance_run(self, task): ) try: if task.run_complete(): - task.run_wait() - self._scheduler_tasks[partname].remove(task) + if self._execute_stage(task, [task.run_wait]): + self._scheduler_tasks[partname].remove(task) return 1 else: @@ -536,7 +533,9 @@ def _failall(self, cause): with contextlib.suppress(FailureLimitError): task.abort(cause) - # TODO all this prints have to obviously leave from here... + # These function can be useful for tracking statistics of the framework + # like number of tests that have finished setup etc, so we will keep them + # for now. def on_task_setup(self, task): pass diff --git a/reframe/schemas/config.json b/reframe/schemas/config.json index f682ed8987..8be2c989e3 100644 --- a/reframe/schemas/config.json +++ b/reframe/schemas/config.json @@ -237,6 +237,7 @@ "stagedir": {"type": "string"}, "outputdir": {"type": "string"}, "resourcesdir": {"type": "string"}, + "policy_timeout": {"type": "number"}, "max_local_jobs": {"type": "number"}, "partitions": { "type": "array", @@ -568,6 +569,7 @@ "systems/partitions/processor": {}, "systems/partitions/devices": [], "systems/partitions/extras": {}, + "systems/policy_timeout": 10, "systems/max_local_jobs": 8 } } From 862b38a96c13a4dc9376bd826b26b3df554c3f8f Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Tue, 21 Dec 2021 19:04:35 +0100 Subject: [PATCH 46/76] Remove countall --- reframe/frontend/executors/policies.py | 31 +++++++++----------------- 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 48eabd0a38..91f2488085 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -20,15 +20,6 @@ TaskEventListener, ABORT_REASONS) -def countall(d): - res = 0 - for (q1, q2) in d.values(): - res += len(q1) - res += len(q2) - - return res - - def _cleanup_all(tasks, *args, **kwargs): for task in tasks: if task.ref_count == 0: @@ -242,36 +233,36 @@ def exit(self): # | if all deps finished and # | test is not RunOnly # | | -# | ↓ -# | [ setup ] +# | v +# | [ ready_compile ] # | | # | if there are available # | slots # if all deps finished and | -# test is RunOnly ↓ -# | [ compile ]---------------+ +# test is RunOnly v +# | [ compiling ]--------------+ # | | | # | if compilation has finished and | # | test is not CompileOnly | # | | | -# | ↓ | -# +---------------->[ compile_wait ] | +# | v | +# +----------------->[ ready_run ] | # | | # if there are available | # slots | # | if compilation has finished and -# ↓ test is CompileOnly -# [ run ] | +# v test is CompileOnly +# [ running ] | # | | # if job has finished | # tests can exit the | | -# pipeline at any point ↓ | -# if they fail [ run_wait ]<--------------+ +# pipeline at any point v | +# if they fail [ completed ]<--------------+ # : | # : if sanity and performance # | succeed # | | -# ↓ ↓ +# v v # ( failed ) ( retired ) class AsynchronousExecutionPolicy(ExecutionPolicy, TaskEventListener): From e9b206dae3e63031deeb7b3cf96e1cb6abcf7cb6 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Tue, 21 Dec 2021 19:48:32 +0100 Subject: [PATCH 47/76] Add policy state property to RegressionTask --- reframe/frontend/executors/__init__.py | 14 ++++++++++++-- reframe/frontend/executors/policies.py | 20 ++++++++++---------- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/reframe/frontend/executors/__init__.py b/reframe/frontend/executors/__init__.py index 6bac640696..be3d5895b9 100644 --- a/reframe/frontend/executors/__init__.py +++ b/reframe/frontend/executors/__init__.py @@ -216,8 +216,18 @@ def failed(self): not self._aborted and not self._skipped) @property - def current_stage(self): - return self._current_stage + def policy_state(self): + states = { + 'startup': 'startup', + 'setup': 'ready_compile', + 'compile': 'compiling', + 'compile_wait': 'ready_run', + 'run': 'running', + 'run_wait': 'completed', + 'finalize': 'retired', + 'cleanup': 'finished', + } + return states[self._current_stage] @property def failed_stage(self): diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 91f2488085..bef5b7036e 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -257,7 +257,7 @@ def exit(self): # if job has finished | # tests can exit the | | # pipeline at any point v | -# if they fail [ completed ]<--------------+ +# if they fail [ completed ]<-------------+ # : | # : if sanity and performance # | succeed @@ -319,7 +319,7 @@ def exit(self): try: self._poll_tasks() num_running = sum( - 1 if t._current_stage in ('run', 'compile') else 0 + 1 if t.policy_state in ('running', 'compiling') else 0 for t in self._current_tasks ) self.advance_all(self._current_tasks, self._policy_timeout) @@ -336,9 +336,9 @@ def _poll_tasks(self): for partname, sched in pairs: jobs = [] for t in self._scheduler_tasks[partname]: - if t._current_stage == 'compile': + if t.policy_state == 'compiling': jobs.append(t.check.build_job) - elif t._current_stage == 'run': + elif t.policy_state == 'running': jobs.append(t.check.job) sched.poll(*jobs) @@ -366,7 +366,7 @@ def advance_all(self, tasks, timeout=None): getlogger().debug2(f"Current tests: {len(tasks)}") # progress might remove the tasks that retire or fail for t in list(tasks): - bump_state = getattr(self, f'advance_{t._current_stage}') + bump_state = getattr(self, f'advance_{t.policy_state}') num_progressed += bump_state(t) t_elapsed = time.time() - t_init if timeout and t_elapsed > timeout and num_progressed: @@ -415,7 +415,7 @@ def advance_startup(self, task): getlogger().debug2(f'{task.check.info()} waiting for dependencies') return 0 - def advance_setup(self, task): + def advance_ready_compile(self, task): partname = ( '_rfm_local' if task.check.local or task.check.build_locally else task.check.current_partition.fullname @@ -429,7 +429,7 @@ def advance_setup(self, task): getlogger().debug2(f'Hit the max job limit of {partname}') return 0 - def advance_compile(self, task): + def advance_compiling(self, task): partname = ( '_rfm_local' if task.check.local or task.check.build_locally else task.check.current_partition.fullname @@ -453,7 +453,7 @@ def advance_compile(self, task): self._current_tasks.remove(task) return 1 - def advance_compile_wait(self, task): + def advance_ready_run(self, task): partname = ( '_rfm_local' if task.check.local else task.check.current_partition.fullname @@ -467,7 +467,7 @@ def advance_compile_wait(self, task): getlogger().debug2(f'Hit the max job limit of {partname}') return 0 - def advance_run(self, task): + def advance_running(self, task): partname = ( '_rfm_local' if task.check.local else task.check.current_partition.fullname @@ -486,7 +486,7 @@ def advance_run(self, task): self._current_tasks.remove(task) return 1 - def advance_run_wait(self, task): + def advance_completed(self, task): try: if not self.skip_sanity_check: task.sanity() From caf49b28df54cd8ad2e45a9c9f986693859cdd2a Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Tue, 21 Dec 2021 19:52:24 +0100 Subject: [PATCH 48/76] Fix long lines --- reframe/frontend/executors/policies.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index bef5b7036e..abb34ee59e 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -290,9 +290,11 @@ def __init__(self): # Job limit per partition self._max_jobs = { - '_rfm_local': rt.runtime().get_option(f'systems/0/max_local_jobs') + '_rfm_local': rt.runtime().get_option('systems/0/max_local_jobs') } - self._policy_timeout = rt.runtime().get_option(f'systems/0/policy_timeout') + self._policy_timeout = rt.runtime().get_option( + 'systems/0/policy_timeout' + ) self.task_listeners.append(self) @@ -352,7 +354,8 @@ def _execute_stage(self, task, methods): except TaskExit: self._current_tasks.remove(task) with contextlib.suppress(KeyError, AttributeError): - self._scheduler_tasks[task.check.current_partition.fullname].remove(task) + partname = task.check.current_partition.fullname + self._scheduler_tasks[partname].remove(task) with contextlib.suppress(KeyError): self._scheduler_tasks['_rfm_local'].remove(task) @@ -442,7 +445,9 @@ def advance_compiling(self, task): if isinstance(task.check, CompileOnlyRegressionTest): # All tests should pass from all the pipeline stages, # even if they are no-ops - self._execute_stage(task, [task.run, task.run_complete, task.run_wait]) + self._execute_stage(task, [task.run, + task.run_complete, + task.run_wait]) return 1 else: From 2f20e0aa1f08a28eb584d0fa3e7807afda0ce403 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Tue, 21 Dec 2021 19:55:48 +0100 Subject: [PATCH 49/76] Rename policy states --- reframe/frontend/executors/__init__.py | 4 ++-- reframe/frontend/executors/policies.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/reframe/frontend/executors/__init__.py b/reframe/frontend/executors/__init__.py index be3d5895b9..3281733ca9 100644 --- a/reframe/frontend/executors/__init__.py +++ b/reframe/frontend/executors/__init__.py @@ -223,9 +223,9 @@ def policy_state(self): 'compile': 'compiling', 'compile_wait': 'ready_run', 'run': 'running', - 'run_wait': 'completed', + 'run_wait': 'completing', 'finalize': 'retired', - 'cleanup': 'finished', + 'cleanup': 'completed', } return states[self._current_stage] diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index abb34ee59e..d79b246603 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -257,7 +257,7 @@ def exit(self): # if job has finished | # tests can exit the | | # pipeline at any point v | -# if they fail [ completed ]<-------------+ +# if they fail [ completing ]<------------+ # : | # : if sanity and performance # | succeed @@ -491,7 +491,7 @@ def advance_running(self, task): self._current_tasks.remove(task) return 1 - def advance_completed(self, task): + def advance_completing(self, task): try: if not self.skip_sanity_check: task.sanity() From d2af207f0e957837944bb4b315b9df42706e78eb Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Tue, 21 Dec 2021 20:10:44 +0100 Subject: [PATCH 50/76] Address PR comments --- reframe/frontend/executors/__init__.py | 14 ++++-------- reframe/frontend/executors/policies.py | 30 ++++++++++++-------------- 2 files changed, 18 insertions(+), 26 deletions(-) diff --git a/reframe/frontend/executors/__init__.py b/reframe/frontend/executors/__init__.py index 3281733ca9..60f4c1c269 100644 --- a/reframe/frontend/executors/__init__.py +++ b/reframe/frontend/executors/__init__.py @@ -262,11 +262,9 @@ class update_timestamps: # we don't want to masquerade the self argument of our containing # function def __enter__(this): - if ( - fn.__name__ != 'poll' and - fn.__name__ != 'run_complete' and - fn.__name__ != 'compile_complete' - ): + if fn.__name__ in ('poll', + 'run_complete', + 'compile_complete'): stage = self._current_stage self._timestamps[f'{stage}_start'] = time.time() @@ -275,11 +273,7 @@ def __exit__(this, exc_type, exc_value, traceback): self._timestamps[f'{stage}_finish'] = time.time() self._timestamps['pipeline_end'] = time.time() - if ( - fn.__name__ != 'poll' and - fn.__name__ != 'run_complete' and - fn.__name__ != 'compile_complete' - ): + if fn.__name__ in ('poll', 'run_complete', 'compile_complete'): self._current_stage = fn.__name__ try: diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index d79b246603..c09119246b 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -20,6 +20,16 @@ TaskEventListener, ABORT_REASONS) +def _get_partition_name(task, phase='run'): + if ( + task.check.local or + phase == 'build' and task.check.build_locally + ): + return '_rfm_local' + else: + return task.check.current_partition.fullname + + def _cleanup_all(tasks, *args, **kwargs): for task in tasks: if task.ref_count == 0: @@ -419,10 +429,7 @@ def advance_startup(self, task): return 0 def advance_ready_compile(self, task): - partname = ( - '_rfm_local' if task.check.local or task.check.build_locally - else task.check.current_partition.fullname - ) + partname = _get_partition_name(task, phase='build') if len(self._scheduler_tasks[partname]) < self._max_jobs[partname]: if self._execute_stage(task, [task.compile]): self._scheduler_tasks[partname].add(task) @@ -433,10 +440,7 @@ def advance_ready_compile(self, task): return 0 def advance_compiling(self, task): - partname = ( - '_rfm_local' if task.check.local or task.check.build_locally - else task.check.current_partition.fullname - ) + partname = _get_partition_name(task, phase='build') try: if task.compile_complete(): task.compile_wait() @@ -459,10 +463,7 @@ def advance_compiling(self, task): return 1 def advance_ready_run(self, task): - partname = ( - '_rfm_local' if task.check.local - else task.check.current_partition.fullname - ) + partname = _get_partition_name(task, phase='run') if len(self._scheduler_tasks[partname]) < self._max_jobs[partname]: if self._execute_stage(task, [task.run]): self._scheduler_tasks[partname].add(task) @@ -473,10 +474,7 @@ def advance_ready_run(self, task): return 0 def advance_running(self, task): - partname = ( - '_rfm_local' if task.check.local - else task.check.current_partition.fullname - ) + partname = _get_partition_name(task, phase='run') try: if task.run_complete(): if self._execute_stage(task, [task.run_wait]): From 39ab172c290691e942440161e98b938abbc05bed Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Tue, 21 Dec 2021 20:13:05 +0100 Subject: [PATCH 51/76] Fix bug --- reframe/frontend/executors/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/reframe/frontend/executors/__init__.py b/reframe/frontend/executors/__init__.py index 60f4c1c269..6b867787ea 100644 --- a/reframe/frontend/executors/__init__.py +++ b/reframe/frontend/executors/__init__.py @@ -262,9 +262,9 @@ class update_timestamps: # we don't want to masquerade the self argument of our containing # function def __enter__(this): - if fn.__name__ in ('poll', - 'run_complete', - 'compile_complete'): + if fn.__name__ not in ('poll', + 'run_complete', + 'compile_complete'): stage = self._current_stage self._timestamps[f'{stage}_start'] = time.time() @@ -273,7 +273,7 @@ def __exit__(this, exc_type, exc_value, traceback): self._timestamps[f'{stage}_finish'] = time.time() self._timestamps['pipeline_end'] = time.time() - if fn.__name__ in ('poll', 'run_complete', 'compile_complete'): + if fn.__name__ not in ('poll', 'run_complete', 'compile_complete'): self._current_stage = fn.__name__ try: From c6e98b2de7bf7ff4f36b50e10852525d0d100873 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Wed, 22 Dec 2021 00:40:08 +0100 Subject: [PATCH 52/76] Update docs --- docs/_static/img/async-exec-policy.svg | 2 +- .../img/regression-task-state-machine.svg | 3 ++ docs/_static/img/serial-exec-policy.svg | 2 +- docs/pipeline.rst | 31 ++++++++++++- reframe/frontend/executors/policies.py | 45 +------------------ 5 files changed, 36 insertions(+), 47 deletions(-) create mode 100644 docs/_static/img/regression-task-state-machine.svg diff --git a/docs/_static/img/async-exec-policy.svg b/docs/_static/img/async-exec-policy.svg index 198a661d3e..5dce160ee6 100644 --- a/docs/_static/img/async-exec-policy.svg +++ b/docs/_static/img/async-exec-policy.svg @@ -1,3 +1,3 @@ -
SE
SE
BU
BU
RU
RU
SA
SA
PE
PE
CL
CL
SE
SE
BU
BU
RU
RU
SE
SE
BU
BU
RU
RU
SA
SA
PE
PE
CL
CL
SA
SA
PE
PE
CL
CL
Viewer does not support full SVG 1.1
\ No newline at end of file +
SE
SE
BU
BU
RU
RU
SA
SA
PE
PE
CL
CL
BU
BU
SE
SE
SE
SE
BU
BU
RU
RU
RU
RU
SA
SA
CL
CL
PE
PE
SA
SA
CL
CL
PE
PE
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/docs/_static/img/regression-task-state-machine.svg b/docs/_static/img/regression-task-state-machine.svg new file mode 100644 index 0000000000..2e822b3443 --- /dev/null +++ b/docs/_static/img/regression-task-state-machine.svg @@ -0,0 +1,3 @@ + + +
exception
exception
STARTUP
STARTUP
exception
exception
READY
COMPILE
READY...
exception
exception
COMPILING
COMPILING
exception
exception
READY
RUN
READY...
exception
exception
RUNNING
RUNNING
Sanity & perf. check
Sanity & perf. check
exception
exception
COMPLETING
COMPLETING
Deps ready &
not RunOnly
Deps ready...
Exec. slots
available
Exec. slot...
Finished and
not CompileOnly
Finished a...
COMPLETED
COMPLETED
Deps
peding
Deps...
No exec. slots
No exec. slots
Not finished
Not finished
Exec. slots
available
Exec. slot...
No exec.
slots
No exec....
Not finished
Not finished
Cleanup
Cleanup
Cleanup
failure
Cleanup...
RETIRED
(success)
RETIRED...
Finished
Finished
FAILED
FAILED
ERROR
ERROR
compile_complete
compile_complete
compile_wait
compile_wait
setup
setup
compile
compile
run
run
run_complete
run_complete
run_wait
run_wait
sanity
sanity
performance
performance
cleanup
cleanup
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/docs/_static/img/serial-exec-policy.svg b/docs/_static/img/serial-exec-policy.svg index 1955dfbacc..449540ba67 100644 --- a/docs/_static/img/serial-exec-policy.svg +++ b/docs/_static/img/serial-exec-policy.svg @@ -1,3 +1,3 @@ -
SE
SE
BU
BU
RU
RU
SA
SA
PE
PE
CL
CL
Idling
Idling
SE
SE
BU
BU
RU
RU
SA
SA
PE
PE
CL
CL
Idling
Idling
Viewer does not support full SVG 1.1
\ No newline at end of file +
SE
SE
BU
BU
RU
RU
SA
SA
PE
PE
CL
CL
Idling
Idling
SE
SE
BU
BU
RU
RU
SA
SA
PE
PE
CL
CL
Idling
Idling
Idling
Idling
Idling
Idling
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/docs/pipeline.rst b/docs/pipeline.rst index e88c92fb3b..8e0cc65b41 100644 --- a/docs/pipeline.rst +++ b/docs/pipeline.rst @@ -101,10 +101,10 @@ Execution Policies All regression tests in ReFrame will execute the pipeline stages described above. However, how exactly this pipeline will be executed is responsibility of the test execution policy. -There are two execution policies in ReFrame: the serial and the asynchronous one. +There are two execution policies in ReFrame: the serial and the asynchronous execution policy. In the serial execution policy, a new test gets into the pipeline after the previous one has exited. -As the figure below shows, this can lead to long idling times in the run phase, since the execution blocks until the associated test job finishes. +As the figure below shows, this can lead to long idling times in the build and run phases, since the execution blocks until the associated test job finishes. .. figure:: _static/img/serial-exec-policy.svg @@ -134,6 +134,33 @@ When the `concurrency limit [ ready_run ] | -# | | -# if there are available | -# slots | -# | if compilation has finished and -# v test is CompileOnly -# [ running ] | -# | | -# if job has finished | -# tests can exit the | | -# pipeline at any point v | -# if they fail [ completing ]<------------+ -# : | -# : if sanity and performance -# | succeed -# | | -# v v -# ( failed ) ( retired ) - class AsynchronousExecutionPolicy(ExecutionPolicy, TaskEventListener): + '''The asynchronous execution policy.''' + def __init__(self): super().__init__() From bad2f90734a909ecc2c2e743b90b7732647fe09e Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Wed, 22 Dec 2021 10:01:41 +0100 Subject: [PATCH 53/76] Address PR comments --- docs/config_reference.rst | 25 +++++ docs/pipeline.rst | 2 +- reframe/frontend/executors/__init__.py | 2 +- reframe/frontend/executors/policies.py | 148 +++++++++++++------------ reframe/schemas/config.json | 10 +- unittests/test_policies.py | 79 ++++++------- 6 files changed, 152 insertions(+), 114 deletions(-) diff --git a/docs/config_reference.rst b/docs/config_reference.rst index 7629c1a199..2d895aee1c 100644 --- a/docs/config_reference.rst +++ b/docs/config_reference.rst @@ -101,6 +101,18 @@ System Configuration A list of hostname regular expression patterns in Python `syntax `__, which will be used by the framework in order to automatically select a system configuration. For the auto-selection process, see `here `__. +.. js:attribute:: .systems[].max_local_jobs + + The maximum number of forced local build or run jobs allowed. + + Forced local jobs run within the execution context of ReFrame. + + :required: No + :default: ``8`` + + .. versionadded:: 3.9.3 + + .. js:attribute:: .systems[].modules_system :required: No @@ -1289,6 +1301,19 @@ General Configuration Timeout value in seconds used when checking if a git repository exists. +.. js:attribute:: .general[].pipeline_timeout + + Timeout in seconds for advancing the pipeline in the asynchronous execution policy. + + ReFrame's asynchronous execution policy will try to advance as many tests as possible in their pipeline, but some tests may take too long to proceed (e.g., due to copying of large files) blocking the advancement of previously started tests. + If this timeout value is exceeded and at least one test has progressed, ReFrame will stop processing new tests and it will try to further advance tests that have already started. + + :required: No + :default: ``10`` + + .. versionadded:: 3.9.3 + + .. js:attribute:: .general[].remote_detect :required: No diff --git a/docs/pipeline.rst b/docs/pipeline.rst index 8e0cc65b41..61664580f4 100644 --- a/docs/pipeline.rst +++ b/docs/pipeline.rst @@ -148,7 +148,7 @@ The following state diagram shows how test tasks are scheduled, as well as when There are a number of things to notice in this diagram: - If a test encounters an exception it is marked as a failure. - Even normal failures, such as dependency failures and sanity of performance failures are also exceptions raised explicitly by the framework during a pipeline stage. + Even normal failures, such as dependency failures and sanity or performance failures are also exceptions raised explicitly by the framework during a pipeline stage. - The pipeline stages that are executed asynchronously, namely the ``compile`` and ``run`` stages, are split in sub-stages for submitting the corresponding job and for checking or waiting its completion. This is why in ReFrame error messages you may see ``compile_complete`` or ``run_complete`` being reported as the failing stage. - The execution of a test may be stalled if there are not enough execution slots available for submitting compile or run jobs on the target partition. diff --git a/reframe/frontend/executors/__init__.py b/reframe/frontend/executors/__init__.py index 6b867787ea..216fa3e6a9 100644 --- a/reframe/frontend/executors/__init__.py +++ b/reframe/frontend/executors/__init__.py @@ -216,7 +216,7 @@ def failed(self): not self._aborted and not self._skipped) @property - def policy_state(self): + def state(self): states = { 'startup': 'startup', 'setup': 'ready_compile', diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 5680af8168..e2d3fc952f 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -9,6 +9,7 @@ import time import reframe.core.runtime as rt +import reframe.utility as util from reframe.core.exceptions import (FailureLimitError, SkipTestError, TaskDependencyError, @@ -21,10 +22,8 @@ def _get_partition_name(task, phase='run'): - if ( - task.check.local or - phase == 'build' and task.check.build_locally - ): + if (task.check.local or + (phase == 'build' and task.check.build_locally)): return '_rfm_local' else: return task.check.current_partition.fullname @@ -243,15 +242,19 @@ def __init__(self): # Index tasks by test cases self._task_index = {} - # A set of all the current tasks - self._current_tasks = set() + # A set of all the current tasks. We use an ordered set here, because + # we want to preserve the order of the tasks. + self._current_tasks = util.OrderedSet() - # Keep a reference to all the partitions - self._partitions = set() + # Quick look up for the partition schedulers including the + # `_rfm_local` pseudo-partition + self._schedulers = { + '_rfm_local': self.local_scheduler + } - # Sets of the jobs that should be polled for each partition - self._scheduler_tasks = { - '_rfm_local': set() + # Tasks per partition + self._partition_tasks = { + '_rfm_local': util.OrderedSet() } # Retired tasks that need to be cleaned up @@ -261,19 +264,15 @@ def __init__(self): self._max_jobs = { '_rfm_local': rt.runtime().get_option('systems/0/max_local_jobs') } - self._policy_timeout = rt.runtime().get_option( - 'systems/0/policy_timeout' - ) - self.task_listeners.append(self) def runcase(self, case): super().runcase(case) check, partition, environ = case - self._partitions.add(partition) + self._schedulers[partition.fullname] = partition.scheduler # Set partition-based counters, if not set already - self._scheduler_tasks.setdefault(partition.fullname, set()) + self._partition_tasks.setdefault(partition.fullname, util.OrderedSet()) self._max_jobs.setdefault(partition.fullname, partition.max_jobs) task = RegressionTask(case, self.task_listeners) @@ -290,10 +289,13 @@ def exit(self): try: self._poll_tasks() num_running = sum( - 1 if t.policy_state in ('running', 'compiling') else 0 + 1 if t.state in ('running', 'compiling') else 0 for t in self._current_tasks ) - self.advance_all(self._current_tasks, self._policy_timeout) + timeout = rt.runtime().get_option( + 'general/pipeline_timeout' + ) + self._advance_all(self._current_tasks, timeout) _cleanup_all(self._retired_tasks, not self.keep_stage_files) if num_running: self._pollctl.running_tasks(num_running).snooze() @@ -302,43 +304,54 @@ def exit(self): raise def _poll_tasks(self): - pairs = [(p.fullname, p.scheduler) for p in self._partitions] - pairs.append(('_rfm_local', self.local_scheduler)) - for partname, sched in pairs: + for partname, sched in self._schedulers.items(): jobs = [] - for t in self._scheduler_tasks[partname]: - if t.policy_state == 'compiling': + for t in self._partition_tasks[partname]: + if t.state == 'compiling': jobs.append(t.check.build_job) - elif t.policy_state == 'running': + elif t.state == 'running': jobs.append(t.check.job) sched.poll(*jobs) - def _execute_stage(self, task, methods): - try: - for m in methods: - m() + def _exec_stage(self, task, stage_methods): + '''Execute a series of pipeline stages. - return True + Return True on success, False otherwise. + ''' + + try: + for stage in stage_methods: + stage() except TaskExit: self._current_tasks.remove(task) - with contextlib.suppress(KeyError, AttributeError): + if task.check.current_partition: partname = task.check.current_partition.fullname - self._scheduler_tasks[partname].remove(task) + else: + partname = None + # Remove tasks from the partition tasks if there with contextlib.suppress(KeyError): - self._scheduler_tasks['_rfm_local'].remove(task) + self._partition_tasks['_rfm_local'].remove(task) + if partname: + self._partition_tasks[partname].remove(task) return False + else: + return True + + def _advance_all(self, tasks, timeout=None): + print(tasks) - def advance_all(self, tasks, timeout=None): t_init = time.time() num_progressed = 0 - getlogger().debug2(f"Current tests: {len(tasks)}") - # progress might remove the tasks that retire or fail + getlogger().debug2(f'Current tests: {len(tasks)}') + + # We take a snapshot of the tasks to advance by doing a shallow copy, + # since the tasks may removed by the individual advance functions. for t in list(tasks): - bump_state = getattr(self, f'advance_{t.policy_state}') + bump_state = getattr(self, f'_advance_{t.state}') num_progressed += bump_state(t) t_elapsed = time.time() - t_init if timeout and t_elapsed > timeout and num_progressed: @@ -346,7 +359,7 @@ def advance_all(self, tasks, timeout=None): getlogger().debug2(f'Bumped {num_progressed} test(s)') - def advance_startup(self, task): + def _advance_startup(self, task): if self.deps_skipped(task): try: raise SkipTestError('skipped due to skipped dependencies') @@ -370,11 +383,11 @@ def advance_startup(self, task): return 1 if isinstance(task.check, RunOnlyRegressionTest): - # All tests should pass from all the pipeline stages, even if + # All tests should execute all the pipeline stages, even if # they are no-ops - self._execute_stage(task, [task.compile, - task.compile_complete, - task.compile_wait]) + self._exec_stage(task, [task.compile, + task.compile_complete, + task.compile_wait]) return 1 elif self.deps_failed(task): @@ -387,68 +400,68 @@ def advance_startup(self, task): getlogger().debug2(f'{task.check.info()} waiting for dependencies') return 0 - def advance_ready_compile(self, task): + def _advance_ready_compile(self, task): partname = _get_partition_name(task, phase='build') - if len(self._scheduler_tasks[partname]) < self._max_jobs[partname]: - if self._execute_stage(task, [task.compile]): - self._scheduler_tasks[partname].add(task) + max_jobs = self._max_jobs[partname] + if len(self._partition_tasks[partname]) < max_jobs: + if self._exec_stage(task, [task.compile]): + self._partition_tasks[partname].add(task) return 1 - getlogger().debug2(f'Hit the max job limit of {partname}') + getlogger().debug2(f'Hit the max job limit of {partname}: {max_jobs}') return 0 - def advance_compiling(self, task): + def _advance_compiling(self, task): partname = _get_partition_name(task, phase='build') try: if task.compile_complete(): task.compile_wait() - self._scheduler_tasks[partname].remove(task) - + self._partition_tasks[partname].remove(task) if isinstance(task.check, CompileOnlyRegressionTest): # All tests should pass from all the pipeline stages, # even if they are no-ops - self._execute_stage(task, [task.run, - task.run_complete, - task.run_wait]) + self._exec_stage(task, [task.run, + task.run_complete, + task.run_wait]) return 1 else: return 0 - except TaskExit: - self._scheduler_tasks[partname].remove(task) + self._partition_tasks[partname].remove(task) self._current_tasks.remove(task) return 1 - def advance_ready_run(self, task): + def _advance_ready_run(self, task): partname = _get_partition_name(task, phase='run') - if len(self._scheduler_tasks[partname]) < self._max_jobs[partname]: - if self._execute_stage(task, [task.run]): - self._scheduler_tasks[partname].add(task) + max_jobs = self._max_jobs[partname] + if len(self._partition_tasks[partname]) < max_jobs: + if self._exec_stage(task, [task.run]): + self._partition_tasks[partname].add(task) return 1 - getlogger().debug2(f'Hit the max job limit of {partname}') + getlogger().debug2(f'Hit the max job limit of {partname}: {max_jobs}') return 0 - def advance_running(self, task): + def _advance_running(self, task): partname = _get_partition_name(task, phase='run') try: if task.run_complete(): - if self._execute_stage(task, [task.run_wait]): - self._scheduler_tasks[partname].remove(task) + if self._exec_stage(task, [task.run_wait]): + self._partition_tasks[partname].remove(task) return 1 else: return 0 except TaskExit: - self._scheduler_tasks[partname].remove(task) + self._partition_tasks[partname].remove(task) self._current_tasks.remove(task) return 1 - def advance_completing(self, task): + def _advance_completing(self, task): try: if not self.skip_sanity_check: task.sanity() @@ -486,9 +499,8 @@ def _failall(self, cause): with contextlib.suppress(FailureLimitError): task.abort(cause) - # These function can be useful for tracking statistics of the framework - # like number of tests that have finished setup etc, so we will keep them - # for now. + # These function can be useful for tracking statistics of the framework, + # such as number of tests that have finished setup etc. def on_task_setup(self, task): pass diff --git a/reframe/schemas/config.json b/reframe/schemas/config.json index 8be2c989e3..2f5a37f61d 100644 --- a/reframe/schemas/config.json +++ b/reframe/schemas/config.json @@ -226,6 +226,7 @@ "type": "array", "items": {"type": "string"} }, + "max_local_jobs": {"type": "number"}, "modules_system": { "type": "string", "enum": ["tmod", "tmod31", "tmod32", "tmod4", @@ -237,8 +238,6 @@ "stagedir": {"type": "string"}, "outputdir": {"type": "string"}, "resourcesdir": {"type": "string"}, - "policy_timeout": {"type": "number"}, - "max_local_jobs": {"type": "number"}, "partitions": { "type": "array", "items": { @@ -466,6 +465,7 @@ "items": {"type": "string"} }, "non_default_craype": {"type": "boolean"}, + "pipeline_timeout": {"type": "number"}, "purge_environment": {"type": "boolean"}, "remote_detect": {"type": "boolean"}, "remote_workdir": {"type": "string"}, @@ -499,6 +499,7 @@ "environments/ldflags": [], "environments/extras": {}, "environments/target_systems": ["*"], + "general/pipeline_timeout": 10, "general/check_search_path": ["${RFM_INSTALL_PREFIX}/checks/"], "general/check_search_recursive": false, "general/clean_stagedir": true, @@ -547,6 +548,7 @@ "schedulers/target_systems": ["*"], "schedulers/use_nodes_option": false, "systems/descr": "", + "systems/max_local_jobs": 8, "systems/modules_system": "nomod", "systems/modules": [], "systems/variables": [], @@ -568,8 +570,6 @@ "systems/partitions/prepare_cmds": [], "systems/partitions/processor": {}, "systems/partitions/devices": [], - "systems/partitions/extras": {}, - "systems/policy_timeout": 10, - "systems/max_local_jobs": 8 + "systems/partitions/extras": {} } } diff --git a/unittests/test_policies.py b/unittests/test_policies.py index 71b69edeb2..a12a7d63af 100644 --- a/unittests/test_policies.py +++ b/unittests/test_policies.py @@ -565,16 +565,24 @@ def on_task_setup(self, task): def max_jobs_opts(n): - return {'systems/partitions/max_jobs': n} + return {'systems/partitions/max_jobs': n, + 'systems/max_local_jobs': n} @pytest.fixture -def async_runner(): - evt_monitor = _TaskEventMonitor() - ret = executors.Runner(policies.AsynchronousExecutionPolicy()) - ret.policy.keep_stage_files = True - ret.policy.task_listeners.append(evt_monitor) - return ret, evt_monitor +def make_async_runner(): + # We need to have control in the unit tests where the policy is created, + # because in some cases we need it to be initialized after the execution + # context. For this reason, we use a constructor fixture here. + + def _make_runner(): + evt_monitor = _TaskEventMonitor() + ret = executors.Runner(policies.AsynchronousExecutionPolicy()) + ret.policy.keep_stage_files = True + ret.policy.task_listeners.append(evt_monitor) + return ret, evt_monitor + + return _make_runner def _read_timestamps(tasks): @@ -595,13 +603,10 @@ def _read_timestamps(tasks): return begin_stamps, end_stamps -def test_concurrency_unlimited(async_runner, make_cases): +def test_concurrency_unlimited(make_async_runner, make_cases, make_exec_ctx): num_checks = 3 - - runner, monitor = async_runner - runner.policy._max_jobs = { - '_rfm_local': num_checks - } + make_exec_ctx(options=max_jobs_opts(num_checks)) + runner, monitor = make_async_runner() runner.runall(make_cases([SleepCheck(.5) for i in range(num_checks)])) # Ensure that all tests were run and without failures. @@ -623,14 +628,12 @@ def test_concurrency_unlimited(async_runner, make_cases): pytest.skip('the system seems too much loaded.') -def test_concurrency_limited(async_runner, make_cases): +def test_concurrency_limited(make_async_runner, make_cases, make_exec_ctx): # The number of checks must be <= 2*max_jobs. num_checks, max_jobs = 5, 3 + make_exec_ctx(options=max_jobs_opts(max_jobs)) - runner, monitor = async_runner - runner.policy._max_jobs = { - '_rfm_local': 3 - } + runner, monitor = make_async_runner() runner.runall(make_cases([SleepCheck(.5) for i in range(num_checks)])) # Ensure that all tests were run and without failures. @@ -667,13 +670,11 @@ def test_concurrency_limited(async_runner, make_cases): pytest.skip('the system seems too loaded.') -def test_concurrency_none(async_runner, make_cases): +def test_concurrency_none(make_async_runner, make_cases, make_exec_ctx): num_checks = 3 + make_exec_ctx(options=max_jobs_opts(1)) - runner, monitor = async_runner - runner.policy._max_jobs = { - '_rfm_local': 1 - } + runner, monitor = make_async_runner() runner.runall(make_cases([SleepCheck(.5) for i in range(num_checks)])) # Ensure that all tests were run and without failures. @@ -709,10 +710,10 @@ def assert_interrupted_run(runner): assert t.exc_info[0] == AbortTaskError -def test_kbd_interrupt_in_wait_with_concurrency(async_runner, make_cases, +def test_kbd_interrupt_in_wait_with_concurrency(make_async_runner, make_cases, make_exec_ctx): make_exec_ctx(options=max_jobs_opts(4)) - runner, _ = async_runner + runner, _ = make_async_runner() with pytest.raises(KeyboardInterrupt): runner.runall(make_cases([ KeyboardInterruptCheck(), SleepCheck(10), @@ -723,7 +724,7 @@ def test_kbd_interrupt_in_wait_with_concurrency(async_runner, make_cases, def test_kbd_interrupt_in_wait_with_limited_concurrency( - async_runner, make_cases, make_exec_ctx + make_async_runner, make_cases, make_exec_ctx ): # The general idea for this test is to allow enough time for all the # four checks to be submitted and at the same time we need the @@ -731,7 +732,7 @@ def test_kbd_interrupt_in_wait_with_limited_concurrency( # trigger the failure), so as to make the framework kill the remaining # three. make_exec_ctx(options=max_jobs_opts(2)) - runner, _ = async_runner + runner, _ = make_async_runner() with pytest.raises(KeyboardInterrupt): runner.runall(make_cases([ KeyboardInterruptCheck(), SleepCheck(10), @@ -741,10 +742,10 @@ def test_kbd_interrupt_in_wait_with_limited_concurrency( assert_interrupted_run(runner) -def test_kbd_interrupt_in_setup_with_concurrency(async_runner, make_cases, +def test_kbd_interrupt_in_setup_with_concurrency(make_async_runner, make_cases, make_exec_ctx): make_exec_ctx(options=max_jobs_opts(4)) - runner, _ = async_runner + runner, _ = make_async_runner() with pytest.raises(KeyboardInterrupt): runner.runall(make_cases([ SleepCheck(1), SleepCheck(1), SleepCheck(1), @@ -755,10 +756,10 @@ def test_kbd_interrupt_in_setup_with_concurrency(async_runner, make_cases, def test_kbd_interrupt_in_setup_with_limited_concurrency( - async_runner, make_cases, make_exec_ctx + make_async_runner, make_cases, make_exec_ctx ): make_exec_ctx(options=max_jobs_opts(2)) - runner, _ = async_runner + runner, _ = make_async_runner() with pytest.raises(KeyboardInterrupt): runner.runall(make_cases([ SleepCheck(1), SleepCheck(1), SleepCheck(1), @@ -768,10 +769,10 @@ def test_kbd_interrupt_in_setup_with_limited_concurrency( assert_interrupted_run(runner) -def test_run_complete_fails_main_loop(async_runner, make_cases, +def test_run_complete_fails_main_loop(make_async_runner, make_cases, make_exec_ctx): make_exec_ctx(options=max_jobs_opts(1)) - runner, _ = async_runner + runner, _ = make_async_runner() num_checks = 3 runner.runall(make_cases([SleepCheckPollFail(10), SleepCheck(0.1), SleepCheckPollFail(10)])) @@ -786,10 +787,10 @@ def test_run_complete_fails_main_loop(async_runner, make_cases, assert isinstance(t.check, SleepCheck) -def test_run_complete_fails_busy_loop(async_runner, make_cases, +def test_run_complete_fails_busy_loop(make_async_runner, make_cases, make_exec_ctx): make_exec_ctx(options=max_jobs_opts(1)) - runner, _ = async_runner + runner, _ = make_async_runner() num_checks = 3 runner.runall(make_cases([SleepCheckPollFailLate(1), SleepCheck(0.1), SleepCheckPollFailLate(0.5)])) @@ -804,10 +805,10 @@ def test_run_complete_fails_busy_loop(async_runner, make_cases, assert isinstance(t.check, SleepCheck) -def test_compile_fail_reschedule_main_loop(async_runner, make_cases, +def test_compile_fail_reschedule_main_loop(make_async_runner, make_cases, make_exec_ctx): make_exec_ctx(options=max_jobs_opts(1)) - runner, _ = async_runner + runner, _ = make_async_runner() num_checks = 2 runner.runall(make_cases([SleepCheckPollFail(.1), CompileFailureCheck()])) @@ -817,10 +818,10 @@ def test_compile_fail_reschedule_main_loop(async_runner, make_cases, assert num_checks == len(stats.failed()) -def test_compile_fail_reschedule_busy_loop(async_runner, make_cases, +def test_compile_fail_reschedule_busy_loop(make_async_runner, make_cases, make_exec_ctx): make_exec_ctx(options=max_jobs_opts(1)) - runner, _ = async_runner + runner, _ = make_async_runner() num_checks = 2 runner.runall( make_cases([SleepCheckPollFailLate(1.5), CompileFailureCheck()]) From a8b12204d8cbe68a0ffcf1b1defe22af5246c912 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Wed, 22 Dec 2021 15:45:34 +0100 Subject: [PATCH 54/76] Document execution contexts --- docs/pipeline.rst | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/docs/pipeline.rst b/docs/pipeline.rst index 61664580f4..9224065c1c 100644 --- a/docs/pipeline.rst +++ b/docs/pipeline.rst @@ -161,6 +161,40 @@ There are a number of things to notice in this diagram: The ``compile`` stage is now also executed asynchronously. +-------------------------------------- +Where each pipeline stage is executed? +-------------------------------------- + +There are two executions contexts where a pipeline stage can be executed: the ReFrame execution context and the partition execution context. +The *ReFrame execution context* is where ReFrame executes. +This is always the local host. +The *partition execution context* can either be local or remote depending on how the partition is configured. +The following table show in which context each pipeline stage executes: + +.. table:: + :align: center + + ============== ================= + Pipeline Stage Execution Context + ============== ================= + *Setup* ReFrame + *Compile* ReFrame if :attr:`~reframe.core.pipeline.RegressionTest.build_locally` is :obj:`True`, partition otherwise. + *Run* ReFrame if :attr:`~reframe.core.pipeline.RegressionTest.local` is :obj:`True` or if :option:`--force-local` is passed, partition otherwise. + *Sanity* ReFrame + *Performance* ReFrame + *Cleanup* ReFrame + ============== ================= + +It should be noted that even if the partition execution context is local, it is treated differently from the ReFrame execution context. +For example, a test executing in the ReFrame context will not respect the :js:attr:`max_jobs` partition configuration option, even if the partition is local. +To control the concurrency of the ReFrame execution context, users should set the :js:attr:`.systems[].max_local_jobs` option instead. + + +.. versionchanged:: 3.9.3 + + Execution contexts were formalized. + + Timing the Test Pipeline ------------------------ From d29f2e80f670ea64b232d9f2722f24d1b64534fd Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Wed, 22 Dec 2021 16:00:13 +0100 Subject: [PATCH 55/76] Remove stale print statement --- reframe/frontend/executors/policies.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index e2d3fc952f..c69e88196c 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -341,8 +341,6 @@ def _exec_stage(self, task, stage_methods): return True def _advance_all(self, tasks, timeout=None): - print(tasks) - t_init = time.time() num_progressed = 0 From 59c2c381ff9241dcf5221d4b7c0092f6b6f77ceb Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Tue, 11 Jan 2022 21:33:05 +0100 Subject: [PATCH 56/76] Change default pipeline timeout and environment variable --- reframe/frontend/cli.py | 7 +++++++ reframe/schemas/config.json | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/reframe/frontend/cli.py b/reframe/frontend/cli.py index 4a9fd734f2..8a3dbaa168 100644 --- a/reframe/frontend/cli.py +++ b/reframe/frontend/cli.py @@ -508,6 +508,13 @@ def main(): action='store_true', help='Use a compact test naming scheme' ) + argparser.add_argument( + dest='pipeline_timeout', + envvar='RFM_PIPELINE_TIMEOUT', + configvar='general/pipeline_timeout', + action='store', + help='Timeout for advancing the pipeline' + ) argparser.add_argument( dest='remote_detect', envvar='RFM_REMOTE_DETECT', diff --git a/reframe/schemas/config.json b/reframe/schemas/config.json index 2f5a37f61d..7c7ee2487a 100644 --- a/reframe/schemas/config.json +++ b/reframe/schemas/config.json @@ -465,7 +465,7 @@ "items": {"type": "string"} }, "non_default_craype": {"type": "boolean"}, - "pipeline_timeout": {"type": "number"}, + "pipeline_timeout": {"type": ["number", "null"]}, "purge_environment": {"type": "boolean"}, "remote_detect": {"type": "boolean"}, "remote_workdir": {"type": "string"}, @@ -499,7 +499,7 @@ "environments/ldflags": [], "environments/extras": {}, "environments/target_systems": ["*"], - "general/pipeline_timeout": 10, + "general/pipeline_timeout": null, "general/check_search_path": ["${RFM_INSTALL_PREFIX}/checks/"], "general/check_search_recursive": false, "general/clean_stagedir": true, From d6c66a9abad95bff574de04711f5544595b97d1a Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Tue, 11 Jan 2022 23:48:49 +0100 Subject: [PATCH 57/76] Dump pipeline progress --- reframe/frontend/executors/policies.py | 45 ++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index c69e88196c..1db8a4b3ff 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -266,6 +266,35 @@ def __init__(self): } self.task_listeners.append(self) + def _init_pipeline_history(self, num_tasks): + self._pipeline_history = [ + { + 'startup': num_tasks, + 'ready_compile': 0, + 'compiling': 0, + 'ready_run': 0, + 'running': 0, + 'completing': 0, + 'retired': 0, + 'completed': 0, + 'fail': 0, + 'skip': 0 + } + ] + + def _update_pipeline_history(self, old_state, new_state, num_tasks=1): + prev_step = self._pipeline_history[-1] + next_step = {**prev_step} + next_step[old_state] -= num_tasks + next_step[new_state] += num_tasks + self._pipeline_history.append(next_step) + + def _dump_pipeline_history(self, filename): + import json + + with open(filename, 'w') as fp: + json.dump(self._pipeline_history, fp, indent=2) + def runcase(self, case): super().runcase(case) check, partition, environ = case @@ -285,6 +314,7 @@ def runcase(self, case): self._current_tasks.add(task) def exit(self): + self._init_pipeline_history(len(self._current_tasks)) while self._current_tasks: try: self._poll_tasks() @@ -296,13 +326,18 @@ def exit(self): 'general/pipeline_timeout' ) self._advance_all(self._current_tasks, timeout) + num_retired = len(self._retired_tasks) _cleanup_all(self._retired_tasks, not self.keep_stage_files) + self._update_pipeline_history('retired', 'completed', + num_retired) if num_running: self._pollctl.running_tasks(num_running).snooze() except ABORT_REASONS as e: self._failall(e) raise + self._dump_pipeline_history('pipeline-history.json') + def _poll_tasks(self): for partname, sched in self._schedulers.items(): jobs = [] @@ -349,12 +384,22 @@ def _advance_all(self, tasks, timeout=None): # We take a snapshot of the tasks to advance by doing a shallow copy, # since the tasks may removed by the individual advance functions. for t in list(tasks): + old_state = t.state bump_state = getattr(self, f'_advance_{t.state}') num_progressed += bump_state(t) + if t.failed: + new_state = 'fail' + elif t.skipped: + new_state = 'skip' + else: + new_state = t.state + t_elapsed = time.time() - t_init if timeout and t_elapsed > timeout and num_progressed: break + self._update_pipeline_history(old_state, new_state, 1) + getlogger().debug2(f'Bumped {num_progressed} test(s)') def _advance_startup(self, task): From d012d75545b9af3128230cf29e0108fb1c58661c Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Tue, 11 Jan 2022 23:59:21 +0100 Subject: [PATCH 58/76] Dump compact json --- reframe/frontend/executors/policies.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 1db8a4b3ff..a09bf8a36a 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -293,7 +293,7 @@ def _dump_pipeline_history(self, filename): import json with open(filename, 'w') as fp: - json.dump(self._pipeline_history, fp, indent=2) + json.dump(self._pipeline_history, fp) def runcase(self, case): super().runcase(case) From 692aaf5625f0fc1a93d67a90961ab9ba6bf0a93c Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Wed, 12 Jan 2022 15:27:57 +0100 Subject: [PATCH 59/76] Fix pipeline progress + plot script --- plot_pipeline_progress.py | 20 ++++++++ reframe/frontend/executors/policies.py | 65 ++++++++++++++------------ 2 files changed, 55 insertions(+), 30 deletions(-) create mode 100644 plot_pipeline_progress.py diff --git a/plot_pipeline_progress.py b/plot_pipeline_progress.py new file mode 100644 index 0000000000..577369579e --- /dev/null +++ b/plot_pipeline_progress.py @@ -0,0 +1,20 @@ +import json +import matplotlib.pyplot as plt +import sys + + +if __name__ == '__main__': + with open(sys.argv[1]) as fp: + raw_data = json.load(fp) + + for state, steps in raw_data.items(): + print(state, len(steps)) + + fig, ax = plt.subplots() + steps = range(len(raw_data['startup'])) + ax.stackplot(steps, raw_data.values(), labels=raw_data.keys(), alpha=0.8) + ax.legend(loc='upper left') + ax.set_title('Pipeline progress') + ax.set_xlabel('Step') + ax.set_ylabel('Number of tasks') + plt.show() diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index a09bf8a36a..9623045ca6 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -266,34 +266,39 @@ def __init__(self): } self.task_listeners.append(self) - def _init_pipeline_history(self, num_tasks): - self._pipeline_history = [ - { - 'startup': num_tasks, - 'ready_compile': 0, - 'compiling': 0, - 'ready_run': 0, - 'running': 0, - 'completing': 0, - 'retired': 0, - 'completed': 0, - 'fail': 0, - 'skip': 0 - } - ] - - def _update_pipeline_history(self, old_state, new_state, num_tasks=1): - prev_step = self._pipeline_history[-1] - next_step = {**prev_step} - next_step[old_state] -= num_tasks - next_step[new_state] += num_tasks - self._pipeline_history.append(next_step) - - def _dump_pipeline_history(self, filename): + def _init_pipeline_progress(self, num_tasks): + self._pipeline_progress = { + 'startup': [num_tasks], + 'ready_compile': [0], + 'compiling': [0], + 'ready_run': [0], + 'running': [0], + 'completing': [0], + 'retired': [0], + 'completed': [0], + 'fail': [0], + 'skip': [0] + } + self._pipeline_step = 0 + + def _update_pipeline_progress(self, old_state, new_state, num_tasks=1): + for state in self._pipeline_progress: + count = self._pipeline_progress[state][self._pipeline_step] + if old_state != new_state: + if state == old_state: + count -= num_tasks + elif state == new_state: + count += num_tasks + + self._pipeline_progress[state].append(count) + + self._pipeline_step += 1 + + def _dump_pipeline_progress(self, filename): import json with open(filename, 'w') as fp: - json.dump(self._pipeline_history, fp) + json.dump(self._pipeline_progress, fp, indent=2) def runcase(self, case): super().runcase(case) @@ -314,7 +319,7 @@ def runcase(self, case): self._current_tasks.add(task) def exit(self): - self._init_pipeline_history(len(self._current_tasks)) + self._init_pipeline_progress(len(self._current_tasks)) while self._current_tasks: try: self._poll_tasks() @@ -328,15 +333,15 @@ def exit(self): self._advance_all(self._current_tasks, timeout) num_retired = len(self._retired_tasks) _cleanup_all(self._retired_tasks, not self.keep_stage_files) - self._update_pipeline_history('retired', 'completed', - num_retired) + self._update_pipeline_progress('retired', 'completed', + num_retired) if num_running: self._pollctl.running_tasks(num_running).snooze() except ABORT_REASONS as e: self._failall(e) raise - self._dump_pipeline_history('pipeline-history.json') + self._dump_pipeline_progress('pipeline-progress.json') def _poll_tasks(self): for partname, sched in self._schedulers.items(): @@ -398,7 +403,7 @@ def _advance_all(self, tasks, timeout=None): if timeout and t_elapsed > timeout and num_progressed: break - self._update_pipeline_history(old_state, new_state, 1) + self._update_pipeline_progress(old_state, new_state, 1) getlogger().debug2(f'Bumped {num_progressed} test(s)') From e2692297f7c793f5ff57c05183e3d7233c8e2c89 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Wed, 12 Jan 2022 17:23:33 +0100 Subject: [PATCH 60/76] Add time support in pipeline progress --- plot_pipeline_progress.py | 30 +++++++++++++++++++++++--- reframe/frontend/executors/policies.py | 30 ++++++++++++++------------ 2 files changed, 43 insertions(+), 17 deletions(-) diff --git a/plot_pipeline_progress.py b/plot_pipeline_progress.py index 577369579e..9a59252e99 100644 --- a/plot_pipeline_progress.py +++ b/plot_pipeline_progress.py @@ -1,5 +1,6 @@ import json import matplotlib.pyplot as plt +import os import sys @@ -10,11 +11,34 @@ for state, steps in raw_data.items(): print(state, len(steps)) + try: + mode = sys.argv[2] + if mode not in ('steps', 'time'): + print(f'unknown mode: {mode}') + sys.exit(1) + except IndexError: + mode = 'steps' + + if mode == 'steps': + x_label = '# Steps' + x_values = range(len(raw_data['startup'])) + else: + x_label = 'Time (s)' + x_values = [x[1] for x in raw_data['startup']] + + y_values = [] + for x in raw_data.values(): + step_values = [s[0] for s in x] + y_values.append(step_values) + + print(x_values) + print(y_values) fig, ax = plt.subplots() - steps = range(len(raw_data['startup'])) - ax.stackplot(steps, raw_data.values(), labels=raw_data.keys(), alpha=0.8) + ax.stackplot(x_values, y_values, labels=raw_data.keys(), alpha=0.8) ax.legend(loc='upper left') ax.set_title('Pipeline progress') - ax.set_xlabel('Step') + ax.set_xlabel(x_label) ax.set_ylabel('Number of tasks') + figname = os.path.splitext(sys.argv[1])[0] + '.png' + plt.savefig(figname) plt.show() diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 9623045ca6..2abfcbc0ca 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -268,37 +268,39 @@ def __init__(self): def _init_pipeline_progress(self, num_tasks): self._pipeline_progress = { - 'startup': [num_tasks], - 'ready_compile': [0], - 'compiling': [0], - 'ready_run': [0], - 'running': [0], - 'completing': [0], - 'retired': [0], - 'completed': [0], - 'fail': [0], - 'skip': [0] + 'startup': [(num_tasks, 0)], + 'ready_compile': [(0, 0)], + 'compiling': [(0, 0)], + 'ready_run': [(0, 0)], + 'running': [(0, 0)], + 'completing': [(0, 0)], + 'retired': [(0, 0)], + 'completed': [(0, 0)], + 'fail': [(0, 0)], + 'skip': [(0, 0)] } self._pipeline_step = 0 + self._t_pipeline_start = time.time() def _update_pipeline_progress(self, old_state, new_state, num_tasks=1): + timestamp = time.time() - self._t_pipeline_start for state in self._pipeline_progress: - count = self._pipeline_progress[state][self._pipeline_step] + count = self._pipeline_progress[state][self._pipeline_step][0] if old_state != new_state: if state == old_state: count -= num_tasks elif state == new_state: count += num_tasks - self._pipeline_progress[state].append(count) + self._pipeline_progress[state].append((count, timestamp)) self._pipeline_step += 1 def _dump_pipeline_progress(self, filename): - import json + import reframe.utility.jsonext as jsonext with open(filename, 'w') as fp: - json.dump(self._pipeline_progress, fp, indent=2) + jsonext.dump(self._pipeline_progress, fp, indent=2) def runcase(self, case): super().runcase(case) From 341e2303b89c89b2375a0026d61e07e203671db1 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Wed, 12 Jan 2022 17:25:15 +0100 Subject: [PATCH 61/76] Remove stale prints --- plot_pipeline_progress.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/plot_pipeline_progress.py b/plot_pipeline_progress.py index 9a59252e99..ac027080cc 100644 --- a/plot_pipeline_progress.py +++ b/plot_pipeline_progress.py @@ -31,8 +31,6 @@ step_values = [s[0] for s in x] y_values.append(step_values) - print(x_values) - print(y_values) fig, ax = plt.subplots() ax.stackplot(x_values, y_values, labels=raw_data.keys(), alpha=0.8) ax.legend(loc='upper left') From 513418d8354bb75a5786c3a4ace2cbe211a33ec8 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Wed, 12 Jan 2022 18:00:21 +0100 Subject: [PATCH 62/76] Change legend location --- plot_pipeline_progress.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/plot_pipeline_progress.py b/plot_pipeline_progress.py index ac027080cc..671d393ad0 100644 --- a/plot_pipeline_progress.py +++ b/plot_pipeline_progress.py @@ -32,8 +32,8 @@ y_values.append(step_values) fig, ax = plt.subplots() - ax.stackplot(x_values, y_values, labels=raw_data.keys(), alpha=0.8) - ax.legend(loc='upper left') + ax.stackplot(x_values, y_values, labels=raw_data.keys(), alpha=0.9) + ax.legend(loc='upper right') ax.set_title('Pipeline progress') ax.set_xlabel(x_label) ax.set_ylabel('Number of tasks') From ecf0ae203479b8d043c48f49913734d22bee463e Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Wed, 12 Jan 2022 18:07:31 +0100 Subject: [PATCH 63/76] Fix retired-completed bug --- reframe/frontend/executors/__init__.py | 6 ++++++ reframe/frontend/executors/policies.py | 12 +++++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/reframe/frontend/executors/__init__.py b/reframe/frontend/executors/__init__.py index 216fa3e6a9..9749256e20 100644 --- a/reframe/frontend/executors/__init__.py +++ b/reframe/frontend/executors/__init__.py @@ -217,6 +217,12 @@ def failed(self): @property def state(self): + if self.failed: + return 'fail' + + if self.skipped: + return 'skip' + states = { 'startup': 'startup', 'setup': 'ready_compile', diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 2abfcbc0ca..75e54b0061 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -335,8 +335,11 @@ def exit(self): self._advance_all(self._current_tasks, timeout) num_retired = len(self._retired_tasks) _cleanup_all(self._retired_tasks, not self.keep_stage_files) + new_num_retired = len(self._retired_tasks) + # Some tests might not be cleaned up because they are waiting + # for dependencies or because their dependencies have failed. self._update_pipeline_progress('retired', 'completed', - num_retired) + num_retired - new_num_retired) if num_running: self._pollctl.running_tasks(num_running).snooze() except ABORT_REASONS as e: @@ -394,12 +397,7 @@ def _advance_all(self, tasks, timeout=None): old_state = t.state bump_state = getattr(self, f'_advance_{t.state}') num_progressed += bump_state(t) - if t.failed: - new_state = 'fail' - elif t.skipped: - new_state = 'skip' - else: - new_state = t.state + new_state = t.state t_elapsed = time.time() - t_init if timeout and t_elapsed > timeout and num_progressed: From 594fb91cb7c3026449545a60a794ade081bfd068 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Wed, 12 Jan 2022 19:04:28 +0100 Subject: [PATCH 64/76] Fix how pipeline timeout is read --- reframe/frontend/executors/policies.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 75e54b0061..90c0186552 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -330,7 +330,7 @@ def exit(self): for t in self._current_tasks ) timeout = rt.runtime().get_option( - 'general/pipeline_timeout' + 'general/0/pipeline_timeout' ) self._advance_all(self._current_tasks, timeout) num_retired = len(self._retired_tasks) From f487031a08656a541314c644eda8ebca9dfa72ec Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Wed, 12 Jan 2022 19:05:59 +0100 Subject: [PATCH 65/76] Always convert pipeline timeout to float --- reframe/frontend/executors/policies.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 90c0186552..e6066069e8 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -329,9 +329,11 @@ def exit(self): 1 if t.state in ('running', 'compiling') else 0 for t in self._current_tasks ) - timeout = rt.runtime().get_option( + + # FIXME: Always convert due to #GH 2246 + timeout = float(rt.runtime().get_option( 'general/0/pipeline_timeout' - ) + )) self._advance_all(self._current_tasks, timeout) num_retired = len(self._retired_tasks) _cleanup_all(self._retired_tasks, not self.keep_stage_files) From 9e03263c12816120b55e482a8787d0a7cbef2347 Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Wed, 12 Jan 2022 19:08:04 +0100 Subject: [PATCH 66/76] Always convert pipeline timeout to float (correct fix) --- reframe/frontend/executors/policies.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index e6066069e8..4c8908f9ac 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -330,10 +330,14 @@ def exit(self): for t in self._current_tasks ) - # FIXME: Always convert due to #GH 2246 - timeout = float(rt.runtime().get_option( + timeout = rt.runtime().get_option( 'general/0/pipeline_timeout' - )) + ) + + # FIXME: Always convert due to #GH 2246 + if timeout is not None: + timeout = float(timeout) + self._advance_all(self._current_tasks, timeout) num_retired = len(self._retired_tasks) _cleanup_all(self._retired_tasks, not self.keep_stage_files) From 12ba66f54a225fe8dcd7e6e787cb20863ced1a8f Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Wed, 12 Jan 2022 19:26:12 +0100 Subject: [PATCH 67/76] Remove transparency in plot --- plot_pipeline_progress.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plot_pipeline_progress.py b/plot_pipeline_progress.py index 671d393ad0..9e91805800 100644 --- a/plot_pipeline_progress.py +++ b/plot_pipeline_progress.py @@ -32,7 +32,7 @@ y_values.append(step_values) fig, ax = plt.subplots() - ax.stackplot(x_values, y_values, labels=raw_data.keys(), alpha=0.9) + ax.stackplot(x_values, y_values, labels=raw_data.keys(), alpha=1) ax.legend(loc='upper right') ax.set_title('Pipeline progress') ax.set_xlabel(x_label) From 7aa1c137a9f7d58ccd2aff928c28a87c2446f1c1 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Tue, 18 Jan 2022 20:39:20 +0100 Subject: [PATCH 68/76] Small fixes --- docs/config_reference.rst | 14 +++++++++++-- docs/pipeline.rst | 4 ++-- reframe/frontend/cli.py | 10 +++++++++ reframe/frontend/executors/policies.py | 29 ++++++++++++++++++-------- reframe/schemas/config.json | 2 ++ 5 files changed, 46 insertions(+), 13 deletions(-) diff --git a/docs/config_reference.rst b/docs/config_reference.rst index 40c9b1705f..5b8c1c273d 100644 --- a/docs/config_reference.rst +++ b/docs/config_reference.rst @@ -110,7 +110,7 @@ System Configuration :required: No :default: ``8`` - .. versionadded:: 3.9.3 + .. versionadded:: 3.10 .. js:attribute:: .systems[].modules_system @@ -1301,6 +1301,16 @@ General Configuration Timeout value in seconds used when checking if a git repository exists. +.. js:attribute:: .general[].pipeline_statistics + + A boolean value indicating whether we want to collect statistic information for the execution of the asynchronous pipeline. + + :required: No + :default: ``False`` + + .. versionadded:: 3.10 + + .. js:attribute:: .general[].pipeline_timeout Timeout in seconds for advancing the pipeline in the asynchronous execution policy. @@ -1311,7 +1321,7 @@ General Configuration :required: No :default: ``10`` - .. versionadded:: 3.9.3 + .. versionadded:: 3.10 .. js:attribute:: .general[].remote_detect diff --git a/docs/pipeline.rst b/docs/pipeline.rst index 9224065c1c..309ca7c86e 100644 --- a/docs/pipeline.rst +++ b/docs/pipeline.rst @@ -157,7 +157,7 @@ There are a number of things to notice in this diagram: If the ``cleanup`` phase fails, the test is not marked as a failure, but this condition is marked as an error. -.. versionchanged:: 3.9.3 +.. versionchanged:: 3.10 The ``compile`` stage is now also executed asynchronously. @@ -190,7 +190,7 @@ For example, a test executing in the ReFrame context will not respect the :js:at To control the concurrency of the ReFrame execution context, users should set the :js:attr:`.systems[].max_local_jobs` option instead. -.. versionchanged:: 3.9.3 +.. versionchanged:: 3.10 Execution contexts were formalized. diff --git a/reframe/frontend/cli.py b/reframe/frontend/cli.py index 4958f40f69..ed14e83e88 100644 --- a/reframe/frontend/cli.py +++ b/reframe/frontend/cli.py @@ -517,6 +517,13 @@ def main(): action='store_true', help='Use a compact test naming scheme' ) + argparser.add_argument( + dest='pipeline_statistics', + envvar='RFM_PIPELINE_STATISTICS', + configvar='general/pipeline_statistics', + action='store_true', + help='Gather statistics for the async execution' + ) argparser.add_argument( dest='pipeline_timeout', envvar='RFM_PIPELINE_TIMEOUT', @@ -1067,6 +1074,9 @@ def module_unuse(*paths): exec_policy.keep_stage_files = site_config.get( 'general/0/keep_stage_files' ) + exec_policy.pipeline_statistics = site_config.get( + 'general/0/pipeline_statistics' + ) try: errmsg = "invalid option for --flex-alloc-nodes: '{0}'" sched_flex_alloc_nodes = int(options.flex_alloc_nodes) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 4c8908f9ac..53d0d82e28 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -264,6 +264,7 @@ def __init__(self): self._max_jobs = { '_rfm_local': rt.runtime().get_option('systems/0/max_local_jobs') } + self._pipeline_statistics = rt.runtime().get_option('systems/0/pipeline_statistics') self.task_listeners.append(self) def _init_pipeline_progress(self, num_tasks): @@ -321,7 +322,9 @@ def runcase(self, case): self._current_tasks.add(task) def exit(self): - self._init_pipeline_progress(len(self._current_tasks)) + if self._pipeline_statistics: + self._init_pipeline_progress(len(self._current_tasks)) + while self._current_tasks: try: self._poll_tasks() @@ -339,20 +342,27 @@ def exit(self): timeout = float(timeout) self._advance_all(self._current_tasks, timeout) - num_retired = len(self._retired_tasks) + if self._pipeline_statistics: + num_retired = len(self._retired_tasks) + _cleanup_all(self._retired_tasks, not self.keep_stage_files) - new_num_retired = len(self._retired_tasks) - # Some tests might not be cleaned up because they are waiting - # for dependencies or because their dependencies have failed. - self._update_pipeline_progress('retired', 'completed', - num_retired - new_num_retired) + if self._pipeline_statistics: + new_num_retired = len(self._retired_tasks) + # Some tests might not be cleaned up because they are + # waiting for dependencies or because their dependencies + # have failed. + self._update_pipeline_progress( + 'retired', 'completed', num_retired - new_num_retired + ) + if num_running: self._pollctl.running_tasks(num_running).snooze() except ABORT_REASONS as e: self._failall(e) raise - self._dump_pipeline_progress('pipeline-progress.json') + if self._pipeline_statistics: + self._dump_pipeline_progress('pipeline-progress.json') def _poll_tasks(self): for partname, sched in self._schedulers.items(): @@ -409,7 +419,8 @@ def _advance_all(self, tasks, timeout=None): if timeout and t_elapsed > timeout and num_progressed: break - self._update_pipeline_progress(old_state, new_state, 1) + if self._pipeline_statistics: + self._update_pipeline_progress(old_state, new_state, 1) getlogger().debug2(f'Bumped {num_progressed} test(s)') diff --git a/reframe/schemas/config.json b/reframe/schemas/config.json index 7c7ee2487a..7ef70bb315 100644 --- a/reframe/schemas/config.json +++ b/reframe/schemas/config.json @@ -465,6 +465,7 @@ "items": {"type": "string"} }, "non_default_craype": {"type": "boolean"}, + "pipeline_statistics": {"type": "boolean"}, "pipeline_timeout": {"type": ["number", "null"]}, "purge_environment": {"type": "boolean"}, "remote_detect": {"type": "boolean"}, @@ -499,6 +500,7 @@ "environments/ldflags": [], "environments/extras": {}, "environments/target_systems": ["*"], + "general/pipeline_statistics": false, "general/pipeline_timeout": null, "general/check_search_path": ["${RFM_INSTALL_PREFIX}/checks/"], "general/check_search_recursive": false, From 1f3d53d8618ada015616a0a14eea03c707711493 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Wed, 19 Jan 2022 09:17:43 +0100 Subject: [PATCH 69/76] Change to full version --- docs/config_reference.rst | 6 +++--- docs/pipeline.rst | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/config_reference.rst b/docs/config_reference.rst index 5b8c1c273d..02fc12f71c 100644 --- a/docs/config_reference.rst +++ b/docs/config_reference.rst @@ -110,7 +110,7 @@ System Configuration :required: No :default: ``8`` - .. versionadded:: 3.10 + .. versionadded:: 3.10.0 .. js:attribute:: .systems[].modules_system @@ -1308,7 +1308,7 @@ General Configuration :required: No :default: ``False`` - .. versionadded:: 3.10 + .. versionadded:: 3.10.0 .. js:attribute:: .general[].pipeline_timeout @@ -1321,7 +1321,7 @@ General Configuration :required: No :default: ``10`` - .. versionadded:: 3.10 + .. versionadded:: 3.10.0 .. js:attribute:: .general[].remote_detect diff --git a/docs/pipeline.rst b/docs/pipeline.rst index 309ca7c86e..64cbe9c9d6 100644 --- a/docs/pipeline.rst +++ b/docs/pipeline.rst @@ -157,7 +157,7 @@ There are a number of things to notice in this diagram: If the ``cleanup`` phase fails, the test is not marked as a failure, but this condition is marked as an error. -.. versionchanged:: 3.10 +.. versionchanged:: 3.10.0 The ``compile`` stage is now also executed asynchronously. @@ -190,7 +190,7 @@ For example, a test executing in the ReFrame context will not respect the :js:at To control the concurrency of the ReFrame execution context, users should set the :js:attr:`.systems[].max_local_jobs` option instead. -.. versionchanged:: 3.10 +.. versionchanged:: 3.10.0 Execution contexts were formalized. From 437c82e8866fad4f85fd0bc0a3485f84a28520e7 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Wed, 19 Jan 2022 13:23:08 +0100 Subject: [PATCH 70/76] Small fixes --- plot_pipeline_progress.py | 6 +++--- reframe/frontend/cli.py | 3 --- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/plot_pipeline_progress.py b/plot_pipeline_progress.py index 9e91805800..b894bded95 100644 --- a/plot_pipeline_progress.py +++ b/plot_pipeline_progress.py @@ -33,10 +33,10 @@ fig, ax = plt.subplots() ax.stackplot(x_values, y_values, labels=raw_data.keys(), alpha=1) - ax.legend(loc='upper right') + ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) ax.set_title('Pipeline progress') ax.set_xlabel(x_label) ax.set_ylabel('Number of tasks') - figname = os.path.splitext(sys.argv[1])[0] + '.png' - plt.savefig(figname) + figname = os.path.splitext(sys.argv[1])[0] + '_' + mode + '.png' + plt.savefig(figname, bbox_inches='tight') plt.show() diff --git a/reframe/frontend/cli.py b/reframe/frontend/cli.py index ed14e83e88..327502e032 100644 --- a/reframe/frontend/cli.py +++ b/reframe/frontend/cli.py @@ -1074,9 +1074,6 @@ def module_unuse(*paths): exec_policy.keep_stage_files = site_config.get( 'general/0/keep_stage_files' ) - exec_policy.pipeline_statistics = site_config.get( - 'general/0/pipeline_statistics' - ) try: errmsg = "invalid option for --flex-alloc-nodes: '{0}'" sched_flex_alloc_nodes = int(options.flex_alloc_nodes) From 7f3b4ad9c1e52b39ddbe312174da6e3033bdc519 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Wed, 19 Jan 2022 14:42:51 +0100 Subject: [PATCH 71/76] Split long line --- reframe/frontend/executors/policies.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 53d0d82e28..1108d6a0e8 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -264,7 +264,9 @@ def __init__(self): self._max_jobs = { '_rfm_local': rt.runtime().get_option('systems/0/max_local_jobs') } - self._pipeline_statistics = rt.runtime().get_option('systems/0/pipeline_statistics') + self._pipeline_statistics = rt.runtime().get_option( + 'systems/0/pipeline_statistics' + ) self.task_listeners.append(self) def _init_pipeline_progress(self, num_tasks): From 9fa29bc4dfbe306343523784edf33c1836ebd3f0 Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Wed, 19 Jan 2022 14:44:46 +0100 Subject: [PATCH 72/76] Correct documentation --- docs/pipeline.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/pipeline.rst b/docs/pipeline.rst index 64cbe9c9d6..e7cf037a92 100644 --- a/docs/pipeline.rst +++ b/docs/pipeline.rst @@ -178,7 +178,7 @@ The following table show in which context each pipeline stage executes: Pipeline Stage Execution Context ============== ================= *Setup* ReFrame - *Compile* ReFrame if :attr:`~reframe.core.pipeline.RegressionTest.build_locally` is :obj:`True`, partition otherwise. + *Compile* ReFrame if :attr:`~reframe.core.pipeline.RegressionTest.build_locally` or :attr:`~reframe.core.pipeline.RegressionTest.local` is :obj:`True` or if :option:`--force-local` is passed, partition otherwise. *Run* ReFrame if :attr:`~reframe.core.pipeline.RegressionTest.local` is :obj:`True` or if :option:`--force-local` is passed, partition otherwise. *Sanity* ReFrame *Performance* ReFrame From 95ba0bfa96e243c954dd237912642b9e0b5d58df Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Wed, 19 Jan 2022 16:33:18 +0100 Subject: [PATCH 73/76] Move plotting tool --- plot_pipeline_progress.py => tools/plot_pipeline_progress.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename plot_pipeline_progress.py => tools/plot_pipeline_progress.py (100%) diff --git a/plot_pipeline_progress.py b/tools/plot_pipeline_progress.py similarity index 100% rename from plot_pipeline_progress.py rename to tools/plot_pipeline_progress.py From ab2b0d07b1e5b6301437e5ff4ea8895419f41f1c Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Fri, 21 Jan 2022 16:11:41 +0100 Subject: [PATCH 74/76] Address PR comments --- docs/config_reference.rst | 5 +++-- reframe/frontend/cli.py | 8 ++++---- reframe/frontend/executors/policies.py | 3 ++- reframe/schemas/config.json | 4 ++-- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/docs/config_reference.rst b/docs/config_reference.rst index 02fc12f71c..6820840120 100644 --- a/docs/config_reference.rst +++ b/docs/config_reference.rst @@ -1301,9 +1301,10 @@ General Configuration Timeout value in seconds used when checking if a git repository exists. -.. js:attribute:: .general[].pipeline_statistics +.. js:attribute:: .general[].dump_pipeline_progress - A boolean value indicating whether we want to collect statistic information for the execution of the asynchronous pipeline. + Dump pipeline progress for the asynchronous execution policy in ``pipeline-progress.json``. + This option is meant for debug purposes only. :required: No :default: ``False`` diff --git a/reframe/frontend/cli.py b/reframe/frontend/cli.py index 327502e032..7449157ad9 100644 --- a/reframe/frontend/cli.py +++ b/reframe/frontend/cli.py @@ -518,11 +518,11 @@ def main(): help='Use a compact test naming scheme' ) argparser.add_argument( - dest='pipeline_statistics', - envvar='RFM_PIPELINE_STATISTICS', - configvar='general/pipeline_statistics', + dest='dump_pipeline_progress', + envvar='RFM_DUMP_PIPELINE_PROGRESS', + configvar='general/dump_pipeline_progress', action='store_true', - help='Gather statistics for the async execution' + help='Dump progress information for the async execution' ) argparser.add_argument( dest='pipeline_timeout', diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 1108d6a0e8..7cae491396 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -265,7 +265,7 @@ def __init__(self): '_rfm_local': rt.runtime().get_option('systems/0/max_local_jobs') } self._pipeline_statistics = rt.runtime().get_option( - 'systems/0/pipeline_statistics' + 'systems/0/dump_pipeline_progress' ) self.task_listeners.append(self) @@ -350,6 +350,7 @@ def exit(self): _cleanup_all(self._retired_tasks, not self.keep_stage_files) if self._pipeline_statistics: new_num_retired = len(self._retired_tasks) + # Some tests might not be cleaned up because they are # waiting for dependencies or because their dependencies # have failed. diff --git a/reframe/schemas/config.json b/reframe/schemas/config.json index 7ef70bb315..045267f08b 100644 --- a/reframe/schemas/config.json +++ b/reframe/schemas/config.json @@ -465,7 +465,7 @@ "items": {"type": "string"} }, "non_default_craype": {"type": "boolean"}, - "pipeline_statistics": {"type": "boolean"}, + "dump_pipeline_progress": {"type": "boolean"}, "pipeline_timeout": {"type": ["number", "null"]}, "purge_environment": {"type": "boolean"}, "remote_detect": {"type": "boolean"}, @@ -500,7 +500,7 @@ "environments/ldflags": [], "environments/extras": {}, "environments/target_systems": ["*"], - "general/pipeline_statistics": false, + "general/dump_pipeline_progress": false, "general/pipeline_timeout": null, "general/check_search_path": ["${RFM_INSTALL_PREFIX}/checks/"], "general/check_search_recursive": false, From c389f3d10570cadc20e51e53774508d2b904373c Mon Sep 17 00:00:00 2001 From: Eirini Koutsaniti Date: Fri, 21 Jan 2022 16:17:59 +0100 Subject: [PATCH 75/76] Address PR comments --- reframe/frontend/executors/policies.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index 7cae491396..f808e00113 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -349,13 +349,13 @@ def exit(self): _cleanup_all(self._retired_tasks, not self.keep_stage_files) if self._pipeline_statistics: - new_num_retired = len(self._retired_tasks) + num_retired_actual = len(self._retired_tasks) # Some tests might not be cleaned up because they are # waiting for dependencies or because their dependencies # have failed. self._update_pipeline_progress( - 'retired', 'completed', num_retired - new_num_retired + 'retired', 'completed', num_retired - num_retired_actual ) if num_running: From c616f5370c93a30a31e723cd8278952cc5296c5c Mon Sep 17 00:00:00 2001 From: Vasileios Karakasis Date: Fri, 21 Jan 2022 18:20:36 +0100 Subject: [PATCH 76/76] Address PR comments --- reframe/frontend/executors/policies.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index f808e00113..aab069501d 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -349,13 +349,13 @@ def exit(self): _cleanup_all(self._retired_tasks, not self.keep_stage_files) if self._pipeline_statistics: - num_retired_actual = len(self._retired_tasks) + num_retired_actual = num_retired - len(self._retired_tasks) # Some tests might not be cleaned up because they are # waiting for dependencies or because their dependencies # have failed. self._update_pipeline_progress( - 'retired', 'completed', num_retired - num_retired_actual + 'retired', 'completed', num_retired_actual ) if num_running: