From a72c2b0c410575011c9bbe2a3fbbdaa942c05dfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jan=C3=9Fen?= Date: Sun, 27 Jul 2025 08:36:55 +0200 Subject: [PATCH 01/83] Move scheduler to standalone --- executorlib/standalone/scheduler.py | 66 ++++++++++++++++++ .../task_scheduler/file/queue_spawner.py | 67 +------------------ .../task_scheduler/file/task_scheduler.py | 6 +- tests/test_fluxclusterexecutor.py | 3 +- tests/test_interactive_slurmspawner.py | 8 +-- 5 files changed, 77 insertions(+), 73 deletions(-) create mode 100644 executorlib/standalone/scheduler.py diff --git a/executorlib/standalone/scheduler.py b/executorlib/standalone/scheduler.py new file mode 100644 index 00000000..27668c13 --- /dev/null +++ b/executorlib/standalone/scheduler.py @@ -0,0 +1,66 @@ +import contextlib +import subprocess +from typing import Optional, Union + +from pysqa import QueueAdapter + + + +def terminate_with_pysqa( + queue_id: int, + config_directory: Optional[str] = None, + backend: Optional[str] = None, +): + """ + Delete job from queuing system + + Args: + queue_id (int): Queuing system ID of the job to delete. + config_directory (str, optional): path to the config directory. + backend (str, optional): name of the backend used to spawn tasks ["slurm", "flux"]. + """ + qa = QueueAdapter( + directory=config_directory, + queue_type=backend, + execute_command=pysqa_execute_command, + ) + status = qa.get_status_of_job(process_id=queue_id) + if status is not None and status not in ["finished", "error"]: + with contextlib.suppress(subprocess.CalledProcessError): + qa.delete_job(process_id=queue_id) + + +def pysqa_execute_command( + commands: str, + working_directory: Optional[str] = None, + split_output: bool = True, + shell: bool = False, + error_filename: str = "pysqa.err", +) -> Union[str, list[str]]: + """ + A wrapper around the subprocess.check_output function. Modified from pysqa to raise an exception if the subprocess + fails to submit the job to the queue. + + Args: + commands (str): The command(s) to be executed on the command line + working_directory (str, optional): The directory where the command is executed. Defaults to None. + split_output (bool, optional): Boolean flag to split newlines in the output. Defaults to True. + shell (bool, optional): Additional switch to convert commands to a single string. Defaults to False. + error_filename (str, optional): In case the execution fails, the output is written to this file. Defaults to "pysqa.err". + + Returns: + Union[str, List[str]]: Output of the shell command either as a string or as a list of strings + """ + if shell and isinstance(commands, list): + commands = " ".join(commands) + out = subprocess.check_output( + commands, + cwd=working_directory, + stderr=subprocess.STDOUT, + universal_newlines=True, + shell=not isinstance(commands, list), + ) + if out is not None and split_output: + return out.split("\n") + else: + return out diff --git a/executorlib/task_scheduler/file/queue_spawner.py b/executorlib/task_scheduler/file/queue_spawner.py index 16dff14f..3cd55587 100644 --- a/executorlib/task_scheduler/file/queue_spawner.py +++ b/executorlib/task_scheduler/file/queue_spawner.py @@ -1,11 +1,10 @@ -import contextlib import os -import subprocess -from typing import Optional, Union +from typing import Optional from pysqa import QueueAdapter from executorlib.standalone.inputcheck import check_file_exists +from executorlib.standalone.scheduler import terminate_with_pysqa, pysqa_execute_command from executorlib.task_scheduler.file.hdf import dump, get_queue_id @@ -43,7 +42,7 @@ def execute_with_pysqa( qa = QueueAdapter( directory=config_directory, queue_type=backend, - execute_command=_pysqa_execute_command, + execute_command=pysqa_execute_command, ) queue_id = get_queue_id(file_name=file_name) if os.path.exists(file_name) and ( @@ -91,30 +90,6 @@ def execute_with_pysqa( return queue_id -def terminate_with_pysqa( - queue_id: int, - config_directory: Optional[str] = None, - backend: Optional[str] = None, -): - """ - Delete job from queuing system - - Args: - queue_id (int): Queuing system ID of the job to delete. - config_directory (str, optional): path to the config directory. - backend (str, optional): name of the backend used to spawn tasks ["slurm", "flux"]. - """ - qa = QueueAdapter( - directory=config_directory, - queue_type=backend, - execute_command=_pysqa_execute_command, - ) - status = qa.get_status_of_job(process_id=queue_id) - if status is not None and status not in ["finished", "error"]: - with contextlib.suppress(subprocess.CalledProcessError): - qa.delete_job(process_id=queue_id) - - def terminate_tasks_in_cache( cache_directory: str, config_directory: Optional[str] = None, @@ -140,39 +115,3 @@ def terminate_tasks_in_cache( config_directory=config_directory, backend=backend, ) - - -def _pysqa_execute_command( - commands: str, - working_directory: Optional[str] = None, - split_output: bool = True, - shell: bool = False, - error_filename: str = "pysqa.err", -) -> Union[str, list[str]]: - """ - A wrapper around the subprocess.check_output function. Modified from pysqa to raise an exception if the subprocess - fails to submit the job to the queue. - - Args: - commands (str): The command(s) to be executed on the command line - working_directory (str, optional): The directory where the command is executed. Defaults to None. - split_output (bool, optional): Boolean flag to split newlines in the output. Defaults to True. - shell (bool, optional): Additional switch to convert commands to a single string. Defaults to False. - error_filename (str, optional): In case the execution fails, the output is written to this file. Defaults to "pysqa.err". - - Returns: - Union[str, List[str]]: Output of the shell command either as a string or as a list of strings - """ - if shell and isinstance(commands, list): - commands = " ".join(commands) - out = subprocess.check_output( - commands, - cwd=working_directory, - stderr=subprocess.STDOUT, - universal_newlines=True, - shell=not isinstance(commands, list), - ) - if out is not None and split_output: - return out.split("\n") - else: - return out diff --git a/executorlib/task_scheduler/file/task_scheduler.py b/executorlib/task_scheduler/file/task_scheduler.py index fe719d8b..47bcda04 100644 --- a/executorlib/task_scheduler/file/task_scheduler.py +++ b/executorlib/task_scheduler/file/task_scheduler.py @@ -17,10 +17,8 @@ ) try: - from executorlib.task_scheduler.file.queue_spawner import ( - execute_with_pysqa, - terminate_with_pysqa, - ) + from executorlib.standalone.scheduler import terminate_with_pysqa + from executorlib.task_scheduler.file.queue_spawner import execute_with_pysqa except ImportError: # If pysqa is not available fall back to executing tasks in a subprocess execute_with_pysqa = execute_in_subprocess # type: ignore diff --git a/tests/test_fluxclusterexecutor.py b/tests/test_fluxclusterexecutor.py index 51b18500..27645d86 100644 --- a/tests/test_fluxclusterexecutor.py +++ b/tests/test_fluxclusterexecutor.py @@ -11,7 +11,8 @@ try: import flux.job from executorlib.task_scheduler.file.hdf import dump - from executorlib.task_scheduler.file.queue_spawner import terminate_with_pysqa, terminate_tasks_in_cache, execute_with_pysqa + from executorlib.task_scheduler.file.queue_spawner import terminate_tasks_in_cache, execute_with_pysqa + from executorlib.standalone.scheduler import terminate_with_pysqa skip_flux_test = "FLUX_URI" not in os.environ pmi = os.environ.get("EXECUTORLIB_PMIX", None) diff --git a/tests/test_interactive_slurmspawner.py b/tests/test_interactive_slurmspawner.py index a0af5b67..2617b9e9 100644 --- a/tests/test_interactive_slurmspawner.py +++ b/tests/test_interactive_slurmspawner.py @@ -2,7 +2,7 @@ from executorlib.task_scheduler.interactive.slurmspawner import generate_slurm_command try: - from executorlib.task_scheduler.file.queue_spawner import _pysqa_execute_command + from executorlib.standalone.scheduler import pysqa_execute_command skip_pysqa_test = False except ImportError: @@ -14,7 +14,7 @@ ) class TestPysqaExecuteCommand(unittest.TestCase): def test_pysqa_execute_command_list(self): - out = _pysqa_execute_command( + out = pysqa_execute_command( commands=["echo", "test"], working_directory=None, split_output=True, @@ -25,7 +25,7 @@ def test_pysqa_execute_command_list(self): self.assertEqual("test", out[0]) def test_pysqa_execute_command_string(self): - out = _pysqa_execute_command( + out = pysqa_execute_command( commands="echo test", working_directory=None, split_output=False, @@ -37,7 +37,7 @@ def test_pysqa_execute_command_string(self): def test_pysqa_execute_command_fail(self): with self.assertRaises(FileNotFoundError): - _pysqa_execute_command( + pysqa_execute_command( commands=["no/executable/available"], working_directory=None, split_output=True, From 2ad819ead91729f87e6f3d1678a1b56d16085230 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jan=C3=9Fen?= Date: Sun, 27 Jul 2025 10:12:01 +0200 Subject: [PATCH 02/83] fix subprocess spawner docstring --- executorlib/standalone/interactive/spawner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/executorlib/standalone/interactive/spawner.py b/executorlib/standalone/interactive/spawner.py index 72f98cfb..85f92218 100644 --- a/executorlib/standalone/interactive/spawner.py +++ b/executorlib/standalone/interactive/spawner.py @@ -73,7 +73,7 @@ def __init__( cwd (str, optional): The current working directory. Defaults to None. cores (int, optional): The number of cores to use. Defaults to 1. threads_per_core (int, optional): The number of threads per core. Defaults to 1. - oversubscribe (bool, optional): Whether to oversubscribe the cores. Defaults to False. + openmpi_oversubscribe (bool, optional): Whether to oversubscribe the cores. Defaults to False. """ super().__init__( cwd=cwd, From fc5f1991fbf7f947185302e6e732bf8c8b9796aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jan=C3=9Fen?= Date: Sun, 27 Jul 2025 10:53:39 +0200 Subject: [PATCH 03/83] file executor fix parallel execution --- executorlib/standalone/command.py | 29 ++++++++++++++++++----- executorlib/task_scheduler/file/shared.py | 1 + 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/executorlib/standalone/command.py b/executorlib/standalone/command.py index aa396caa..0f6835b6 100644 --- a/executorlib/standalone/command.py +++ b/executorlib/standalone/command.py @@ -1,6 +1,7 @@ import importlib.util import os import sys +from typing import Optional def get_command_path(executable: str) -> str: @@ -16,24 +17,40 @@ def get_command_path(executable: str) -> str: return os.path.abspath(os.path.join(__file__, "..", "..", "backend", executable)) -def get_cache_execute_command(file_name: str, cores: int = 1) -> list: +def get_cache_execute_command(file_name: str, cores: int = 1, backend: Optional[str] = None) -> list: """ Get command to call backend as a list of two strings Args: file_name (str): The name of the file. cores (int, optional): Number of cores used to execute the task. Defaults to 1. + backend (str, optional): name of the backend used to spawn tasks ["slurm", "flux"]. Returns: list[str]: List of strings containing the python executable path and the backend script to execute """ command_lst = [sys.executable] if cores > 1 and importlib.util.find_spec("mpi4py") is not None: - command_lst = ( - ["mpiexec", "-n", str(cores)] - + command_lst - + [get_command_path(executable="cache_parallel.py"), file_name] - ) + if backend is None: + command_lst = ( + ["mpiexec", "-n", str(cores)] + + command_lst + + [get_command_path(executable="cache_parallel.py"), file_name] + ) + elif backend == "slurm": + command_lst = ( + ["srun", "-n", str(cores)] + + command_lst + + [get_command_path(executable="cache_parallel.py"), file_name] + ) + elif backend == "flux": + command_lst = ( + ["flux", "run", "-n", str(cores)] + + command_lst + + [get_command_path(executable="cache_parallel.py"), file_name] + ) + else: + raise ValueError("backend should be None, slurm or flux, not {}".format(backend)) elif cores > 1: raise ImportError( "mpi4py is required for parallel calculations. Please install mpi4py." diff --git a/executorlib/task_scheduler/file/shared.py b/executorlib/task_scheduler/file/shared.py index 0c5ac882..5d8a90f9 100644 --- a/executorlib/task_scheduler/file/shared.py +++ b/executorlib/task_scheduler/file/shared.py @@ -154,6 +154,7 @@ def execute_tasks_h5( command=get_cache_execute_command( file_name=file_name, cores=task_resource_dict["cores"], + backend=backend, ), file_name=file_name, data_dict=data_dict, From 293adc32ddb2a0ba337b20b79e4d493b31c60983 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jan=C3=9Fen?= Date: Sun, 27 Jul 2025 11:07:19 +0200 Subject: [PATCH 04/83] add command tests --- tests/test_standalone_command.py | 57 ++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 tests/test_standalone_command.py diff --git a/tests/test_standalone_command.py b/tests/test_standalone_command.py new file mode 100644 index 00000000..f89821d8 --- /dev/null +++ b/tests/test_standalone_command.py @@ -0,0 +1,57 @@ +import sys +from unittest import TestCase +from executorlib.standalone.command import get_cache_execute_command, get_interactive_execute_command + + +class TestCommands(TestCase): + def test_get_interactive_execute_command_serial(self): + output = get_interactive_execute_command(cores=1) + self.assertEqual(output[0], sys.executable) + self.assertEqual(output[1].split("/")[-1], "interactive_serial.py") + + def test_get_interactive_execute_command_parallel(self): + output = get_interactive_execute_command(cores=2) + self.assertEqual(output[0], sys.executable) + self.assertEqual(output[1].split("/")[-1], "interactive_parallel.py") + + def test_get_cache_execute_command_serial(self): + file_name = "test.txt" + output = get_cache_execute_command(cores=1, file_name=file_name) + self.assertEqual(output[0], sys.executable) + self.assertEqual(output[1].split("/")[-1], "cache_serial.py") + self.assertEqual(output[2], file_name) + output = get_cache_execute_command(cores=1, file_name=file_name, backend="slurm") + self.assertEqual(output[0], sys.executable) + self.assertEqual(output[1].split("/")[-1], "cache_serial.py") + self.assertEqual(output[2], file_name) + output = get_cache_execute_command(cores=1, file_name=file_name, backend="flux") + self.assertEqual(output[0], sys.executable) + self.assertEqual(output[1].split("/")[-1], "cache_serial.py") + self.assertEqual(output[2], file_name) + + def test_get_cache_execute_command_parallel(self): + file_name = "test.txt" + output = get_cache_execute_command(cores=2, file_name=file_name) + self.assertEqual(output[0], "mpiexec") + self.assertEqual(output[1], "-n") + self.assertEqual(output[2], str(2)) + self.assertEqual(output[3], sys.executable) + self.assertEqual(output[4].split("/")[-1], "cache_parallel.py") + self.assertEqual(output[5], file_name) + output = get_cache_execute_command(cores=2, file_name=file_name, backend="slurm") + self.assertEqual(output[0], "srun") + self.assertEqual(output[1], "-n") + self.assertEqual(output[2], str(2)) + self.assertEqual(output[3], sys.executable) + self.assertEqual(output[4].split("/")[-1], "cache_parallel.py") + self.assertEqual(output[5], file_name) + output = get_cache_execute_command(cores=2, file_name=file_name, backend="flux") + self.assertEqual(output[0], "flux") + self.assertEqual(output[1], "run") + self.assertEqual(output[2], "-n") + self.assertEqual(output[3], str(2)) + self.assertEqual(output[4], sys.executable) + self.assertEqual(output[5].split("/")[-1], "cache_parallel.py") + self.assertEqual(output[6], file_name) + with self.assertRaises(ValueError): + get_cache_execute_command(cores=2, file_name=file_name, backend="test") From 07e84092e5d167569a09da68467bad9cac0042ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jan=C3=9Fen?= Date: Sun, 27 Jul 2025 11:08:44 +0200 Subject: [PATCH 05/83] move slurm command to standalone --- executorlib/standalone/slurm_command.py | 48 +++++++++++++++++++ .../interactive/slurmspawner.py | 47 +----------------- tests/test_interactive_slurmspawner.py | 2 +- 3 files changed, 50 insertions(+), 47 deletions(-) create mode 100644 executorlib/standalone/slurm_command.py diff --git a/executorlib/standalone/slurm_command.py b/executorlib/standalone/slurm_command.py new file mode 100644 index 00000000..7b390b48 --- /dev/null +++ b/executorlib/standalone/slurm_command.py @@ -0,0 +1,48 @@ +from typing import Optional + + +SLURM_COMMAND = "srun" + + +def generate_slurm_command( + cores: int, + cwd: Optional[str], + threads_per_core: int = 1, + gpus_per_core: int = 0, + num_nodes: Optional[int] = None, + exclusive: bool = False, + openmpi_oversubscribe: bool = False, + slurm_cmd_args: Optional[list[str]] = None, +) -> list[str]: + """ + Generate the command list for the SLURM interface. + + Args: + cores (int): The number of cores. + cwd (str): The current working directory. + threads_per_core (int, optional): The number of threads per core. Defaults to 1. + gpus_per_core (int, optional): The number of GPUs per core. Defaults to 0. + num_nodes (int, optional): The number of compute nodes to use for executing the task. Defaults to None. + exclusive (bool): Whether to exclusively reserve the compute nodes, or allow sharing compute notes. Defaults to False. + openmpi_oversubscribe (bool, optional): Whether to oversubscribe the cores. Defaults to False. + slurm_cmd_args (list[str], optional): Additional command line arguments. Defaults to []. + + Returns: + list[str]: The generated command list. + """ + command_prepend_lst = [SLURM_COMMAND, "-n", str(cores)] + if cwd is not None: + command_prepend_lst += ["-D", cwd] + if num_nodes is not None: + command_prepend_lst += ["-N", str(num_nodes)] + if threads_per_core > 1: + command_prepend_lst += ["--cpus-per-task=" + str(threads_per_core)] + if gpus_per_core > 0: + command_prepend_lst += ["--gpus-per-task=" + str(gpus_per_core)] + if exclusive: + command_prepend_lst += ["--exact"] + if openmpi_oversubscribe: + command_prepend_lst += ["--oversubscribe"] + if slurm_cmd_args is not None and len(slurm_cmd_args) > 0: + command_prepend_lst += slurm_cmd_args + return command_prepend_lst diff --git a/executorlib/task_scheduler/interactive/slurmspawner.py b/executorlib/task_scheduler/interactive/slurmspawner.py index 8426012d..309c43d9 100644 --- a/executorlib/task_scheduler/interactive/slurmspawner.py +++ b/executorlib/task_scheduler/interactive/slurmspawner.py @@ -2,8 +2,7 @@ from typing import Optional from executorlib.standalone.interactive.spawner import SubprocessSpawner - -SLURM_COMMAND = "srun" +from executorlib.standalone.slurm_command import generate_slurm_command def validate_max_workers(max_workers: int, cores: int, threads_per_core: int): @@ -79,47 +78,3 @@ def generate_command(self, command_lst: list[str]) -> list[str]: return super().generate_command( command_lst=command_prepend_lst + command_lst, ) - - -def generate_slurm_command( - cores: int, - cwd: Optional[str], - threads_per_core: int = 1, - gpus_per_core: int = 0, - num_nodes: Optional[int] = None, - exclusive: bool = False, - openmpi_oversubscribe: bool = False, - slurm_cmd_args: Optional[list[str]] = None, -) -> list[str]: - """ - Generate the command list for the SLURM interface. - - Args: - cores (int): The number of cores. - cwd (str): The current working directory. - threads_per_core (int, optional): The number of threads per core. Defaults to 1. - gpus_per_core (int, optional): The number of GPUs per core. Defaults to 0. - num_nodes (int, optional): The number of compute nodes to use for executing the task. Defaults to None. - exclusive (bool): Whether to exclusively reserve the compute nodes, or allow sharing compute notes. Defaults to False. - openmpi_oversubscribe (bool, optional): Whether to oversubscribe the cores. Defaults to False. - slurm_cmd_args (list[str], optional): Additional command line arguments. Defaults to []. - - Returns: - list[str]: The generated command list. - """ - command_prepend_lst = [SLURM_COMMAND, "-n", str(cores)] - if cwd is not None: - command_prepend_lst += ["-D", cwd] - if num_nodes is not None: - command_prepend_lst += ["-N", str(num_nodes)] - if threads_per_core > 1: - command_prepend_lst += ["--cpus-per-task=" + str(threads_per_core)] - if gpus_per_core > 0: - command_prepend_lst += ["--gpus-per-task=" + str(gpus_per_core)] - if exclusive: - command_prepend_lst += ["--exact"] - if openmpi_oversubscribe: - command_prepend_lst += ["--oversubscribe"] - if slurm_cmd_args is not None and len(slurm_cmd_args) > 0: - command_prepend_lst += slurm_cmd_args - return command_prepend_lst diff --git a/tests/test_interactive_slurmspawner.py b/tests/test_interactive_slurmspawner.py index 2617b9e9..bb04ad34 100644 --- a/tests/test_interactive_slurmspawner.py +++ b/tests/test_interactive_slurmspawner.py @@ -1,5 +1,5 @@ import unittest -from executorlib.task_scheduler.interactive.slurmspawner import generate_slurm_command +from executorlib.standalone.slurm_command import generate_slurm_command try: from executorlib.standalone.scheduler import pysqa_execute_command From 12208680c1d1331e8cb1dc0d7a333038aa5f588c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jan=C3=9Fen?= Date: Sun, 27 Jul 2025 11:12:48 +0200 Subject: [PATCH 06/83] implement spawner for pysqa --- executorlib/task_scheduler/worker/__init__.py | 0 executorlib/task_scheduler/worker/spawner.py | 114 ++++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 executorlib/task_scheduler/worker/__init__.py create mode 100644 executorlib/task_scheduler/worker/spawner.py diff --git a/executorlib/task_scheduler/worker/__init__.py b/executorlib/task_scheduler/worker/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/executorlib/task_scheduler/worker/spawner.py b/executorlib/task_scheduler/worker/spawner.py new file mode 100644 index 00000000..269ab64d --- /dev/null +++ b/executorlib/task_scheduler/worker/spawner.py @@ -0,0 +1,114 @@ +from typing import Optional + +from pysqa import QueueAdapter + +from executorlib.standalone.interactive.spawner import BaseSpawner +from executorlib.standalone.scheduler import pysqa_execute_command, terminate_with_pysqa + + +class PysqaSpawner(BaseSpawner): + def __init__( + self, + cwd: Optional[str] = None, + cores: int = 1, + openmpi_oversubscribe: bool = False, + threads_per_core: int = 1, + config_directory: Optional[str] = None, + backend: Optional[str] = None, + submission_kwargs: Optional[dict] = None, + ): + """ + Subprocess interface implementation. + + Args: + cwd (str, optional): The current working directory. Defaults to None. + cores (int, optional): The number of cores to use. Defaults to 1. + threads_per_core (int, optional): The number of threads per core. Defaults to 1. + openmpi_oversubscribe (bool, optional): Whether to oversubscribe the cores. Defaults to False. + """ + super().__init__( + cwd=cwd, + cores=cores, + openmpi_oversubscribe=openmpi_oversubscribe, + ) + self._process: Optional[int] = None + self._threads_per_core = threads_per_core + self._config_directory = config_directory + self._backend = backend + self._submission_kwargs = submission_kwargs + + def bootup( + self, + command_lst: list[str], + ): + """ + Method to start the subprocess interface. + + Args: + command_lst (list[str]): The command list to execute. + """ + qa = QueueAdapter( + directory=self._config_directory, + queue_type=self._backend, + execute_command=pysqa_execute_command, + ) + self._process = qa.submit_job( + command=" ".join(self.generate_command(command_lst=command_lst)), + working_directory=self._cwd, + cores=self._cores, + **self._submission_kwargs, + ) + + def generate_command(self, command_lst: list[str]) -> list[str]: + """ + Method to generate the command list. + + Args: + command_lst (list[str]): The command list. + + Returns: + list[str]: The generated command list. + """ + if self._cores > 1 and self._backend is None: + command_prepend = ["mpiexec", "-n", str(self._cores)] + elif self._cores > 1 and self._backend == "slurm": + command_prepend = ["srun", "-n", str(self._cores)] + elif self._cores > 1 and self._backend == "flux": + command_prepend = ["flux", "run", "-n", str(self._cores)] + elif self._cores > 1: + raise ValueError("backend should be None, slurm or flux, not {}".format(self._backend)) + else: + command_prepend = [] + return command_prepend + command_lst + + def shutdown(self, wait: bool = True): + """ + Method to shutdown the subprocess interface. + + Args: + wait (bool, optional): Whether to wait for the interface to shutdown. Defaults to True. + """ + if self._process is not None: + terminate_with_pysqa( + queue_id=self._process, + config_directory=self._config_directory, + backend=self._backend, + ) + self._process = None + + def poll(self) -> bool: + """ + Method to check if the subprocess interface is running. + + Returns: + bool: True if the interface is running, False otherwise. + """ + qa = QueueAdapter( + directory=self._config_directory, + queue_type=self._backend, + execute_command=pysqa_execute_command, + ) + if self._process is not None: + return qa.get_status_of_job(process_id=self._process) in ["running", "pending"] + else: + return False From 62c4c9173192e7c0835cfd4a02ecfe2d6e122e2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Jan=C3=9Fen?= Date: Sun, 27 Jul 2025 21:36:31 +0200 Subject: [PATCH 07/83] transfer changes --- executorlib/standalone/slurm_command.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/executorlib/standalone/slurm_command.py b/executorlib/standalone/slurm_command.py index 7b390b48..4816625c 100644 --- a/executorlib/standalone/slurm_command.py +++ b/executorlib/standalone/slurm_command.py @@ -13,6 +13,7 @@ def generate_slurm_command( exclusive: bool = False, openmpi_oversubscribe: bool = False, slurm_cmd_args: Optional[list[str]] = None, + pmi_mode: Optional[str] = None, ) -> list[str]: """ Generate the command list for the SLURM interface. @@ -26,6 +27,7 @@ def generate_slurm_command( exclusive (bool): Whether to exclusively reserve the compute nodes, or allow sharing compute notes. Defaults to False. openmpi_oversubscribe (bool, optional): Whether to oversubscribe the cores. Defaults to False. slurm_cmd_args (list[str], optional): Additional command line arguments. Defaults to []. + pmi_mode (str): PMI interface to use (OpenMPI v5 requires pmix) default is None Returns: list[str]: The generated command list. @@ -33,6 +35,8 @@ def generate_slurm_command( command_prepend_lst = [SLURM_COMMAND, "-n", str(cores)] if cwd is not None: command_prepend_lst += ["-D", cwd] + if pmi_mode is not None: + command_prepend_lst += ["--mpi=" + pmi_mode] if num_nodes is not None: command_prepend_lst += ["-N", str(num_nodes)] if threads_per_core > 1: @@ -45,4 +49,4 @@ def generate_slurm_command( command_prepend_lst += ["--oversubscribe"] if slurm_cmd_args is not None and len(slurm_cmd_args) > 0: command_prepend_lst += slurm_cmd_args - return command_prepend_lst + return command_prepend_lst \ No newline at end of file From 778658569e7283c23433112a70cff0f08896bdce Mon Sep 17 00:00:00 2001 From: pyiron-runner Date: Sun, 27 Jul 2025 21:15:05 +0000 Subject: [PATCH 08/83] Format black --- executorlib/standalone/slurm_command.py | 2 +- executorlib/task_scheduler/worker/spawner.py | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/executorlib/standalone/slurm_command.py b/executorlib/standalone/slurm_command.py index 4816625c..3d9ca47d 100644 --- a/executorlib/standalone/slurm_command.py +++ b/executorlib/standalone/slurm_command.py @@ -49,4 +49,4 @@ def generate_slurm_command( command_prepend_lst += ["--oversubscribe"] if slurm_cmd_args is not None and len(slurm_cmd_args) > 0: command_prepend_lst += slurm_cmd_args - return command_prepend_lst \ No newline at end of file + return command_prepend_lst diff --git a/executorlib/task_scheduler/worker/spawner.py b/executorlib/task_scheduler/worker/spawner.py index 269ab64d..30328141 100644 --- a/executorlib/task_scheduler/worker/spawner.py +++ b/executorlib/task_scheduler/worker/spawner.py @@ -76,7 +76,9 @@ def generate_command(self, command_lst: list[str]) -> list[str]: elif self._cores > 1 and self._backend == "flux": command_prepend = ["flux", "run", "-n", str(self._cores)] elif self._cores > 1: - raise ValueError("backend should be None, slurm or flux, not {}".format(self._backend)) + raise ValueError( + "backend should be None, slurm or flux, not {}".format(self._backend) + ) else: command_prepend = [] return command_prepend + command_lst @@ -109,6 +111,9 @@ def poll(self) -> bool: execute_command=pysqa_execute_command, ) if self._process is not None: - return qa.get_status_of_job(process_id=self._process) in ["running", "pending"] + return qa.get_status_of_job(process_id=self._process) in [ + "running", + "pending", + ] else: return False From 36b0b47651a5d4ca22acf2bc49c1e15fecdef540 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 27 Jul 2025 21:16:01 +0000 Subject: [PATCH 09/83] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- executorlib/standalone/slurm_command.py | 1 - executorlib/task_scheduler/worker/spawner.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/executorlib/standalone/slurm_command.py b/executorlib/standalone/slurm_command.py index 3d9ca47d..f9f4f8db 100644 --- a/executorlib/standalone/slurm_command.py +++ b/executorlib/standalone/slurm_command.py @@ -1,6 +1,5 @@ from typing import Optional - SLURM_COMMAND = "srun" diff --git a/executorlib/task_scheduler/worker/spawner.py b/executorlib/task_scheduler/worker/spawner.py index 30328141..af615126 100644 --- a/executorlib/task_scheduler/worker/spawner.py +++ b/executorlib/task_scheduler/worker/spawner.py @@ -77,7 +77,7 @@ def generate_command(self, command_lst: list[str]) -> list[str]: command_prepend = ["flux", "run", "-n", str(self._cores)] elif self._cores > 1: raise ValueError( - "backend should be None, slurm or flux, not {}".format(self._backend) + f"backend should be None, slurm or flux, not {self._backend}" ) else: command_prepend = [] From 1cc704414aad5b64be20d35a0240a58bc8b4f23c Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Tue, 19 Aug 2025 11:20:06 +0200 Subject: [PATCH 10/83] block_allocation --- executorlib/executor/flux.py | 58 +++++++++----- executorlib/executor/slurm.py | 59 +++++++++----- .../pysqaspawner.py} | 76 ++++++++++++++++++- 3 files changed, 149 insertions(+), 44 deletions(-) rename executorlib/task_scheduler/{worker/spawner.py => interactive/pysqaspawner.py} (59%) diff --git a/executorlib/executor/flux.py b/executorlib/executor/flux.py index 864548d6..e9016576 100644 --- a/executorlib/executor/flux.py +++ b/executorlib/executor/flux.py @@ -357,28 +357,46 @@ def __init__( if not plot_dependency_graph: import pysqa # noqa - from executorlib.task_scheduler.file.task_scheduler import ( - create_file_executor, - ) + if block_allocation: + from executorlib.task_scheduler.interactive.pysqaspawner import create_pysqa_block_allocation_scheduler + + super().__init__( + executor=create_pysqa_block_allocation_scheduler( + max_cores=max_cores, + cache_directory=cache_directory, + hostname_localhost=hostname_localhost, + log_obj_size=log_obj_size, + pmi_mode=pmi_mode, + init_function=init_function, + max_workers=max_workers, + resource_dict=resource_dict, + pysqa_config_directory=pysqa_config_directory, + backend="flux", + ) + ) + else: + from executorlib.task_scheduler.file.task_scheduler import ( + create_file_executor, + ) - super().__init__( - executor=create_file_executor( - max_workers=max_workers, - backend="flux", - max_cores=max_cores, - cache_directory=cache_directory, - resource_dict=resource_dict, - flux_executor=None, - pmi_mode=pmi_mode, - flux_executor_nesting=False, - flux_log_files=False, - pysqa_config_directory=pysqa_config_directory, - hostname_localhost=hostname_localhost, - block_allocation=block_allocation, - init_function=init_function, - disable_dependencies=disable_dependencies, + super().__init__( + executor=create_file_executor( + max_workers=max_workers, + backend="flux", + max_cores=max_cores, + cache_directory=cache_directory, + resource_dict=resource_dict, + flux_executor=None, + pmi_mode=pmi_mode, + flux_executor_nesting=False, + flux_log_files=False, + pysqa_config_directory=pysqa_config_directory, + hostname_localhost=hostname_localhost, + block_allocation=block_allocation, + init_function=init_function, + disable_dependencies=disable_dependencies, + ) ) - ) else: super().__init__( executor=DependencyTaskScheduler( diff --git a/executorlib/executor/slurm.py b/executorlib/executor/slurm.py index 3a4e202b..655a2d80 100644 --- a/executorlib/executor/slurm.py +++ b/executorlib/executor/slurm.py @@ -165,28 +165,47 @@ def __init__( if not plot_dependency_graph: import pysqa # noqa - from executorlib.task_scheduler.file.task_scheduler import ( - create_file_executor, - ) + if block_allocation: + from executorlib.task_scheduler.interactive.pysqaspawner import create_pysqa_block_allocation_scheduler + + super().__init__( + executor=create_pysqa_block_allocation_scheduler( + max_cores=max_cores, + cache_directory=cache_directory, + hostname_localhost=hostname_localhost, + log_obj_size=log_obj_size, + pmi_mode=pmi_mode, + init_function=init_function, + max_workers=max_workers, + resource_dict=resource_dict, + pysqa_config_directory=pysqa_config_directory, + backend="slurm", + ), + ) - super().__init__( - executor=create_file_executor( - max_workers=max_workers, - backend="slurm", - max_cores=max_cores, - cache_directory=cache_directory, - resource_dict=resource_dict, - pmi_mode=pmi_mode, - flux_executor=None, - flux_executor_nesting=False, - flux_log_files=False, - pysqa_config_directory=pysqa_config_directory, - hostname_localhost=hostname_localhost, - block_allocation=block_allocation, - init_function=init_function, - disable_dependencies=disable_dependencies, + else: + from executorlib.task_scheduler.file.task_scheduler import ( + create_file_executor, + ) + + super().__init__( + executor=create_file_executor( + max_workers=max_workers, + backend="slurm", + max_cores=max_cores, + cache_directory=cache_directory, + resource_dict=resource_dict, + pmi_mode=pmi_mode, + flux_executor=None, + flux_executor_nesting=False, + flux_log_files=False, + pysqa_config_directory=pysqa_config_directory, + hostname_localhost=hostname_localhost, + block_allocation=block_allocation, + init_function=init_function, + disable_dependencies=disable_dependencies, + ) ) - ) else: super().__init__( executor=DependencyTaskScheduler( diff --git a/executorlib/task_scheduler/worker/spawner.py b/executorlib/task_scheduler/interactive/pysqaspawner.py similarity index 59% rename from executorlib/task_scheduler/worker/spawner.py rename to executorlib/task_scheduler/interactive/pysqaspawner.py index af615126..0fbc321b 100644 --- a/executorlib/task_scheduler/worker/spawner.py +++ b/executorlib/task_scheduler/interactive/pysqaspawner.py @@ -1,9 +1,14 @@ -from typing import Optional +from time import sleep +from typing import Callable, Optional from pysqa import QueueAdapter +from executorlib.standalone.inputcheck import validate_number_of_cores from executorlib.standalone.interactive.spawner import BaseSpawner from executorlib.standalone.scheduler import pysqa_execute_command, terminate_with_pysqa +from executorlib.task_scheduler.interactive.blockallocation import ( + BlockAllocationTaskScheduler, +) class PysqaSpawner(BaseSpawner): @@ -11,11 +16,15 @@ def __init__( self, cwd: Optional[str] = None, cores: int = 1, - openmpi_oversubscribe: bool = False, threads_per_core: int = 1, + gpus_per_core: int = 0, + num_nodes: Optional[int] = None, + exclusive: bool = False, + openmpi_oversubscribe: bool = False, + slurm_cmd_args: Optional[list[str]] = None, + pmi_mode: Optional[str] = None, config_directory: Optional[str] = None, backend: Optional[str] = None, - submission_kwargs: Optional[dict] = None, ): """ Subprocess interface implementation. @@ -33,9 +42,13 @@ def __init__( ) self._process: Optional[int] = None self._threads_per_core = threads_per_core + self._gpus_per_core = gpus_per_core + self._num_nodes = num_nodes + self._exclusive = exclusive + self._slurm_cmd_args = slurm_cmd_args + self._pmi_mode = pmi_mode self._config_directory = config_directory self._backend = backend - self._submission_kwargs = submission_kwargs def bootup( self, @@ -52,12 +65,30 @@ def bootup( queue_type=self._backend, execute_command=pysqa_execute_command, ) + if self._gpus_per_core > 0: + raise ValueError() + if self._num_nodes is not None: + raise ValueError() + if self._exclusive: + raise ValueError() + if self._pmi_mode is not None: + raise ValueError() self._process = qa.submit_job( command=" ".join(self.generate_command(command_lst=command_lst)), working_directory=self._cwd, cores=self._cores, **self._submission_kwargs, ) + while True: + status = qa.get_status_of_job(process_id=self._process) + if status in ["running", "pending"]: + break + elif status is None: + raise RuntimeError( + f"Failed to start the process with command: {command_lst}" + ) + else: + sleep(1) # Wait for the process to start def generate_command(self, command_lst: list[str]) -> list[str]: """ @@ -117,3 +148,40 @@ def poll(self) -> bool: ] else: return False + + +def create_pysqa_block_allocation_scheduler( + max_cores: Optional[int] = None, + cache_directory: Optional[str] = None, + hostname_localhost: Optional[bool] = None, + log_obj_size: bool = False, + pmi_mode: Optional[str] = None, + init_function: Optional[Callable] = None, + max_workers: Optional[int] = None, + resource_dict: Optional[dict] = None, + pysqa_config_directory: Optional[str] = None, + backend: Optional[str] = None, +): + if backend is None: + raise ValueError("Backend must be either 'slurm' or 'flux'.") + if resource_dict is None: + resource_dict = {} + cores_per_worker = resource_dict.get("cores", 1) + resource_dict["cache_directory"] = cache_directory + resource_dict["hostname_localhost"] = hostname_localhost + resource_dict["log_obj_size"] = log_obj_size + resource_dict["pmi_mode"] = pmi_mode + resource_dict["init_function"] = init_function + resource_dict["config_directory"] = pysqa_config_directory + resource_dict["backend"] = backend + max_workers = validate_number_of_cores( + max_cores=max_cores, + max_workers=max_workers, + cores_per_worker=cores_per_worker, + set_local_cores=False, + ) + return BlockAllocationTaskScheduler( + max_workers=max_workers, + executor_kwargs=resource_dict, + spawner=PysqaSpawner, + ) From 02f0ce788cee5efaf7869ee6fc4d5fb0375b139d Mon Sep 17 00:00:00 2001 From: pyiron-runner Date: Tue, 19 Aug 2025 09:20:57 +0000 Subject: [PATCH 11/83] Format black --- executorlib/executor/flux.py | 6 ++++-- executorlib/executor/slurm.py | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/executorlib/executor/flux.py b/executorlib/executor/flux.py index e9016576..1d844f28 100644 --- a/executorlib/executor/flux.py +++ b/executorlib/executor/flux.py @@ -358,8 +358,10 @@ def __init__( import pysqa # noqa if block_allocation: - from executorlib.task_scheduler.interactive.pysqaspawner import create_pysqa_block_allocation_scheduler - + from executorlib.task_scheduler.interactive.pysqaspawner import ( + create_pysqa_block_allocation_scheduler, + ) + super().__init__( executor=create_pysqa_block_allocation_scheduler( max_cores=max_cores, diff --git a/executorlib/executor/slurm.py b/executorlib/executor/slurm.py index 655a2d80..ed2410d3 100644 --- a/executorlib/executor/slurm.py +++ b/executorlib/executor/slurm.py @@ -166,8 +166,10 @@ def __init__( import pysqa # noqa if block_allocation: - from executorlib.task_scheduler.interactive.pysqaspawner import create_pysqa_block_allocation_scheduler - + from executorlib.task_scheduler.interactive.pysqaspawner import ( + create_pysqa_block_allocation_scheduler, + ) + super().__init__( executor=create_pysqa_block_allocation_scheduler( max_cores=max_cores, From 2804562add35fa36919756028a364fc3766ede73 Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Tue, 19 Aug 2025 11:23:28 +0200 Subject: [PATCH 12/83] fix type hint --- executorlib/task_scheduler/interactive/pysqaspawner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/executorlib/task_scheduler/interactive/pysqaspawner.py b/executorlib/task_scheduler/interactive/pysqaspawner.py index 0fbc321b..f58b0975 100644 --- a/executorlib/task_scheduler/interactive/pysqaspawner.py +++ b/executorlib/task_scheduler/interactive/pysqaspawner.py @@ -77,7 +77,7 @@ def bootup( command=" ".join(self.generate_command(command_lst=command_lst)), working_directory=self._cwd, cores=self._cores, - **self._submission_kwargs, + **self._slurm_cmd_args, ) while True: status = qa.get_status_of_job(process_id=self._process) From 6fb86f7b8591e523c55822a8b32c7a9d6addc37d Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Tue, 19 Aug 2025 11:52:33 +0200 Subject: [PATCH 13/83] implement additional options for SLURM --- .../interactive/pysqaspawner.py | 40 ++++++++++++------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/executorlib/task_scheduler/interactive/pysqaspawner.py b/executorlib/task_scheduler/interactive/pysqaspawner.py index f58b0975..afedada7 100644 --- a/executorlib/task_scheduler/interactive/pysqaspawner.py +++ b/executorlib/task_scheduler/interactive/pysqaspawner.py @@ -65,18 +65,10 @@ def bootup( queue_type=self._backend, execute_command=pysqa_execute_command, ) - if self._gpus_per_core > 0: - raise ValueError() - if self._num_nodes is not None: - raise ValueError() - if self._exclusive: - raise ValueError() - if self._pmi_mode is not None: - raise ValueError() self._process = qa.submit_job( command=" ".join(self.generate_command(command_lst=command_lst)), working_directory=self._cwd, - cores=self._cores, + cores=int(self._cores * self._threads_per_core), **self._slurm_cmd_args, ) while True: @@ -100,12 +92,34 @@ def generate_command(self, command_lst: list[str]) -> list[str]: Returns: list[str]: The generated command list. """ - if self._cores > 1 and self._backend is None: - command_prepend = ["mpiexec", "-n", str(self._cores)] - elif self._cores > 1 and self._backend == "slurm": + if self._cores > 1 and self._backend == "slurm": command_prepend = ["srun", "-n", str(self._cores)] + if self._pmi_mode is not None: + command_prepend += ["--mpi=" + self._pmi_mode] + if self._num_nodes is not None: + command_prepend_lst += ["-N", str(self._num_nodes)] + if self._threads_per_core > 1: + command_prepend_lst += ["--cpus-per-task=" + str(self._threads_per_core)] + if self._gpus_per_core > 0: + command_prepend_lst += ["--gpus-per-task=" + str(self._gpus_per_core)] + if self._exclusive: + command_prepend_lst += ["--exact"] + if self._openmpi_oversubscribe: + command_prepend_lst += ["--oversubscribe"] elif self._cores > 1 and self._backend == "flux": command_prepend = ["flux", "run", "-n", str(self._cores)] + if self._pmi_mode is not None: + command_prepend += ["-o", "pmi=" + self._pmi_mode] + if self._num_nodes is not None: + raise ValueError() + if self._threads_per_core > 1: + raise ValueError() + if self._gpus_per_core > 0: + raise ValueError() + if self._exclusive: + raise ValueError() + if self._openmpi_oversubscribe: + raise ValueError() elif self._cores > 1: raise ValueError( f"backend should be None, slurm or flux, not {self._backend}" @@ -162,8 +176,6 @@ def create_pysqa_block_allocation_scheduler( pysqa_config_directory: Optional[str] = None, backend: Optional[str] = None, ): - if backend is None: - raise ValueError("Backend must be either 'slurm' or 'flux'.") if resource_dict is None: resource_dict = {} cores_per_worker = resource_dict.get("cores", 1) From ff10b0d67f0c0ea4aeacf43900074df6d116f0f0 Mon Sep 17 00:00:00 2001 From: pyiron-runner Date: Tue, 19 Aug 2025 09:53:15 +0000 Subject: [PATCH 14/83] Format black --- executorlib/task_scheduler/interactive/pysqaspawner.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/executorlib/task_scheduler/interactive/pysqaspawner.py b/executorlib/task_scheduler/interactive/pysqaspawner.py index afedada7..459e2132 100644 --- a/executorlib/task_scheduler/interactive/pysqaspawner.py +++ b/executorlib/task_scheduler/interactive/pysqaspawner.py @@ -99,7 +99,9 @@ def generate_command(self, command_lst: list[str]) -> list[str]: if self._num_nodes is not None: command_prepend_lst += ["-N", str(self._num_nodes)] if self._threads_per_core > 1: - command_prepend_lst += ["--cpus-per-task=" + str(self._threads_per_core)] + command_prepend_lst += [ + "--cpus-per-task=" + str(self._threads_per_core) + ] if self._gpus_per_core > 0: command_prepend_lst += ["--gpus-per-task=" + str(self._gpus_per_core)] if self._exclusive: @@ -113,7 +115,7 @@ def generate_command(self, command_lst: list[str]) -> list[str]: if self._num_nodes is not None: raise ValueError() if self._threads_per_core > 1: - raise ValueError() + raise ValueError() if self._gpus_per_core > 0: raise ValueError() if self._exclusive: From 38e022078f0714796fba78ac66fbe92ef84c17f6 Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Tue, 19 Aug 2025 12:07:14 +0200 Subject: [PATCH 15/83] fixes --- executorlib/task_scheduler/interactive/pysqaspawner.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/executorlib/task_scheduler/interactive/pysqaspawner.py b/executorlib/task_scheduler/interactive/pysqaspawner.py index 459e2132..8784686a 100644 --- a/executorlib/task_scheduler/interactive/pysqaspawner.py +++ b/executorlib/task_scheduler/interactive/pysqaspawner.py @@ -97,17 +97,17 @@ def generate_command(self, command_lst: list[str]) -> list[str]: if self._pmi_mode is not None: command_prepend += ["--mpi=" + self._pmi_mode] if self._num_nodes is not None: - command_prepend_lst += ["-N", str(self._num_nodes)] + command_prepend += ["-N", str(self._num_nodes)] if self._threads_per_core > 1: - command_prepend_lst += [ + command_prepend += [ "--cpus-per-task=" + str(self._threads_per_core) ] if self._gpus_per_core > 0: - command_prepend_lst += ["--gpus-per-task=" + str(self._gpus_per_core)] + command_prepend += ["--gpus-per-task=" + str(self._gpus_per_core)] if self._exclusive: - command_prepend_lst += ["--exact"] + command_prepend += ["--exact"] if self._openmpi_oversubscribe: - command_prepend_lst += ["--oversubscribe"] + command_prepend += ["--oversubscribe"] elif self._cores > 1 and self._backend == "flux": command_prepend = ["flux", "run", "-n", str(self._cores)] if self._pmi_mode is not None: From 713887841cdffe537af659ccde622039a3c755ff Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 19 Aug 2025 10:07:24 +0000 Subject: [PATCH 16/83] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- executorlib/task_scheduler/interactive/pysqaspawner.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/executorlib/task_scheduler/interactive/pysqaspawner.py b/executorlib/task_scheduler/interactive/pysqaspawner.py index 8784686a..9fa06e0e 100644 --- a/executorlib/task_scheduler/interactive/pysqaspawner.py +++ b/executorlib/task_scheduler/interactive/pysqaspawner.py @@ -99,9 +99,7 @@ def generate_command(self, command_lst: list[str]) -> list[str]: if self._num_nodes is not None: command_prepend += ["-N", str(self._num_nodes)] if self._threads_per_core > 1: - command_prepend += [ - "--cpus-per-task=" + str(self._threads_per_core) - ] + command_prepend += ["--cpus-per-task=" + str(self._threads_per_core)] if self._gpus_per_core > 0: command_prepend += ["--gpus-per-task=" + str(self._gpus_per_core)] if self._exclusive: From a9c4c687fdab55026692cea29ecc8220dcaa050f Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Tue, 19 Aug 2025 17:40:52 +0200 Subject: [PATCH 17/83] add test for flux block allocation --- tests/test_fluxclusterexecutor.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/test_fluxclusterexecutor.py b/tests/test_fluxclusterexecutor.py index 0968fabb..fc2c1bc9 100644 --- a/tests/test_fluxclusterexecutor.py +++ b/tests/test_fluxclusterexecutor.py @@ -51,6 +51,20 @@ def test_executor(self): self.assertEqual(len(os.listdir("executorlib_cache")), 4) self.assertTrue(fs1.done()) + def test_executor_blockallocation(self): + with FluxClusterExecutor( + resource_dict={"cores": 2, "cwd": "executorlib_cache"}, + block_allocation=True, + cache_directory="executorlib_cache", + pmi_mode=pmi, + ) as exe: + cloudpickle_register(ind=1) + fs1 = exe.submit(mpi_funct, 1) + self.assertFalse(fs1.done()) + self.assertEqual(fs1.result(), [(1, 2, 0), (1, 2, 1)]) + self.assertEqual(len(os.listdir("executorlib_cache")), 4) + self.assertTrue(fs1.done()) + def test_executor_no_cwd(self): with FluxClusterExecutor( resource_dict={"cores": 2}, From 0e60b287749174a6cab94c2ee72bf17d72eb91f8 Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Tue, 19 Aug 2025 17:47:00 +0200 Subject: [PATCH 18/83] fixes --- executorlib/task_scheduler/worker/__init__.py | 0 tests/test_fluxclusterexecutor.py | 1 + 2 files changed, 1 insertion(+) delete mode 100644 executorlib/task_scheduler/worker/__init__.py diff --git a/executorlib/task_scheduler/worker/__init__.py b/executorlib/task_scheduler/worker/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/test_fluxclusterexecutor.py b/tests/test_fluxclusterexecutor.py index fc2c1bc9..9bcc1f5a 100644 --- a/tests/test_fluxclusterexecutor.py +++ b/tests/test_fluxclusterexecutor.py @@ -57,6 +57,7 @@ def test_executor_blockallocation(self): block_allocation=True, cache_directory="executorlib_cache", pmi_mode=pmi, + max_workers=2, ) as exe: cloudpickle_register(ind=1) fs1 = exe.submit(mpi_funct, 1) From faf4c50691651a8612d9756070c2738514a0d28d Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Tue, 19 Aug 2025 17:55:58 +0200 Subject: [PATCH 19/83] more fixes --- executorlib/task_scheduler/interactive/pysqaspawner.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/executorlib/task_scheduler/interactive/pysqaspawner.py b/executorlib/task_scheduler/interactive/pysqaspawner.py index 9fa06e0e..2e3e52d2 100644 --- a/executorlib/task_scheduler/interactive/pysqaspawner.py +++ b/executorlib/task_scheduler/interactive/pysqaspawner.py @@ -25,6 +25,7 @@ def __init__( pmi_mode: Optional[str] = None, config_directory: Optional[str] = None, backend: Optional[str] = None, + **kwargs, ): """ Subprocess interface implementation. @@ -49,6 +50,7 @@ def __init__( self._pmi_mode = pmi_mode self._config_directory = config_directory self._backend = backend + self._pysqa_submission_kwargs = kwargs def bootup( self, @@ -69,7 +71,7 @@ def bootup( command=" ".join(self.generate_command(command_lst=command_lst)), working_directory=self._cwd, cores=int(self._cores * self._threads_per_core), - **self._slurm_cmd_args, + **self._pysqa_submission_kwargs, ) while True: status = qa.get_status_of_job(process_id=self._process) @@ -106,6 +108,8 @@ def generate_command(self, command_lst: list[str]) -> list[str]: command_prepend += ["--exact"] if self._openmpi_oversubscribe: command_prepend += ["--oversubscribe"] + if self._slurm_cmd_args is not None and len(self._slurm_cmd_args) > 0: + command_prepend += self._slurm_cmd_args elif self._cores > 1 and self._backend == "flux": command_prepend = ["flux", "run", "-n", str(self._cores)] if self._pmi_mode is not None: From 4bd0001bbc687e6ba2b7276a6970b0221a2b268f Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Tue, 19 Aug 2025 21:10:19 +0200 Subject: [PATCH 20/83] fixes --- executorlib/task_scheduler/interactive/pysqaspawner.py | 4 +++- tests/test_fluxclusterexecutor.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/executorlib/task_scheduler/interactive/pysqaspawner.py b/executorlib/task_scheduler/interactive/pysqaspawner.py index 2e3e52d2..e724b111 100644 --- a/executorlib/task_scheduler/interactive/pysqaspawner.py +++ b/executorlib/task_scheduler/interactive/pysqaspawner.py @@ -1,3 +1,4 @@ +import os from time import sleep from typing import Callable, Optional @@ -183,7 +184,8 @@ def create_pysqa_block_allocation_scheduler( if resource_dict is None: resource_dict = {} cores_per_worker = resource_dict.get("cores", 1) - resource_dict["cache_directory"] = cache_directory + resource_dict["cwd"] = os.path.abspath(resource_dict["cwd"]) + resource_dict["cache_directory"] = os.path.abspath(cache_directory) resource_dict["hostname_localhost"] = hostname_localhost resource_dict["log_obj_size"] = log_obj_size resource_dict["pmi_mode"] = pmi_mode diff --git a/tests/test_fluxclusterexecutor.py b/tests/test_fluxclusterexecutor.py index 9bcc1f5a..0231a14e 100644 --- a/tests/test_fluxclusterexecutor.py +++ b/tests/test_fluxclusterexecutor.py @@ -57,7 +57,7 @@ def test_executor_blockallocation(self): block_allocation=True, cache_directory="executorlib_cache", pmi_mode=pmi, - max_workers=2, + max_workers=1, ) as exe: cloudpickle_register(ind=1) fs1 = exe.submit(mpi_funct, 1) From cf1cfe9899766a8f4aad70bf91a4ccfb59532c20 Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Tue, 19 Aug 2025 21:26:23 +0200 Subject: [PATCH 21/83] handle different types --- executorlib/task_scheduler/interactive/pysqaspawner.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/executorlib/task_scheduler/interactive/pysqaspawner.py b/executorlib/task_scheduler/interactive/pysqaspawner.py index e724b111..a4225301 100644 --- a/executorlib/task_scheduler/interactive/pysqaspawner.py +++ b/executorlib/task_scheduler/interactive/pysqaspawner.py @@ -184,8 +184,12 @@ def create_pysqa_block_allocation_scheduler( if resource_dict is None: resource_dict = {} cores_per_worker = resource_dict.get("cores", 1) - resource_dict["cwd"] = os.path.abspath(resource_dict["cwd"]) - resource_dict["cache_directory"] = os.path.abspath(cache_directory) + if "cwd" in resource_dict: + resource_dict["cwd"] = os.path.abspath(resource_dict["cwd"]) + if cache_directory is None: + resource_dict["cache_directory"] = os.path.abspath(cache_directory) + else: + resource_dict["cache_directory"] = None resource_dict["hostname_localhost"] = hostname_localhost resource_dict["log_obj_size"] = log_obj_size resource_dict["pmi_mode"] = pmi_mode From 3887d1616624f7c4262d7ec41ac184531f8669a0 Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Tue, 19 Aug 2025 23:09:05 +0200 Subject: [PATCH 22/83] fixes --- executorlib/task_scheduler/interactive/pysqaspawner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/executorlib/task_scheduler/interactive/pysqaspawner.py b/executorlib/task_scheduler/interactive/pysqaspawner.py index a4225301..9a36fede 100644 --- a/executorlib/task_scheduler/interactive/pysqaspawner.py +++ b/executorlib/task_scheduler/interactive/pysqaspawner.py @@ -186,7 +186,7 @@ def create_pysqa_block_allocation_scheduler( cores_per_worker = resource_dict.get("cores", 1) if "cwd" in resource_dict: resource_dict["cwd"] = os.path.abspath(resource_dict["cwd"]) - if cache_directory is None: + if cache_directory is not None: resource_dict["cache_directory"] = os.path.abspath(cache_directory) else: resource_dict["cache_directory"] = None From b3ab3a25470888cdec714962f6acca7370a56d65 Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Sun, 24 Aug 2025 17:10:12 +0200 Subject: [PATCH 23/83] Add print commands --- executorlib/standalone/scheduler.py | 2 ++ .../task_scheduler/interactive/blockallocation.py | 5 +++++ .../task_scheduler/interactive/pysqaspawner.py | 11 +++++------ executorlib/task_scheduler/interactive/shared.py | 4 ++++ 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/executorlib/standalone/scheduler.py b/executorlib/standalone/scheduler.py index bc68187b..ce36a15e 100644 --- a/executorlib/standalone/scheduler.py +++ b/executorlib/standalone/scheduler.py @@ -23,6 +23,7 @@ def terminate_with_pysqa( queue_type=backend, execute_command=pysqa_execute_command, ) + print(qa.get_queue_status()) status = qa.get_status_of_job(process_id=queue_id) if status is not None and status not in ["finished", "error"]: with contextlib.suppress(subprocess.CalledProcessError): @@ -52,6 +53,7 @@ def pysqa_execute_command( """ if shell and isinstance(commands, list): commands = " ".join(commands) + print(commands, working_directory) out = subprocess.check_output( commands, cwd=working_directory, diff --git a/executorlib/task_scheduler/interactive/blockallocation.py b/executorlib/task_scheduler/interactive/blockallocation.py index 96cec2c1..a6e0ae7a 100644 --- a/executorlib/task_scheduler/interactive/blockallocation.py +++ b/executorlib/task_scheduler/interactive/blockallocation.py @@ -156,14 +156,19 @@ def shutdown(self, wait: bool = True, *, cancel_futures: bool = False): if cancel_futures: cancel_items_in_queue(que=self._future_queue) if isinstance(self._process, list): + print(len(self._process), wait) for _ in range(len(self._process)): self._future_queue.put({"shutdown": True, "wait": wait}) + print("after submission", wait) if wait: for process in self._process: + print("join") process.join() + print("join done") self._future_queue.join() self._process = None self._future_queue = None + print("block shutdown done") def _set_process(self, process: list[Thread]): # type: ignore """ diff --git a/executorlib/task_scheduler/interactive/pysqaspawner.py b/executorlib/task_scheduler/interactive/pysqaspawner.py index 9a36fede..8ff95e0d 100644 --- a/executorlib/task_scheduler/interactive/pysqaspawner.py +++ b/executorlib/task_scheduler/interactive/pysqaspawner.py @@ -68,12 +68,14 @@ def bootup( queue_type=self._backend, execute_command=pysqa_execute_command, ) + print(self._process, self) self._process = qa.submit_job( command=" ".join(self.generate_command(command_lst=command_lst)), working_directory=self._cwd, cores=int(self._cores * self._threads_per_core), **self._pysqa_submission_kwargs, ) + print(self._process, self) while True: status = qa.get_status_of_job(process_id=self._process) if status in ["running", "pending"]: @@ -147,6 +149,7 @@ def shutdown(self, wait: bool = True): backend=self._backend, ) self._process = None + print("terminate done") def poll(self) -> bool: """ @@ -184,12 +187,8 @@ def create_pysqa_block_allocation_scheduler( if resource_dict is None: resource_dict = {} cores_per_worker = resource_dict.get("cores", 1) - if "cwd" in resource_dict: - resource_dict["cwd"] = os.path.abspath(resource_dict["cwd"]) - if cache_directory is not None: - resource_dict["cache_directory"] = os.path.abspath(cache_directory) - else: - resource_dict["cache_directory"] = None + resource_dict["cwd"] = os.path.abspath(resource_dict["cwd"]) + resource_dict["cache_directory"] = os.path.abspath(cache_directory) resource_dict["hostname_localhost"] = hostname_localhost resource_dict["log_obj_size"] = log_obj_size resource_dict["pmi_mode"] = pmi_mode diff --git a/executorlib/task_scheduler/interactive/shared.py b/executorlib/task_scheduler/interactive/shared.py index 02162308..baf754a7 100644 --- a/executorlib/task_scheduler/interactive/shared.py +++ b/executorlib/task_scheduler/interactive/shared.py @@ -68,10 +68,14 @@ def execute_tasks( while True: task_dict = future_queue.get() if "shutdown" in task_dict and task_dict["shutdown"]: + print("before shutdown", interface, interface._process, interface._spawner, interface._spawner._process) interface.shutdown(wait=task_dict["wait"]) + print("before done") _task_done(future_queue=future_queue) + print("before join", queue_join_on_shutdown) if queue_join_on_shutdown: future_queue.join() + print("break") break elif "fn" in task_dict and "future" in task_dict: if error_log_file is not None: From 3936620830897cf26402d8d5cd3f7631e7c57f1c Mon Sep 17 00:00:00 2001 From: pyiron-runner Date: Sun, 24 Aug 2025 15:10:48 +0000 Subject: [PATCH 24/83] Format black --- executorlib/task_scheduler/interactive/shared.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/executorlib/task_scheduler/interactive/shared.py b/executorlib/task_scheduler/interactive/shared.py index baf754a7..eebcc4e9 100644 --- a/executorlib/task_scheduler/interactive/shared.py +++ b/executorlib/task_scheduler/interactive/shared.py @@ -68,7 +68,13 @@ def execute_tasks( while True: task_dict = future_queue.get() if "shutdown" in task_dict and task_dict["shutdown"]: - print("before shutdown", interface, interface._process, interface._spawner, interface._spawner._process) + print( + "before shutdown", + interface, + interface._process, + interface._spawner, + interface._spawner._process, + ) interface.shutdown(wait=task_dict["wait"]) print("before done") _task_done(future_queue=future_queue) From a9f4eea39668622e1788cb7f33ed0c7d8525784c Mon Sep 17 00:00:00 2001 From: jan-janssen Date: Sun, 24 Aug 2025 18:26:35 +0200 Subject: [PATCH 25/83] hash for worker directory --- executorlib/standalone/scheduler.py | 2 -- .../task_scheduler/interactive/blockallocation.py | 5 ----- executorlib/task_scheduler/interactive/pysqaspawner.py | 7 +++---- executorlib/task_scheduler/interactive/shared.py | 10 ---------- 4 files changed, 3 insertions(+), 21 deletions(-) diff --git a/executorlib/standalone/scheduler.py b/executorlib/standalone/scheduler.py index ce36a15e..bc68187b 100644 --- a/executorlib/standalone/scheduler.py +++ b/executorlib/standalone/scheduler.py @@ -23,7 +23,6 @@ def terminate_with_pysqa( queue_type=backend, execute_command=pysqa_execute_command, ) - print(qa.get_queue_status()) status = qa.get_status_of_job(process_id=queue_id) if status is not None and status not in ["finished", "error"]: with contextlib.suppress(subprocess.CalledProcessError): @@ -53,7 +52,6 @@ def pysqa_execute_command( """ if shell and isinstance(commands, list): commands = " ".join(commands) - print(commands, working_directory) out = subprocess.check_output( commands, cwd=working_directory, diff --git a/executorlib/task_scheduler/interactive/blockallocation.py b/executorlib/task_scheduler/interactive/blockallocation.py index a6e0ae7a..96cec2c1 100644 --- a/executorlib/task_scheduler/interactive/blockallocation.py +++ b/executorlib/task_scheduler/interactive/blockallocation.py @@ -156,19 +156,14 @@ def shutdown(self, wait: bool = True, *, cancel_futures: bool = False): if cancel_futures: cancel_items_in_queue(que=self._future_queue) if isinstance(self._process, list): - print(len(self._process), wait) for _ in range(len(self._process)): self._future_queue.put({"shutdown": True, "wait": wait}) - print("after submission", wait) if wait: for process in self._process: - print("join") process.join() - print("join done") self._future_queue.join() self._process = None self._future_queue = None - print("block shutdown done") def _set_process(self, process: list[Thread]): # type: ignore """ diff --git a/executorlib/task_scheduler/interactive/pysqaspawner.py b/executorlib/task_scheduler/interactive/pysqaspawner.py index 8ff95e0d..a2caa965 100644 --- a/executorlib/task_scheduler/interactive/pysqaspawner.py +++ b/executorlib/task_scheduler/interactive/pysqaspawner.py @@ -1,4 +1,5 @@ import os +import hashlib from time import sleep from typing import Callable, Optional @@ -68,14 +69,13 @@ def bootup( queue_type=self._backend, execute_command=pysqa_execute_command, ) - print(self._process, self) + hash = hashlib.md5(str(self).encode()).hexdigest() self._process = qa.submit_job( command=" ".join(self.generate_command(command_lst=command_lst)), - working_directory=self._cwd, + working_directory=os.path.join(self._cwd, hash), cores=int(self._cores * self._threads_per_core), **self._pysqa_submission_kwargs, ) - print(self._process, self) while True: status = qa.get_status_of_job(process_id=self._process) if status in ["running", "pending"]: @@ -149,7 +149,6 @@ def shutdown(self, wait: bool = True): backend=self._backend, ) self._process = None - print("terminate done") def poll(self) -> bool: """ diff --git a/executorlib/task_scheduler/interactive/shared.py b/executorlib/task_scheduler/interactive/shared.py index eebcc4e9..02162308 100644 --- a/executorlib/task_scheduler/interactive/shared.py +++ b/executorlib/task_scheduler/interactive/shared.py @@ -68,20 +68,10 @@ def execute_tasks( while True: task_dict = future_queue.get() if "shutdown" in task_dict and task_dict["shutdown"]: - print( - "before shutdown", - interface, - interface._process, - interface._spawner, - interface._spawner._process, - ) interface.shutdown(wait=task_dict["wait"]) - print("before done") _task_done(future_queue=future_queue) - print("before join", queue_join_on_shutdown) if queue_join_on_shutdown: future_queue.join() - print("break") break elif "fn" in task_dict and "future" in task_dict: if error_log_file is not None: From 6fb2decc7b12e096ab3effb72132e9450b645d60 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 24 Aug 2025 16:26:44 +0000 Subject: [PATCH 26/83] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- executorlib/task_scheduler/interactive/pysqaspawner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/executorlib/task_scheduler/interactive/pysqaspawner.py b/executorlib/task_scheduler/interactive/pysqaspawner.py index a2caa965..62846957 100644 --- a/executorlib/task_scheduler/interactive/pysqaspawner.py +++ b/executorlib/task_scheduler/interactive/pysqaspawner.py @@ -1,5 +1,5 @@ -import os import hashlib +import os from time import sleep from typing import Callable, Optional From ef7f5bf540bce7574ed819da44a8184d619ff607 Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Sun, 24 Aug 2025 18:51:38 +0200 Subject: [PATCH 27/83] Update test_fluxclusterexecutor.py --- tests/test_fluxclusterexecutor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_fluxclusterexecutor.py b/tests/test_fluxclusterexecutor.py index 0231a14e..654a719a 100644 --- a/tests/test_fluxclusterexecutor.py +++ b/tests/test_fluxclusterexecutor.py @@ -63,7 +63,7 @@ def test_executor_blockallocation(self): fs1 = exe.submit(mpi_funct, 1) self.assertFalse(fs1.done()) self.assertEqual(fs1.result(), [(1, 2, 0), (1, 2, 1)]) - self.assertEqual(len(os.listdir("executorlib_cache")), 4) + self.assertEqual(len(os.listdir("executorlib_cache")), 2) self.assertTrue(fs1.done()) def test_executor_no_cwd(self): From 199d3d834c269f331f4abef4a712d77537bba1f0 Mon Sep 17 00:00:00 2001 From: jan-janssen Date: Sun, 24 Aug 2025 20:13:12 +0200 Subject: [PATCH 28/83] fixes --- .../interactive/pysqaspawner.py | 11 +++++++-- tests/test_fluxclusterexecutor.py | 24 +++++++++++++++++++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/executorlib/task_scheduler/interactive/pysqaspawner.py b/executorlib/task_scheduler/interactive/pysqaspawner.py index 62846957..83daf90c 100644 --- a/executorlib/task_scheduler/interactive/pysqaspawner.py +++ b/executorlib/task_scheduler/interactive/pysqaspawner.py @@ -70,9 +70,13 @@ def bootup( execute_command=pysqa_execute_command, ) hash = hashlib.md5(str(self).encode()).hexdigest() + if self._cwd is not None: + working_directory = os.path.join(self._cwd, hash) + else: + working_directory = os.path.abspath(hash) self._process = qa.submit_job( command=" ".join(self.generate_command(command_lst=command_lst)), - working_directory=os.path.join(self._cwd, hash), + working_directory=working_directory, cores=int(self._cores * self._threads_per_core), **self._pysqa_submission_kwargs, ) @@ -187,7 +191,10 @@ def create_pysqa_block_allocation_scheduler( resource_dict = {} cores_per_worker = resource_dict.get("cores", 1) resource_dict["cwd"] = os.path.abspath(resource_dict["cwd"]) - resource_dict["cache_directory"] = os.path.abspath(cache_directory) + if cache_directory is not None: + resource_dict["cache_directory"] = os.path.abspath(cache_directory) + else: + resource_dict["cache_directory"] = os.path.abspath(".") resource_dict["hostname_localhost"] = hostname_localhost resource_dict["log_obj_size"] = log_obj_size resource_dict["pmi_mode"] = pmi_mode diff --git a/tests/test_fluxclusterexecutor.py b/tests/test_fluxclusterexecutor.py index 654a719a..9b6c36d0 100644 --- a/tests/test_fluxclusterexecutor.py +++ b/tests/test_fluxclusterexecutor.py @@ -24,6 +24,11 @@ skip_mpi4py_test = importlib.util.find_spec("mpi4py") is None +def echo(i): + sleep(1) + return i + + def mpi_funct(i): from mpi4py import MPI @@ -66,6 +71,25 @@ def test_executor_blockallocation(self): self.assertEqual(len(os.listdir("executorlib_cache")), 2) self.assertTrue(fs1.done()) + def test_executor_blockallocation_echo(self): + with FluxClusterExecutor( + resource_dict={"cores": 1, "cwd": "executorlib_cache"}, + block_allocation=True, + cache_directory="executorlib_cache", + pmi_mode=pmi, + max_workers=2, + ) as exe: + cloudpickle_register(ind=1) + fs1 = exe.submit(echo, 1) + fs2 = exe.submit(echo, 2) + self.assertFalse(fs1.done()) + self.assertFalse(fs2.done()) + self.assertEqual(fs1.result(), 1) + self.assertEqual(fs2.result(), 2) + self.assertEqual(len(os.listdir("executorlib_cache")), 2) + self.assertTrue(fs1.done()) + self.assertTrue(fs2.done()) + def test_executor_no_cwd(self): with FluxClusterExecutor( resource_dict={"cores": 2}, From 18e2b016be8b5f6a2a92a83a63ef0032cba58e8d Mon Sep 17 00:00:00 2001 From: jan-janssen Date: Sun, 24 Aug 2025 20:15:45 +0200 Subject: [PATCH 29/83] fix test --- tests/test_fluxclusterexecutor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_fluxclusterexecutor.py b/tests/test_fluxclusterexecutor.py index 9b6c36d0..6a40bed4 100644 --- a/tests/test_fluxclusterexecutor.py +++ b/tests/test_fluxclusterexecutor.py @@ -86,7 +86,7 @@ def test_executor_blockallocation_echo(self): self.assertFalse(fs2.done()) self.assertEqual(fs1.result(), 1) self.assertEqual(fs2.result(), 2) - self.assertEqual(len(os.listdir("executorlib_cache")), 2) + self.assertEqual(len(os.listdir("executorlib_cache")), 4) self.assertTrue(fs1.done()) self.assertTrue(fs2.done()) From 15b69d24b709b49576ef36fe221086d003444b2b Mon Sep 17 00:00:00 2001 From: jan-janssen Date: Fri, 29 Aug 2025 13:21:38 +0200 Subject: [PATCH 30/83] only receive jobs when worker is running --- executorlib/task_scheduler/interactive/pysqaspawner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/executorlib/task_scheduler/interactive/pysqaspawner.py b/executorlib/task_scheduler/interactive/pysqaspawner.py index 83daf90c..edcb289f 100644 --- a/executorlib/task_scheduler/interactive/pysqaspawner.py +++ b/executorlib/task_scheduler/interactive/pysqaspawner.py @@ -82,7 +82,7 @@ def bootup( ) while True: status = qa.get_status_of_job(process_id=self._process) - if status in ["running", "pending"]: + if status == "running": break elif status is None: raise RuntimeError( From fc7a3825783e1b7fdd9f3df3f40e4291fa497d29 Mon Sep 17 00:00:00 2001 From: jan-janssen Date: Fri, 29 Aug 2025 13:56:58 +0200 Subject: [PATCH 31/83] fix job resubmission --- .../interactive/pysqaspawner.py | 30 +++++++++++-------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/executorlib/task_scheduler/interactive/pysqaspawner.py b/executorlib/task_scheduler/interactive/pysqaspawner.py index edcb289f..8cc3249b 100644 --- a/executorlib/task_scheduler/interactive/pysqaspawner.py +++ b/executorlib/task_scheduler/interactive/pysqaspawner.py @@ -69,25 +69,18 @@ def bootup( queue_type=self._backend, execute_command=pysqa_execute_command, ) - hash = hashlib.md5(str(self).encode()).hexdigest() - if self._cwd is not None: - working_directory = os.path.join(self._cwd, hash) - else: - working_directory = os.path.abspath(hash) - self._process = qa.submit_job( - command=" ".join(self.generate_command(command_lst=command_lst)), - working_directory=working_directory, - cores=int(self._cores * self._threads_per_core), - **self._pysqa_submission_kwargs, - ) + job_id = self._start_process_helper(command_lst=command_lst, queue_adapter=qa) while True: - status = qa.get_status_of_job(process_id=self._process) + status = qa.get_status_of_job(process_id=job_id) if status == "running": + self._process = job_id break elif status is None: raise RuntimeError( f"Failed to start the process with command: {command_lst}" ) + elif status == "error": + job_id = self._start_process_helper(command_lst=command_lst, queue_adapter=qa) else: sleep(1) # Wait for the process to start @@ -173,6 +166,19 @@ def poll(self) -> bool: ] else: return False + + def _start_process_helper(self, command_lst: str, queue_adapter: QueueAdapter) -> int: + hash = hashlib.md5(str(self).encode()).hexdigest() + if self._cwd is not None: + working_directory = os.path.join(self._cwd, hash) + else: + working_directory = os.path.abspath(hash) + return queue_adapter.submit_job( + command=" ".join(self.generate_command(command_lst=command_lst)), + working_directory=working_directory, + cores=int(self._cores * self._threads_per_core), + **self._pysqa_submission_kwargs, + ) def create_pysqa_block_allocation_scheduler( From 7a3b19125decf26e73b192e036f0f03e05a83fc7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 29 Aug 2025 11:57:09 +0000 Subject: [PATCH 32/83] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- executorlib/task_scheduler/interactive/pysqaspawner.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/executorlib/task_scheduler/interactive/pysqaspawner.py b/executorlib/task_scheduler/interactive/pysqaspawner.py index 8cc3249b..2f73b725 100644 --- a/executorlib/task_scheduler/interactive/pysqaspawner.py +++ b/executorlib/task_scheduler/interactive/pysqaspawner.py @@ -80,7 +80,9 @@ def bootup( f"Failed to start the process with command: {command_lst}" ) elif status == "error": - job_id = self._start_process_helper(command_lst=command_lst, queue_adapter=qa) + job_id = self._start_process_helper( + command_lst=command_lst, queue_adapter=qa + ) else: sleep(1) # Wait for the process to start @@ -166,8 +168,10 @@ def poll(self) -> bool: ] else: return False - - def _start_process_helper(self, command_lst: str, queue_adapter: QueueAdapter) -> int: + + def _start_process_helper( + self, command_lst: str, queue_adapter: QueueAdapter + ) -> int: hash = hashlib.md5(str(self).encode()).hexdigest() if self._cwd is not None: working_directory = os.path.join(self._cwd, hash) From 4033bf39acf3fcd78327501f8bf6f0c11ffd440d Mon Sep 17 00:00:00 2001 From: jan-janssen Date: Fri, 29 Aug 2025 13:59:20 +0200 Subject: [PATCH 33/83] fix type hints --- executorlib/task_scheduler/interactive/pysqaspawner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/executorlib/task_scheduler/interactive/pysqaspawner.py b/executorlib/task_scheduler/interactive/pysqaspawner.py index 2f73b725..6ad2265e 100644 --- a/executorlib/task_scheduler/interactive/pysqaspawner.py +++ b/executorlib/task_scheduler/interactive/pysqaspawner.py @@ -170,7 +170,7 @@ def poll(self) -> bool: return False def _start_process_helper( - self, command_lst: str, queue_adapter: QueueAdapter + self, command_lst: list[str], queue_adapter: QueueAdapter ) -> int: hash = hashlib.md5(str(self).encode()).hexdigest() if self._cwd is not None: From 1c3e26307014975d17c21becb5b7a11abd1df665 Mon Sep 17 00:00:00 2001 From: jan-janssen Date: Fri, 29 Aug 2025 17:14:11 +0200 Subject: [PATCH 34/83] restart workers after they were killed --- .../standalone/interactive/communication.py | 18 +++++++++-- .../task_scheduler/interactive/shared.py | 32 +++++++++++++++---- 2 files changed, 41 insertions(+), 9 deletions(-) diff --git a/executorlib/standalone/interactive/communication.py b/executorlib/standalone/interactive/communication.py index 4a198882..78d82f79 100644 --- a/executorlib/standalone/interactive/communication.py +++ b/executorlib/standalone/interactive/communication.py @@ -7,6 +7,10 @@ import zmq +class ExecutorlibSockerError(RuntimeError): + pass + + class SocketInterface: """ The SocketInterface is an abstraction layer on top of the zero message queue. @@ -16,7 +20,7 @@ class SocketInterface: log_obj_size (boolean): Enable debug mode which reports the size of the communicated objects. """ - def __init__(self, spawner=None, log_obj_size=False): + def __init__(self, spawner=None, log_obj_size: bool = False, time_out_ms: int = 1000): """ Initialize the SocketInterface. @@ -25,12 +29,16 @@ def __init__(self, spawner=None, log_obj_size=False): """ self._context = zmq.Context() self._socket = self._context.socket(zmq.PAIR) + self._poller = zmq.Poller() + self._poller.register(self._socket, zmq.POLLIN) self._process = None + self._time_out_ms = time_out_ms if log_obj_size: self._logger = logging.getLogger("executorlib") else: self._logger = None self._spawner = spawner + self._command_lst = [] def send_dict(self, input_dict: dict): """ @@ -52,7 +60,12 @@ def receive_dict(self) -> dict: Returns: dict: dictionary with response received from the connected client """ - data = self._socket.recv() + response_lst = [] + while len(response_lst) == 0: + response_lst = self._poller.poll(self._time_out_ms) + if not self._spawner.poll(): + raise ExecutorlibSockerError() + data = self._socket.recv(zmq.NOBLOCK) if self._logger is not None: self._logger.warning( "Received dictionary of size: " + str(sys.getsizeof(data)) @@ -97,6 +110,7 @@ def bootup( Args: command_lst (list): list of strings to start the client process """ + self._command_lst = command_lst self._spawner.bootup( command_lst=command_lst, ) diff --git a/executorlib/task_scheduler/interactive/shared.py b/executorlib/task_scheduler/interactive/shared.py index 02162308..06b884b4 100644 --- a/executorlib/task_scheduler/interactive/shared.py +++ b/executorlib/task_scheduler/interactive/shared.py @@ -3,10 +3,12 @@ import queue import time from typing import Callable, Optional +from concurrent.futures._base import PENDING from executorlib.standalone.command import get_interactive_execute_command from executorlib.standalone.interactive.communication import ( SocketInterface, + ExecutorlibSockerError, interface_bootup, ) from executorlib.standalone.interactive.spawner import BaseSpawner, MpiExecSpawner @@ -107,9 +109,17 @@ def _execute_task_without_cache( try: f.set_result(interface.send_and_receive_dict(input_dict=task_dict)) except Exception as thread_exception: - interface.shutdown(wait=True) - _task_done(future_queue=future_queue) - f.set_exception(exception=thread_exception) + if isinstance(thread_exception, ExecutorlibSockerError): + f._state = PENDING + _task_done(future_queue=future_queue) + future_queue.put(task_dict | {"future": f}) + interface._spawner.bootup( + command_lst=interface._command_lst, + ) + else: + interface.shutdown(wait=True) + _task_done(future_queue=future_queue) + f.set_exception(exception=thread_exception) else: _task_done(future_queue=future_queue) @@ -154,10 +164,18 @@ def _execute_task_with_cache( dump(file_name=file_name, data_dict=data_dict) f.set_result(result) except Exception as thread_exception: - interface.shutdown(wait=True) - _task_done(future_queue=future_queue) - f.set_exception(exception=thread_exception) - raise thread_exception + if isinstance(thread_exception, ExecutorlibSockerError): + f._state = PENDING + _task_done(future_queue=future_queue) + future_queue.put(task_dict | {"future": f}) + interface._spawner.bootup( + command_lst=interface._command_lst, + ) + else: + interface.shutdown(wait=True) + _task_done(future_queue=future_queue) + f.set_exception(exception=thread_exception) + raise thread_exception else: _task_done(future_queue=future_queue) else: From cea4ca15101b530d49e606a034f19fdd600b56c5 Mon Sep 17 00:00:00 2001 From: pyiron-runner Date: Fri, 29 Aug 2025 15:14:48 +0000 Subject: [PATCH 35/83] Format black --- executorlib/standalone/interactive/communication.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/executorlib/standalone/interactive/communication.py b/executorlib/standalone/interactive/communication.py index 78d82f79..d9f7ffed 100644 --- a/executorlib/standalone/interactive/communication.py +++ b/executorlib/standalone/interactive/communication.py @@ -20,7 +20,9 @@ class SocketInterface: log_obj_size (boolean): Enable debug mode which reports the size of the communicated objects. """ - def __init__(self, spawner=None, log_obj_size: bool = False, time_out_ms: int = 1000): + def __init__( + self, spawner=None, log_obj_size: bool = False, time_out_ms: int = 1000 + ): """ Initialize the SocketInterface. @@ -30,7 +32,7 @@ def __init__(self, spawner=None, log_obj_size: bool = False, time_out_ms: int = self._context = zmq.Context() self._socket = self._context.socket(zmq.PAIR) self._poller = zmq.Poller() - self._poller.register(self._socket, zmq.POLLIN) + self._poller.register(self._socket, zmq.POLLIN) self._process = None self._time_out_ms = time_out_ms if log_obj_size: From 35a937224081d904d0e539e5cc5b1ea4bc0faf1e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 29 Aug 2025 15:15:38 +0000 Subject: [PATCH 36/83] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- executorlib/task_scheduler/interactive/shared.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/executorlib/task_scheduler/interactive/shared.py b/executorlib/task_scheduler/interactive/shared.py index 06b884b4..1cbc564a 100644 --- a/executorlib/task_scheduler/interactive/shared.py +++ b/executorlib/task_scheduler/interactive/shared.py @@ -2,13 +2,13 @@ import os import queue import time -from typing import Callable, Optional from concurrent.futures._base import PENDING +from typing import Callable, Optional from executorlib.standalone.command import get_interactive_execute_command from executorlib.standalone.interactive.communication import ( - SocketInterface, ExecutorlibSockerError, + SocketInterface, interface_bootup, ) from executorlib.standalone.interactive.spawner import BaseSpawner, MpiExecSpawner From 17f1c3ad6e61ee94c0e92a15bdc01503a43bf05f Mon Sep 17 00:00:00 2001 From: jan-janssen Date: Fri, 29 Aug 2025 17:24:24 +0200 Subject: [PATCH 37/83] type fixes --- executorlib/standalone/interactive/communication.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/executorlib/standalone/interactive/communication.py b/executorlib/standalone/interactive/communication.py index d9f7ffed..d4d7696c 100644 --- a/executorlib/standalone/interactive/communication.py +++ b/executorlib/standalone/interactive/communication.py @@ -1,7 +1,7 @@ import logging import sys from socket import gethostname -from typing import Optional +from typing import Any, Optional import cloudpickle import zmq @@ -35,12 +35,11 @@ def __init__( self._poller.register(self._socket, zmq.POLLIN) self._process = None self._time_out_ms = time_out_ms + self._logger: Optional[logging.Logger] = None if log_obj_size: self._logger = logging.getLogger("executorlib") - else: - self._logger = None self._spawner = spawner - self._command_lst = [] + self._command_lst: list[str] = [] def send_dict(self, input_dict: dict): """ @@ -62,7 +61,7 @@ def receive_dict(self) -> dict: Returns: dict: dictionary with response received from the connected client """ - response_lst = [] + response_lst: list[tuple[Any, int]] = [] while len(response_lst) == 0: response_lst = self._poller.poll(self._time_out_ms) if not self._spawner.poll(): From acd91fe85e6a3ebee3d37cde9f2be978c618ffc7 Mon Sep 17 00:00:00 2001 From: jan-janssen Date: Fri, 29 Aug 2025 17:50:42 +0200 Subject: [PATCH 38/83] helper function --- executorlib/task_scheduler/interactive/shared.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/executorlib/task_scheduler/interactive/shared.py b/executorlib/task_scheduler/interactive/shared.py index 1cbc564a..2c8038cd 100644 --- a/executorlib/task_scheduler/interactive/shared.py +++ b/executorlib/task_scheduler/interactive/shared.py @@ -2,6 +2,7 @@ import os import queue import time +from concurrent.futures import Future from concurrent.futures._base import PENDING from typing import Callable, Optional @@ -110,9 +111,7 @@ def _execute_task_without_cache( f.set_result(interface.send_and_receive_dict(input_dict=task_dict)) except Exception as thread_exception: if isinstance(thread_exception, ExecutorlibSockerError): - f._state = PENDING - _task_done(future_queue=future_queue) - future_queue.put(task_dict | {"future": f}) + _reset_task_dict(future_obj=f, future_queue=future_queue, task_dict=task_dict) interface._spawner.bootup( command_lst=interface._command_lst, ) @@ -165,9 +164,7 @@ def _execute_task_with_cache( f.set_result(result) except Exception as thread_exception: if isinstance(thread_exception, ExecutorlibSockerError): - f._state = PENDING - _task_done(future_queue=future_queue) - future_queue.put(task_dict | {"future": f}) + _reset_task_dict(future_obj=f, future_queue=future_queue, task_dict=task_dict) interface._spawner.bootup( command_lst=interface._command_lst, ) @@ -188,3 +185,9 @@ def _execute_task_with_cache( def _task_done(future_queue: queue.Queue): with contextlib.suppress(ValueError): future_queue.task_done() + + +def _reset_task_dict(future_obj: Future, future_queue: queue.Queue, task_dict: dict): + future_obj._state = PENDING + _task_done(future_queue=future_queue) + future_queue.put(task_dict | {"future": future_obj}) From 337fa450d6f58fad41006cd8558e82f9c206978e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 29 Aug 2025 15:50:52 +0000 Subject: [PATCH 39/83] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- executorlib/task_scheduler/interactive/shared.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/executorlib/task_scheduler/interactive/shared.py b/executorlib/task_scheduler/interactive/shared.py index 2c8038cd..6c8cfbbb 100644 --- a/executorlib/task_scheduler/interactive/shared.py +++ b/executorlib/task_scheduler/interactive/shared.py @@ -111,7 +111,9 @@ def _execute_task_without_cache( f.set_result(interface.send_and_receive_dict(input_dict=task_dict)) except Exception as thread_exception: if isinstance(thread_exception, ExecutorlibSockerError): - _reset_task_dict(future_obj=f, future_queue=future_queue, task_dict=task_dict) + _reset_task_dict( + future_obj=f, future_queue=future_queue, task_dict=task_dict + ) interface._spawner.bootup( command_lst=interface._command_lst, ) @@ -164,7 +166,9 @@ def _execute_task_with_cache( f.set_result(result) except Exception as thread_exception: if isinstance(thread_exception, ExecutorlibSockerError): - _reset_task_dict(future_obj=f, future_queue=future_queue, task_dict=task_dict) + _reset_task_dict( + future_obj=f, future_queue=future_queue, task_dict=task_dict + ) interface._spawner.bootup( command_lst=interface._command_lst, ) From 19e4cbf93390aef7f9825a1c2ac149407beaaea5 Mon Sep 17 00:00:00 2001 From: jan-janssen Date: Fri, 29 Aug 2025 17:54:50 +0200 Subject: [PATCH 40/83] introduce restart function --- executorlib/standalone/interactive/communication.py | 8 ++++++++ executorlib/task_scheduler/interactive/shared.py | 8 ++------ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/executorlib/standalone/interactive/communication.py b/executorlib/standalone/interactive/communication.py index d4d7696c..9824e838 100644 --- a/executorlib/standalone/interactive/communication.py +++ b/executorlib/standalone/interactive/communication.py @@ -116,6 +116,14 @@ def bootup( command_lst=command_lst, ) + def restart(self): + """ + Restart the client process to onnect to the SocketInterface. + """ + self._spawner.bootup( + command_lst=self._command_lst, + ) + def shutdown(self, wait: bool = True): """ Shutdown the SocketInterface and the connected client process. diff --git a/executorlib/task_scheduler/interactive/shared.py b/executorlib/task_scheduler/interactive/shared.py index 6c8cfbbb..7e3c8c69 100644 --- a/executorlib/task_scheduler/interactive/shared.py +++ b/executorlib/task_scheduler/interactive/shared.py @@ -114,9 +114,7 @@ def _execute_task_without_cache( _reset_task_dict( future_obj=f, future_queue=future_queue, task_dict=task_dict ) - interface._spawner.bootup( - command_lst=interface._command_lst, - ) + interface.restart() else: interface.shutdown(wait=True) _task_done(future_queue=future_queue) @@ -169,9 +167,7 @@ def _execute_task_with_cache( _reset_task_dict( future_obj=f, future_queue=future_queue, task_dict=task_dict ) - interface._spawner.bootup( - command_lst=interface._command_lst, - ) + interface.restart() else: interface.shutdown(wait=True) _task_done(future_queue=future_queue) From 9053074f4adc1babc8c2e8cdaf2b932dfd81be20 Mon Sep 17 00:00:00 2001 From: jan-janssen Date: Fri, 29 Aug 2025 18:00:36 +0200 Subject: [PATCH 41/83] fix spelling --- executorlib/standalone/interactive/communication.py | 4 ++-- executorlib/task_scheduler/interactive/shared.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/executorlib/standalone/interactive/communication.py b/executorlib/standalone/interactive/communication.py index 9824e838..0900a8cf 100644 --- a/executorlib/standalone/interactive/communication.py +++ b/executorlib/standalone/interactive/communication.py @@ -7,7 +7,7 @@ import zmq -class ExecutorlibSockerError(RuntimeError): +class ExecutorlibSocketError(RuntimeError): pass @@ -65,7 +65,7 @@ def receive_dict(self) -> dict: while len(response_lst) == 0: response_lst = self._poller.poll(self._time_out_ms) if not self._spawner.poll(): - raise ExecutorlibSockerError() + raise ExecutorlibSocketError() data = self._socket.recv(zmq.NOBLOCK) if self._logger is not None: self._logger.warning( diff --git a/executorlib/task_scheduler/interactive/shared.py b/executorlib/task_scheduler/interactive/shared.py index 7e3c8c69..883c3dac 100644 --- a/executorlib/task_scheduler/interactive/shared.py +++ b/executorlib/task_scheduler/interactive/shared.py @@ -8,7 +8,7 @@ from executorlib.standalone.command import get_interactive_execute_command from executorlib.standalone.interactive.communication import ( - ExecutorlibSockerError, + ExecutorlibSocketError, SocketInterface, interface_bootup, ) @@ -110,7 +110,7 @@ def _execute_task_without_cache( try: f.set_result(interface.send_and_receive_dict(input_dict=task_dict)) except Exception as thread_exception: - if isinstance(thread_exception, ExecutorlibSockerError): + if isinstance(thread_exception, ExecutorlibSocketError): _reset_task_dict( future_obj=f, future_queue=future_queue, task_dict=task_dict ) @@ -163,7 +163,7 @@ def _execute_task_with_cache( dump(file_name=file_name, data_dict=data_dict) f.set_result(result) except Exception as thread_exception: - if isinstance(thread_exception, ExecutorlibSockerError): + if isinstance(thread_exception, ExecutorlibSocketError): _reset_task_dict( future_obj=f, future_queue=future_queue, task_dict=task_dict ) From 5362c73e27ffe464ebe0e8dd3cd2b143bf3e410d Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Sat, 30 Aug 2025 11:39:58 +0200 Subject: [PATCH 42/83] shutdown on del --- .../interactive/pysqaspawner.py | 52 ++++++++++--------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/executorlib/task_scheduler/interactive/pysqaspawner.py b/executorlib/task_scheduler/interactive/pysqaspawner.py index 6ad2265e..48a14291 100644 --- a/executorlib/task_scheduler/interactive/pysqaspawner.py +++ b/executorlib/task_scheduler/interactive/pysqaspawner.py @@ -43,7 +43,6 @@ def __init__( cores=cores, openmpi_oversubscribe=openmpi_oversubscribe, ) - self._process: Optional[int] = None self._threads_per_core = threads_per_core self._gpus_per_core = gpus_per_core self._num_nodes = num_nodes @@ -53,6 +52,8 @@ def __init__( self._config_directory = config_directory self._backend = backend self._pysqa_submission_kwargs = kwargs + self._process: Optional[int] = None + self._queue_adapter: Optional[QueueAdapter] = None def bootup( self, @@ -64,25 +65,18 @@ def bootup( Args: command_lst (list[str]): The command list to execute. """ - qa = QueueAdapter( + self._queue_adapter = QueueAdapter( directory=self._config_directory, queue_type=self._backend, execute_command=pysqa_execute_command, ) - job_id = self._start_process_helper(command_lst=command_lst, queue_adapter=qa) + self._process = self._start_process_helper( + command_lst=command_lst, + queue_adapter=self._queue_adapter, + ) while True: - status = qa.get_status_of_job(process_id=job_id) - if status == "running": - self._process = job_id + if self._check_process_helper(command_lst=command_lst): break - elif status is None: - raise RuntimeError( - f"Failed to start the process with command: {command_lst}" - ) - elif status == "error": - job_id = self._start_process_helper( - command_lst=command_lst, queue_adapter=qa - ) else: sleep(1) # Wait for the process to start @@ -147,7 +141,7 @@ def shutdown(self, wait: bool = True): config_directory=self._config_directory, backend=self._backend, ) - self._process = None + self._process = None def poll(self) -> bool: """ @@ -156,16 +150,9 @@ def poll(self) -> bool: Returns: bool: True if the interface is running, False otherwise. """ - qa = QueueAdapter( - directory=self._config_directory, - queue_type=self._backend, - execute_command=pysqa_execute_command, - ) if self._process is not None: - return qa.get_status_of_job(process_id=self._process) in [ - "running", - "pending", - ] + status = self._queue_adapter.get_status_of_job(process_id=self._process) + return status in ["running", "pending"] else: return False @@ -183,6 +170,23 @@ def _start_process_helper( cores=int(self._cores * self._threads_per_core), **self._pysqa_submission_kwargs, ) + + def _check_process_helper(self, command_lst: list[str]) -> bool: + status = self._queue_adapter.get_status_of_job(process_id=self._process) + if status == "running": + return True + elif status is None: + raise RuntimeError( + f"Failed to start the process with command: {command_lst}" + ) + elif status == "error": + self._process = self._start_process_helper( + command_lst=command_lst, queue_adapter=self._queue_adapter + ) + return False + + def __del__(self): + self.shutdown(wait=True) def create_pysqa_block_allocation_scheduler( From 1b4baa90a96ff9b3de01a9a82bd740c47c16eb9f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 30 Aug 2025 09:40:07 +0000 Subject: [PATCH 43/83] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- executorlib/task_scheduler/interactive/pysqaspawner.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/executorlib/task_scheduler/interactive/pysqaspawner.py b/executorlib/task_scheduler/interactive/pysqaspawner.py index 48a14291..a20eca60 100644 --- a/executorlib/task_scheduler/interactive/pysqaspawner.py +++ b/executorlib/task_scheduler/interactive/pysqaspawner.py @@ -71,7 +71,7 @@ def bootup( execute_command=pysqa_execute_command, ) self._process = self._start_process_helper( - command_lst=command_lst, + command_lst=command_lst, queue_adapter=self._queue_adapter, ) while True: @@ -170,7 +170,7 @@ def _start_process_helper( cores=int(self._cores * self._threads_per_core), **self._pysqa_submission_kwargs, ) - + def _check_process_helper(self, command_lst: list[str]) -> bool: status = self._queue_adapter.get_status_of_job(process_id=self._process) if status == "running": @@ -184,7 +184,7 @@ def _check_process_helper(self, command_lst: list[str]) -> bool: command_lst=command_lst, queue_adapter=self._queue_adapter ) return False - + def __del__(self): self.shutdown(wait=True) From 0855ee99f7e8981043454cbd754b2f32ff815563 Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Sat, 30 Aug 2025 11:43:10 +0200 Subject: [PATCH 44/83] type fixes --- executorlib/task_scheduler/interactive/pysqaspawner.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/executorlib/task_scheduler/interactive/pysqaspawner.py b/executorlib/task_scheduler/interactive/pysqaspawner.py index a20eca60..73a8cb87 100644 --- a/executorlib/task_scheduler/interactive/pysqaspawner.py +++ b/executorlib/task_scheduler/interactive/pysqaspawner.py @@ -150,7 +150,7 @@ def poll(self) -> bool: Returns: bool: True if the interface is running, False otherwise. """ - if self._process is not None: + if self._process is not None and self._queue_adapter is not None: status = self._queue_adapter.get_status_of_job(process_id=self._process) return status in ["running", "pending"] else: @@ -172,7 +172,10 @@ def _start_process_helper( ) def _check_process_helper(self, command_lst: list[str]) -> bool: - status = self._queue_adapter.get_status_of_job(process_id=self._process) + if self._queue_adapter is not None: + status = self._queue_adapter.get_status_of_job(process_id=self._process) + else: + status = None if status == "running": return True elif status is None: From 8cb53caec7ae1ae75cb9644e120a739c9a7aaadd Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Sat, 30 Aug 2025 18:22:21 +0200 Subject: [PATCH 45/83] Introduce stop function (#791) * all tasks are stopped with stop function * Format black * add additional break * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix typing * fixes * shutdown * restructure * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * the interface can only be none when it was cancelled before it started * fix type hints * fixes * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * be more explizit with types --------- Co-authored-by: pyiron-runner Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../standalone/interactive/communication.py | 40 ++++++++++++++----- executorlib/standalone/interactive/spawner.py | 9 +++-- .../interactive/blockallocation.py | 13 +++++- .../task_scheduler/interactive/fluxspawner.py | 6 ++- .../interactive/pysqaspawner.py | 6 ++- .../task_scheduler/interactive/shared.py | 35 +++++++++++----- 6 files changed, 80 insertions(+), 29 deletions(-) diff --git a/executorlib/standalone/interactive/communication.py b/executorlib/standalone/interactive/communication.py index b5af3c56..b0c4bc39 100644 --- a/executorlib/standalone/interactive/communication.py +++ b/executorlib/standalone/interactive/communication.py @@ -1,7 +1,7 @@ import logging import sys from socket import gethostname -from typing import Any, Optional +from typing import Any, Callable, Optional import cloudpickle import zmq @@ -43,6 +43,7 @@ def __init__( self._logger = logging.getLogger("executorlib") self._spawner = spawner self._command_lst: list[str] = [] + self._stop_function: Optional[Callable] = None def send_dict(self, input_dict: dict): """ @@ -107,7 +108,8 @@ def bind_to_random_port(self) -> int: def bootup( self, command_lst: list[str], - ): + stop_function: Optional[Callable] = None, + ) -> bool: """ Boot up the client process to connect to the SocketInterface. @@ -115,17 +117,26 @@ def bootup( command_lst (list): list of strings to start the client process """ self._command_lst = command_lst - self._spawner.bootup( + self._stop_function = stop_function + if not self._spawner.bootup( command_lst=command_lst, - ) + stop_function=stop_function, + ): + self._reset_socket() + return False + return True def restart(self): """ Restart the client process to onnect to the SocketInterface. """ - self._spawner.bootup( + if not self._spawner.bootup( command_lst=self._command_lst, - ) + stop_function=self._stop_function, + ): + self._reset_socket() + return False + return True def shutdown(self, wait: bool = True): """ @@ -140,6 +151,10 @@ def shutdown(self, wait: bool = True): input_dict={"shutdown": True, "wait": wait} ) self._spawner.shutdown(wait=wait) + self._reset_socket() + return result + + def _reset_socket(self): if self._socket is not None: self._socket.close() if self._context is not None: @@ -147,7 +162,6 @@ def shutdown(self, wait: bool = True): self._process = None self._socket = None self._context = None - return result def __del__(self): """ @@ -163,7 +177,8 @@ def interface_bootup( hostname_localhost: Optional[bool] = None, log_obj_size: bool = False, worker_id: Optional[int] = None, -) -> SocketInterface: + stop_function: Optional[Callable] = None, +) -> Optional[SocketInterface]: """ Start interface for ZMQ communication @@ -202,10 +217,13 @@ def interface_bootup( "--zmqport", str(interface.bind_to_random_port()), ] - interface.bootup( + if interface.bootup( command_lst=command_lst, - ) - return interface + stop_function=stop_function, + ): + return interface + else: + return None def interface_connect(host: str, port: str) -> tuple[zmq.Context, zmq.Socket]: diff --git a/executorlib/standalone/interactive/spawner.py b/executorlib/standalone/interactive/spawner.py index 4a5cb390..ce90052b 100644 --- a/executorlib/standalone/interactive/spawner.py +++ b/executorlib/standalone/interactive/spawner.py @@ -1,7 +1,7 @@ import os import subprocess from abc import ABC, abstractmethod -from typing import Optional +from typing import Callable, Optional MPI_COMMAND = "mpiexec" @@ -29,7 +29,8 @@ def __init__( def bootup( self, command_lst: list[str], - ): + stop_function: Optional[Callable] = None, + ) -> bool: """ Method to start the interface. @@ -87,7 +88,8 @@ def __init__( def bootup( self, command_lst: list[str], - ): + stop_function: Optional[Callable] = None, + ) -> bool: """ Method to start the subprocess interface. @@ -101,6 +103,7 @@ def bootup( cwd=self._cwd, stdin=subprocess.DEVNULL, ) + return True def generate_command(self, command_lst: list[str]) -> list[str]: """ diff --git a/executorlib/task_scheduler/interactive/blockallocation.py b/executorlib/task_scheduler/interactive/blockallocation.py index 96cec2c1..2e1d1f02 100644 --- a/executorlib/task_scheduler/interactive/blockallocation.py +++ b/executorlib/task_scheduler/interactive/blockallocation.py @@ -12,6 +12,8 @@ from executorlib.task_scheduler.base import TaskSchedulerBase from executorlib.task_scheduler.interactive.shared import execute_tasks +_task_schedulder_dict: dict = {} + class BlockAllocationTaskScheduler(TaskSchedulerBase): """ @@ -61,11 +63,18 @@ def __init__( executor_kwargs["queue_join_on_shutdown"] = False self._process_kwargs = executor_kwargs self._max_workers = max_workers + self_id = id(self) + self._self_id = self_id + _task_schedulder_dict[self._self_id] = False self._set_process( process=[ Thread( target=execute_tasks, - kwargs=executor_kwargs | {"worker_id": worker_id}, + kwargs=executor_kwargs + | { + "worker_id": worker_id, + "stop_function": lambda: _task_schedulder_dict[self_id], + }, ) for worker_id in range(self._max_workers) ], @@ -155,7 +164,9 @@ def shutdown(self, wait: bool = True, *, cancel_futures: bool = False): if self._future_queue is not None: if cancel_futures: cancel_items_in_queue(que=self._future_queue) + self._shutdown_flag = True if isinstance(self._process, list): + _task_schedulder_dict[self._self_id] = True for _ in range(len(self._process)): self._future_queue.put({"shutdown": True, "wait": wait}) if wait: diff --git a/executorlib/task_scheduler/interactive/fluxspawner.py b/executorlib/task_scheduler/interactive/fluxspawner.py index 5a35dd5c..378bbe92 100644 --- a/executorlib/task_scheduler/interactive/fluxspawner.py +++ b/executorlib/task_scheduler/interactive/fluxspawner.py @@ -1,5 +1,5 @@ import os -from typing import Optional +from typing import Callable, Optional import flux import flux.job @@ -75,7 +75,8 @@ def __init__( def bootup( self, command_lst: list[str], - ): + stop_function: Optional[Callable] = None, + ) -> bool: """ Boot up the client process to connect to the SocketInterface. @@ -126,6 +127,7 @@ def bootup( ) else: self._future = self._flux_executor.submit(jobspec=jobspec) + return True def shutdown(self, wait: bool = True): """ diff --git a/executorlib/task_scheduler/interactive/pysqaspawner.py b/executorlib/task_scheduler/interactive/pysqaspawner.py index 73a8cb87..31f57c8b 100644 --- a/executorlib/task_scheduler/interactive/pysqaspawner.py +++ b/executorlib/task_scheduler/interactive/pysqaspawner.py @@ -58,6 +58,7 @@ def __init__( def bootup( self, command_lst: list[str], + stop_function: Optional[Callable] = None, ): """ Method to start the subprocess interface. @@ -76,7 +77,10 @@ def bootup( ) while True: if self._check_process_helper(command_lst=command_lst): - break + return True + elif stop_function is not None and stop_function(): + self.shutdown(wait=True) + return False else: sleep(1) # Wait for the process to start diff --git a/executorlib/task_scheduler/interactive/shared.py b/executorlib/task_scheduler/interactive/shared.py index 883c3dac..fea9f86a 100644 --- a/executorlib/task_scheduler/interactive/shared.py +++ b/executorlib/task_scheduler/interactive/shared.py @@ -28,6 +28,7 @@ def execute_tasks( log_obj_size: bool = False, error_log_file: Optional[str] = None, worker_id: Optional[int] = None, + stop_function: Optional[Callable] = None, **kwargs, ) -> None: """ @@ -63,15 +64,17 @@ def execute_tasks( hostname_localhost=hostname_localhost, log_obj_size=log_obj_size, worker_id=worker_id, + stop_function=stop_function, ) - if init_function is not None: + if init_function is not None and interface is not None: interface.send_dict( input_dict={"init": True, "fn": init_function, "args": (), "kwargs": {}} ) while True: task_dict = future_queue.get() if "shutdown" in task_dict and task_dict["shutdown"]: - interface.shutdown(wait=task_dict["wait"]) + if interface is not None: + interface.shutdown(wait=task_dict["wait"]) _task_done(future_queue=future_queue) if queue_join_on_shutdown: future_queue.join() @@ -79,23 +82,31 @@ def execute_tasks( elif "fn" in task_dict and "future" in task_dict: if error_log_file is not None: task_dict["error_log_file"] = error_log_file - if cache_directory is None: - _execute_task_without_cache( - interface=interface, task_dict=task_dict, future_queue=future_queue + if cache_directory is None and interface is not None: + result_flag = _execute_task_without_cache( + interface=interface, + task_dict=task_dict, + future_queue=future_queue, ) - else: - _execute_task_with_cache( + elif cache_directory is not None and interface is not None: + result_flag = _execute_task_with_cache( interface=interface, task_dict=task_dict, future_queue=future_queue, cache_directory=cache_directory, cache_key=cache_key, ) + else: + raise ValueError() + if not result_flag: + if queue_join_on_shutdown: + future_queue.join() + break def _execute_task_without_cache( interface: SocketInterface, task_dict: dict, future_queue: queue.Queue -): +) -> bool: """ Execute the task in the task_dict by communicating it via the interface. @@ -114,13 +125,14 @@ def _execute_task_without_cache( _reset_task_dict( future_obj=f, future_queue=future_queue, task_dict=task_dict ) - interface.restart() + return interface.restart() else: interface.shutdown(wait=True) _task_done(future_queue=future_queue) f.set_exception(exception=thread_exception) else: _task_done(future_queue=future_queue) + return True def _execute_task_with_cache( @@ -129,7 +141,7 @@ def _execute_task_with_cache( future_queue: queue.Queue, cache_directory: str, cache_key: Optional[str] = None, -): +) -> bool: """ Execute the task in the task_dict by communicating it via the interface using the cache in the cache directory. @@ -167,7 +179,7 @@ def _execute_task_with_cache( _reset_task_dict( future_obj=f, future_queue=future_queue, task_dict=task_dict ) - interface.restart() + return interface.restart() else: interface.shutdown(wait=True) _task_done(future_queue=future_queue) @@ -180,6 +192,7 @@ def _execute_task_with_cache( future = task_dict["future"] future.set_result(result) _task_done(future_queue=future_queue) + return True def _task_done(future_queue: queue.Queue): From 79842e6efc08ef0ea80499725a4806be49de3333 Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Sun, 31 Aug 2025 12:26:22 +0200 Subject: [PATCH 46/83] merge changes --- executorlib/task_scheduler/interactive/shared.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/executorlib/task_scheduler/interactive/shared.py b/executorlib/task_scheduler/interactive/shared.py index 69c2d2e9..404fde91 100644 --- a/executorlib/task_scheduler/interactive/shared.py +++ b/executorlib/task_scheduler/interactive/shared.py @@ -81,7 +81,7 @@ def execute_multiple_tasks( break elif "fn" in task_dict and "future" in task_dict: if interface is not None: - result_flag =_execute_task_dict( + result_flag = _execute_task_dict( task_dict=task_dict, interface=interface, cache_directory=cache_directory, From 8551eda9d92b6b97625d27ed3d5732bd4dd36e67 Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Sun, 31 Aug 2025 12:27:06 +0200 Subject: [PATCH 47/83] fix docstring --- executorlib/task_scheduler/interactive/shared.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/executorlib/task_scheduler/interactive/shared.py b/executorlib/task_scheduler/interactive/shared.py index 404fde91..16fd9002 100644 --- a/executorlib/task_scheduler/interactive/shared.py +++ b/executorlib/task_scheduler/interactive/shared.py @@ -113,7 +113,8 @@ def execute_single_task( Execute a single tasks in parallel using the message passing interface (MPI). Args: - future_queue (queue.Queue): task queue of dictionary objects which are submitted to the parallel process + task_dict (dict): task submitted to the executor as dictionary. This dictionary has the following keys + {"fn": Callable, "args": (), "kwargs": {}, "resource_dict": {}} cores (int): defines the total number of MPI ranks to use spawner (BaseSpawner): Spawner to start process on selected compute resources hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the @@ -123,11 +124,9 @@ def execute_single_task( points to the same address as localhost. Still MacOS >= 12 seems to disable this look up for security reasons. So on MacOS it is required to set this option to true - init_function (Callable): optional function to preset arguments for functions which are submitted later cache_directory (str, optional): The directory to store cache files. Defaults to "executorlib_cache". cache_key (str, optional): By default the cache_key is generated based on the function hash, this can be overwritten by setting the cache_key. - queue_join_on_shutdown (bool): Join communication queue when thread is closed. Defaults to True. log_obj_size (bool): Enable debug mode which reports the size of the communicated objects. error_log_file (str): Name of the error log file to use for storing exceptions raised by the Python functions submitted to the Executor. From a969dd9929150d955282e12907e1d10c62e91642 Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Sun, 31 Aug 2025 12:33:01 +0200 Subject: [PATCH 48/83] fixes --- .../task_scheduler/interactive/shared.py | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/executorlib/task_scheduler/interactive/shared.py b/executorlib/task_scheduler/interactive/shared.py index 16fd9002..bea60460 100644 --- a/executorlib/task_scheduler/interactive/shared.py +++ b/executorlib/task_scheduler/interactive/shared.py @@ -88,7 +88,15 @@ def execute_multiple_tasks( cache_key=cache_key, error_log_file=error_log_file, ) - _task_done(future_queue=future_queue) + if not result_flag: + _task_done(future_queue=future_queue) + f = task_dict.pop("future") + _reset_task_dict( + future_obj=f, future_queue=future_queue, task_dict=task_dict + ) + interface.restart() + else: + _task_done(future_queue=future_queue) else: raise ValueError() if not result_flag: @@ -198,10 +206,7 @@ def _execute_task_without_cache(interface: SocketInterface, task_dict: dict) -> f.set_result(interface.send_and_receive_dict(input_dict=task_dict)) except Exception as thread_exception: if isinstance(thread_exception, ExecutorlibSocketError): - _reset_task_dict( - future_obj=f, future_queue=future_queue, task_dict=task_dict - ) - return interface.restart() + return False else: interface.shutdown(wait=True) f.set_exception(exception=thread_exception) @@ -247,10 +252,7 @@ def _execute_task_with_cache( f.set_result(result) except Exception as thread_exception: if isinstance(thread_exception, ExecutorlibSocketError): - _reset_task_dict( - future_obj=f, future_queue=future_queue, task_dict=task_dict - ) - return interface.restart() + return False else: interface.shutdown(wait=True) f.set_exception(exception=thread_exception) From 928465b6099993115e8fcca95e0535524b5863ac Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Sun, 31 Aug 2025 12:37:32 +0200 Subject: [PATCH 49/83] fix types --- .../task_scheduler/interactive/shared.py | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/executorlib/task_scheduler/interactive/shared.py b/executorlib/task_scheduler/interactive/shared.py index bea60460..91b5953e 100644 --- a/executorlib/task_scheduler/interactive/shared.py +++ b/executorlib/task_scheduler/interactive/shared.py @@ -80,25 +80,23 @@ def execute_multiple_tasks( future_queue.join() break elif "fn" in task_dict and "future" in task_dict: - if interface is not None: - result_flag = _execute_task_dict( - task_dict=task_dict, - interface=interface, - cache_directory=cache_directory, - cache_key=cache_key, - error_log_file=error_log_file, + result_flag = _execute_task_dict( + task_dict=task_dict, + interface=interface, + cache_directory=cache_directory, + cache_key=cache_key, + error_log_file=error_log_file, + ) + if not result_flag: + _task_done(future_queue=future_queue) + f = task_dict.pop("future") + _reset_task_dict( + future_obj=f, future_queue=future_queue, task_dict=task_dict ) - if not result_flag: - _task_done(future_queue=future_queue) - f = task_dict.pop("future") - _reset_task_dict( - future_obj=f, future_queue=future_queue, task_dict=task_dict - ) + if interface is not None: interface.restart() - else: - _task_done(future_queue=future_queue) else: - raise ValueError() + _task_done(future_queue=future_queue) if not result_flag: if queue_join_on_shutdown: future_queue.join() @@ -160,7 +158,7 @@ def execute_single_task( def _execute_task_dict( task_dict: dict, - interface: SocketInterface, + interface: Optional[SocketInterface] = None, cache_directory: Optional[str] = None, cache_key: Optional[str] = None, error_log_file: Optional[str] = None, @@ -180,15 +178,17 @@ def _execute_task_dict( """ if error_log_file is not None: task_dict["error_log_file"] = error_log_file - if cache_directory is None: + if cache_directory is None and interface is not None: return _execute_task_without_cache(interface=interface, task_dict=task_dict) - else: + elif cache_directory is not None and interface is not None: return _execute_task_with_cache( interface=interface, task_dict=task_dict, cache_directory=cache_directory, cache_key=cache_key, ) + else: + raise ValueError() def _execute_task_without_cache(interface: SocketInterface, task_dict: dict) -> bool: From 3e36a5a1b2071186265e7f1aa1a603c4f7405461 Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Sun, 31 Aug 2025 13:29:05 +0200 Subject: [PATCH 50/83] consistent naming scheme --- executorlib/executor/flux.py | 2 +- executorlib/executor/slurm.py | 2 +- .../interactive/{pysqaspawner.py => spawner_pysqa.py} | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename executorlib/task_scheduler/interactive/{pysqaspawner.py => spawner_pysqa.py} (100%) diff --git a/executorlib/executor/flux.py b/executorlib/executor/flux.py index e8952047..4d7c2826 100644 --- a/executorlib/executor/flux.py +++ b/executorlib/executor/flux.py @@ -358,7 +358,7 @@ def __init__( import pysqa # noqa if block_allocation: - from executorlib.task_scheduler.interactive.pysqaspawner import ( + from executorlib.task_scheduler.interactive.spawner_pysqa import ( create_pysqa_block_allocation_scheduler, ) diff --git a/executorlib/executor/slurm.py b/executorlib/executor/slurm.py index 2624dd91..f0bd3342 100644 --- a/executorlib/executor/slurm.py +++ b/executorlib/executor/slurm.py @@ -166,7 +166,7 @@ def __init__( import pysqa # noqa if block_allocation: - from executorlib.task_scheduler.interactive.pysqaspawner import ( + from executorlib.task_scheduler.interactive.spawner_pysqa import ( create_pysqa_block_allocation_scheduler, ) diff --git a/executorlib/task_scheduler/interactive/pysqaspawner.py b/executorlib/task_scheduler/interactive/spawner_pysqa.py similarity index 100% rename from executorlib/task_scheduler/interactive/pysqaspawner.py rename to executorlib/task_scheduler/interactive/spawner_pysqa.py From ad6ca173f8910ea8732ab027fe2bbff0ff43400e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 31 Aug 2025 15:56:13 +0000 Subject: [PATCH 51/83] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- executorlib/task_scheduler/interactive/blockallocation.py | 6 +++++- executorlib/task_scheduler/interactive/shared.py | 5 ++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/executorlib/task_scheduler/interactive/blockallocation.py b/executorlib/task_scheduler/interactive/blockallocation.py index b334b528..2142ea2f 100644 --- a/executorlib/task_scheduler/interactive/blockallocation.py +++ b/executorlib/task_scheduler/interactive/blockallocation.py @@ -12,7 +12,11 @@ from executorlib.standalone.interactive.spawner import BaseSpawner, MpiExecSpawner from executorlib.standalone.queue import cancel_items_in_queue from executorlib.task_scheduler.base import TaskSchedulerBase -from executorlib.task_scheduler.interactive.shared import execute_task_dict, task_done, reset_task_dict +from executorlib.task_scheduler.interactive.shared import ( + execute_task_dict, + reset_task_dict, + task_done, +) _task_schedulder_dict: dict = {} diff --git a/executorlib/task_scheduler/interactive/shared.py b/executorlib/task_scheduler/interactive/shared.py index 38994e0d..4d61de7a 100644 --- a/executorlib/task_scheduler/interactive/shared.py +++ b/executorlib/task_scheduler/interactive/shared.py @@ -6,7 +6,10 @@ from concurrent.futures._base import PENDING from typing import Optional -from executorlib.standalone.interactive.communication import ExecutorlibSocketError, SocketInterface +from executorlib.standalone.interactive.communication import ( + ExecutorlibSocketError, + SocketInterface, +) from executorlib.standalone.serialize import serialize_funct From f1d0afffd13bc57ecafe8febed1bda50b1457e53 Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Sun, 31 Aug 2025 18:05:05 +0200 Subject: [PATCH 52/83] remove duplicated task_done() call --- executorlib/task_scheduler/interactive/blockallocation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/executorlib/task_scheduler/interactive/blockallocation.py b/executorlib/task_scheduler/interactive/blockallocation.py index b334b528..1a3bdde6 100644 --- a/executorlib/task_scheduler/interactive/blockallocation.py +++ b/executorlib/task_scheduler/interactive/blockallocation.py @@ -261,7 +261,6 @@ def _execute_multiple_tasks( cache_key=cache_key, error_log_file=error_log_file, ) - task_done(future_queue=future_queue) if not result_flag: task_done(future_queue=future_queue) f = task_dict.pop("future") From 88d0cd60363fe00a01cf72e5940e08ae0b914688 Mon Sep 17 00:00:00 2001 From: jan-janssen Date: Sun, 31 Aug 2025 18:33:29 +0200 Subject: [PATCH 53/83] fixes --- executorlib/task_scheduler/interactive/blockallocation.py | 5 ++++- executorlib/task_scheduler/interactive/shared.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/executorlib/task_scheduler/interactive/blockallocation.py b/executorlib/task_scheduler/interactive/blockallocation.py index 1e969603..e02cf741 100644 --- a/executorlib/task_scheduler/interactive/blockallocation.py +++ b/executorlib/task_scheduler/interactive/blockallocation.py @@ -234,6 +234,7 @@ def _execute_multiple_tasks( worker_id (int): Communicate the worker which ID was assigned to it for future reference and resource distribution. """ + # The interface becomes None when the job was cancelled before computing resources were allocated. interface = interface_bootup( command_lst=get_interactive_execute_command( cores=cores, @@ -259,7 +260,7 @@ def _execute_multiple_tasks( break elif "fn" in task_dict and "future" in task_dict: result_flag = execute_task_dict( - task_dict=task_dict, + task_dict=task_dict.copy(), # this copy is expensive and should be fixed interface=interface, cache_directory=cache_directory, cache_key=cache_key, @@ -273,5 +274,7 @@ def _execute_multiple_tasks( ) if interface is not None: interface.restart() + else: + break else: task_done(future_queue=future_queue) diff --git a/executorlib/task_scheduler/interactive/shared.py b/executorlib/task_scheduler/interactive/shared.py index 4d61de7a..80fa0acc 100644 --- a/executorlib/task_scheduler/interactive/shared.py +++ b/executorlib/task_scheduler/interactive/shared.py @@ -45,7 +45,7 @@ def execute_task_dict( cache_key=cache_key, ) else: - raise ValueError() + return False def task_done(future_queue: queue.Queue): From 68b14082e6b0967004805f20e9bb455aa4fd1e4a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 31 Aug 2025 16:33:38 +0000 Subject: [PATCH 54/83] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- executorlib/task_scheduler/interactive/blockallocation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/executorlib/task_scheduler/interactive/blockallocation.py b/executorlib/task_scheduler/interactive/blockallocation.py index e02cf741..f71afa41 100644 --- a/executorlib/task_scheduler/interactive/blockallocation.py +++ b/executorlib/task_scheduler/interactive/blockallocation.py @@ -234,7 +234,7 @@ def _execute_multiple_tasks( worker_id (int): Communicate the worker which ID was assigned to it for future reference and resource distribution. """ - # The interface becomes None when the job was cancelled before computing resources were allocated. + # The interface becomes None when the job was cancelled before computing resources were allocated. interface = interface_bootup( command_lst=get_interactive_execute_command( cores=cores, From 14580797e1fa8a3c6b9c159d01de74d5dfa39c3b Mon Sep 17 00:00:00 2001 From: jan-janssen Date: Sun, 31 Aug 2025 18:41:06 +0200 Subject: [PATCH 55/83] cancel items in queue --- executorlib/task_scheduler/interactive/blockallocation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/executorlib/task_scheduler/interactive/blockallocation.py b/executorlib/task_scheduler/interactive/blockallocation.py index f71afa41..0c22fc15 100644 --- a/executorlib/task_scheduler/interactive/blockallocation.py +++ b/executorlib/task_scheduler/interactive/blockallocation.py @@ -172,12 +172,13 @@ def shutdown(self, wait: bool = True, *, cancel_futures: bool = False): cancel_items_in_queue(que=self._future_queue) self._shutdown_flag = True if isinstance(self._process, list): - _task_schedulder_dict[self._self_id] = True + _task_schedulder_dict[self._self_id] = True # This is a hard shutdown for _ in range(len(self._process)): self._future_queue.put({"shutdown": True, "wait": wait}) if wait: for process in self._process: process.join() + cancel_items_in_queue(que=self._future_queue) self._future_queue.join() self._process = None self._future_queue = None From 095385f6c7fb0428de0fc7a63a738a5f556e2698 Mon Sep 17 00:00:00 2001 From: jan-janssen Date: Sun, 31 Aug 2025 19:00:45 +0200 Subject: [PATCH 56/83] fixes --- .../interactive/blockallocation.py | 5 ++- .../task_scheduler/interactive/onetoone.py | 8 +++- .../task_scheduler/interactive/shared.py | 42 +++++++++++++------ 3 files changed, 40 insertions(+), 15 deletions(-) diff --git a/executorlib/task_scheduler/interactive/blockallocation.py b/executorlib/task_scheduler/interactive/blockallocation.py index a3d5c7c8..929a5d45 100644 --- a/executorlib/task_scheduler/interactive/blockallocation.py +++ b/executorlib/task_scheduler/interactive/blockallocation.py @@ -260,8 +260,10 @@ def _execute_multiple_tasks( future_queue.join() break elif "fn" in task_dict and "future" in task_dict: + f = task_dict.pop("future") result_flag = execute_task_dict( - task_dict=task_dict.copy(), # this copy is expensive and should be fixed + future_obj=f, + task_dict=task_dict, interface=interface, cache_directory=cache_directory, cache_key=cache_key, @@ -269,7 +271,6 @@ def _execute_multiple_tasks( ) if not result_flag: task_done(future_queue=future_queue) - f = task_dict.pop("future") reset_task_dict( future_obj=f, future_queue=future_queue, task_dict=task_dict ) diff --git a/executorlib/task_scheduler/interactive/onetoone.py b/executorlib/task_scheduler/interactive/onetoone.py index b3ffddbd..dbdd2c82 100644 --- a/executorlib/task_scheduler/interactive/onetoone.py +++ b/executorlib/task_scheduler/interactive/onetoone.py @@ -1,6 +1,7 @@ import queue from threading import Thread from typing import Optional +from concurrent.futures import Future from executorlib.standalone.command import get_interactive_execute_command from executorlib.standalone.interactive.communication import interface_bootup @@ -186,6 +187,7 @@ def _wrap_execute_task_in_separate_process( dictionary containing the future objects and the number of cores they require """ resource_dict = task_dict.pop("resource_dict").copy() + f = task_dict.pop("future") if "cores" not in resource_dict or ( resource_dict["cores"] == 1 and executor_kwargs["cores"] >= 1 ): @@ -197,12 +199,13 @@ def _wrap_execute_task_in_separate_process( max_cores=max_cores, max_workers=max_workers, ) - active_task_dict[task_dict["future"]] = slots_required + active_task_dict[f] = slots_required task_kwargs = executor_kwargs.copy() task_kwargs.update(resource_dict) task_kwargs.update( { "task_dict": task_dict, + "future_obj": f, "spawner": spawner, "hostname_localhost": hostname_localhost, } @@ -217,6 +220,7 @@ def _wrap_execute_task_in_separate_process( def _execute_task_in_thread( task_dict: dict, + future_obj: Future, cores: int = 1, spawner: type[BaseSpawner] = MpiExecSpawner, hostname_localhost: Optional[bool] = None, @@ -233,6 +237,7 @@ def _execute_task_in_thread( Args: task_dict (dict): task submitted to the executor as dictionary. This dictionary has the following keys {"fn": Callable, "args": (), "kwargs": {}, "resource_dict": {}} + future_obj (Future): A Future representing the given call. cores (int): defines the total number of MPI ranks to use spawner (BaseSpawner): Spawner to start process on selected compute resources hostname_localhost (boolean): use localhost instead of the hostname to establish the zmq connection. In the @@ -253,6 +258,7 @@ def _execute_task_in_thread( """ execute_task_dict( task_dict=task_dict, + future_obj=future_obj, interface=interface_bootup( command_lst=get_interactive_execute_command( cores=cores, diff --git a/executorlib/task_scheduler/interactive/shared.py b/executorlib/task_scheduler/interactive/shared.py index 80fa0acc..cf4de6ff 100644 --- a/executorlib/task_scheduler/interactive/shared.py +++ b/executorlib/task_scheduler/interactive/shared.py @@ -15,6 +15,7 @@ def execute_task_dict( task_dict: dict, + future_obj: Future, interface: Optional[SocketInterface] = None, cache_directory: Optional[str] = None, cache_key: Optional[str] = None, @@ -26,6 +27,7 @@ def execute_task_dict( Args: task_dict (dict): task submitted to the executor as dictionary. This dictionary has the following keys {"fn": Callable, "args": (), "kwargs": {}, "resource_dict": {}} + future_obj (Future): A Future representing the given call. interface (SocketInterface): socket interface for zmq communication cache_directory (str, optional): The directory to store cache files. Defaults to "executorlib_cache". cache_key (str, optional): By default the cache_key is generated based on the function hash, this can be @@ -36,30 +38,46 @@ def execute_task_dict( if error_log_file is not None: task_dict["error_log_file"] = error_log_file if cache_directory is None and interface is not None: - return _execute_task_without_cache(interface=interface, task_dict=task_dict) + return _execute_task_without_cache(interface=interface, task_dict=task_dict, future_obj=future_obj) elif cache_directory is not None and interface is not None: return _execute_task_with_cache( interface=interface, task_dict=task_dict, cache_directory=cache_directory, cache_key=cache_key, + future_obj=future_obj, ) else: return False def task_done(future_queue: queue.Queue): + """ + Mark the current task as done in the current queue. + + Args: + future_queue (queue): Queue of task dictionaries waiting for execution. + """ with contextlib.suppress(ValueError): future_queue.task_done() def reset_task_dict(future_obj: Future, future_queue: queue.Queue, task_dict: dict): + """ + Reset the task dictionary for resubmission to the queue. + + Args: + future_obj (Future): A Future representing the given call. + future_queue (queue): Queue of task dictionaries waiting for execution. + task_dict (dict): task submitted to the executor as dictionary. This dictionary has the following keys + {"fn": Callable, "args": (), "kwargs": {}, "resource_dict": {}} + """ future_obj._state = PENDING _task_done(future_queue=future_queue) future_queue.put(task_dict | {"future": future_obj}) -def _execute_task_without_cache(interface: SocketInterface, task_dict: dict) -> bool: +def _execute_task_without_cache(interface: SocketInterface, task_dict: dict, future_obj: Future) -> bool: """ Execute the task in the task_dict by communicating it via the interface. @@ -67,23 +85,24 @@ def _execute_task_without_cache(interface: SocketInterface, task_dict: dict) -> interface (SocketInterface): socket interface for zmq communication task_dict (dict): task submitted to the executor as dictionary. This dictionary has the following keys {"fn": Callable, "args": (), "kwargs": {}, "resource_dict": {}} + future_obj (Future): A Future representing the given call. """ - f = task_dict.pop("future") - if not f.done() and f.set_running_or_notify_cancel(): + if not future_obj.done() and future_obj.set_running_or_notify_cancel(): try: - f.set_result(interface.send_and_receive_dict(input_dict=task_dict)) + future_obj.set_result(interface.send_and_receive_dict(input_dict=task_dict)) except Exception as thread_exception: if isinstance(thread_exception, ExecutorlibSocketError): return False else: interface.shutdown(wait=True) - f.set_exception(exception=thread_exception) + future_obj.set_exception(exception=thread_exception) return True def _execute_task_with_cache( interface: SocketInterface, task_dict: dict, + future_obj: Future, cache_directory: str, cache_key: Optional[str] = None, ) -> bool: @@ -94,6 +113,7 @@ def _execute_task_with_cache( interface (SocketInterface): socket interface for zmq communication task_dict (dict): task submitted to the executor as dictionary. This dictionary has the following keys {"fn": Callable, "args": (), "kwargs": {}, "resource_dict": {}} + future_obj (Future): A Future representing the given call. cache_directory (str): The directory to store cache files. cache_key (str, optional): By default the cache_key is generated based on the function hash, this can be overwritten by setting the cache_key. @@ -109,25 +129,23 @@ def _execute_task_with_cache( ) file_name = os.path.abspath(os.path.join(cache_directory, task_key + "_o.h5")) if file_name not in get_cache_files(cache_directory=cache_directory): - f = task_dict.pop("future") - if f.set_running_or_notify_cancel(): + if future_obj.set_running_or_notify_cancel(): try: time_start = time.time() result = interface.send_and_receive_dict(input_dict=task_dict) data_dict["output"] = result data_dict["runtime"] = time.time() - time_start dump(file_name=file_name, data_dict=data_dict) - f.set_result(result) + future_obj.set_result(result) except Exception as thread_exception: if isinstance(thread_exception, ExecutorlibSocketError): return False else: interface.shutdown(wait=True) - f.set_exception(exception=thread_exception) + future_obj.set_exception(exception=thread_exception) else: _, _, result = get_output(file_name=file_name) - future = task_dict["future"] - future.set_result(result) + future_obj.set_result(result) return True From ae7ac003f1e2c5442b95cdecafa1224e711e9383 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 31 Aug 2025 17:00:54 +0000 Subject: [PATCH 57/83] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- executorlib/task_scheduler/interactive/onetoone.py | 2 +- executorlib/task_scheduler/interactive/shared.py | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/executorlib/task_scheduler/interactive/onetoone.py b/executorlib/task_scheduler/interactive/onetoone.py index dbdd2c82..c6f678cd 100644 --- a/executorlib/task_scheduler/interactive/onetoone.py +++ b/executorlib/task_scheduler/interactive/onetoone.py @@ -1,7 +1,7 @@ import queue +from concurrent.futures import Future from threading import Thread from typing import Optional -from concurrent.futures import Future from executorlib.standalone.command import get_interactive_execute_command from executorlib.standalone.interactive.communication import interface_bootup diff --git a/executorlib/task_scheduler/interactive/shared.py b/executorlib/task_scheduler/interactive/shared.py index cf4de6ff..74032551 100644 --- a/executorlib/task_scheduler/interactive/shared.py +++ b/executorlib/task_scheduler/interactive/shared.py @@ -38,7 +38,9 @@ def execute_task_dict( if error_log_file is not None: task_dict["error_log_file"] = error_log_file if cache_directory is None and interface is not None: - return _execute_task_without_cache(interface=interface, task_dict=task_dict, future_obj=future_obj) + return _execute_task_without_cache( + interface=interface, task_dict=task_dict, future_obj=future_obj + ) elif cache_directory is not None and interface is not None: return _execute_task_with_cache( interface=interface, @@ -53,8 +55,8 @@ def execute_task_dict( def task_done(future_queue: queue.Queue): """ - Mark the current task as done in the current queue. - + Mark the current task as done in the current queue. + Args: future_queue (queue): Queue of task dictionaries waiting for execution. """ @@ -77,7 +79,9 @@ def reset_task_dict(future_obj: Future, future_queue: queue.Queue, task_dict: di future_queue.put(task_dict | {"future": future_obj}) -def _execute_task_without_cache(interface: SocketInterface, task_dict: dict, future_obj: Future) -> bool: +def _execute_task_without_cache( + interface: SocketInterface, task_dict: dict, future_obj: Future +) -> bool: """ Execute the task in the task_dict by communicating it via the interface. From c7f9eaf50392bb48470e5d6fc6bfb5021ae8ac89 Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Sun, 31 Aug 2025 20:17:15 +0200 Subject: [PATCH 58/83] fix return --- executorlib/task_scheduler/interactive/shared.py | 1 + 1 file changed, 1 insertion(+) diff --git a/executorlib/task_scheduler/interactive/shared.py b/executorlib/task_scheduler/interactive/shared.py index 2ba6126b..3cf20362 100644 --- a/executorlib/task_scheduler/interactive/shared.py +++ b/executorlib/task_scheduler/interactive/shared.py @@ -102,6 +102,7 @@ def _execute_task_without_cache( else: interface.shutdown(wait=True) future_obj.set_exception(exception=thread_exception) + return True def _execute_task_with_cache( From d4babd8d651faf8f2597620b5fd66bbfe54f4ac1 Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Sun, 31 Aug 2025 20:21:50 +0200 Subject: [PATCH 59/83] fix duplicated arguments --- executorlib/task_scheduler/interactive/onetoone.py | 1 - 1 file changed, 1 deletion(-) diff --git a/executorlib/task_scheduler/interactive/onetoone.py b/executorlib/task_scheduler/interactive/onetoone.py index 74182203..c6f678cd 100644 --- a/executorlib/task_scheduler/interactive/onetoone.py +++ b/executorlib/task_scheduler/interactive/onetoone.py @@ -208,7 +208,6 @@ def _wrap_execute_task_in_separate_process( "future_obj": f, "spawner": spawner, "hostname_localhost": hostname_localhost, - "future_obj": f, } ) process = Thread( From 72da39d43c523e7edc5f3ce92952f974539eeb90 Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Sun, 31 Aug 2025 20:44:48 +0200 Subject: [PATCH 60/83] resort --- executorlib/task_scheduler/interactive/onetoone.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/executorlib/task_scheduler/interactive/onetoone.py b/executorlib/task_scheduler/interactive/onetoone.py index c6f678cd..3b631565 100644 --- a/executorlib/task_scheduler/interactive/onetoone.py +++ b/executorlib/task_scheduler/interactive/onetoone.py @@ -205,9 +205,9 @@ def _wrap_execute_task_in_separate_process( task_kwargs.update( { "task_dict": task_dict, - "future_obj": f, "spawner": spawner, "hostname_localhost": hostname_localhost, + "future_obj": f, } ) process = Thread( From 60e2deeed161ca5a5cfd8d72eacd26bea0fefeea Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Sun, 31 Aug 2025 21:18:39 +0200 Subject: [PATCH 61/83] remove unused statement --- executorlib/task_scheduler/interactive/blockallocation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/executorlib/task_scheduler/interactive/blockallocation.py b/executorlib/task_scheduler/interactive/blockallocation.py index 929a5d45..756068ce 100644 --- a/executorlib/task_scheduler/interactive/blockallocation.py +++ b/executorlib/task_scheduler/interactive/blockallocation.py @@ -170,7 +170,6 @@ def shutdown(self, wait: bool = True, *, cancel_futures: bool = False): if self._future_queue is not None: if cancel_futures: cancel_items_in_queue(que=self._future_queue) - self._shutdown_flag = True if isinstance(self._process, list): _task_schedulder_dict[self._self_id] = True # This is a hard shutdown for _ in range(len(self._process)): From dbf3e65224a681f01b213b3833f49b203b1d5ba6 Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Sun, 31 Aug 2025 21:55:49 +0200 Subject: [PATCH 62/83] rename variable --- executorlib/task_scheduler/interactive/blockallocation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/executorlib/task_scheduler/interactive/blockallocation.py b/executorlib/task_scheduler/interactive/blockallocation.py index 756068ce..fdb1e5c6 100644 --- a/executorlib/task_scheduler/interactive/blockallocation.py +++ b/executorlib/task_scheduler/interactive/blockallocation.py @@ -18,7 +18,7 @@ task_done, ) -_task_schedulder_dict: dict = {} +_interrupt_interface_bootup_dict: dict = {} class BlockAllocationTaskScheduler(TaskSchedulerBase): @@ -71,7 +71,7 @@ def __init__( self._max_workers = max_workers self_id = id(self) self._self_id = self_id - _task_schedulder_dict[self._self_id] = False + _interrupt_interface_bootup_dict[self._self_id] = False self._set_process( process=[ Thread( @@ -79,7 +79,7 @@ def __init__( kwargs=executor_kwargs | { "worker_id": worker_id, - "stop_function": lambda: _task_schedulder_dict[self_id], + "stop_function": lambda: _interrupt_interface_bootup_dict[self_id], }, ) for worker_id in range(self._max_workers) @@ -171,7 +171,7 @@ def shutdown(self, wait: bool = True, *, cancel_futures: bool = False): if cancel_futures: cancel_items_in_queue(que=self._future_queue) if isinstance(self._process, list): - _task_schedulder_dict[self._self_id] = True # This is a hard shutdown + _interrupt_interface_bootup_dict[self._self_id] = True for _ in range(len(self._process)): self._future_queue.put({"shutdown": True, "wait": wait}) if wait: From 6b73ab7b928299aac05f67bdbe94bd9ac7b887f2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 31 Aug 2025 19:55:57 +0000 Subject: [PATCH 63/83] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- executorlib/task_scheduler/interactive/blockallocation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/executorlib/task_scheduler/interactive/blockallocation.py b/executorlib/task_scheduler/interactive/blockallocation.py index fdb1e5c6..bf74ff5b 100644 --- a/executorlib/task_scheduler/interactive/blockallocation.py +++ b/executorlib/task_scheduler/interactive/blockallocation.py @@ -79,7 +79,9 @@ def __init__( kwargs=executor_kwargs | { "worker_id": worker_id, - "stop_function": lambda: _interrupt_interface_bootup_dict[self_id], + "stop_function": lambda: _interrupt_interface_bootup_dict[ + self_id + ], }, ) for worker_id in range(self._max_workers) From fd9b630c8d0a0bbe5e4cd7235c7fed0687b3f028 Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Sun, 31 Aug 2025 22:28:47 +0200 Subject: [PATCH 64/83] Update shared.py --- executorlib/task_scheduler/interactive/shared.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/executorlib/task_scheduler/interactive/shared.py b/executorlib/task_scheduler/interactive/shared.py index 3cf20362..b81a1c93 100644 --- a/executorlib/task_scheduler/interactive/shared.py +++ b/executorlib/task_scheduler/interactive/shared.py @@ -152,8 +152,3 @@ def _execute_task_with_cache( _, _, result = get_output(file_name=file_name) future_obj.set_result(result) return True - - -def _task_done(future_queue: queue.Queue): - with contextlib.suppress(ValueError): - future_queue.task_done() From b60d3a205957530a03e08985613026a3e5952037 Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Sun, 31 Aug 2025 22:30:22 +0200 Subject: [PATCH 65/83] Update blockallocation.py --- .../task_scheduler/interactive/blockallocation.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/executorlib/task_scheduler/interactive/blockallocation.py b/executorlib/task_scheduler/interactive/blockallocation.py index bf74ff5b..54aaf4cc 100644 --- a/executorlib/task_scheduler/interactive/blockallocation.py +++ b/executorlib/task_scheduler/interactive/blockallocation.py @@ -18,7 +18,7 @@ task_done, ) -_interrupt_interface_bootup_dict: dict = {} +_interrupt_bootup_dict: dict = {} class BlockAllocationTaskScheduler(TaskSchedulerBase): @@ -71,17 +71,14 @@ def __init__( self._max_workers = max_workers self_id = id(self) self._self_id = self_id - _interrupt_interface_bootup_dict[self._self_id] = False + _interrupt_bootup_dict[self._self_id] = False self._set_process( process=[ Thread( target=_execute_multiple_tasks, - kwargs=executor_kwargs - | { + kwargs=executor_kwargs | { "worker_id": worker_id, - "stop_function": lambda: _interrupt_interface_bootup_dict[ - self_id - ], + "stop_function": lambda: _interrupt_bootup_dict[self_id], }, ) for worker_id in range(self._max_workers) @@ -173,7 +170,7 @@ def shutdown(self, wait: bool = True, *, cancel_futures: bool = False): if cancel_futures: cancel_items_in_queue(que=self._future_queue) if isinstance(self._process, list): - _interrupt_interface_bootup_dict[self._self_id] = True + _interrupt_bootup_dict[self._self_id] = True for _ in range(len(self._process)): self._future_queue.put({"shutdown": True, "wait": wait}) if wait: From c27713f2b0de95cbbc6a5f3feea380cebe29d4b2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 31 Aug 2025 20:30:27 +0000 Subject: [PATCH 66/83] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- executorlib/task_scheduler/interactive/blockallocation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/executorlib/task_scheduler/interactive/blockallocation.py b/executorlib/task_scheduler/interactive/blockallocation.py index 54aaf4cc..5338deba 100644 --- a/executorlib/task_scheduler/interactive/blockallocation.py +++ b/executorlib/task_scheduler/interactive/blockallocation.py @@ -76,7 +76,8 @@ def __init__( process=[ Thread( target=_execute_multiple_tasks, - kwargs=executor_kwargs | { + kwargs=executor_kwargs + | { "worker_id": worker_id, "stop_function": lambda: _interrupt_bootup_dict[self_id], }, From a647574878a99f02e321b2a59f594b5e1bc1e273 Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Mon, 8 Sep 2025 09:46:10 +0200 Subject: [PATCH 67/83] Add docstrings --- .../interactive/spawner_pysqa.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/executorlib/task_scheduler/interactive/spawner_pysqa.py b/executorlib/task_scheduler/interactive/spawner_pysqa.py index 31f57c8b..c2f60d70 100644 --- a/executorlib/task_scheduler/interactive/spawner_pysqa.py +++ b/executorlib/task_scheduler/interactive/spawner_pysqa.py @@ -34,9 +34,17 @@ def __init__( Args: cwd (str, optional): The current working directory. Defaults to None. - cores (int, optional): The number of cores to use. Defaults to 1. - threads_per_core (int, optional): The number of threads per core. Defaults to 1. - openmpi_oversubscribe (bool, optional): Whether to oversubscribe the cores. Defaults to False. + cores (int): The number of cores to use. Defaults to 1. + threads_per_core (int): The number of threads per core. Defaults to 1. + gpus_per_core (int): number of GPUs per worker - defaults to 0 + num_nodes (int, optional): The number of compute nodes to use for executing the task. Defaults to None. + exclusive (bool): Whether to exclusively reserve the compute nodes, or allow sharing compute notes. Defaults + to False. + openmpi_oversubscribe (bool): Whether to oversubscribe the cores. Defaults to False. + slurm_cmd_args (list, optional): Additional command line arguments for the srun call (SLURM only) + pmi_mode (str, optional): PMI interface to use (OpenMPI v5 requires pmix) default is None + config_directory (str, optional): path to the pysqa config directory (only for pysqa based backend). + backend (str): name of the backend used to spawn tasks. """ super().__init__( cwd=cwd, @@ -65,6 +73,10 @@ def bootup( Args: command_lst (list[str]): The command list to execute. + stop_function (Callable): Function to stop the interface. + + Returns: + bool: Whether the interface was successfully started. """ self._queue_adapter = QueueAdapter( directory=self._config_directory, From 4e097b6cfdff14da6d6d940a5d4de47f890cfcbe Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Mon, 8 Sep 2025 09:46:25 +0200 Subject: [PATCH 68/83] test for generate_command() --- tests/test_standalone_interactive_backend.py | 42 ++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/tests/test_standalone_interactive_backend.py b/tests/test_standalone_interactive_backend.py index f4b46e30..efdaf6af 100644 --- a/tests/test_standalone_interactive_backend.py +++ b/tests/test_standalone_interactive_backend.py @@ -6,6 +6,13 @@ from executorlib.standalone.interactive.spawner import MpiExecSpawner from executorlib.task_scheduler.interactive.spawner_slurm import SrunSpawner +try: + from executorlib.task_scheduler.interactive.spawner_pysqa import PysqaSpawner + + skip_pysqa_test = False +except ImportError: + skip_pysqa_test = True + class TestParser(unittest.TestCase): def test_command_local(self): @@ -121,3 +128,38 @@ def test_command_slurm_user_command(self): ), ) self.assertEqual(result_dict, parse_arguments(command_lst)) + + @unittest.skipIf(skip_pysqa_test, "pysqa is not installed, so the pysqa tests are skipped.") + def test_command_pysqa(self): + interface_slurm = PysqaSpawner(backend="slurm", cores=2, pmi_mode="pmix", num_nodes=2, threads_per_core=2, gpus_per_core=1, exclusive=True, openmpi_oversubscribe=True, slurm_cmd_args=["test"]) + output = ['srun', '-n', '2', '--mpi=pmix', '-N', '2', '--cpus-per-task=2', '--gpus-per-task=1', '--exact', '--oversubscribe', 'test'] + self.assertEqual(interface_slurm.generate_command(command_lst=[]), output) + + interface_flux = PysqaSpawner(backend="flux", cores=2, pmi_mode="pmix") + output = ['flux', 'run', '-n', '2', '-o', 'pmi=pmix'] + self.assertEqual(interface_flux.generate_command(command_lst=[]), output) + + interface_flux = PysqaSpawner(backend="flux", cores=2, pmi_mode="pmix", num_nodes=2) + with self.assertRaises(ValueError): + interface_flux.generate_command(command_lst=[]) + + interface_flux = PysqaSpawner(backend="flux", cores=2, pmi_mode="pmix", threads_per_core=2) + with self.assertRaises(ValueError): + interface_flux.generate_command(command_lst=[]) + + interface_flux = PysqaSpawner(backend="flux", cores=2, pmi_mode="pmix", gpus_per_core=1) + with self.assertRaises(ValueError): + interface_flux.generate_command(command_lst=[]) + + interface_flux = PysqaSpawner(backend="flux", cores=2, pmi_mode="pmix", exclusive=True) + with self.assertRaises(ValueError): + interface_flux.generate_command(command_lst=[]) + + interface_flux = PysqaSpawner(backend="flux", cores=2, pmi_mode="pmix", openmpi_oversubscribe=True) + with self.assertRaises(ValueError): + interface_flux.generate_command(command_lst=[]) + + interface_nobackend = PysqaSpawner(cores=2) + with self.assertRaises(ValueError): + interface_nobackend.generate_command(command_lst=[]) + \ No newline at end of file From 1c0148a1517bb125910f627a8a25b82767a74c9c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 8 Sep 2025 07:46:34 +0000 Subject: [PATCH 69/83] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- executorlib/task_scheduler/interactive/spawner_pysqa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/executorlib/task_scheduler/interactive/spawner_pysqa.py b/executorlib/task_scheduler/interactive/spawner_pysqa.py index c2f60d70..8fb2ccd6 100644 --- a/executorlib/task_scheduler/interactive/spawner_pysqa.py +++ b/executorlib/task_scheduler/interactive/spawner_pysqa.py @@ -38,7 +38,7 @@ def __init__( threads_per_core (int): The number of threads per core. Defaults to 1. gpus_per_core (int): number of GPUs per worker - defaults to 0 num_nodes (int, optional): The number of compute nodes to use for executing the task. Defaults to None. - exclusive (bool): Whether to exclusively reserve the compute nodes, or allow sharing compute notes. Defaults + exclusive (bool): Whether to exclusively reserve the compute nodes, or allow sharing compute notes. Defaults to False. openmpi_oversubscribe (bool): Whether to oversubscribe the cores. Defaults to False. slurm_cmd_args (list, optional): Additional command line arguments for the srun call (SLURM only) From 62d48987e7c279badf67e7a6a4d5b55891a3b73f Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Mon, 8 Sep 2025 11:36:58 +0200 Subject: [PATCH 70/83] Add more tests --- tests/test_fluxclusterexecutor.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/test_fluxclusterexecutor.py b/tests/test_fluxclusterexecutor.py index 54b64ff1..8a89faf4 100644 --- a/tests/test_fluxclusterexecutor.py +++ b/tests/test_fluxclusterexecutor.py @@ -14,6 +14,7 @@ from executorlib.standalone.hdf import dump from executorlib.task_scheduler.file.spawner_pysqa import execute_with_pysqa from executorlib.standalone.scheduler import terminate_with_pysqa + from executorlib.task_scheduler.interactive.spawner_pysqa import PysqaSpawner skip_flux_test = "FLUX_URI" not in os.environ pmi = os.environ.get("EXECUTORLIB_PMIX", None) @@ -37,6 +38,10 @@ def mpi_funct(i): return i, size, rank +def stop_function(): + return True + + @unittest.skipIf( skip_flux_test or skip_mpi4py_test, "h5py or mpi4py or flux are not installed, so the h5py, flux and mpi4py tests are skipped.", @@ -161,3 +166,17 @@ def test_terminate_tasks_in_cache(self): def tearDown(self): shutil.rmtree("executorlib_cache", ignore_errors=True) + + +@unittest.skipIf( + skip_flux_test, + "flux is not installed, so the flux tests are skipped.", +) +class TestPysqaSpawner(unittest.TestCase): + def test_pysqa_spawner_sleep(self): + interface_flux = PysqaSpawner(backend="flux", cores=1) + self.assertTrue(interface_flux.bootup(command_lst=["sleep", "1"])) + + def test_pysqa_spawner_stop_function(self): + interface_flux = PysqaSpawner(backend="flux", cores=1) + self.assertFalse(interface_flux.bootup(command_lst=["exit"], stop_function=stop_function)) From dba3b48a147fa8f434e0929fef90814479cf3477 Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Mon, 8 Sep 2025 11:40:15 +0200 Subject: [PATCH 71/83] smaller tests --- tests/test_fluxclusterexecutor.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tests/test_fluxclusterexecutor.py b/tests/test_fluxclusterexecutor.py index 8a89faf4..2a0f39ae 100644 --- a/tests/test_fluxclusterexecutor.py +++ b/tests/test_fluxclusterexecutor.py @@ -38,10 +38,6 @@ def mpi_funct(i): return i, size, rank -def stop_function(): - return True - - @unittest.skipIf( skip_flux_test or skip_mpi4py_test, "h5py or mpi4py or flux are not installed, so the h5py, flux and mpi4py tests are skipped.", @@ -176,7 +172,3 @@ class TestPysqaSpawner(unittest.TestCase): def test_pysqa_spawner_sleep(self): interface_flux = PysqaSpawner(backend="flux", cores=1) self.assertTrue(interface_flux.bootup(command_lst=["sleep", "1"])) - - def test_pysqa_spawner_stop_function(self): - interface_flux = PysqaSpawner(backend="flux", cores=1) - self.assertFalse(interface_flux.bootup(command_lst=["exit"], stop_function=stop_function)) From ad4e45c47711ee983561465eeaeeffa8e2f69488 Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Mon, 8 Sep 2025 11:42:54 +0200 Subject: [PATCH 72/83] submit a big job --- tests/test_fluxclusterexecutor.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_fluxclusterexecutor.py b/tests/test_fluxclusterexecutor.py index 2a0f39ae..4d6bab66 100644 --- a/tests/test_fluxclusterexecutor.py +++ b/tests/test_fluxclusterexecutor.py @@ -38,6 +38,10 @@ def mpi_funct(i): return i, size, rank +def stop_function(): + return True + + @unittest.skipIf( skip_flux_test or skip_mpi4py_test, "h5py or mpi4py or flux are not installed, so the h5py, flux and mpi4py tests are skipped.", @@ -172,3 +176,7 @@ class TestPysqaSpawner(unittest.TestCase): def test_pysqa_spawner_sleep(self): interface_flux = PysqaSpawner(backend="flux", cores=1) self.assertTrue(interface_flux.bootup(command_lst=["sleep", "1"])) + + def test_pysqa_spawner_big(self): + interface_flux = PysqaSpawner(backend="flux", cores=100) + self.assertFalse(interface_flux.bootup(command_lst=["sleep", "1"], stop_function=stop_function)) From 624856182d1cdffd4dc26cd10309b32e34d0efbf Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Mon, 8 Sep 2025 11:59:00 +0200 Subject: [PATCH 73/83] extend tests --- tests/test_fluxclusterexecutor.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/test_fluxclusterexecutor.py b/tests/test_fluxclusterexecutor.py index 4d6bab66..c0bc6458 100644 --- a/tests/test_fluxclusterexecutor.py +++ b/tests/test_fluxclusterexecutor.py @@ -176,6 +176,19 @@ class TestPysqaSpawner(unittest.TestCase): def test_pysqa_spawner_sleep(self): interface_flux = PysqaSpawner(backend="flux", cores=1) self.assertTrue(interface_flux.bootup(command_lst=["sleep", "1"])) + self.assertTrue(interface_flux._check_process_helper(command_lst=[])) + self.assertTrue(interface_flux.poll()) + process_id = interface_flux._process + interface_flux.shutdown(wait=True) + interface_flux._process = process_id + self.assertFalse(interface_flux.poll()) + self.assertFalse(interface_flux._check_process_helper(command_lst=["sleep", "1"])) + self.assertTrue(interface_flux.poll()) + + def test_pysqa_spawner_error(self): + interface_flux = PysqaSpawner(backend="flux", cores=1) + with self.assertRaises(RuntimeError): + interface_flux.bootup(command_lst=["--unknonwn", "1"]) def test_pysqa_spawner_big(self): interface_flux = PysqaSpawner(backend="flux", cores=100) From 1e2d21c031727feddaf7d4e04cacbc50ef766ade Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Mon, 8 Sep 2025 12:03:57 +0200 Subject: [PATCH 74/83] no command --- tests/test_fluxclusterexecutor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_fluxclusterexecutor.py b/tests/test_fluxclusterexecutor.py index c0bc6458..a21b62d5 100644 --- a/tests/test_fluxclusterexecutor.py +++ b/tests/test_fluxclusterexecutor.py @@ -183,12 +183,11 @@ def test_pysqa_spawner_sleep(self): interface_flux._process = process_id self.assertFalse(interface_flux.poll()) self.assertFalse(interface_flux._check_process_helper(command_lst=["sleep", "1"])) - self.assertTrue(interface_flux.poll()) def test_pysqa_spawner_error(self): interface_flux = PysqaSpawner(backend="flux", cores=1) with self.assertRaises(RuntimeError): - interface_flux.bootup(command_lst=["--unknonwn", "1"]) + interface_flux.bootup(command_lst=[]) def test_pysqa_spawner_big(self): interface_flux = PysqaSpawner(backend="flux", cores=100) From 10db91ce09372d65ba3d4edd110646e5ad9c6708 Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Mon, 8 Sep 2025 12:07:51 +0200 Subject: [PATCH 75/83] remove error test --- tests/test_fluxclusterexecutor.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/test_fluxclusterexecutor.py b/tests/test_fluxclusterexecutor.py index a21b62d5..04680c34 100644 --- a/tests/test_fluxclusterexecutor.py +++ b/tests/test_fluxclusterexecutor.py @@ -184,11 +184,6 @@ def test_pysqa_spawner_sleep(self): self.assertFalse(interface_flux.poll()) self.assertFalse(interface_flux._check_process_helper(command_lst=["sleep", "1"])) - def test_pysqa_spawner_error(self): - interface_flux = PysqaSpawner(backend="flux", cores=1) - with self.assertRaises(RuntimeError): - interface_flux.bootup(command_lst=[]) - def test_pysqa_spawner_big(self): interface_flux = PysqaSpawner(backend="flux", cores=100) self.assertFalse(interface_flux.bootup(command_lst=["sleep", "1"], stop_function=stop_function)) From 8daa42c8ad31a3be43da2332d80930462e70f85c Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Mon, 8 Sep 2025 12:16:36 +0200 Subject: [PATCH 76/83] extend tests --- tests/test_standalone_interactive_backend.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/test_standalone_interactive_backend.py b/tests/test_standalone_interactive_backend.py index efdaf6af..b146a526 100644 --- a/tests/test_standalone_interactive_backend.py +++ b/tests/test_standalone_interactive_backend.py @@ -7,7 +7,7 @@ from executorlib.task_scheduler.interactive.spawner_slurm import SrunSpawner try: - from executorlib.task_scheduler.interactive.spawner_pysqa import PysqaSpawner + from executorlib.task_scheduler.interactive.spawner_pysqa import PysqaSpawner, create_pysqa_block_allocation_scheduler skip_pysqa_test = False except ImportError: @@ -135,6 +135,9 @@ def test_command_pysqa(self): output = ['srun', '-n', '2', '--mpi=pmix', '-N', '2', '--cpus-per-task=2', '--gpus-per-task=1', '--exact', '--oversubscribe', 'test'] self.assertEqual(interface_slurm.generate_command(command_lst=[]), output) + with self.assertRaises(RuntimeError): + interface_slurm.bootup(command_lst=["sleep", "1"]) + interface_flux = PysqaSpawner(backend="flux", cores=2, pmi_mode="pmix") output = ['flux', 'run', '-n', '2', '-o', 'pmi=pmix'] self.assertEqual(interface_flux.generate_command(command_lst=[]), output) @@ -162,4 +165,7 @@ def test_command_pysqa(self): interface_nobackend = PysqaSpawner(cores=2) with self.assertRaises(ValueError): interface_nobackend.generate_command(command_lst=[]) + + with self.assertRaises(ValueError): + create_pysqa_block_allocation_scheduler() \ No newline at end of file From 77ae767f341d4d28f8b6cf4e3ed09dd36cc25edd Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Mon, 8 Sep 2025 12:19:32 +0200 Subject: [PATCH 77/83] change error name --- tests/test_standalone_interactive_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_standalone_interactive_backend.py b/tests/test_standalone_interactive_backend.py index b146a526..bdbb9922 100644 --- a/tests/test_standalone_interactive_backend.py +++ b/tests/test_standalone_interactive_backend.py @@ -166,6 +166,6 @@ def test_command_pysqa(self): with self.assertRaises(ValueError): interface_nobackend.generate_command(command_lst=[]) - with self.assertRaises(ValueError): + with self.assertRaises(FileNotFoundError): create_pysqa_block_allocation_scheduler() \ No newline at end of file From 54c5a23efdd4559f8ad5558a9bac2f4175923512 Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Mon, 8 Sep 2025 12:21:52 +0200 Subject: [PATCH 78/83] check more errors --- tests/test_standalone_interactive_backend.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_standalone_interactive_backend.py b/tests/test_standalone_interactive_backend.py index bdbb9922..7be32234 100644 --- a/tests/test_standalone_interactive_backend.py +++ b/tests/test_standalone_interactive_backend.py @@ -166,6 +166,9 @@ def test_command_pysqa(self): with self.assertRaises(ValueError): interface_nobackend.generate_command(command_lst=[]) + with self.assertRaises(RuntimeError): + interface_nobackend._check_process_helper(command_lst=[]) + with self.assertRaises(FileNotFoundError): create_pysqa_block_allocation_scheduler() \ No newline at end of file From 676b4ecedc0c4d774cbfd241cb396932e6f19c92 Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Mon, 8 Sep 2025 12:27:58 +0200 Subject: [PATCH 79/83] clean up --- tests/test_standalone_interactive_backend.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/test_standalone_interactive_backend.py b/tests/test_standalone_interactive_backend.py index 7be32234..3279c8fd 100644 --- a/tests/test_standalone_interactive_backend.py +++ b/tests/test_standalone_interactive_backend.py @@ -7,7 +7,7 @@ from executorlib.task_scheduler.interactive.spawner_slurm import SrunSpawner try: - from executorlib.task_scheduler.interactive.spawner_pysqa import PysqaSpawner, create_pysqa_block_allocation_scheduler + from executorlib.task_scheduler.interactive.spawner_pysqa import PysqaSpawner skip_pysqa_test = False except ImportError: @@ -135,7 +135,7 @@ def test_command_pysqa(self): output = ['srun', '-n', '2', '--mpi=pmix', '-N', '2', '--cpus-per-task=2', '--gpus-per-task=1', '--exact', '--oversubscribe', 'test'] self.assertEqual(interface_slurm.generate_command(command_lst=[]), output) - with self.assertRaises(RuntimeError): + with self.assertRaises(FileNotFoundError): interface_slurm.bootup(command_lst=["sleep", "1"]) interface_flux = PysqaSpawner(backend="flux", cores=2, pmi_mode="pmix") @@ -168,7 +168,4 @@ def test_command_pysqa(self): with self.assertRaises(RuntimeError): interface_nobackend._check_process_helper(command_lst=[]) - - with self.assertRaises(FileNotFoundError): - create_pysqa_block_allocation_scheduler() \ No newline at end of file From 9b497d27d51a59ba56894852276c818348d6d56e Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Mon, 8 Sep 2025 12:33:17 +0200 Subject: [PATCH 80/83] extend tests --- tests/test_standalone_interactive_backend.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/test_standalone_interactive_backend.py b/tests/test_standalone_interactive_backend.py index 3279c8fd..9fa00536 100644 --- a/tests/test_standalone_interactive_backend.py +++ b/tests/test_standalone_interactive_backend.py @@ -7,7 +7,7 @@ from executorlib.task_scheduler.interactive.spawner_slurm import SrunSpawner try: - from executorlib.task_scheduler.interactive.spawner_pysqa import PysqaSpawner + from executorlib.task_scheduler.interactive.spawner_pysqa import PysqaSpawner, create_pysqa_block_allocation_scheduler skip_pysqa_test = False except ImportError: @@ -168,4 +168,9 @@ def test_command_pysqa(self): with self.assertRaises(RuntimeError): interface_nobackend._check_process_helper(command_lst=[]) - \ No newline at end of file + + with self.assertRaises(KeyError): + create_pysqa_block_allocation_scheduler() + + with self.assertRaises(ValueError): + create_pysqa_block_allocation_scheduler(resource_dict={"cwd": "."}) \ No newline at end of file From 6c624bdf95d4c293df002129ad8c757f734951e6 Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Mon, 8 Sep 2025 12:38:58 +0200 Subject: [PATCH 81/83] more tests --- .../task_scheduler/interactive/spawner_pysqa.py | 3 ++- tests/test_slurmclusterexecutor.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/executorlib/task_scheduler/interactive/spawner_pysqa.py b/executorlib/task_scheduler/interactive/spawner_pysqa.py index 8fb2ccd6..b91178be 100644 --- a/executorlib/task_scheduler/interactive/spawner_pysqa.py +++ b/executorlib/task_scheduler/interactive/spawner_pysqa.py @@ -223,7 +223,8 @@ def create_pysqa_block_allocation_scheduler( if resource_dict is None: resource_dict = {} cores_per_worker = resource_dict.get("cores", 1) - resource_dict["cwd"] = os.path.abspath(resource_dict["cwd"]) + if "cwd" in resource_dict and resource_dict["cwd"] is not None: + resource_dict["cwd"] = os.path.abspath(resource_dict["cwd"]) if cache_directory is not None: resource_dict["cache_directory"] = os.path.abspath(cache_directory) else: diff --git a/tests/test_slurmclusterexecutor.py b/tests/test_slurmclusterexecutor.py index a26524e7..704b3a91 100644 --- a/tests/test_slurmclusterexecutor.py +++ b/tests/test_slurmclusterexecutor.py @@ -20,6 +20,13 @@ except ImportError: skip_h5py_test = True +try: + import pysqa + + skip_pysqa_test = False +except ImportError: + skip_pysqa_test = True + submission_template = """\ #!/bin/bash #SBATCH --output=time.out @@ -108,3 +115,10 @@ def test_executor_existing_files(self): def tearDown(self): shutil.rmtree("executorlib_cache", ignore_errors=True) + + +@unittest.skipIf(skip_pysqa_test, "pysqa is not installed, so the pysqa tests are skipped.") +class TestSlurmClusterInit(unittest.TestCase): + def test_slurm_cluster_init(self): + with self.assertRaises(ValueError): + SlurmClusterExecutor(block_allocation=True) \ No newline at end of file From 6c0284560a5b5118e688df173dd20966188d4086 Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Mon, 8 Sep 2025 12:42:13 +0200 Subject: [PATCH 82/83] validate initialization --- tests/test_slurmclusterexecutor.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/test_slurmclusterexecutor.py b/tests/test_slurmclusterexecutor.py index 704b3a91..41057119 100644 --- a/tests/test_slurmclusterexecutor.py +++ b/tests/test_slurmclusterexecutor.py @@ -119,6 +119,9 @@ def tearDown(self): @unittest.skipIf(skip_pysqa_test, "pysqa is not installed, so the pysqa tests are skipped.") class TestSlurmClusterInit(unittest.TestCase): - def test_slurm_cluster_init(self): + def test_slurm_cluster_block_allocation(self): with self.assertRaises(ValueError): - SlurmClusterExecutor(block_allocation=True) \ No newline at end of file + SlurmClusterExecutor(block_allocation=True) + + def test_slurm_cluster_file(self): + self.assertTrue(SlurmClusterExecutor(block_allocation=False)) \ No newline at end of file From 5f7b676bee4f36c988fc67ca26dade6e85fbe9c4 Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Mon, 8 Sep 2025 12:43:03 +0200 Subject: [PATCH 83/83] fix test --- tests/test_standalone_interactive_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_standalone_interactive_backend.py b/tests/test_standalone_interactive_backend.py index 9fa00536..ed3745e3 100644 --- a/tests/test_standalone_interactive_backend.py +++ b/tests/test_standalone_interactive_backend.py @@ -169,7 +169,7 @@ def test_command_pysqa(self): with self.assertRaises(RuntimeError): interface_nobackend._check_process_helper(command_lst=[]) - with self.assertRaises(KeyError): + with self.assertRaises(ValueError): create_pysqa_block_allocation_scheduler() with self.assertRaises(ValueError):