diff --git a/executorlib/__init__.py b/executorlib/__init__.py index 127fd879..7b47910b 100644 --- a/executorlib/__init__.py +++ b/executorlib/__init__.py @@ -1,10 +1,10 @@ from executorlib._version import get_versions as _get_versions -from executorlib.interfaces.flux import ( +from executorlib.executor.flux import ( FluxClusterExecutor, FluxJobExecutor, ) -from executorlib.interfaces.single import SingleNodeExecutor -from executorlib.interfaces.slurm import ( +from executorlib.executor.single import SingleNodeExecutor +from executorlib.executor.slurm import ( SlurmClusterExecutor, SlurmJobExecutor, ) diff --git a/executorlib/backend/cache_parallel.py b/executorlib/backend/cache_parallel.py index f094446d..9b1d25d8 100644 --- a/executorlib/backend/cache_parallel.py +++ b/executorlib/backend/cache_parallel.py @@ -4,7 +4,10 @@ import cloudpickle -from executorlib.cache.backend import backend_load_file, backend_write_file +from executorlib.task_scheduler.file.backend import ( + backend_load_file, + backend_write_file, +) def main() -> None: diff --git a/executorlib/backend/cache_serial.py b/executorlib/backend/cache_serial.py index daf5a520..c627fe8d 100644 --- a/executorlib/backend/cache_serial.py +++ b/executorlib/backend/cache_serial.py @@ -1,6 +1,6 @@ import sys -from executorlib.cache.backend import backend_execute_task_in_file +from executorlib.task_scheduler.file.backend import backend_execute_task_in_file if __name__ == "__main__": backend_execute_task_in_file(file_name=sys.argv[1]) diff --git a/executorlib/base/__init__.py b/executorlib/executor/__init__.py similarity index 100% rename from executorlib/base/__init__.py rename to executorlib/executor/__init__.py diff --git a/executorlib/executor/base.py b/executorlib/executor/base.py new file mode 100644 index 00000000..78a410a4 --- /dev/null +++ b/executorlib/executor/base.py @@ -0,0 +1,118 @@ +import queue +from concurrent.futures import ( + Executor as FutureExecutor, +) +from concurrent.futures import ( + Future, +) +from typing import Callable, Optional + +from executorlib.task_scheduler.base import TaskSchedulerBase + + +class ExecutorBase(FutureExecutor): + """ + Interface class for the executor. + + Args: + executor (TaskSchedulerBase): internal executor + """ + + def __init__(self, executor: TaskSchedulerBase): + self._task_scheduler = executor + + @property + def max_workers(self) -> Optional[int]: + return self._task_scheduler.max_workers + + @max_workers.setter + def max_workers(self, max_workers: int): + self._task_scheduler.max_workers = max_workers + + @property + def info(self) -> Optional[dict]: + """ + Get the information about the executor. + + Returns: + Optional[dict]: Information about the executor. + """ + return self._task_scheduler.info + + @property + def future_queue(self) -> Optional[queue.Queue]: + """ + Get the future queue. + + Returns: + queue.Queue: The future queue. + """ + return self._task_scheduler.future_queue + + def submit( # type: ignore + self, + fn: Callable, + /, + *args, + resource_dict: Optional[dict] = None, + **kwargs, + ) -> Future: + """ + Submits a callable to be executed with the given arguments. + + Schedules the callable to be executed as fn(*args, **kwargs) and returns + a Future instance representing the execution of the callable. + + Args: + fn (callable): function to submit for execution + args: arguments for the submitted function + kwargs: keyword arguments for the submitted function + resource_dict (dict): resource dictionary, which defines the resources used for the execution of the + function. Example resource dictionary: { + cores: 1, + threads_per_core: 1, + gpus_per_worker: 0, + oversubscribe: False, + cwd: None, + executor: None, + hostname_localhost: False, + } + + Returns: + Future: A Future representing the given call. + """ + return self._task_scheduler.submit( + *([fn] + list(args)), resource_dict=resource_dict, **kwargs + ) + + def shutdown(self, wait: bool = True, *, cancel_futures: bool = False): + """ + Clean-up the resources associated with the Executor. + + It is safe to call this method several times. Otherwise, no other + methods can be called after this one. + + Args: + wait (bool): If True then shutdown will not return until all running + futures have finished executing and the resources used by the + parallel_executors have been reclaimed. + cancel_futures (bool): If True then shutdown will cancel all pending + futures. Futures that are completed or running will not be + cancelled. + """ + self._task_scheduler.shutdown(wait=wait, cancel_futures=cancel_futures) + + def __len__(self) -> int: + """ + Get the length of the executor. + + Returns: + int: The length of the executor. + """ + return len(self._task_scheduler) + + def __exit__(self, *args, **kwargs) -> None: + """ + Exit method called when exiting the context manager. + """ + self._task_scheduler.__exit__(*args, **kwargs) diff --git a/executorlib/interfaces/flux.py b/executorlib/executor/flux.py similarity index 96% rename from executorlib/interfaces/flux.py rename to executorlib/executor/flux.py index 0d85b5b3..c7568eb9 100644 --- a/executorlib/interfaces/flux.py +++ b/executorlib/executor/flux.py @@ -1,10 +1,7 @@ import contextlib from typing import Callable, Optional, Union -from executorlib.base.executor import ExecutorInterface -from executorlib.interactive.blockallocation import BlockAllocationExecutor -from executorlib.interactive.dependency import DependencyExecutor -from executorlib.interactive.onetoone import OneTaskPerProcessExecutor +from executorlib.executor.base import ExecutorBase from executorlib.standalone.inputcheck import ( check_command_line_argument_lst, check_init_function, @@ -14,15 +11,20 @@ check_refresh_rate, validate_number_of_cores, ) +from executorlib.task_scheduler.interactive.blockallocation import ( + BlockAllocationTaskScheduler, +) +from executorlib.task_scheduler.interactive.dependency import DependencyTaskScheduler +from executorlib.task_scheduler.interactive.onetoone import OneProcessTaskScheduler with contextlib.suppress(ImportError): - from executorlib.interactive.fluxspawner import ( + from executorlib.task_scheduler.interactive.fluxspawner import ( FluxPythonSpawner, validate_max_workers, ) -class FluxJobExecutor(ExecutorInterface): +class FluxJobExecutor(ExecutorBase): """ The executorlib.Executor leverages either the message passing interface (MPI), the SLURM workload manager or preferable the flux framework for distributing python functions within a given resource allocation. In contrast to @@ -70,7 +72,7 @@ class FluxJobExecutor(ExecutorInterface): Examples: ``` >>> import numpy as np - >>> from executorlib.interfaces.flux import FluxJobExecutor + >>> from executorlib.executor.flux import FluxJobExecutor >>> >>> def calc(i, j, k): >>> from mpi4py import MPI @@ -167,7 +169,7 @@ def __init__( ) if not disable_dependencies: super().__init__( - executor=DependencyExecutor( + executor=DependencyTaskScheduler( executor=create_flux_executor( max_workers=max_workers, cache_directory=cache_directory, @@ -207,7 +209,7 @@ def __init__( ) -class FluxClusterExecutor(ExecutorInterface): +class FluxClusterExecutor(ExecutorBase): """ The executorlib.Executor leverages either the message passing interface (MPI), the SLURM workload manager or preferable the flux framework for distributing python functions within a given resource allocation. In contrast to @@ -251,7 +253,7 @@ class FluxClusterExecutor(ExecutorInterface): Examples: ``` >>> import numpy as np - >>> from executorlib.interfaces.flux import FluxClusterExecutor + >>> from executorlib.executor.flux import FluxClusterExecutor >>> >>> def calc(i, j, k): >>> from mpi4py import MPI @@ -341,7 +343,9 @@ def __init__( {k: v for k, v in default_resource_dict.items() if k not in resource_dict} ) if not plot_dependency_graph: - from executorlib.cache.executor import create_file_executor + from executorlib.task_scheduler.file.task_scheduler import ( + create_file_executor, + ) super().__init__( executor=create_file_executor( @@ -363,7 +367,7 @@ def __init__( ) else: super().__init__( - executor=DependencyExecutor( + executor=DependencyTaskScheduler( executor=create_flux_executor( max_workers=max_workers, cache_directory=cache_directory, @@ -397,7 +401,7 @@ def create_flux_executor( hostname_localhost: Optional[bool] = None, block_allocation: bool = False, init_function: Optional[Callable] = None, -) -> Union[OneTaskPerProcessExecutor, BlockAllocationExecutor]: +) -> Union[OneProcessTaskScheduler, BlockAllocationTaskScheduler]: """ Create a flux executor @@ -468,13 +472,13 @@ def create_flux_executor( cores=cores_per_worker, threads_per_core=resource_dict.get("threads_per_core", 1), ) - return BlockAllocationExecutor( + return BlockAllocationTaskScheduler( max_workers=max_workers, executor_kwargs=resource_dict, spawner=FluxPythonSpawner, ) else: - return OneTaskPerProcessExecutor( + return OneProcessTaskScheduler( max_cores=max_cores, max_workers=max_workers, executor_kwargs=resource_dict, diff --git a/executorlib/interfaces/single.py b/executorlib/executor/single.py similarity index 96% rename from executorlib/interfaces/single.py rename to executorlib/executor/single.py index 667104a8..f4af810e 100644 --- a/executorlib/interfaces/single.py +++ b/executorlib/executor/single.py @@ -1,9 +1,6 @@ from typing import Callable, Optional, Union -from executorlib.base.executor import ExecutorInterface -from executorlib.interactive.blockallocation import BlockAllocationExecutor -from executorlib.interactive.dependency import DependencyExecutor -from executorlib.interactive.onetoone import OneTaskPerProcessExecutor +from executorlib.executor.base import ExecutorBase from executorlib.standalone.inputcheck import ( check_command_line_argument_lst, check_gpus_per_worker, @@ -13,9 +10,14 @@ validate_number_of_cores, ) from executorlib.standalone.interactive.spawner import MpiExecSpawner +from executorlib.task_scheduler.interactive.blockallocation import ( + BlockAllocationTaskScheduler, +) +from executorlib.task_scheduler.interactive.dependency import DependencyTaskScheduler +from executorlib.task_scheduler.interactive.onetoone import OneProcessTaskScheduler -class SingleNodeExecutor(ExecutorInterface): +class SingleNodeExecutor(ExecutorBase): """ The executorlib.Executor leverages either the message passing interface (MPI), the SLURM workload manager or preferable the flux framework for distributing python functions within a given resource allocation. In contrast to @@ -58,7 +60,7 @@ class SingleNodeExecutor(ExecutorInterface): Examples: ``` >>> import numpy as np - >>> from executorlib.interfaces.single import SingleNodeExecutor + >>> from executorlib.executor.single import SingleNodeExecutor >>> >>> def calc(i, j, k): >>> from mpi4py import MPI @@ -147,7 +149,7 @@ def __init__( ) if not disable_dependencies: super().__init__( - executor=DependencyExecutor( + executor=DependencyTaskScheduler( executor=create_single_node_executor( max_workers=max_workers, cache_directory=cache_directory, @@ -187,7 +189,7 @@ def create_single_node_executor( hostname_localhost: Optional[bool] = None, block_allocation: bool = False, init_function: Optional[Callable] = None, -) -> Union[OneTaskPerProcessExecutor, BlockAllocationExecutor]: +) -> Union[OneProcessTaskScheduler, BlockAllocationTaskScheduler]: """ Create a single node executor @@ -241,7 +243,7 @@ def create_single_node_executor( del resource_dict["slurm_cmd_args"] if block_allocation: resource_dict["init_function"] = init_function - return BlockAllocationExecutor( + return BlockAllocationTaskScheduler( max_workers=validate_number_of_cores( max_cores=max_cores, max_workers=max_workers, @@ -252,7 +254,7 @@ def create_single_node_executor( spawner=MpiExecSpawner, ) else: - return OneTaskPerProcessExecutor( + return OneProcessTaskScheduler( max_cores=max_cores, max_workers=max_workers, executor_kwargs=resource_dict, diff --git a/executorlib/interfaces/slurm.py b/executorlib/executor/slurm.py similarity index 96% rename from executorlib/interfaces/slurm.py rename to executorlib/executor/slurm.py index d9365fd2..fe604386 100644 --- a/executorlib/interfaces/slurm.py +++ b/executorlib/executor/slurm.py @@ -1,19 +1,24 @@ from typing import Callable, Optional, Union -from executorlib.base.executor import ExecutorInterface -from executorlib.interactive.blockallocation import BlockAllocationExecutor -from executorlib.interactive.dependency import DependencyExecutor -from executorlib.interactive.onetoone import OneTaskPerProcessExecutor -from executorlib.interactive.slurmspawner import SrunSpawner, validate_max_workers +from executorlib.executor.base import ExecutorBase from executorlib.standalone.inputcheck import ( check_init_function, check_plot_dependency_graph, check_refresh_rate, validate_number_of_cores, ) +from executorlib.task_scheduler.interactive.blockallocation import ( + BlockAllocationTaskScheduler, +) +from executorlib.task_scheduler.interactive.dependency import DependencyTaskScheduler +from executorlib.task_scheduler.interactive.onetoone import OneProcessTaskScheduler +from executorlib.task_scheduler.interactive.slurmspawner import ( + SrunSpawner, + validate_max_workers, +) -class SlurmClusterExecutor(ExecutorInterface): +class SlurmClusterExecutor(ExecutorBase): """ The executorlib.Executor leverages either the message passing interface (MPI), the SLURM workload manager or preferable the flux framework for distributing python functions within a given resource allocation. In contrast to @@ -57,7 +62,7 @@ class SlurmClusterExecutor(ExecutorInterface): Examples: ``` >>> import numpy as np - >>> from executorlib.interfaces.slurm import SlurmClusterExecutor + >>> from executorlib.executor.slurm import SlurmClusterExecutor >>> >>> def calc(i, j, k): >>> from mpi4py import MPI @@ -147,7 +152,9 @@ def __init__( {k: v for k, v in default_resource_dict.items() if k not in resource_dict} ) if not plot_dependency_graph: - from executorlib.cache.executor import create_file_executor + from executorlib.task_scheduler.file.task_scheduler import ( + create_file_executor, + ) super().__init__( executor=create_file_executor( @@ -169,7 +176,7 @@ def __init__( ) else: super().__init__( - executor=DependencyExecutor( + executor=DependencyTaskScheduler( executor=create_slurm_executor( max_workers=max_workers, cache_directory=cache_directory, @@ -187,7 +194,7 @@ def __init__( ) -class SlurmJobExecutor(ExecutorInterface): +class SlurmJobExecutor(ExecutorBase): """ The executorlib.Executor leverages either the message passing interface (MPI), the SLURM workload manager or preferable the flux framework for distributing python functions within a given resource allocation. In contrast to @@ -234,7 +241,7 @@ class SlurmJobExecutor(ExecutorInterface): Examples: ``` >>> import numpy as np - >>> from executorlib.interfaces.slurm import SlurmJobExecutor + >>> from executorlib.executor.slurm import SlurmJobExecutor >>> >>> def calc(i, j, k): >>> from mpi4py import MPI @@ -327,7 +334,7 @@ def __init__( ) if not disable_dependencies: super().__init__( - executor=DependencyExecutor( + executor=DependencyTaskScheduler( executor=create_slurm_executor( max_workers=max_workers, cache_directory=cache_directory, @@ -367,7 +374,7 @@ def create_slurm_executor( hostname_localhost: Optional[bool] = None, block_allocation: bool = False, init_function: Optional[Callable] = None, -) -> Union[OneTaskPerProcessExecutor, BlockAllocationExecutor]: +) -> Union[OneProcessTaskScheduler, BlockAllocationTaskScheduler]: """ Create a SLURM executor @@ -425,13 +432,13 @@ def create_slurm_executor( cores=cores_per_worker, threads_per_core=resource_dict.get("threads_per_core", 1), ) - return BlockAllocationExecutor( + return BlockAllocationTaskScheduler( max_workers=max_workers, executor_kwargs=resource_dict, spawner=SrunSpawner, ) else: - return OneTaskPerProcessExecutor( + return OneProcessTaskScheduler( max_cores=max_cores, max_workers=max_workers, executor_kwargs=resource_dict, diff --git a/executorlib/cache/__init__.py b/executorlib/task_scheduler/__init__.py similarity index 100% rename from executorlib/cache/__init__.py rename to executorlib/task_scheduler/__init__.py diff --git a/executorlib/base/executor.py b/executorlib/task_scheduler/base.py similarity index 62% rename from executorlib/base/executor.py rename to executorlib/task_scheduler/base.py index 59a42322..36c46c21 100644 --- a/executorlib/base/executor.py +++ b/executorlib/task_scheduler/base.py @@ -14,7 +14,7 @@ from executorlib.standalone.serialize import cloudpickle_register -class ExecutorBase(FutureExecutor): +class TaskSchedulerBase(FutureExecutor): """ Base class for the executor. @@ -179,111 +179,3 @@ def __del__(self): """ with contextlib.suppress(AttributeError, RuntimeError): self.shutdown(wait=False) - - -class ExecutorInterface(FutureExecutor): - """ - Interface class for the executor. - - Args: - executor (ExecutorBase): internal executor - """ - - def __init__(self, executor: ExecutorBase): - self._task_scheduler = executor - - @property - def max_workers(self) -> Optional[int]: - return self._task_scheduler.max_workers - - @max_workers.setter - def max_workers(self, max_workers: int): - self._task_scheduler.max_workers = max_workers - - @property - def info(self) -> Optional[dict]: - """ - Get the information about the executor. - - Returns: - Optional[dict]: Information about the executor. - """ - return self._task_scheduler.info - - @property - def future_queue(self) -> Optional[queue.Queue]: - """ - Get the future queue. - - Returns: - queue.Queue: The future queue. - """ - return self._task_scheduler.future_queue - - def submit( # type: ignore - self, - fn: Callable, - /, - *args, - resource_dict: Optional[dict] = None, - **kwargs, - ) -> Future: - """ - Submits a callable to be executed with the given arguments. - - Schedules the callable to be executed as fn(*args, **kwargs) and returns - a Future instance representing the execution of the callable. - - Args: - fn (callable): function to submit for execution - args: arguments for the submitted function - kwargs: keyword arguments for the submitted function - resource_dict (dict): resource dictionary, which defines the resources used for the execution of the - function. Example resource dictionary: { - cores: 1, - threads_per_core: 1, - gpus_per_worker: 0, - oversubscribe: False, - cwd: None, - executor: None, - hostname_localhost: False, - } - - Returns: - Future: A Future representing the given call. - """ - return self._task_scheduler.submit( - *([fn] + list(args)), resource_dict=resource_dict, **kwargs - ) - - def shutdown(self, wait: bool = True, *, cancel_futures: bool = False): - """ - Clean-up the resources associated with the Executor. - - It is safe to call this method several times. Otherwise, no other - methods can be called after this one. - - Args: - wait (bool): If True then shutdown will not return until all running - futures have finished executing and the resources used by the - parallel_executors have been reclaimed. - cancel_futures (bool): If True then shutdown will cancel all pending - futures. Futures that are completed or running will not be - cancelled. - """ - self._task_scheduler.shutdown(wait=wait, cancel_futures=cancel_futures) - - def __len__(self) -> int: - """ - Get the length of the executor. - - Returns: - int: The length of the executor. - """ - return len(self._task_scheduler) - - def __exit__(self, *args, **kwargs) -> None: - """ - Exit method called when exiting the context manager. - """ - self._task_scheduler.__exit__(*args, **kwargs) diff --git a/executorlib/interactive/__init__.py b/executorlib/task_scheduler/file/__init__.py similarity index 100% rename from executorlib/interactive/__init__.py rename to executorlib/task_scheduler/file/__init__.py diff --git a/executorlib/cache/backend.py b/executorlib/task_scheduler/file/backend.py similarity index 97% rename from executorlib/cache/backend.py rename to executorlib/task_scheduler/file/backend.py index cbb649e8..63fe6ea5 100644 --- a/executorlib/cache/backend.py +++ b/executorlib/task_scheduler/file/backend.py @@ -2,8 +2,8 @@ import time from typing import Any -from executorlib.cache.shared import FutureItem from executorlib.standalone.hdf import dump, load +from executorlib.task_scheduler.file.shared import FutureItem def backend_load_file(file_name: str) -> dict: diff --git a/executorlib/cache/queue_spawner.py b/executorlib/task_scheduler/file/queue_spawner.py similarity index 100% rename from executorlib/cache/queue_spawner.py rename to executorlib/task_scheduler/file/queue_spawner.py diff --git a/executorlib/cache/shared.py b/executorlib/task_scheduler/file/shared.py similarity index 100% rename from executorlib/cache/shared.py rename to executorlib/task_scheduler/file/shared.py diff --git a/executorlib/cache/subprocess_spawner.py b/executorlib/task_scheduler/file/subprocess_spawner.py similarity index 100% rename from executorlib/cache/subprocess_spawner.py rename to executorlib/task_scheduler/file/subprocess_spawner.py diff --git a/executorlib/cache/executor.py b/executorlib/task_scheduler/file/task_scheduler.py similarity index 93% rename from executorlib/cache/executor.py rename to executorlib/task_scheduler/file/task_scheduler.py index 9f9582c3..2a2ca099 100644 --- a/executorlib/cache/executor.py +++ b/executorlib/task_scheduler/file/task_scheduler.py @@ -2,12 +2,6 @@ from threading import Thread from typing import Callable, Optional -from executorlib.base.executor import ExecutorBase -from executorlib.cache.shared import execute_tasks_h5 -from executorlib.cache.subprocess_spawner import ( - execute_in_subprocess, - terminate_subprocess, -) from executorlib.standalone.inputcheck import ( check_executor, check_flux_executor_pmi_mode, @@ -16,15 +10,21 @@ check_max_workers_and_cores, check_nested_flux_executor, ) +from executorlib.task_scheduler.base import TaskSchedulerBase +from executorlib.task_scheduler.file.shared import execute_tasks_h5 +from executorlib.task_scheduler.file.subprocess_spawner import ( + execute_in_subprocess, + terminate_subprocess, +) try: - from executorlib.cache.queue_spawner import execute_with_pysqa + from executorlib.task_scheduler.file.queue_spawner import execute_with_pysqa except ImportError: # If pysqa is not available fall back to executing tasks in a subprocess execute_with_pysqa = execute_in_subprocess # type: ignore -class FileExecutor(ExecutorBase): +class FileTaskScheduler(TaskSchedulerBase): def __init__( self, cache_directory: str = "cache", @@ -113,7 +113,7 @@ def create_file_executor( check_executor(executor=flux_executor) check_nested_flux_executor(nested_flux_executor=flux_executor_nesting) check_flux_log_files(flux_log_files=flux_log_files) - return FileExecutor( + return FileTaskScheduler( cache_directory=cache_directory, resource_dict=resource_dict, pysqa_config_directory=pysqa_config_directory, diff --git a/executorlib/interfaces/__init__.py b/executorlib/task_scheduler/interactive/__init__.py similarity index 100% rename from executorlib/interfaces/__init__.py rename to executorlib/task_scheduler/interactive/__init__.py diff --git a/executorlib/interactive/blockallocation.py b/executorlib/task_scheduler/interactive/blockallocation.py similarity index 93% rename from executorlib/interactive/blockallocation.py rename to executorlib/task_scheduler/interactive/blockallocation.py index 9e6500b7..7f474d6a 100644 --- a/executorlib/interactive/blockallocation.py +++ b/executorlib/task_scheduler/interactive/blockallocation.py @@ -3,18 +3,19 @@ from threading import Thread from typing import Callable, Optional -from executorlib.base.executor import ExecutorBase, cancel_items_in_queue -from executorlib.interactive.shared import execute_tasks from executorlib.standalone.inputcheck import ( check_resource_dict, check_resource_dict_is_empty, ) from executorlib.standalone.interactive.spawner import BaseSpawner, MpiExecSpawner +from executorlib.standalone.queue import cancel_items_in_queue +from executorlib.task_scheduler.base import TaskSchedulerBase +from executorlib.task_scheduler.interactive.shared import execute_tasks -class BlockAllocationExecutor(ExecutorBase): +class BlockAllocationTaskScheduler(TaskSchedulerBase): """ - The executorlib.interactive.executor.InteractiveExecutor leverages the exeutorlib interfaces to distribute python + The executorlib.interactive.executor.InteractiveExecutor leverages the exeutorlib executor to distribute python tasks on a workstation or inside a queuing system allocation. In contrast to the mpi4py.futures.MPIPoolExecutor the executorlib.interactive.executor.InteractiveExecutor can be executed in a serial python process and does not require the python script to be executed with MPI. Consequently, it is primarily an abstraction of its functionality to @@ -28,7 +29,7 @@ class BlockAllocationExecutor(ExecutorBase): Examples: >>> import numpy as np - >>> from executorlib.interactive.blockallocation import BlockAllocationExecutor + >>> from executorlib.interactive.blockallocation import BlockAllocationTaskScheduler >>> >>> def calc(i, j, k): >>> from mpi4py import MPI @@ -39,7 +40,7 @@ class BlockAllocationExecutor(ExecutorBase): >>> def init_k(): >>> return {"k": 3} >>> - >>> with BlockAllocationExecutor(max_workers=2, executor_kwargs={"init_function": init_k}) as p: + >>> with BlockAllocationTaskScheduler(max_workers=2, executor_kwargs={"init_function": init_k}) as p: >>> fs = p.submit(calc, 2, j=4) >>> print(fs.result()) [(array([2, 4, 3]), 2, 0), (array([2, 4, 3]), 2, 1)] diff --git a/executorlib/interactive/dependency.py b/executorlib/task_scheduler/interactive/dependency.py similarity index 97% rename from executorlib/interactive/dependency.py rename to executorlib/task_scheduler/interactive/dependency.py index 5f27ff4a..07f9f2a3 100644 --- a/executorlib/interactive/dependency.py +++ b/executorlib/task_scheduler/interactive/dependency.py @@ -4,7 +4,6 @@ from time import sleep from typing import Any, Callable, Optional -from executorlib.base.executor import ExecutorBase from executorlib.standalone.interactive.arguments import ( check_exception_was_raised, get_exception_lst, @@ -16,9 +15,10 @@ generate_nodes_and_edges, generate_task_hash, ) +from executorlib.task_scheduler.base import TaskSchedulerBase -class DependencyExecutor(ExecutorBase): +class DependencyTaskScheduler(TaskSchedulerBase): """ ExecutorWithDependencies is a class that extends ExecutorBase and provides functionality for executing tasks with dependencies. @@ -38,7 +38,7 @@ class DependencyExecutor(ExecutorBase): def __init__( self, - executor: ExecutorBase, + executor: TaskSchedulerBase, max_cores: Optional[int] = None, refresh_rate: float = 0.01, plot_dependency_graph: bool = False, @@ -188,7 +188,7 @@ def __exit__( def _execute_tasks_with_dependencies( future_queue: queue.Queue, executor_queue: queue.Queue, - executor: ExecutorBase, + executor: TaskSchedulerBase, refresh_rate: float = 0.01, ): """ @@ -198,7 +198,7 @@ def _execute_tasks_with_dependencies( Args: future_queue (Queue): Queue for receiving new tasks. executor_queue (Queue): Queue for the internal executor. - executor (ExecutorBase): Executor to execute the tasks with after the dependencies are resolved. + executor (TaskSchedulerBase): Executor to execute the tasks with after the dependencies are resolved. refresh_rate (float): Set the refresh rate in seconds, how frequently the input queue is checked. """ wait_lst = [] diff --git a/executorlib/interactive/fluxspawner.py b/executorlib/task_scheduler/interactive/fluxspawner.py similarity index 100% rename from executorlib/interactive/fluxspawner.py rename to executorlib/task_scheduler/interactive/fluxspawner.py diff --git a/executorlib/interactive/onetoone.py b/executorlib/task_scheduler/interactive/onetoone.py similarity index 96% rename from executorlib/interactive/onetoone.py rename to executorlib/task_scheduler/interactive/onetoone.py index ca38ca52..d28f014b 100644 --- a/executorlib/interactive/onetoone.py +++ b/executorlib/task_scheduler/interactive/onetoone.py @@ -2,14 +2,14 @@ from threading import Thread from typing import Optional -from executorlib.base.executor import ExecutorBase -from executorlib.interactive.shared import execute_tasks from executorlib.standalone.interactive.spawner import BaseSpawner, MpiExecSpawner +from executorlib.task_scheduler.base import TaskSchedulerBase +from executorlib.task_scheduler.interactive.shared import execute_tasks -class OneTaskPerProcessExecutor(ExecutorBase): +class OneProcessTaskScheduler(TaskSchedulerBase): """ - The executorlib.interactive.executor.InteractiveStepExecutor leverages the executorlib interfaces to distribute python + The executorlib.interactive.executor.InteractiveStepExecutor leverages the executorlib executor to distribute python tasks. In contrast to the mpi4py.futures.MPIPoolExecutor the executorlib.interactive.executor.InteractiveStepExecutor can be executed in a serial python process and does not require the python script to be executed with MPI. Consequently, it is primarily an abstraction of its functionality to improve the usability in particular when used @@ -23,7 +23,7 @@ class OneTaskPerProcessExecutor(ExecutorBase): Examples: >>> import numpy as np - >>> from executorlib.interactive.onetoone import OneTaskPerProcessExecutor + >>> from executorlib.interactive.onetoone import OneProcessTaskScheduler >>> >>> def calc(i, j, k): >>> from mpi4py import MPI @@ -31,7 +31,7 @@ class OneTaskPerProcessExecutor(ExecutorBase): >>> rank = MPI.COMM_WORLD.Get_rank() >>> return np.array([i, j, k]), size, rank >>> - >>> with OneTaskPerProcessExecutor(max_cores=2) as p: + >>> with OneProcessTaskScheduler(max_cores=2) as p: >>> fs = p.submit(calc, 2, j=4, k=3, resource_dict={"cores": 2}) >>> print(fs.result()) diff --git a/executorlib/interactive/shared.py b/executorlib/task_scheduler/interactive/shared.py similarity index 100% rename from executorlib/interactive/shared.py rename to executorlib/task_scheduler/interactive/shared.py diff --git a/executorlib/interactive/slurmspawner.py b/executorlib/task_scheduler/interactive/slurmspawner.py similarity index 100% rename from executorlib/interactive/slurmspawner.py rename to executorlib/task_scheduler/interactive/slurmspawner.py diff --git a/notebooks/1-single-node.ipynb b/notebooks/1-single-node.ipynb index 91fd7349..60689144 100644 --- a/notebooks/1-single-node.ipynb +++ b/notebooks/1-single-node.ipynb @@ -530,7 +530,7 @@ ], "source": [ "%%time\n", - "with SingleNodeExecutor(cache_directory=\"./cache\") as exe:\n", + "with SingleNodeExecutor(cache_directory=\"./file\") as exe:\n", " future_lst = [exe.submit(sum, [i, i]) for i in range(1, 4)]\n", " print([f.result() for f in future_lst])" ] @@ -563,7 +563,7 @@ ], "source": [ "%%time\n", - "with SingleNodeExecutor(cache_directory=\"./cache\") as exe:\n", + "with SingleNodeExecutor(cache_directory=\"./file\") as exe:\n", " future_lst = [exe.submit(sum, [i, i]) for i in range(1, 4)]\n", " print([f.result() for f in future_lst])" ] @@ -664,7 +664,7 @@ "import pandas\n", "from executorlib import get_cache_data\n", "\n", - "df = pandas.DataFrame(get_cache_data(cache_directory=\"./cache\"))\n", + "df = pandas.DataFrame(get_cache_data(cache_directory=\"./file\"))\n", "df" ] }, @@ -694,7 +694,7 @@ "import os\n", "import shutil\n", "\n", - "cache_dir = \"./cache\"\n", + "cache_dir = \"./file\"\n", "if os.path.exists(cache_dir):\n", " print(os.listdir(cache_dir))\n", " try:\n", diff --git a/notebooks/2-hpc-cluster.ipynb b/notebooks/2-hpc-cluster.ipynb index ec3d7007..8bc85182 100644 --- a/notebooks/2-hpc-cluster.ipynb +++ b/notebooks/2-hpc-cluster.ipynb @@ -1 +1,194 @@ -{"metadata":{"kernelspec":{"display_name":"Flux","language":"python","name":"flux"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.12.9"}},"nbformat_minor":5,"nbformat":4,"cells":[{"id":"ddf66f38-dc4a-4306-8b1c-b923fdb76922","cell_type":"markdown","source":"# HPC Cluster Executor\nIn contrast to the [Single Node Executor](https://executorlib.readthedocs.io/en/latest/1-single-node.html) and the [HPC Job Executor](https://executorlib.readthedocs.io/en/latest/3-hpc-job.html) the HPC Submission Executors do not communicate via the [zero message queue](https://zeromq.org) but instead store the python functions on the file system and uses the job scheduler to handle the dependencies of the Python functions. Consequently, the block allocation `block_allocation` and the init function `init_function` are not available in the HPC Cluster Executors. At the same time it is possible to close the Python process which created the `Executor`, wait until the execution of the submitted Python functions is completed and afterwards reload the results from the cache.\n\nInternally the HPC submission mode is using the [Python simple queuing system adatper (pysqa)](https://pysqa.readthedocs.io) to connect to HPC job schedulers and the [h5py](https://www.h5py.org) package for serializing the Python functions to store them on the file system. Both packages are optional dependency of executorlib. The installation of the [pysqa](https://pysqa.readthedocs.io) package and the [h5py](https://www.h5py.org) package are covered in the installation section. ","metadata":{}},{"id":"d56862a6-8279-421d-a090-7ca2a3c4d416","cell_type":"markdown","source":"## SLURM\nThe [Simple Linux Utility for Resource Management (SLURM)](https://slurm.schedmd.com) job scheduler is currently the most commonly used job scheduler for HPC clusters. In the HPC submission mode executorlib internally uses the [sbatch](https://slurm.schedmd.com/sbatch.html) command this is in contrast to the [HPC allocatiom mode] which internally uses the [srun](https://slurm.schedmd.com/srun.html) command. \n\nThe connection to the job scheduler is based on the [Python simple queuing system adatper (pysqa)](https://pysqa.readthedocs.io). It provides a default configuration for most commonly used job schedulers including SLURM, in addition it is also possible to provide the submission template as part of the resource dictionary `resource_dict` or via the path to the configuration directory with the `pysqa_config_directory` parameter. All three options are covered in more detail on the [pysqa documentation](https://pysqa.readthedocs.io).","metadata":{}},{"id":"db7760e8-35a6-4a1c-8b0f-410b536c3835","cell_type":"markdown","source":"```python\nfrom executorlib import SlurmClusterExecutor\n```","metadata":{}},{"id":"b20913f3-59e4-418c-a399-866124f8e497","cell_type":"markdown","source":"In comparison to the [SingleNodeExecutor](https://executorlib.readthedocs.io/en/latest/1-single-node.html), the only parameter which is changed in the `SlurmClusterExecutor` is the requirement to specify the cache directory using the `cache_directory=\"./cache\"`. The rest of the syntax remains exactly the same, to simplify the up-scaling of simulation workflows.","metadata":{}},{"id":"0b8f3b77-6199-4736-9f28-3058c5230777","cell_type":"markdown","source":"```python\nwith SlurmClusterExecutor(cache_directory=\"./cache\") as exe:\n future_lst = [exe.submit(sum, [i, i]) for i in range(1, 4)]\n print([f.result() for f in future_lst])\n```","metadata":{}},{"id":"37bef7ac-ce3e-4d8a-b848-b1474c370bca","cell_type":"markdown","source":"Specific parameters for `SlurmClusterExecutor` like the maximum run time `\"run_time_max\"`, the maximum memory `\"memory_max\"` or the submission template for the job submission script `\"submission_template\"` can be specified as part of the resource dictionary. Again it is possible to specify the resource dictonary `resource_dicionary` either for each function in the `submit()` function or during the initialization of the `SlurmClusterExecutor`.","metadata":{}},{"id":"658781de-f222-4235-8c26-b0f77a0831b3","cell_type":"markdown","source":"```python\nsubmission_template = \"\"\"\\\n#!/bin/bash\n#SBATCH --output=time.out\n#SBATCH --job-name={{job_name}}\n#SBATCH --chdir={{working_directory}}\n#SBATCH --get-user-env=L\n#SBATCH --partition={{partition}}\n{%- if run_time_max %}\n#SBATCH --time={{ [1, run_time_max // 60]|max }}\n{%- endif %}\n{%- if dependency %}\n#SBATCH --dependency=afterok:{{ dependency | join(',') }}\n{%- endif %}\n{%- if memory_max %}\n#SBATCH --mem={{memory_max}}G\n{%- endif %}\n#SBATCH --cpus-per-task={{cores}}\n\n{{command}}\n\"\"\"\n\nwith SlurmClusterExecutor(cache_directory=\"./cache\") as exe:\n future = exe.submit(\n sum, [4, 4], \n resource_dict={\n \"submission_template\": submission_template, \n \"run_time_max\": 180, # in seconds \n })\n print(future.result())\n```","metadata":{}},{"id":"f7ad9c97-7743-4f87-9344-4299b2b31a56","cell_type":"markdown","source":"With these options executorlib in combination with the SLURM job scheduler provides a lot flexibility to configure the submission of Python functions depending on the specific configuration of the job scheduler. ","metadata":{}},{"id":"2a814efb-2fbc-41ba-98df-cf121d19ea66","cell_type":"markdown","source":"## Flux\nWhile most HPC job schedulers require extensive configuration before they can be tested, the [flux framework](http://flux-framework.org) can be installed with the conda package manager, as explained in the [installation section](https://executorlib.readthedocs.io/en/latest/installation.html#alternative-installations). This simple installation makes the flux framework especially suitable for demonstrations, testing and continous integration. So below a number of features for the HPC submission mode are demonstrated based on the example of the [flux framework](http://flux-framework.org) still the same applies to other job schedulers like SLURM introduced above.","metadata":{}},{"id":"29d7aa18-357e-416e-805c-1322b59abec1","cell_type":"markdown","source":"### Dependencies\nAs already demonstrated for the [SingleNodeExecutor](https://executorlib.readthedocs.io/en/latest/1-single-node.html) the `Executor` classes from executorlib are capable of resolving the dependencies of serial functions, when [concurrent futures Future](https://docs.python.org/3/library/concurrent.futures.html#future-objects) objects are used as inputs for subsequent function calls. For the case of the HPC submission these dependencies are communicated to the job scheduler, which allows to stop the Python process which created the `Executor` class, wait until the execution of the submitted Python functions is completed and afterwards restart the Python process for the `Executor` class and reload the calculation results from the cache defined by the `cache_directory` parameter.","metadata":{}},{"id":"0f7fc37a-1248-492d-91ab-9db1d737eaee","cell_type":"code","source":"def add_funct(a, b):\n return a + b","metadata":{"trusted":false},"outputs":[],"execution_count":1},{"id":"ae308683-6083-4e78-afc2-bff6c6dc297b","cell_type":"code","source":"from executorlib import FluxClusterExecutor\n\nwith FluxClusterExecutor(cache_directory=\"./cache\") as exe:\n future = 0\n for i in range(4, 8):\n future = exe.submit(add_funct, i, future)\n print(future.result())","metadata":{"trusted":false},"outputs":[{"name":"stdout","output_type":"stream","text":"22\n"}],"execution_count":2},{"id":"ca75cb6c-c50f-4bee-9b09-d8d29d6c263b","cell_type":"markdown","source":"### Resource Assignment\nIn analogy to the [SingleNodeExecutor](https://executorlib.readthedocs.io/en/latest/1-single-node.html) the resource assignment for the `FluxClusterExecutor` is handled by either including the resource dictionary parameter `resource_dict` in the initialization of the `FluxClusterExecutor` class or in every call of the `submit()` function.\n\nBelow this is demonstrated once for the assignment of multiple CPU cores for the execution of a Python function which internally uses the message passing interface (MPI) via the [mpi4py](https://mpi4py.readthedocs.io) package.","metadata":{}},{"id":"eded3a0f-e54f-44f6-962f-eedde4bd2158","cell_type":"code","source":"def calc(i):\n from mpi4py import MPI\n\n size = MPI.COMM_WORLD.Get_size()\n rank = MPI.COMM_WORLD.Get_rank()\n return i, size, rank\n","metadata":{"trusted":false},"outputs":[],"execution_count":3},{"id":"669b05df-3cb2-4f69-9d94-8b2442745ebb","cell_type":"code","source":"with FluxClusterExecutor(cache_directory=\"./cache\") as exe:\n fs = exe.submit(calc, 3, resource_dict={\"cores\": 2})\n print(fs.result())","metadata":{"trusted":false},"outputs":[{"name":"stdout","output_type":"stream","text":"[(3, 2, 0), (3, 2, 1)]\n"}],"execution_count":4},{"id":"d91499d7-5c6c-4c10-b7b7-bfc4b87ddaa8","cell_type":"markdown","source":"Beyond CPU cores and threads which were previously also introduced for the [Single Node Executor](https://executorlib.readthedocs.io/en/latest/1-single-node.html) the HPC Cluster Executors also provide the option to select the available accelerator cards or GPUs, by specifying the `\"gpus_per_core\"` parameter in the resource dictionary `resource_dict`. For demonstration we create a Python function which reads the GPU device IDs and submit it to the `FluxClusterExecutor` class:\n```python\ndef get_available_gpus():\n import socket\n from tensorflow.python.client import device_lib\n local_device_protos = device_lib.list_local_devices()\n return [\n (x.name, x.physical_device_desc, socket.gethostname()) \n for x in local_device_protos if x.device_type == 'GPU'\n ]\n```\n\n```python\nwith FluxClusterExecutor(\n cache_directory=\"./cache\",\n resource_dict={\"gpus_per_core\": 1}\n) as exe:\n fs_1 = exe.submit(get_available_gpus)\n fs_2 = exe.submit(get_available_gpus)\n print(fs_1.result(), fs_2.result())\n```","metadata":{}},{"id":"3f47fd34-04d1-42a7-bb06-6821dc99a648","cell_type":"markdown","source":"### Cleaning Cache\nFinally, as the HPC Cluster Executors leverage the file system to communicate serialized Python functions, it is important to clean up the cache directory specified by the `cache_directory` parameter once the results of the submitted Python functions are no longer needed. The serialized Python functions are stored in binary format using the [cloudpickle](https://github.com/cloudpipe/cloudpickle) library for serialization. This format is design for caching but not for long-term storage. The user is responsible for the long-term storage of their data.","metadata":{}},{"id":"f537b4f6-cc98-43da-8aca-94a823bcbcbd","cell_type":"code","source":"import os\nimport shutil\n\ncache_dir = \"./cache\"\nif os.path.exists(cache_dir):\n print(os.listdir(cache_dir))\n try:\n shutil.rmtree(cache_dir)\n except OSError:\n pass","metadata":{"trusted":false},"outputs":[{"name":"stdout","output_type":"stream","text":"['add_functdce32a0e7f6eac9e4e19fec335b79726', 'calc76234667eef65c770fecf54645ef8ada', 'add_functee0545e0d3edb8a4a6ceb6d5ae712d39', 'add_funct3263a1038c0d088677685b6eccd9f7b7', 'add_funct6034ded02bdb3ff97695f3a94455ca4d']\n"}],"execution_count":5}]} \ No newline at end of file +{ + "metadata": { + "kernelspec": { + "display_name": "Flux", + "language": "python", + "name": "flux" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat_minor": 5, + "nbformat": 4, + "cells": [ + { + "id": "ddf66f38-dc4a-4306-8b1c-b923fdb76922", + "cell_type": "markdown", + "source": "# HPC Cluster Executor\nIn contrast to the [Single Node Executor](https://executorlib.readthedocs.io/en/latest/1-single-node.html) and the [HPC Job Executor](https://executorlib.readthedocs.io/en/latest/3-hpc-job.html) the HPC Submission Executors do not communicate via the [zero message queue](https://zeromq.org) but instead store the python functions on the file system and uses the job scheduler to handle the dependencies of the Python functions. Consequently, the block allocation `block_allocation` and the init function `init_function` are not available in the HPC Cluster Executors. At the same time it is possible to close the Python process which created the `Executor`, wait until the execution of the submitted Python functions is completed and afterwards reload the results from the cache.\n\nInternally the HPC submission mode is using the [Python simple queuing system adatper (pysqa)](https://pysqa.readthedocs.io) to connect to HPC job schedulers and the [h5py](https://www.h5py.org) package for serializing the Python functions to store them on the file system. Both packages are optional dependency of executorlib. The installation of the [pysqa](https://pysqa.readthedocs.io) package and the [h5py](https://www.h5py.org) package are covered in the installation section. ", + "metadata": {} + }, + { + "id": "d56862a6-8279-421d-a090-7ca2a3c4d416", + "cell_type": "markdown", + "source": "## SLURM\nThe [Simple Linux Utility for Resource Management (SLURM)](https://slurm.schedmd.com) job scheduler is currently the most commonly used job scheduler for HPC clusters. In the HPC submission mode executorlib internally uses the [sbatch](https://slurm.schedmd.com/sbatch.html) command this is in contrast to the [HPC allocatiom mode] which internally uses the [srun](https://slurm.schedmd.com/srun.html) command. \n\nThe connection to the job scheduler is based on the [Python simple queuing system adatper (pysqa)](https://pysqa.readthedocs.io). It provides a default configuration for most commonly used job schedulers including SLURM, in addition it is also possible to provide the submission template as part of the resource dictionary `resource_dict` or via the path to the configuration directory with the `pysqa_config_directory` parameter. All three options are covered in more detail on the [pysqa documentation](https://pysqa.readthedocs.io).", + "metadata": {} + }, + { + "id": "db7760e8-35a6-4a1c-8b0f-410b536c3835", + "cell_type": "markdown", + "source": "```python\nfrom executorlib import SlurmClusterExecutor\n```", + "metadata": {} + }, + { + "id": "b20913f3-59e4-418c-a399-866124f8e497", + "cell_type": "markdown", + "source": "In comparison to the [SingleNodeExecutor](https://executorlib.readthedocs.io/en/latest/1-single-node.html), the only parameter which is changed in the `SlurmClusterExecutor` is the requirement to specify the cache directory using the `cache_directory=\"./cache\"`. The rest of the syntax remains exactly the same, to simplify the up-scaling of simulation workflows.", + "metadata": {} + }, + { + "id": "0b8f3b77-6199-4736-9f28-3058c5230777", + "cell_type": "markdown", + "source": "```python\nwith SlurmClusterExecutor(cache_directory=\"./cache\") as exe:\n future_lst = [exe.submit(sum, [i, i]) for i in range(1, 4)]\n print([f.result() for f in future_lst])\n```", + "metadata": {} + }, + { + "id": "37bef7ac-ce3e-4d8a-b848-b1474c370bca", + "cell_type": "markdown", + "source": "Specific parameters for `SlurmClusterExecutor` like the maximum run time `\"run_time_max\"`, the maximum memory `\"memory_max\"` or the submission template for the job submission script `\"submission_template\"` can be specified as part of the resource dictionary. Again it is possible to specify the resource dictonary `resource_dicionary` either for each function in the `submit()` function or during the initialization of the `SlurmClusterExecutor`.", + "metadata": {} + }, + { + "id": "658781de-f222-4235-8c26-b0f77a0831b3", + "cell_type": "markdown", + "source": "```python\nsubmission_template = \"\"\"\\\n#!/bin/bash\n#SBATCH --output=time.out\n#SBATCH --job-name={{job_name}}\n#SBATCH --chdir={{working_directory}}\n#SBATCH --get-user-env=L\n#SBATCH --partition={{partition}}\n{%- if run_time_max %}\n#SBATCH --time={{ [1, run_time_max // 60]|max }}\n{%- endif %}\n{%- if dependency %}\n#SBATCH --dependency=afterok:{{ dependency | join(',') }}\n{%- endif %}\n{%- if memory_max %}\n#SBATCH --mem={{memory_max}}G\n{%- endif %}\n#SBATCH --cpus-per-task={{cores}}\n\n{{command}}\n\"\"\"\n\nwith SlurmClusterExecutor(cache_directory=\"./cache\") as exe:\n future = exe.submit(\n sum, [4, 4], \n resource_dict={\n \"submission_template\": submission_template, \n \"run_time_max\": 180, # in seconds \n })\n print(future.result())\n```", + "metadata": {} + }, + { + "id": "f7ad9c97-7743-4f87-9344-4299b2b31a56", + "cell_type": "markdown", + "source": "With these options executorlib in combination with the SLURM job scheduler provides a lot flexibility to configure the submission of Python functions depending on the specific configuration of the job scheduler. ", + "metadata": {} + }, + { + "id": "2a814efb-2fbc-41ba-98df-cf121d19ea66", + "cell_type": "markdown", + "source": "## Flux\nWhile most HPC job schedulers require extensive configuration before they can be tested, the [flux framework](http://flux-framework.org) can be installed with the conda package manager, as explained in the [installation section](https://executorlib.readthedocs.io/en/latest/installation.html#alternative-installations). This simple installation makes the flux framework especially suitable for demonstrations, testing and continous integration. So below a number of features for the HPC submission mode are demonstrated based on the example of the [flux framework](http://flux-framework.org) still the same applies to other job schedulers like SLURM introduced above.", + "metadata": {} + }, + { + "id": "29d7aa18-357e-416e-805c-1322b59abec1", + "cell_type": "markdown", + "source": "### Dependencies\nAs already demonstrated for the [SingleNodeExecutor](https://executorlib.readthedocs.io/en/latest/1-single-node.html) the `Executor` classes from executorlib are capable of resolving the dependencies of serial functions, when [concurrent futures Future](https://docs.python.org/3/library/concurrent.futures.html#future-objects) objects are used as inputs for subsequent function calls. For the case of the HPC submission these dependencies are communicated to the job scheduler, which allows to stop the Python process which created the `Executor` class, wait until the execution of the submitted Python functions is completed and afterwards restart the Python process for the `Executor` class and reload the calculation results from the cache defined by the `cache_directory` parameter.", + "metadata": {} + }, + { + "id": "0f7fc37a-1248-492d-91ab-9db1d737eaee", + "cell_type": "code", + "source": "def add_funct(a, b):\n return a + b", + "metadata": { + "trusted": false + }, + "outputs": [], + "execution_count": 1 + }, + { + "id": "ae308683-6083-4e78-afc2-bff6c6dc297b", + "cell_type": "code", + "source": [ + "from executorlib import FluxClusterExecutor\n", + "\n", + "with FluxClusterExecutor(cache_directory=\"./file\") as exe:\n", + " future = 0\n", + " for i in range(4, 8):\n", + " future = exe.submit(add_funct, i, future)\n", + " print(future.result())" + ], + "metadata": { + "trusted": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": "22\n" + } + ], + "execution_count": 2 + }, + { + "id": "ca75cb6c-c50f-4bee-9b09-d8d29d6c263b", + "cell_type": "markdown", + "source": "### Resource Assignment\nIn analogy to the [SingleNodeExecutor](https://executorlib.readthedocs.io/en/latest/1-single-node.html) the resource assignment for the `FluxClusterExecutor` is handled by either including the resource dictionary parameter `resource_dict` in the initialization of the `FluxClusterExecutor` class or in every call of the `submit()` function.\n\nBelow this is demonstrated once for the assignment of multiple CPU cores for the execution of a Python function which internally uses the message passing interface (MPI) via the [mpi4py](https://mpi4py.readthedocs.io) package.", + "metadata": {} + }, + { + "id": "eded3a0f-e54f-44f6-962f-eedde4bd2158", + "cell_type": "code", + "source": "def calc(i):\n from mpi4py import MPI\n\n size = MPI.COMM_WORLD.Get_size()\n rank = MPI.COMM_WORLD.Get_rank()\n return i, size, rank\n", + "metadata": { + "trusted": false + }, + "outputs": [], + "execution_count": 3 + }, + { + "id": "669b05df-3cb2-4f69-9d94-8b2442745ebb", + "cell_type": "code", + "source": [ + "with FluxClusterExecutor(cache_directory=\"./file\") as exe:\n", + " fs = exe.submit(calc, 3, resource_dict={\"cores\": 2})\n", + " print(fs.result())" + ], + "metadata": { + "trusted": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": "[(3, 2, 0), (3, 2, 1)]\n" + } + ], + "execution_count": 4 + }, + { + "id": "d91499d7-5c6c-4c10-b7b7-bfc4b87ddaa8", + "cell_type": "markdown", + "source": "Beyond CPU cores and threads which were previously also introduced for the [Single Node Executor](https://executorlib.readthedocs.io/en/latest/1-single-node.html) the HPC Cluster Executors also provide the option to select the available accelerator cards or GPUs, by specifying the `\"gpus_per_core\"` parameter in the resource dictionary `resource_dict`. For demonstration we create a Python function which reads the GPU device IDs and submit it to the `FluxClusterExecutor` class:\n```python\ndef get_available_gpus():\n import socket\n from tensorflow.python.client import device_lib\n local_device_protos = device_lib.list_local_devices()\n return [\n (x.name, x.physical_device_desc, socket.gethostname()) \n for x in local_device_protos if x.device_type == 'GPU'\n ]\n```\n\n```python\nwith FluxClusterExecutor(\n cache_directory=\"./cache\",\n resource_dict={\"gpus_per_core\": 1}\n) as exe:\n fs_1 = exe.submit(get_available_gpus)\n fs_2 = exe.submit(get_available_gpus)\n print(fs_1.result(), fs_2.result())\n```", + "metadata": {} + }, + { + "id": "3f47fd34-04d1-42a7-bb06-6821dc99a648", + "cell_type": "markdown", + "source": "### Cleaning Cache\nFinally, as the HPC Cluster Executors leverage the file system to communicate serialized Python functions, it is important to clean up the cache directory specified by the `cache_directory` parameter once the results of the submitted Python functions are no longer needed. The serialized Python functions are stored in binary format using the [cloudpickle](https://github.com/cloudpipe/cloudpickle) library for serialization. This format is design for caching but not for long-term storage. The user is responsible for the long-term storage of their data.", + "metadata": {} + }, + { + "id": "f537b4f6-cc98-43da-8aca-94a823bcbcbd", + "cell_type": "code", + "source": [ + "import os\n", + "import shutil\n", + "\n", + "cache_dir = \"./file\"\n", + "if os.path.exists(cache_dir):\n", + " print(os.listdir(cache_dir))\n", + " try:\n", + " shutil.rmtree(cache_dir)\n", + " except OSError:\n", + " pass" + ], + "metadata": { + "trusted": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": "['add_functdce32a0e7f6eac9e4e19fec335b79726', 'calc76234667eef65c770fecf54645ef8ada', 'add_functee0545e0d3edb8a4a6ceb6d5ae712d39', 'add_funct3263a1038c0d088677685b6eccd9f7b7', 'add_funct6034ded02bdb3ff97695f3a94455ca4d']\n" + } + ], + "execution_count": 5 + } + ] +} diff --git a/notebooks/3-hpc-job.ipynb b/notebooks/3-hpc-job.ipynb index e21fdaf8..dd4e5f3b 100644 --- a/notebooks/3-hpc-job.ipynb +++ b/notebooks/3-hpc-job.ipynb @@ -1 +1,322 @@ -{"metadata":{"kernelspec":{"name":"flux","display_name":"Flux","language":"python"},"language_info":{"name":"python","version":"3.12.9","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":5,"nbformat":4,"cells":[{"id":"87c3425d-5abe-4e0b-a948-e371808c322c","cell_type":"markdown","source":"# HPC Job Executor\nIn contrast to the [HPC Cluster Executor](https://executorlib.readthedocs.io/en/latest/2-hpc-cluster.html) which submits individual Python functions to HPC job schedulers, the HPC Job Executors take a given job allocation of the HPC job scheduler and executes Python functions with the resources available in this job allocation. In this regard it is similar to the [Single Node Executor](https://executorlib.readthedocs.io/en/latest/1-single-node.html) as it communicates with the individual Python processes using the [zero message queue](https://zeromq.org/), still it is more advanced as it can access the computational resources of all compute nodes of the given HPC job allocation and also provides the option to assign GPUs as accelerators for parallel execution.\n\nAvailable Functionality: \n* Submit Python functions with the [submit() function or the map() function](https://executorlib.readthedocs.io/en/latest/1-single-node.html#basic-functionality).\n* Support for parallel execution, either using the [message passing interface (MPI)](https://executorlib.readthedocs.io/en/latest/1-single-node.html#mpi-parallel-functions), [thread based parallelism](https://executorlib.readthedocs.io/en/latest/1-single-node.html#thread-parallel-functions) or by [assigning dedicated GPUs](https://executorlib.readthedocs.io/en/latest/2-hpc-cluster.html#resource-assignment) to selected Python functions. All these resources assignments are handled via the [resource dictionary parameter resource_dict](https://executorlib.readthedocs.io/en/latest/trouble_shooting.html#resource-dictionary).\n* Performance optimization features, like [block allocation](https://executorlib.readthedocs.io/en/latest/1-single-node.html#block-allocation), [dependency resolution](https://executorlib.readthedocs.io/en/latest/1-single-node.html#dependencies) and [caching](https://executorlib.readthedocs.io/en/latest/1-single-node.html#cache).\n\nThe only parameter the user has to change is the `backend` parameter. ","metadata":{}},{"id":"8c788b9f-6b54-4ce0-a864-4526b7f6f170","cell_type":"markdown","source":"## SLURM\nWith the [Simple Linux Utility for Resource Management (SLURM)](https://slurm.schedmd.com/) currently being the most commonly used job scheduler, executorlib provides an interface to submit Python functions to SLURM. Internally, this is based on the [srun](https://slurm.schedmd.com/srun.html) command of the SLURM scheduler, which creates job steps in a given allocation. Given that all resource requests in SLURM are communicated via a central database a large number of submitted Python functions and resulting job steps can slow down the performance of SLURM. To address this limitation it is recommended to install the hierarchical job scheduler [flux](https://flux-framework.org/) in addition to SLURM, to use flux for distributing the resources within a given allocation. This configuration is discussed in more detail below in the section [SLURM with flux](https://executorlib.readthedocs.io/en/latest/3-hpc-job.html#slurm-with-flux).","metadata":{}},{"id":"133b751f-0925-4d11-99f0-3f8dd9360b54","cell_type":"code","source":"from executorlib import SlurmJobExecutor","metadata":{"trusted":true},"outputs":[],"execution_count":1},{"id":"9b74944e-2ccd-4cb0-860a-d876310ea870","cell_type":"markdown","source":"```python\nwith SlurmAllocationExecutor() as exe:\n future = exe.submit(sum, [1, 1])\n print(future.result())\n```","metadata":{}},{"id":"36e2d68a-f093-4082-933a-d95bfe7a60c6","cell_type":"markdown","source":"## SLURM with Flux \nAs discussed in the installation section it is important to select the [flux](https://flux-framework.org/) version compatible to the installation of a given HPC cluster. Which GPUs are available? Who manufactured these GPUs? Does the HPC use [mpich](https://www.mpich.org/) or [OpenMPI](https://www.open-mpi.org/) or one of their commercial counter parts like cray MPI or intel MPI? Depending on the configuration different installation options can be choosen, as explained in the [installation section](https://executorlib.readthedocs.io/en/latest/installation.html#hpc-job-executor).\n\nAfterwards flux can be started in an [sbatch](https://slurm.schedmd.com/sbatch.html) submission script using:\n```\nsrun flux start python \n```\nIn this Python script `` the `\"flux_allocation\"` backend can be used.","metadata":{}},{"id":"68be70c3-af18-4165-862d-7022d35bf9e4","cell_type":"markdown","source":"### Resource Assignment\nIndependent of the selected Executor [Single Node Executor](https://executorlib.readthedocs.io/en/latest/1-single-node.html), [HPC Cluster Executor](https://executorlib.readthedocs.io/en/latest/2-hpc-cluster.html) or HPC job executor the assignment of the computational resources remains the same. They can either be specified in the `submit()` function by adding the resource dictionary parameter [resource_dict](https://executorlib.readthedocs.io/en/latest/trouble_shooting.html#resource-dictionary) or alternatively during the initialization of the `Executor` class by adding the resource dictionary parameter [resource_dict](https://executorlib.readthedocs.io/en/latest/trouble_shooting.html#resource-dictionary) there.\n\nThis functionality of executorlib is commonly used to rewrite individual Python functions to use MPI while the rest of the Python program remains serial.","metadata":{}},{"id":"8a2c08df-cfea-4783-ace6-68fcd8ebd330","cell_type":"code","source":"def calc_mpi(i):\n from mpi4py import MPI\n\n size = MPI.COMM_WORLD.Get_size()\n rank = MPI.COMM_WORLD.Get_rank()\n return i, size, rank","metadata":{"trusted":true},"outputs":[],"execution_count":2},{"id":"715e0c00-7b17-40bb-bd55-b0e097bfef07","cell_type":"markdown","source":"Depending on the choice of MPI version, it is recommended to specify the pmi standard which [flux](https://flux-framework.org/) should use internally for the resource assignment. For example for OpenMPI >=5 `\"pmix\"` is the recommended pmi standard.","metadata":{}},{"id":"5802c7d7-9560-4909-9d30-a915a91ac0a1","cell_type":"code","source":"from executorlib import FluxJobExecutor\n\nwith FluxJobExecutor(flux_executor_pmi_mode=\"pmix\") as exe:\n fs = exe.submit(calc_mpi, 3, resource_dict={\"cores\": 2})\n print(fs.result())","metadata":{"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":"[(3, 2, 0), (3, 2, 1)]\n"}],"execution_count":3},{"id":"da862425-08b6-4ced-999f-89a74e85f410","cell_type":"markdown","source":"### Block Allocation\nThe block allocation for the HPC allocation mode follows the same implementation as the [block allocation for the Single Node Executor](https://executorlib.readthedocs.io/en/latest/1-single-node.html#block-allocation). It starts by defining the initialization function `init_function()` which returns a dictionary which is internally used to look up input parameters for Python functions submitted to the `FluxJobExecutor` class. Commonly this functionality is used to store large data objects inside the Python process created for the block allocation, rather than reloading these Python objects for each submitted function.","metadata":{}},{"id":"cdc742c0-35f7-47ff-88c0-1b0dbeabe51b","cell_type":"code","source":"def init_function():\n return {\"j\": 4, \"k\": 3, \"l\": 2}","metadata":{"trusted":true},"outputs":[],"execution_count":4},{"id":"5ddf8343-ab2c-4469-ac9f-ee568823d4ad","cell_type":"code","source":"def calc_with_preload(i, j, k):\n return i + j + k","metadata":{"trusted":true},"outputs":[],"execution_count":5},{"id":"0da13efa-1941-416f-b9e6-bba15b5cdfa2","cell_type":"code","source":"with FluxJobExecutor(\n flux_executor_pmi_mode=\"pmix\",\n max_workers=2,\n init_function=init_function,\n block_allocation=True,\n) as exe:\n fs = exe.submit(calc_with_preload, 2, j=5)\n print(fs.result())","metadata":{"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":"10\n"}],"execution_count":6},{"id":"82f3b947-e662-4a0d-b590-9475e0b4f7dd","cell_type":"markdown","source":"In this example the parameter `k` is used from the dataset created by the initialization function while the parameters `i` and `j` are specified by the call of the `submit()` function. \n\nWhen using the block allocation mode, it is recommended to set either the maxium number of workers using the `max_workers` parameter or the maximum number of CPU cores using the `max_cores` parameter to prevent oversubscribing the available resources. ","metadata":{}},{"id":"8ced8359-8ecb-480b-966b-b85d8446d85c","cell_type":"markdown","source":"### Dependencies\nPython functions with rather different computational resource requirements should not be merged into a single function. So to able to execute a series of Python functions which each depend on the output of the previous Python function executorlib internally handles the dependencies based on the [concurrent futures future](https://docs.python.org/3/library/concurrent.futures.html#future-objects) objects from the Python standard library. This implementation is independent of the selected backend and works for HPC allocation mode just like explained in the [Single Node Executor](https://executorlib.readthedocs.io/en/latest/1-single-node.html#dependencies) section.","metadata":{}},{"id":"bd26d97b-46fd-4786-9ad1-1e534b31bf36","cell_type":"code","source":"def add_funct(a, b):\n return a + b","metadata":{"trusted":true},"outputs":[],"execution_count":7},{"id":"1a2d440f-3cfc-4ff2-b74d-e21823c65f69","cell_type":"code","source":"with FluxJobExecutor(flux_executor_pmi_mode=\"pmix\") as exe:\n future = 0\n for i in range(1, 4):\n future = exe.submit(add_funct, i, future)\n print(future.result())","metadata":{"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":"6\n"}],"execution_count":8},{"id":"f526c2bf-fdf5-463b-a955-020753138415","cell_type":"markdown","source":"### Caching\nFinally, also the caching is available for HPC allocation mode, in analogy to the [Single Node Executor](https://executorlib.readthedocs.io/en/latest/1-single-node.html#cache). Again this functionality is not designed to identify function calls with the same parameters, but rather provides the option to reload previously cached results even after the Python processes which contained the executorlib `Executor` class is closed. As the cache is stored on the file system, this option can decrease the performance of executorlib. Consequently the caching option should primarily be used during the prototyping phase.","metadata":{}},{"id":"dcba63e0-72f5-49d1-ab04-2092fccc1c47","cell_type":"code","source":"with FluxJobExecutor(flux_executor_pmi_mode=\"pmix\", cache_directory=\"./cache\") as exe:\n future_lst = [exe.submit(sum, [i, i]) for i in range(1, 4)]\n print([f.result() for f in future_lst])","metadata":{"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":"[2, 4, 6]\n"}],"execution_count":9},{"id":"c3958a14-075b-4c10-9729-d1c559a9231c","cell_type":"code","source":"import os\nimport shutil\n\ncache_dir = \"./cache\"\nif os.path.exists(cache_dir):\n print(os.listdir(cache_dir))\n try:\n shutil.rmtree(cache_dir)\n except OSError:\n pass","metadata":{"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":"['sum0d968285d17368d1c34ea7392309bcc5', 'sum6270955d7c8022a0c1027aafaee64439', 'sum0102e33bb2921ae07a3bbe3db5d3dec9']\n"}],"execution_count":10},{"id":"c24ca82d-60bd-4fb9-a082-bf9a81e838bf","cell_type":"markdown","source":"### Nested executors\nThe hierarchical nature of the [flux](https://flux-framework.org/) job scheduler allows the creation of additional executorlib Executors inside the functions submitted to the Executor. This hierarchy can be beneficial to separate the logic to saturate the available computational resources. ","metadata":{}},{"id":"06fb2d1f-65fc-4df6-9402-5e9837835484","cell_type":"code","source":"def calc_nested():\n from executorlib import FluxJobExecutor\n\n with FluxJobExecutor(flux_executor_pmi_mode=\"pmix\") as exe:\n fs = exe.submit(sum, [1, 1])\n return fs.result()","metadata":{"trusted":true},"outputs":[],"execution_count":11},{"id":"89b7d0fd-5978-4913-a79a-f26cc8047445","cell_type":"code","source":"with FluxJobExecutor(flux_executor_pmi_mode=\"pmix\", flux_executor_nesting=True) as exe:\n fs = exe.submit(calc_nested)\n print(fs.result())","metadata":{"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":"2\n"}],"execution_count":12},{"id":"34a8c690-ca5a-41d1-b38f-c67eff085750","cell_type":"markdown","source":"### Resource Monitoring\nFor debugging it is commonly helpful to keep track of the computational resources. [flux](https://flux-framework.org/) provides a number of features to analyse the resource utilization, so here only the two most commonly used ones are introduced. Starting with the option to list all the resources available in a given allocation with the `flux resource list` command:","metadata":{}},{"id":"7481eb0a-a41b-4d46-bb48-b4db299fcd86","cell_type":"code","source":"! flux resource list","metadata":{"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":" STATE NNODES NCORES NGPUS NODELIST\n free 1 24 0 jupyter-pyiron-executorlib-slqpe5j5\n allocated 0 0 0 \n down 0 0 0 \n"}],"execution_count":13},{"id":"08d98134-a0e0-4841-be82-e09e1af29e7f","cell_type":"markdown","source":"Followed by the list of jobs which were executed in a given flux session. This can be retrieved using the `flux jobs -a` command:","metadata":{}},{"id":"1ee6e147-f53a-4526-8ed0-fd036f2ee6bf","cell_type":"code","source":"! flux jobs -a","metadata":{"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":" JOBID USER NAME ST NTASKS NNODES TIME INFO\n\u001b[01;32m ƒ5c7bbtT jovyan flux CD 1 1 4.227s jupyter-pyiron-executorlib-slqpe5j5\n\u001b[0;0m\u001b[01;32m ƒ47tyNMM jovyan python CD 1 1 2.982s jupyter-pyiron-executorlib-slqpe5j5\n\u001b[0;0m\u001b[01;32m ƒ47sVP51 jovyan python CD 1 1 2.902s jupyter-pyiron-executorlib-slqpe5j5\n\u001b[0;0m\u001b[01;32m ƒ427vAfR jovyan python CD 1 1 2.986s jupyter-pyiron-executorlib-slqpe5j5\n\u001b[0;0m\u001b[01;32m ƒ3jUnECw jovyan python CD 1 1 0.455s jupyter-pyiron-executorlib-slqpe5j5\n\u001b[0;0m\u001b[01;32m ƒ3P1G9Uj jovyan python CD 1 1 0.643s jupyter-pyiron-executorlib-slqpe5j5\n\u001b[0;0m\u001b[01;32m ƒ38sQze3 jovyan python CD 1 1 0.606s jupyter-pyiron-executorlib-slqpe5j5\n\u001b[0;0m\u001b[01;32m ƒ2HHH1w5 jovyan python CD 1 1 1.665s jupyter-pyiron-executorlib-slqpe5j5\n\u001b[0;0m\u001b[01;32m ƒ2EvtA1M jovyan python CD 1 1 1.734s jupyter-pyiron-executorlib-slqpe5j5\n\u001b[0;0m\u001b[01;32m ƒV4qQRd jovyan python CD 2 1 1.463s jupyter-pyiron-executorlib-slqpe5j5\n\u001b[0;0m"}],"execution_count":14},{"id":"021f165b-27cc-4676-968b-cbcfd1f0210a","cell_type":"markdown","source":"## Flux\nWhile the number of HPC clusters which use [flux](https://flux-framework.org/) as primary job scheduler is currently still limited the setup and functionality provided by executorlib for running [SLURM with flux](https://executorlib.readthedocs.io/en/latest/3-hpc-job.html#slurm-with-flux) also applies to HPCs which use [flux](https://flux-framework.org/) as primary job scheduler.","metadata":{}},{"id":"04f03ebb-3f9e-4738-b9d2-5cb0db9b63c3","cell_type":"code","source":"","metadata":{"trusted":true},"outputs":[],"execution_count":null}]} \ No newline at end of file +{ + "metadata": { + "kernelspec": { + "name": "flux", + "display_name": "Flux", + "language": "python" + }, + "language_info": { + "name": "python", + "version": "3.12.9", + "mimetype": "text/x-python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "pygments_lexer": "ipython3", + "nbconvert_exporter": "python", + "file_extension": ".py" + } + }, + "nbformat_minor": 5, + "nbformat": 4, + "cells": [ + { + "id": "87c3425d-5abe-4e0b-a948-e371808c322c", + "cell_type": "markdown", + "source": "# HPC Job Executor\nIn contrast to the [HPC Cluster Executor](https://executorlib.readthedocs.io/en/latest/2-hpc-cluster.html) which submits individual Python functions to HPC job schedulers, the HPC Job Executors take a given job allocation of the HPC job scheduler and executes Python functions with the resources available in this job allocation. In this regard it is similar to the [Single Node Executor](https://executorlib.readthedocs.io/en/latest/1-single-node.html) as it communicates with the individual Python processes using the [zero message queue](https://zeromq.org/), still it is more advanced as it can access the computational resources of all compute nodes of the given HPC job allocation and also provides the option to assign GPUs as accelerators for parallel execution.\n\nAvailable Functionality: \n* Submit Python functions with the [submit() function or the map() function](https://executorlib.readthedocs.io/en/latest/1-single-node.html#basic-functionality).\n* Support for parallel execution, either using the [message passing interface (MPI)](https://executorlib.readthedocs.io/en/latest/1-single-node.html#mpi-parallel-functions), [thread based parallelism](https://executorlib.readthedocs.io/en/latest/1-single-node.html#thread-parallel-functions) or by [assigning dedicated GPUs](https://executorlib.readthedocs.io/en/latest/2-hpc-cluster.html#resource-assignment) to selected Python functions. All these resources assignments are handled via the [resource dictionary parameter resource_dict](https://executorlib.readthedocs.io/en/latest/trouble_shooting.html#resource-dictionary).\n* Performance optimization features, like [block allocation](https://executorlib.readthedocs.io/en/latest/1-single-node.html#block-allocation), [dependency resolution](https://executorlib.readthedocs.io/en/latest/1-single-node.html#dependencies) and [caching](https://executorlib.readthedocs.io/en/latest/1-single-node.html#cache).\n\nThe only parameter the user has to change is the `backend` parameter. ", + "metadata": {} + }, + { + "id": "8c788b9f-6b54-4ce0-a864-4526b7f6f170", + "cell_type": "markdown", + "source": "## SLURM\nWith the [Simple Linux Utility for Resource Management (SLURM)](https://slurm.schedmd.com/) currently being the most commonly used job scheduler, executorlib provides an interface to submit Python functions to SLURM. Internally, this is based on the [srun](https://slurm.schedmd.com/srun.html) command of the SLURM scheduler, which creates job steps in a given allocation. Given that all resource requests in SLURM are communicated via a central database a large number of submitted Python functions and resulting job steps can slow down the performance of SLURM. To address this limitation it is recommended to install the hierarchical job scheduler [flux](https://flux-framework.org/) in addition to SLURM, to use flux for distributing the resources within a given allocation. This configuration is discussed in more detail below in the section [SLURM with flux](https://executorlib.readthedocs.io/en/latest/3-hpc-job.html#slurm-with-flux).", + "metadata": {} + }, + { + "id": "133b751f-0925-4d11-99f0-3f8dd9360b54", + "cell_type": "code", + "source": "from executorlib import SlurmJobExecutor", + "metadata": { + "trusted": true + }, + "outputs": [], + "execution_count": 1 + }, + { + "id": "9b74944e-2ccd-4cb0-860a-d876310ea870", + "cell_type": "markdown", + "source": "```python\nwith SlurmAllocationExecutor() as exe:\n future = exe.submit(sum, [1, 1])\n print(future.result())\n```", + "metadata": {} + }, + { + "id": "36e2d68a-f093-4082-933a-d95bfe7a60c6", + "cell_type": "markdown", + "source": "## SLURM with Flux \nAs discussed in the installation section it is important to select the [flux](https://flux-framework.org/) version compatible to the installation of a given HPC cluster. Which GPUs are available? Who manufactured these GPUs? Does the HPC use [mpich](https://www.mpich.org/) or [OpenMPI](https://www.open-mpi.org/) or one of their commercial counter parts like cray MPI or intel MPI? Depending on the configuration different installation options can be choosen, as explained in the [installation section](https://executorlib.readthedocs.io/en/latest/installation.html#hpc-job-executor).\n\nAfterwards flux can be started in an [sbatch](https://slurm.schedmd.com/sbatch.html) submission script using:\n```\nsrun flux start python \n```\nIn this Python script `` the `\"flux_allocation\"` backend can be used.", + "metadata": {} + }, + { + "id": "68be70c3-af18-4165-862d-7022d35bf9e4", + "cell_type": "markdown", + "source": "### Resource Assignment\nIndependent of the selected Executor [Single Node Executor](https://executorlib.readthedocs.io/en/latest/1-single-node.html), [HPC Cluster Executor](https://executorlib.readthedocs.io/en/latest/2-hpc-cluster.html) or HPC job executor the assignment of the computational resources remains the same. They can either be specified in the `submit()` function by adding the resource dictionary parameter [resource_dict](https://executorlib.readthedocs.io/en/latest/trouble_shooting.html#resource-dictionary) or alternatively during the initialization of the `Executor` class by adding the resource dictionary parameter [resource_dict](https://executorlib.readthedocs.io/en/latest/trouble_shooting.html#resource-dictionary) there.\n\nThis functionality of executorlib is commonly used to rewrite individual Python functions to use MPI while the rest of the Python program remains serial.", + "metadata": {} + }, + { + "id": "8a2c08df-cfea-4783-ace6-68fcd8ebd330", + "cell_type": "code", + "source": "def calc_mpi(i):\n from mpi4py import MPI\n\n size = MPI.COMM_WORLD.Get_size()\n rank = MPI.COMM_WORLD.Get_rank()\n return i, size, rank", + "metadata": { + "trusted": true + }, + "outputs": [], + "execution_count": 2 + }, + { + "id": "715e0c00-7b17-40bb-bd55-b0e097bfef07", + "cell_type": "markdown", + "source": "Depending on the choice of MPI version, it is recommended to specify the pmi standard which [flux](https://flux-framework.org/) should use internally for the resource assignment. For example for OpenMPI >=5 `\"pmix\"` is the recommended pmi standard.", + "metadata": {} + }, + { + "id": "5802c7d7-9560-4909-9d30-a915a91ac0a1", + "cell_type": "code", + "source": "from executorlib import FluxJobExecutor\n\nwith FluxJobExecutor(flux_executor_pmi_mode=\"pmix\") as exe:\n fs = exe.submit(calc_mpi, 3, resource_dict={\"cores\": 2})\n print(fs.result())", + "metadata": { + "trusted": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": "[(3, 2, 0), (3, 2, 1)]\n" + } + ], + "execution_count": 3 + }, + { + "id": "da862425-08b6-4ced-999f-89a74e85f410", + "cell_type": "markdown", + "source": "### Block Allocation\nThe block allocation for the HPC allocation mode follows the same implementation as the [block allocation for the Single Node Executor](https://executorlib.readthedocs.io/en/latest/1-single-node.html#block-allocation). It starts by defining the initialization function `init_function()` which returns a dictionary which is internally used to look up input parameters for Python functions submitted to the `FluxJobExecutor` class. Commonly this functionality is used to store large data objects inside the Python process created for the block allocation, rather than reloading these Python objects for each submitted function.", + "metadata": {} + }, + { + "id": "cdc742c0-35f7-47ff-88c0-1b0dbeabe51b", + "cell_type": "code", + "source": "def init_function():\n return {\"j\": 4, \"k\": 3, \"l\": 2}", + "metadata": { + "trusted": true + }, + "outputs": [], + "execution_count": 4 + }, + { + "id": "5ddf8343-ab2c-4469-ac9f-ee568823d4ad", + "cell_type": "code", + "source": "def calc_with_preload(i, j, k):\n return i + j + k", + "metadata": { + "trusted": true + }, + "outputs": [], + "execution_count": 5 + }, + { + "id": "0da13efa-1941-416f-b9e6-bba15b5cdfa2", + "cell_type": "code", + "source": "with FluxJobExecutor(\n flux_executor_pmi_mode=\"pmix\",\n max_workers=2,\n init_function=init_function,\n block_allocation=True,\n) as exe:\n fs = exe.submit(calc_with_preload, 2, j=5)\n print(fs.result())", + "metadata": { + "trusted": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": "10\n" + } + ], + "execution_count": 6 + }, + { + "id": "82f3b947-e662-4a0d-b590-9475e0b4f7dd", + "cell_type": "markdown", + "source": "In this example the parameter `k` is used from the dataset created by the initialization function while the parameters `i` and `j` are specified by the call of the `submit()` function. \n\nWhen using the block allocation mode, it is recommended to set either the maxium number of workers using the `max_workers` parameter or the maximum number of CPU cores using the `max_cores` parameter to prevent oversubscribing the available resources. ", + "metadata": {} + }, + { + "id": "8ced8359-8ecb-480b-966b-b85d8446d85c", + "cell_type": "markdown", + "source": "### Dependencies\nPython functions with rather different computational resource requirements should not be merged into a single function. So to able to execute a series of Python functions which each depend on the output of the previous Python function executorlib internally handles the dependencies based on the [concurrent futures future](https://docs.python.org/3/library/concurrent.futures.html#future-objects) objects from the Python standard library. This implementation is independent of the selected backend and works for HPC allocation mode just like explained in the [Single Node Executor](https://executorlib.readthedocs.io/en/latest/1-single-node.html#dependencies) section.", + "metadata": {} + }, + { + "id": "bd26d97b-46fd-4786-9ad1-1e534b31bf36", + "cell_type": "code", + "source": "def add_funct(a, b):\n return a + b", + "metadata": { + "trusted": true + }, + "outputs": [], + "execution_count": 7 + }, + { + "id": "1a2d440f-3cfc-4ff2-b74d-e21823c65f69", + "cell_type": "code", + "source": "with FluxJobExecutor(flux_executor_pmi_mode=\"pmix\") as exe:\n future = 0\n for i in range(1, 4):\n future = exe.submit(add_funct, i, future)\n print(future.result())", + "metadata": { + "trusted": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": "6\n" + } + ], + "execution_count": 8 + }, + { + "id": "f526c2bf-fdf5-463b-a955-020753138415", + "cell_type": "markdown", + "source": "### Caching\nFinally, also the caching is available for HPC allocation mode, in analogy to the [Single Node Executor](https://executorlib.readthedocs.io/en/latest/1-single-node.html#cache). Again this functionality is not designed to identify function calls with the same parameters, but rather provides the option to reload previously cached results even after the Python processes which contained the executorlib `Executor` class is closed. As the cache is stored on the file system, this option can decrease the performance of executorlib. Consequently the caching option should primarily be used during the prototyping phase.", + "metadata": {} + }, + { + "id": "dcba63e0-72f5-49d1-ab04-2092fccc1c47", + "cell_type": "code", + "source": [ + "with FluxJobExecutor(flux_executor_pmi_mode=\"pmix\", cache_directory=\"./file\") as exe:\n", + " future_lst = [exe.submit(sum, [i, i]) for i in range(1, 4)]\n", + " print([f.result() for f in future_lst])" + ], + "metadata": { + "trusted": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": "[2, 4, 6]\n" + } + ], + "execution_count": 9 + }, + { + "id": "c3958a14-075b-4c10-9729-d1c559a9231c", + "cell_type": "code", + "source": [ + "import os\n", + "import shutil\n", + "\n", + "cache_dir = \"./file\"\n", + "if os.path.exists(cache_dir):\n", + " print(os.listdir(cache_dir))\n", + " try:\n", + " shutil.rmtree(cache_dir)\n", + " except OSError:\n", + " pass" + ], + "metadata": { + "trusted": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": "['sum0d968285d17368d1c34ea7392309bcc5', 'sum6270955d7c8022a0c1027aafaee64439', 'sum0102e33bb2921ae07a3bbe3db5d3dec9']\n" + } + ], + "execution_count": 10 + }, + { + "id": "c24ca82d-60bd-4fb9-a082-bf9a81e838bf", + "cell_type": "markdown", + "source": "### Nested executors\nThe hierarchical nature of the [flux](https://flux-framework.org/) job scheduler allows the creation of additional executorlib Executors inside the functions submitted to the Executor. This hierarchy can be beneficial to separate the logic to saturate the available computational resources. ", + "metadata": {} + }, + { + "id": "06fb2d1f-65fc-4df6-9402-5e9837835484", + "cell_type": "code", + "source": "def calc_nested():\n from executorlib import FluxJobExecutor\n\n with FluxJobExecutor(flux_executor_pmi_mode=\"pmix\") as exe:\n fs = exe.submit(sum, [1, 1])\n return fs.result()", + "metadata": { + "trusted": true + }, + "outputs": [], + "execution_count": 11 + }, + { + "id": "89b7d0fd-5978-4913-a79a-f26cc8047445", + "cell_type": "code", + "source": "with FluxJobExecutor(flux_executor_pmi_mode=\"pmix\", flux_executor_nesting=True) as exe:\n fs = exe.submit(calc_nested)\n print(fs.result())", + "metadata": { + "trusted": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": "2\n" + } + ], + "execution_count": 12 + }, + { + "id": "34a8c690-ca5a-41d1-b38f-c67eff085750", + "cell_type": "markdown", + "source": "### Resource Monitoring\nFor debugging it is commonly helpful to keep track of the computational resources. [flux](https://flux-framework.org/) provides a number of features to analyse the resource utilization, so here only the two most commonly used ones are introduced. Starting with the option to list all the resources available in a given allocation with the `flux resource list` command:", + "metadata": {} + }, + { + "id": "7481eb0a-a41b-4d46-bb48-b4db299fcd86", + "cell_type": "code", + "source": "! flux resource list", + "metadata": { + "trusted": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": " STATE NNODES NCORES NGPUS NODELIST\n free 1 24 0 jupyter-pyiron-executorlib-slqpe5j5\n allocated 0 0 0 \n down 0 0 0 \n" + } + ], + "execution_count": 13 + }, + { + "id": "08d98134-a0e0-4841-be82-e09e1af29e7f", + "cell_type": "markdown", + "source": "Followed by the list of jobs which were executed in a given flux session. This can be retrieved using the `flux jobs -a` command:", + "metadata": {} + }, + { + "id": "1ee6e147-f53a-4526-8ed0-fd036f2ee6bf", + "cell_type": "code", + "source": "! flux jobs -a", + "metadata": { + "trusted": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": " JOBID USER NAME ST NTASKS NNODES TIME INFO\n\u001B[01;32m ƒ5c7bbtT jovyan flux CD 1 1 4.227s jupyter-pyiron-executorlib-slqpe5j5\n\u001B[0;0m\u001B[01;32m ƒ47tyNMM jovyan python CD 1 1 2.982s jupyter-pyiron-executorlib-slqpe5j5\n\u001B[0;0m\u001B[01;32m ƒ47sVP51 jovyan python CD 1 1 2.902s jupyter-pyiron-executorlib-slqpe5j5\n\u001B[0;0m\u001B[01;32m ƒ427vAfR jovyan python CD 1 1 2.986s jupyter-pyiron-executorlib-slqpe5j5\n\u001B[0;0m\u001B[01;32m ƒ3jUnECw jovyan python CD 1 1 0.455s jupyter-pyiron-executorlib-slqpe5j5\n\u001B[0;0m\u001B[01;32m ƒ3P1G9Uj jovyan python CD 1 1 0.643s jupyter-pyiron-executorlib-slqpe5j5\n\u001B[0;0m\u001B[01;32m ƒ38sQze3 jovyan python CD 1 1 0.606s jupyter-pyiron-executorlib-slqpe5j5\n\u001B[0;0m\u001B[01;32m ƒ2HHH1w5 jovyan python CD 1 1 1.665s jupyter-pyiron-executorlib-slqpe5j5\n\u001B[0;0m\u001B[01;32m ƒ2EvtA1M jovyan python CD 1 1 1.734s jupyter-pyiron-executorlib-slqpe5j5\n\u001B[0;0m\u001B[01;32m ƒV4qQRd jovyan python CD 2 1 1.463s jupyter-pyiron-executorlib-slqpe5j5\n\u001B[0;0m" + } + ], + "execution_count": 14 + }, + { + "id": "021f165b-27cc-4676-968b-cbcfd1f0210a", + "cell_type": "markdown", + "source": "## Flux\nWhile the number of HPC clusters which use [flux](https://flux-framework.org/) as primary job scheduler is currently still limited the setup and functionality provided by executorlib for running [SLURM with flux](https://executorlib.readthedocs.io/en/latest/3-hpc-job.html#slurm-with-flux) also applies to HPCs which use [flux](https://flux-framework.org/) as primary job scheduler.", + "metadata": {} + }, + { + "id": "04f03ebb-3f9e-4738-b9d2-5cb0db9b63c3", + "cell_type": "code", + "source": "", + "metadata": { + "trusted": true + }, + "outputs": [], + "execution_count": null + } + ] +} diff --git a/notebooks/4-developer.ipynb b/notebooks/4-developer.ipynb index 97f7b0c9..29653b93 100644 --- a/notebooks/4-developer.ipynb +++ b/notebooks/4-developer.ipynb @@ -71,32 +71,32 @@ "## Modules\n", "While it is not recommended to link to specific internal components of executorlib in external Python packages but rather only the `Executor` classes should be used as central interfaces to executorlib, the internal architecture is briefly outlined below. \n", "* `backend` - the backend module contains the functionality for the Python processes created by executorlib to execute the submitted Python functions.\n", - "* `base` - the base module contains the definition of the executorlib `ExecutorBase` class which is internally used to create the different interfaces. To compare if an given `Executor` class is based on executorlib compare with the `ExecutorBase` class which can be imported as `from executorlib.base.executor import ExecutorBase`.\n", - "* `cache` - the cache module defines the file based communication for the [HPC Cluster Executor](https://executorlib.readthedocs.io/en/latest/2-hpc-cluster.html).\n", - "* `interactive` - the interactive modules defines the [zero message queue](https://zeromq.org) based communication for the [Single Node Executor](https://executorlib.readthedocs.io/en/latest/1-single-node.html) and the [HPC Job Executor](https://executorlib.readthedocs.io/en/latest/3-hpc-job.html).\n", - "* `interfaces` - the different `Executor` classes are defined here, namely `SingleNodeExecutor`, `SlurmClusterExecutor`, `SlurmJobExecutor`, `FluxClusterExecutor` and `FluxJobExecutor`.\n", + "* `executor` - the executor module defines the different `Executor` classes, namely `SingleNodeExecutor`, `SlurmClusterExecutor`, `SlurmJobExecutor`, `FluxClusterExecutor` and `FluxJobExecutor`. These are the interfaces the user interacts with.\n", "* `standalone` - the standalone module contains a number of utility functions which only depend on external libraries and do not have any internal dependency to other parts of `executorlib`. This includes the functionality to generate executable commands, the [h5py](https://www.h5py.org) based interface for caching, a number of input checks, routines to plot the dependencies of a number of future objects, functionality to interact with the [queues defined in the Python standard library](https://docs.python.org/3/library/queue.html), the interface for serialization based on [cloudpickle](https://github.com/cloudpipe/cloudpickle) and finally an extension to the [threading](https://docs.python.org/3/library/threading.html) of the Python standard library.\n", + "* `task_scheduler` - the internal task scheduler module defines the task schedulers, namely `BlockAllocationTaskScheduler`, `DependencyTaskScheduler`, `FileTaskScheduler` and `OneProcessTaskScheduler`. They are divided into two sub modules:\n", + " * `file` - the file based task scheduler module defines the file based communication for the [HPC Cluster Executor](https://executorlib.readthedocs.io/en/latest/2-hpc-cluster.html).\n", + " * `interactive` - the interactive task scheduler module defines the [zero message queue](https://zeromq.org) based communication for the [Single Node Executor](https://executorlib.readthedocs.io/en/latest/1-single-node.html) and the [HPC Job Executor](https://executorlib.readthedocs.io/en/latest/3-hpc-job.html).\n", "\n", "Given the level of separation the integration of submodules from the standalone module in external software packages should be the easiest way to benefit from the developments in executorlib beyond just using the `Executor` class. \n", "\n", "## Interface Class Hierarchy\n", - "executorlib provides five different interfaces, namely `SingleNodeExecutor`, `SlurmClusterExecutor`, `SlurmJobExecutor`, `FluxClusterExecutor` and `FluxJobExecutor`, internally these are mapped to four types of task schedulers `Executor._task_scheduler`, namely `BlockAllocationExecutor`, `DependencyExecutor`, `FileExecutor` and `OneTaskPerProcessExecutor` depending on which options are selected. The dependence is illustrated in the following table:\n", + "executorlib provides five different interfaces, namely `SingleNodeExecutor`, `SlurmClusterExecutor`, `SlurmJobExecutor`, `FluxClusterExecutor` and `FluxJobExecutor`, internally these are mapped to four types of task schedulers, namely `BlockAllocationTaskScheduler`, `DependencyTaskScheduler`, `FileTaskScheduler` and `OneProcessTaskScheduler` depending on which options are selected. Finally, the task schedulers are connected to spawners to start new processes, namely the `MpiExecSpawner`, `SrunSpawner` and `FluxPythonSpawner`. The dependence is illustrated in the following table:\n", "\n", - "| | `BlockAllocationExecutor` | `DependencyExecutor` | `FileExecutor` | `OneTaskPerProcessExecutor` |\n", - "|-------------------------------------------------------------------------|---------------------------|--------------------------|----------------|-----------------------------|\n", - "| `SingleNodeExecutor(disable_dependencies=False)` | | with `MpiExecSpawner` | | |\n", - "| `SingleNodeExecutor(disable_dependencies=True, block_allocation=False)` | | | | with `MpiExecSpawner` |\n", - "| `SingleNodeExecutor(disable_dependencies=True, block_allocation=True)` | with `MpiExecSpawner` | | | |\n", - "| `SlurmClusterExecutor(plot_dependency_graph=False)` | | | with `pysqa` | |\n", - "| `SlurmClusterExecutor(plot_dependency_graph=True)` | | with `SrunSpawner` | | |\n", - "| `SlurmJobExecutor(disable_dependencies=False)` | | with `SrunSpawner` | | |\n", - "| `SlurmJobExecutor(disable_dependencies=True, block_allocation=False)` | | | | with `SrunSpawner` |\n", - "| `SlurmJobExecutor(disable_dependencies=True, block_allocation=True)` | with `SrunSpawner` | | | |\n", - "| `FluxClusterExecutor(plot_dependency_graph=False)` | | | with `pysqa` | |\n", - "| `FluxClusterExecutor(plot_dependency_graph=True)` | | with `FluxPythonSpawner` | | |\n", - "| `FluxJobExecutor(disable_dependencies=False)` | | with `FluxPythonSpawner` | | |\n", - "| `FluxJobExecutor(disable_dependencies=True, block_allocation=False)` | | | | with `FluxPythonSpawner` |\n", - "| `FluxJobExecutor(disable_dependencies=True, block_allocation=True)` | with `FluxPythonSpawner` | | | |" + "| | `BlockAllocationTaskScheduler` | `DependencyTaskScheduler` | `FileTaskScheduler` | `OneProcessTaskScheduler` |\n", + "|-------------------------------------------------------------------------|--------------------------------|---------------------------|---------------------|---------------------------|\n", + "| `SingleNodeExecutor(disable_dependencies=False)` | | with `MpiExecSpawner` | | |\n", + "| `SingleNodeExecutor(disable_dependencies=True, block_allocation=False)` | | | | with `MpiExecSpawner` |\n", + "| `SingleNodeExecutor(disable_dependencies=True, block_allocation=True)` | with `MpiExecSpawner` | | | |\n", + "| `SlurmClusterExecutor(plot_dependency_graph=False)` | | | with `pysqa` | |\n", + "| `SlurmClusterExecutor(plot_dependency_graph=True)` | | with `SrunSpawner` | | |\n", + "| `SlurmJobExecutor(disable_dependencies=False)` | | with `SrunSpawner` | | |\n", + "| `SlurmJobExecutor(disable_dependencies=True, block_allocation=False)` | | | | with `SrunSpawner` |\n", + "| `SlurmJobExecutor(disable_dependencies=True, block_allocation=True)` | with `SrunSpawner` | | | |\n", + "| `FluxClusterExecutor(plot_dependency_graph=False)` | | | with `pysqa` | |\n", + "| `FluxClusterExecutor(plot_dependency_graph=True)` | | with `FluxPythonSpawner` | | |\n", + "| `FluxJobExecutor(disable_dependencies=False)` | | with `FluxPythonSpawner` | | |\n", + "| `FluxJobExecutor(disable_dependencies=True, block_allocation=False)` | | | | with `FluxPythonSpawner` |\n", + "| `FluxJobExecutor(disable_dependencies=True, block_allocation=True)` | with `FluxPythonSpawner` | | | |" ], "metadata": {} }, diff --git a/tests/test_base_executor_queue.py b/tests/test_base_executor_queue.py index d27840dd..7e6f40b3 100644 --- a/tests/test_base_executor_queue.py +++ b/tests/test_base_executor_queue.py @@ -2,7 +2,7 @@ from queue import Queue import unittest -from executorlib.base.executor import cancel_items_in_queue +from executorlib.standalone.queue import cancel_items_in_queue class TestQueue(unittest.TestCase): diff --git a/tests/test_cache_backend_execute.py b/tests/test_cache_backend_execute.py index 0dce06c6..2e38f2cf 100644 --- a/tests/test_cache_backend_execute.py +++ b/tests/test_cache_backend_execute.py @@ -5,8 +5,8 @@ try: - from executorlib.cache.backend import backend_execute_task_in_file - from executorlib.cache.shared import _check_task_output, FutureItem + from executorlib.task_scheduler.file.backend import backend_execute_task_in_file + from executorlib.task_scheduler.file.shared import _check_task_output, FutureItem from executorlib.standalone.hdf import dump, get_runtime from executorlib.standalone.serialize import serialize_funct_h5 diff --git a/tests/test_cache_fileexecutor_mpi.py b/tests/test_cache_fileexecutor_mpi.py index e30dd699..d4a4f3a7 100644 --- a/tests/test_cache_fileexecutor_mpi.py +++ b/tests/test_cache_fileexecutor_mpi.py @@ -3,11 +3,11 @@ import shutil import unittest -from executorlib.cache.subprocess_spawner import execute_in_subprocess +from executorlib.task_scheduler.file.subprocess_spawner import execute_in_subprocess try: - from executorlib.cache.executor import FileExecutor + from executorlib.task_scheduler.file.task_scheduler import FileTaskScheduler skip_h5py_test = False except ImportError: @@ -31,7 +31,7 @@ def mpi_funct(i): ) class TestCacheExecutorMPI(unittest.TestCase): def test_executor(self): - with FileExecutor( + with FileTaskScheduler( resource_dict={"cores": 2}, execute_function=execute_in_subprocess ) as exe: fs1 = exe.submit(mpi_funct, 1) diff --git a/tests/test_cache_fileexecutor_serial.py b/tests/test_cache_fileexecutor_serial.py index c6ac1b1a..c28c858b 100644 --- a/tests/test_cache_fileexecutor_serial.py +++ b/tests/test_cache_fileexecutor_serial.py @@ -5,14 +5,14 @@ import unittest from threading import Thread -from executorlib.cache.subprocess_spawner import ( +from executorlib.task_scheduler.file.subprocess_spawner import ( execute_in_subprocess, terminate_subprocess, ) try: - from executorlib.cache.executor import FileExecutor, create_file_executor - from executorlib.cache.shared import execute_tasks_h5 + from executorlib.task_scheduler.file.task_scheduler import FileTaskScheduler, create_file_executor + from executorlib.task_scheduler.file.shared import execute_tasks_h5 skip_h5py_test = False except ImportError: @@ -36,14 +36,14 @@ def get_error(a): ) class TestCacheExecutorSerial(unittest.TestCase): def test_executor_mixed(self): - with FileExecutor(execute_function=execute_in_subprocess) as exe: + with FileTaskScheduler(execute_function=execute_in_subprocess) as exe: fs1 = exe.submit(my_funct, 1, b=2) self.assertFalse(fs1.done()) self.assertEqual(fs1.result(), 3) self.assertTrue(fs1.done()) def test_executor_dependence_mixed(self): - with FileExecutor(execute_function=execute_in_subprocess) as exe: + with FileTaskScheduler(execute_function=execute_in_subprocess) as exe: fs1 = exe.submit(my_funct, 1, b=2) fs2 = exe.submit(my_funct, 1, b=fs1) self.assertFalse(fs2.done()) @@ -58,7 +58,7 @@ def test_create_file_executor_error(self): def test_executor_dependence_error(self): with self.assertRaises(ValueError): - with FileExecutor( + with FileTaskScheduler( execute_function=execute_in_subprocess, disable_dependencies=True ) as exe: fs = exe.submit(my_funct, 1, b=exe.submit(my_funct, 1, b=2)) @@ -66,7 +66,7 @@ def test_executor_dependence_error(self): def test_executor_working_directory(self): cwd = os.path.join(os.path.dirname(__file__), "executables") - with FileExecutor( + with FileTaskScheduler( resource_dict={"cwd": cwd}, execute_function=execute_in_subprocess ) as exe: fs1 = exe.submit(list_files_in_working_directory) @@ -74,7 +74,7 @@ def test_executor_working_directory(self): def test_executor_error(self): cwd = os.path.join(os.path.dirname(__file__), "executables") - with FileExecutor( + with FileTaskScheduler( resource_dict={"cwd": cwd}, execute_function=execute_in_subprocess ) as exe: fs1 = exe.submit(get_error, a=1) diff --git a/tests/test_fluxjobexecutor.py b/tests/test_fluxjobexecutor.py index d5505e68..8cfa8e9a 100644 --- a/tests/test_fluxjobexecutor.py +++ b/tests/test_fluxjobexecutor.py @@ -8,7 +8,7 @@ try: import flux.job - from executorlib.interactive.fluxspawner import FluxPythonSpawner + from executorlib.task_scheduler.interactive.fluxspawner import FluxPythonSpawner skip_flux_test = "FLUX_URI" not in os.environ pmi = os.environ.get("EXECUTORLIB_PMIX", None) diff --git a/tests/test_fluxjobexecutor_plot.py b/tests/test_fluxjobexecutor_plot.py index 74b920f6..1b71239e 100644 --- a/tests/test_fluxjobexecutor_plot.py +++ b/tests/test_fluxjobexecutor_plot.py @@ -10,7 +10,7 @@ try: import pygraphviz import flux.job - from executorlib.interactive.fluxspawner import FluxPythonSpawner + from executorlib.task_scheduler.interactive.fluxspawner import FluxPythonSpawner skip_graphviz_flux_test = "FLUX_URI" not in os.environ except ImportError: diff --git a/tests/test_fluxpythonspawner.py b/tests/test_fluxpythonspawner.py index 7cf0ad89..bf8eb939 100644 --- a/tests/test_fluxpythonspawner.py +++ b/tests/test_fluxpythonspawner.py @@ -5,14 +5,14 @@ import numpy as np -from executorlib.interactive.shared import execute_tasks -from executorlib.interactive.blockallocation import BlockAllocationExecutor +from executorlib.task_scheduler.interactive.shared import execute_tasks +from executorlib.task_scheduler.interactive.blockallocation import BlockAllocationTaskScheduler from executorlib.standalone.serialize import cloudpickle_register try: import flux.job - from executorlib.interactive.fluxspawner import FluxPythonSpawner + from executorlib.task_scheduler.interactive.fluxspawner import FluxPythonSpawner skip_flux_test = "FLUX_URI" not in os.environ pmi = os.environ.get("EXECUTORLIB_PMIX", None) @@ -48,7 +48,7 @@ def setUp(self): self.flux_executor = flux.job.FluxExecutor() def test_flux_executor_serial(self): - with BlockAllocationExecutor( + with BlockAllocationTaskScheduler( max_workers=2, executor_kwargs={"flux_executor": self.flux_executor, "priority": 20}, spawner=FluxPythonSpawner, @@ -61,7 +61,7 @@ def test_flux_executor_serial(self): self.assertTrue(fs_2.done()) def test_flux_executor_threads(self): - with BlockAllocationExecutor( + with BlockAllocationTaskScheduler( max_workers=1, executor_kwargs={ "flux_executor": self.flux_executor, @@ -77,7 +77,7 @@ def test_flux_executor_threads(self): self.assertTrue(fs_2.done()) def test_flux_executor_parallel(self): - with BlockAllocationExecutor( + with BlockAllocationTaskScheduler( max_workers=1, executor_kwargs={ "flux_executor": self.flux_executor, @@ -91,7 +91,7 @@ def test_flux_executor_parallel(self): self.assertTrue(fs_1.done()) def test_single_task(self): - with BlockAllocationExecutor( + with BlockAllocationTaskScheduler( max_workers=1, executor_kwargs={ "flux_executor": self.flux_executor, @@ -138,7 +138,7 @@ def test_execute_task_threads(self): q.join() def test_internal_memory(self): - with BlockAllocationExecutor( + with BlockAllocationTaskScheduler( max_workers=1, executor_kwargs={ "flux_executor": self.flux_executor, diff --git a/tests/test_interactive_dependencies.py b/tests/test_interactive_dependencies.py index 24ed3c04..74d48d2b 100644 --- a/tests/test_interactive_dependencies.py +++ b/tests/test_interactive_dependencies.py @@ -5,7 +5,7 @@ import numpy as np -from executorlib.interactive.blockallocation import BlockAllocationExecutor +from executorlib.task_scheduler.interactive.blockallocation import BlockAllocationTaskScheduler from executorlib.standalone.interactive.spawner import MpiExecSpawner @@ -18,7 +18,7 @@ def calc(i): class TestFuture(unittest.TestCase): def test_pool_serial(self): - with BlockAllocationExecutor( + with BlockAllocationTaskScheduler( max_workers=1, executor_kwargs={"cores": 1}, spawner=MpiExecSpawner, @@ -34,7 +34,7 @@ def test_pool_serial(self): skip_mpi4py_test, "mpi4py is not installed, so the mpi4py tests are skipped." ) def test_pool_serial_multi_core(self): - with BlockAllocationExecutor( + with BlockAllocationTaskScheduler( max_workers=1, executor_kwargs={"cores": 2}, spawner=MpiExecSpawner, @@ -67,7 +67,7 @@ def callback(future): def submit(): # Executor only exists in this scope and can get garbage collected after # this function is exits - future = BlockAllocationExecutor( + future = BlockAllocationTaskScheduler( max_workers=1, executor_kwargs={}, spawner=MpiExecSpawner, @@ -108,7 +108,7 @@ def __init__(self): def run(self): self.running = True - future = BlockAllocationExecutor( + future = BlockAllocationTaskScheduler( max_workers=1, executor_kwargs={}, spawner=MpiExecSpawner, diff --git a/tests/test_interactive_slurmspawner.py b/tests/test_interactive_slurmspawner.py index 147e871d..a0af5b67 100644 --- a/tests/test_interactive_slurmspawner.py +++ b/tests/test_interactive_slurmspawner.py @@ -1,8 +1,8 @@ import unittest -from executorlib.interactive.slurmspawner import generate_slurm_command +from executorlib.task_scheduler.interactive.slurmspawner import generate_slurm_command try: - from executorlib.cache.queue_spawner import _pysqa_execute_command + from executorlib.task_scheduler.file.queue_spawner import _pysqa_execute_command skip_pysqa_test = False except ImportError: diff --git a/tests/test_mpiexecspawner.py b/tests/test_mpiexecspawner.py index 4d36fb86..a1d08cdc 100644 --- a/tests/test_mpiexecspawner.py +++ b/tests/test_mpiexecspawner.py @@ -7,11 +7,11 @@ import numpy as np -from executorlib.base.executor import ExecutorBase +from executorlib.task_scheduler.base import TaskSchedulerBase from executorlib.standalone.interactive.spawner import MpiExecSpawner -from executorlib.interactive.shared import execute_tasks -from executorlib.interactive.blockallocation import BlockAllocationExecutor -from executorlib.interactive.onetoone import OneTaskPerProcessExecutor +from executorlib.task_scheduler.interactive.shared import execute_tasks +from executorlib.task_scheduler.interactive.blockallocation import BlockAllocationTaskScheduler +from executorlib.task_scheduler.interactive.onetoone import OneProcessTaskScheduler from executorlib.standalone.interactive.backend import call_funct from executorlib.standalone.serialize import cloudpickle_register @@ -64,7 +64,7 @@ def sleep_one(i): class TestPyMpiExecutorSerial(unittest.TestCase): def test_pympiexecutor_two_workers(self): - with BlockAllocationExecutor( + with BlockAllocationTaskScheduler( max_workers=2, executor_kwargs={}, spawner=MpiExecSpawner, @@ -78,7 +78,7 @@ def test_pympiexecutor_two_workers(self): self.assertTrue(fs_2.done()) def test_max_workers(self): - with BlockAllocationExecutor( + with BlockAllocationTaskScheduler( max_workers=2, executor_kwargs={}, spawner=MpiExecSpawner, @@ -86,7 +86,7 @@ def test_max_workers(self): self.assertEqual(exe.max_workers, 2) def test_pympiexecutor_one_worker(self): - with BlockAllocationExecutor( + with BlockAllocationTaskScheduler( max_workers=1, executor_kwargs={}, spawner=MpiExecSpawner, @@ -102,7 +102,7 @@ def test_pympiexecutor_one_worker(self): class TestPyMpiExecutorStepSerial(unittest.TestCase): def test_pympiexecutor_two_workers(self): - with OneTaskPerProcessExecutor( + with OneProcessTaskScheduler( max_cores=2, executor_kwargs={}, spawner=MpiExecSpawner, @@ -116,7 +116,7 @@ def test_pympiexecutor_two_workers(self): self.assertTrue(fs_2.done()) def test_max_workers(self): - with OneTaskPerProcessExecutor( + with OneProcessTaskScheduler( max_workers=2, executor_kwargs={}, spawner=MpiExecSpawner, @@ -124,7 +124,7 @@ def test_max_workers(self): self.assertEqual(exe.max_workers, 2) def test_pympiexecutor_one_worker(self): - with OneTaskPerProcessExecutor( + with OneProcessTaskScheduler( max_cores=1, executor_kwargs={}, spawner=MpiExecSpawner, @@ -143,7 +143,7 @@ def test_pympiexecutor_one_worker(self): ) class TestPyMpiExecutorMPI(unittest.TestCase): def test_pympiexecutor_one_worker_with_mpi(self): - with BlockAllocationExecutor( + with BlockAllocationTaskScheduler( max_workers=1, executor_kwargs={"cores": 2}, spawner=MpiExecSpawner, @@ -154,7 +154,7 @@ def test_pympiexecutor_one_worker_with_mpi(self): self.assertTrue(fs_1.done()) def test_pympiexecutor_one_worker_with_mpi_multiple_submissions(self): - with BlockAllocationExecutor( + with BlockAllocationTaskScheduler( max_workers=1, executor_kwargs={"cores": 2}, spawner=MpiExecSpawner, @@ -174,7 +174,7 @@ def test_pympiexecutor_one_worker_with_mpi_multiple_submissions(self): ) def test_pympiexecutor_one_worker_with_mpi_echo(self): - with BlockAllocationExecutor( + with BlockAllocationTaskScheduler( max_workers=1, executor_kwargs={"cores": 2}, spawner=MpiExecSpawner, @@ -189,7 +189,7 @@ def test_pympiexecutor_one_worker_with_mpi_echo(self): ) class TestPyMpiStepExecutorMPI(unittest.TestCase): def test_pympiexecutor_one_worker_with_mpi(self): - with OneTaskPerProcessExecutor( + with OneProcessTaskScheduler( max_cores=2, executor_kwargs={"cores": 2}, spawner=MpiExecSpawner, @@ -200,7 +200,7 @@ def test_pympiexecutor_one_worker_with_mpi(self): self.assertTrue(fs_1.done()) def test_pympiexecutor_one_worker_with_mpi_multiple_submissions(self): - with OneTaskPerProcessExecutor( + with OneProcessTaskScheduler( max_cores=2, executor_kwargs={"cores": 2}, spawner=MpiExecSpawner, @@ -220,7 +220,7 @@ def test_pympiexecutor_one_worker_with_mpi_multiple_submissions(self): ) def test_pympiexecutor_one_worker_with_mpi_echo(self): - with OneTaskPerProcessExecutor( + with OneProcessTaskScheduler( max_cores=2, executor_kwargs={"cores": 2}, spawner=MpiExecSpawner, @@ -232,7 +232,7 @@ def test_pympiexecutor_one_worker_with_mpi_echo(self): class TestPyMpiExecutorInitFunction(unittest.TestCase): def test_internal_memory(self): - with BlockAllocationExecutor( + with BlockAllocationTaskScheduler( max_workers=1, executor_kwargs={ "cores": 1, @@ -273,7 +273,7 @@ def test_execute_task(self): class TestFuturePool(unittest.TestCase): def test_pool_serial(self): - with BlockAllocationExecutor( + with BlockAllocationTaskScheduler( max_workers=1, executor_kwargs={"cores": 1}, spawner=MpiExecSpawner, @@ -288,7 +288,7 @@ def test_pool_serial(self): self.assertEqual(output.result(), np.array(4)) def test_executor_multi_submission(self): - with BlockAllocationExecutor( + with BlockAllocationTaskScheduler( max_workers=1, executor_kwargs={"cores": 1}, spawner=MpiExecSpawner, @@ -301,7 +301,7 @@ def test_executor_multi_submission(self): self.assertTrue(fs_2.done()) def test_shutdown(self): - p = BlockAllocationExecutor( + p = BlockAllocationTaskScheduler( max_workers=1, executor_kwargs={"cores": 1}, spawner=MpiExecSpawner, @@ -317,7 +317,7 @@ def test_shutdown(self): fs2.result() def test_pool_serial_map(self): - with BlockAllocationExecutor( + with BlockAllocationTaskScheduler( max_workers=1, executor_kwargs={"cores": 1}, spawner=MpiExecSpawner, @@ -327,7 +327,7 @@ def test_pool_serial_map(self): def test_executor_exception(self): with self.assertRaises(RuntimeError): - with BlockAllocationExecutor( + with BlockAllocationTaskScheduler( max_workers=1, executor_kwargs={"cores": 1}, spawner=MpiExecSpawner, @@ -337,7 +337,7 @@ def test_executor_exception(self): def test_executor_exception_future(self): with self.assertRaises(RuntimeError): - with BlockAllocationExecutor( + with BlockAllocationTaskScheduler( max_workers=1, executor_kwargs={"cores": 1}, spawner=MpiExecSpawner, @@ -358,7 +358,7 @@ def test_meta(self): "openmpi_oversubscribe": False, "max_workers": 1, } - with BlockAllocationExecutor( + with BlockAllocationTaskScheduler( max_workers=1, executor_kwargs={ "cores": 2, @@ -374,7 +374,7 @@ def test_meta(self): self.assertEqual(exe.info[k], v) else: self.assertEqual(str(exe.info[k]), v) - with ExecutorBase() as exe: + with TaskSchedulerBase() as exe: self.assertIsNone(exe.info) def test_meta_step(self): @@ -386,7 +386,7 @@ def test_meta_step(self): "openmpi_oversubscribe": False, "max_cores": 2, } - with OneTaskPerProcessExecutor( + with OneProcessTaskScheduler( max_cores=2, executor_kwargs={ "cores": 2, @@ -406,7 +406,7 @@ def test_meta_step(self): skip_mpi4py_test, "mpi4py is not installed, so the mpi4py tests are skipped." ) def test_pool_multi_core(self): - with BlockAllocationExecutor( + with BlockAllocationTaskScheduler( max_workers=1, executor_kwargs={"cores": 2}, spawner=MpiExecSpawner, @@ -424,7 +424,7 @@ def test_pool_multi_core(self): skip_mpi4py_test, "mpi4py is not installed, so the mpi4py tests are skipped." ) def test_pool_multi_core_map(self): - with BlockAllocationExecutor( + with BlockAllocationTaskScheduler( max_workers=1, executor_kwargs={"cores": 2}, spawner=MpiExecSpawner, diff --git a/tests/test_singlenodeexecutor_dependencies.py b/tests/test_singlenodeexecutor_dependencies.py index 4e4780d3..cd453bce 100644 --- a/tests/test_singlenodeexecutor_dependencies.py +++ b/tests/test_singlenodeexecutor_dependencies.py @@ -5,8 +5,8 @@ from threading import Thread from executorlib import SingleNodeExecutor -from executorlib.interfaces.single import create_single_node_executor -from executorlib.interactive.dependency import _execute_tasks_with_dependencies +from executorlib.executor.single import create_single_node_executor +from executorlib.task_scheduler.interactive.dependency import _execute_tasks_with_dependencies from executorlib.standalone.serialize import cloudpickle_register from executorlib.standalone.interactive.spawner import MpiExecSpawner diff --git a/tests/test_singlenodeexecutor_shell_executor.py b/tests/test_singlenodeexecutor_shell_executor.py index 7a69cf22..df97ecd2 100644 --- a/tests/test_singlenodeexecutor_shell_executor.py +++ b/tests/test_singlenodeexecutor_shell_executor.py @@ -5,7 +5,7 @@ from executorlib import SingleNodeExecutor from executorlib.standalone.serialize import cloudpickle_register -from executorlib.interactive.shared import execute_tasks +from executorlib.task_scheduler.interactive.shared import execute_tasks from executorlib.standalone.interactive.spawner import MpiExecSpawner diff --git a/tests/test_singlenodeexecutor_shell_interactive.py b/tests/test_singlenodeexecutor_shell_interactive.py index 6b4e2d77..0adc54bf 100644 --- a/tests/test_singlenodeexecutor_shell_interactive.py +++ b/tests/test_singlenodeexecutor_shell_interactive.py @@ -6,7 +6,7 @@ from executorlib import SingleNodeExecutor from executorlib.standalone.serialize import cloudpickle_register -from executorlib.interactive.shared import execute_tasks +from executorlib.task_scheduler.interactive.shared import execute_tasks from executorlib.standalone.interactive.spawner import MpiExecSpawner diff --git a/tests/test_standalone_interactive_backend.py b/tests/test_standalone_interactive_backend.py index 40bda2e1..cfa961af 100644 --- a/tests/test_standalone_interactive_backend.py +++ b/tests/test_standalone_interactive_backend.py @@ -4,7 +4,7 @@ from executorlib.standalone.interactive.backend import parse_arguments from executorlib.standalone.interactive.spawner import MpiExecSpawner -from executorlib.interactive.slurmspawner import SrunSpawner +from executorlib.task_scheduler.interactive.slurmspawner import SrunSpawner class TestParser(unittest.TestCase):