From 4e1164204a4d3980532191c77498c7e1c653c056 Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Thu, 8 Jun 2023 14:39:27 -0600 Subject: [PATCH 1/3] Add GPU support --- pyiron_base/jobs/job/extension/server/generic.py | 15 ++++++++++++++- pyiron_base/jobs/job/runfunction.py | 13 +++++++++++-- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/pyiron_base/jobs/job/extension/server/generic.py b/pyiron_base/jobs/job/extension/server/generic.py index 40a676006..431c07f38 100644 --- a/pyiron_base/jobs/job/extension/server/generic.py +++ b/pyiron_base/jobs/job/extension/server/generic.py @@ -74,10 +74,11 @@ class Server: # add the option to return the job id and the hold id to the serv """ def __init__( - self, host=None, queue=None, cores=1, threads=1, run_mode="modal", new_hdf=True + self, host=None, queue=None, cores=1, threads=1, gpus=None, run_mode="modal", new_hdf=True ): self._cores = cores self._threads = threads + self._gpus = None self._run_time = None self._memory_limit = None self._host = self._init_host(host=host) @@ -230,6 +231,14 @@ def threads(self): def threads(self, number_of_threads): self._threads = number_of_threads + @property + def gpus(self): + return self._gpus + + @gpus.setter + def gpus(self, number_of_gpus): + self._gpus = number_of_gpus + @property def cores(self): """ @@ -447,6 +456,8 @@ def to_hdf(self, hdf, group_name=None): hdf_dict["accept_crash"] = self.accept_crash if len(self.additional_arguments) > 0: hdf_dict["additional_arguments"] = self.additional_arguments + if self._gpus is not None: + hdf_dict["accept_crash"] = self._gpus if group_name is not None: with hdf.open(group_name) as hdf_group: @@ -490,6 +501,8 @@ def from_hdf(self, hdf, group_name=None): self._threads = hdf_dict["threads"] if "additional_arguments" in hdf_dict.keys(): self.additional_arguments = hdf_dict["additional_arguments"] + if "gpus" in hdf_dict.keys(): + self._gpus = hdf_dict["accept_crash"] self._new_hdf = hdf_dict["new_h5"] == 1 def db_entry(self): diff --git a/pyiron_base/jobs/job/runfunction.py b/pyiron_base/jobs/job/runfunction.py index 64dcf6fa2..b52abdc49 100644 --- a/pyiron_base/jobs/job/runfunction.py +++ b/pyiron_base/jobs/job/runfunction.py @@ -113,7 +113,15 @@ def run_job_with_status_created(job): elif job.server.run_mode.srun: run_job_with_runmode_srun(job=job) elif job.server.run_mode.flux: - return run_job_with_runmode_flux(job=job, executor=job.flux_executor) + if job.server.gpus is not None: + gpus_per_slot = int(job.server.gpus/job.server.cores) + else: + gpus_per_slot = None + return run_job_with_runmode_flux( + job=job, + executor=job.flux_executor, + gpus_per_slot=gpus_per_slot, + ) elif ( job.server.run_mode.non_modal or job.server.run_mode.thread @@ -443,7 +451,7 @@ def run_job_with_runmode_srun(job): ) -def run_job_with_runmode_flux(job, executor): +def run_job_with_runmode_flux(job, executor, gpus_per_slot=None): if not flux_available: raise ModuleNotFoundError( "No module named 'flux'. No linux you can install flux via conda." @@ -480,6 +488,7 @@ def run_job_with_runmode_flux(job, executor): script=exeuctable_str, num_nodes=1, cores_per_slot=1, + gpus_per_slot=gpus_per_slot, num_slots=job.server.cores, ) jobspec.cwd = job.project_hdf5.working_directory From 45c21899299558c81cbc54e8b4094c91349cdb47 Mon Sep 17 00:00:00 2001 From: pyiron-runner Date: Thu, 8 Jun 2023 20:42:29 +0000 Subject: [PATCH 2/3] Format black --- pyiron_base/jobs/job/extension/server/generic.py | 9 ++++++++- pyiron_base/jobs/job/runfunction.py | 2 +- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/pyiron_base/jobs/job/extension/server/generic.py b/pyiron_base/jobs/job/extension/server/generic.py index 431c07f38..c46adf048 100644 --- a/pyiron_base/jobs/job/extension/server/generic.py +++ b/pyiron_base/jobs/job/extension/server/generic.py @@ -74,7 +74,14 @@ class Server: # add the option to return the job id and the hold id to the serv """ def __init__( - self, host=None, queue=None, cores=1, threads=1, gpus=None, run_mode="modal", new_hdf=True + self, + host=None, + queue=None, + cores=1, + threads=1, + gpus=None, + run_mode="modal", + new_hdf=True, ): self._cores = cores self._threads = threads diff --git a/pyiron_base/jobs/job/runfunction.py b/pyiron_base/jobs/job/runfunction.py index b52abdc49..4a463ba4c 100644 --- a/pyiron_base/jobs/job/runfunction.py +++ b/pyiron_base/jobs/job/runfunction.py @@ -114,7 +114,7 @@ def run_job_with_status_created(job): run_job_with_runmode_srun(job=job) elif job.server.run_mode.flux: if job.server.gpus is not None: - gpus_per_slot = int(job.server.gpus/job.server.cores) + gpus_per_slot = int(job.server.gpus / job.server.cores) else: gpus_per_slot = None return run_job_with_runmode_flux( From 932ee35f3c9a8b3ba3b1fb6aa55033b4504b051d Mon Sep 17 00:00:00 2001 From: Jan Janssen Date: Thu, 8 Jun 2023 14:52:21 -0600 Subject: [PATCH 3/3] Add GPU support for shell script based executables --- pyiron_base/jobs/job/extension/executable.py | 21 +++++++++----------- pyiron_base/jobs/job/runfunction.py | 2 +- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/pyiron_base/jobs/job/extension/executable.py b/pyiron_base/jobs/job/extension/executable.py index b92493370..97c9c940b 100644 --- a/pyiron_base/jobs/job/extension/executable.py +++ b/pyiron_base/jobs/job/extension/executable.py @@ -217,13 +217,14 @@ def executable_path(self, new_path): else: self.storage.mpi = False - def get_input_for_subprocess_call(self, cores, threads): + def get_input_for_subprocess_call(self, cores, threads, gpus=None): """ Get the input parameters for the subprocess call to execute the job Args: cores (int): number of cores threads (int): number of threads + gpus (int/None): number of gpus Returns: str/ list, boolean: executable and shell variables @@ -231,18 +232,14 @@ def get_input_for_subprocess_call(self, cores, threads): if cores == 1 or not self.mpi: executable = self.__str__() shell = True - elif isinstance(self.executable_path, list): - executable = self.executable_path[:] + [ - str(cores), - str(threads), - ] - shell = False else: - executable = [ - self.executable_path, - str(cores), - str(threads), - ] + if isinstance(self.executable_path, list): + executable = self.executable_path[:] + else: + executable = [self.executable_path] + executable += [str(cores), str(threads)] + if gpus is not None: + executable += [str(gpus)] shell = False return executable, shell diff --git a/pyiron_base/jobs/job/runfunction.py b/pyiron_base/jobs/job/runfunction.py index b52abdc49..d868c0317 100644 --- a/pyiron_base/jobs/job/runfunction.py +++ b/pyiron_base/jobs/job/runfunction.py @@ -524,7 +524,7 @@ def execute_job_with_external_executable(job): raise ValueError("No executable set!") job.status.running = True executable, shell = job.executable.get_input_for_subprocess_call( - cores=job.server.cores, threads=job.server.threads + cores=job.server.cores, threads=job.server.threads, gpus=job.server.gpus ) job_crashed, out = False, None try: