Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add GPU support #1126

Merged
merged 4 commits into from
Jun 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 9 additions & 12 deletions pyiron_base/jobs/job/extension/executable.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,32 +217,29 @@ def executable_path(self, new_path):
else:
self.storage.mpi = False

def get_input_for_subprocess_call(self, cores, threads):
def get_input_for_subprocess_call(self, cores, threads, gpus=None):
"""
Get the input parameters for the subprocess call to execute the job

Args:
cores (int): number of cores
threads (int): number of threads
gpus (int/None): number of gpus

Returns:
str/ list, boolean: executable and shell variables
"""
if cores == 1 or not self.mpi:
executable = self.__str__()
shell = True
elif isinstance(self.executable_path, list):
executable = self.executable_path[:] + [
str(cores),
str(threads),
]
shell = False
else:
executable = [
self.executable_path,
str(cores),
str(threads),
]
if isinstance(self.executable_path, list):
executable = self.executable_path[:]
else:
executable = [self.executable_path]
executable += [str(cores), str(threads)]
if gpus is not None:
executable += [str(gpus)]
shell = False
return executable, shell

Expand Down
22 changes: 21 additions & 1 deletion pyiron_base/jobs/job/extension/server/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,18 @@ class Server: # add the option to return the job id and the hold id to the serv
"""

def __init__(
self, host=None, queue=None, cores=1, threads=1, run_mode="modal", new_hdf=True
self,
host=None,
queue=None,
cores=1,
threads=1,
gpus=None,
run_mode="modal",
new_hdf=True,
):
self._cores = cores
self._threads = threads
self._gpus = None
self._run_time = None
self._memory_limit = None
self._host = self._init_host(host=host)
Expand Down Expand Up @@ -230,6 +238,14 @@ def threads(self):
def threads(self, number_of_threads):
self._threads = number_of_threads

@property
def gpus(self):
return self._gpus

@gpus.setter
def gpus(self, number_of_gpus):
self._gpus = number_of_gpus

@property
def cores(self):
"""
Expand Down Expand Up @@ -447,6 +463,8 @@ def to_hdf(self, hdf, group_name=None):
hdf_dict["accept_crash"] = self.accept_crash
if len(self.additional_arguments) > 0:
hdf_dict["additional_arguments"] = self.additional_arguments
if self._gpus is not None:
hdf_dict["accept_crash"] = self._gpus

if group_name is not None:
with hdf.open(group_name) as hdf_group:
Expand Down Expand Up @@ -490,6 +508,8 @@ def from_hdf(self, hdf, group_name=None):
self._threads = hdf_dict["threads"]
if "additional_arguments" in hdf_dict.keys():
self.additional_arguments = hdf_dict["additional_arguments"]
if "gpus" in hdf_dict.keys():
self._gpus = hdf_dict["accept_crash"]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason why the names don't match and clash with an already existing entry?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at the PRs merged over the last two days, I'm also quite concerned that we now have both Flux and GPU "support", but the tests and notebooks directories have not been touched at all.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch - I fixed it in #1127

@liamhuber the challenging part for both is the integration in the test environment. But I agree I am going to work on adding tests once the presentation on Monday was successful.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For flux it seems super straightforward: add flux to the notebook dependencies, take your example from #1120 and change it so it uses a python template job instead of atomistics, and add it to an example notebook. That would give us both a bare-minimum of testing and at least somewhere in the codebase that the intended use is shown.

For GPUs I agree, we cannot directly test the execution on GitHub CI, but we could at least have AssertRaises tests to make sure that setting the gpus flag actually gets us to the right part of the code.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But I agree I am going to work on adding tests once the presentation on Monday was successful.

@jan-janssen I am super not comfortable with merges to base being done under "I have a presentation on Monday" pressure. Is it not possible to leave these in a branch and do the example from there, or is your example really getting the audience to conda install pyiron such that you need these changes publicly available?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is your example really getting the audience to conda install pyiron such that you need these changes publicly available?

Yes, that is the goal - tell people during the three month we developed an Exascale ready version of pyiron and they can now install it directly from conda, with flux and all required dependencies.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For flux it seems super straightforward: add flux to the notebook dependencies, take your example from #1120 and change it so it uses a python template job instead of atomistics, and add it to an example notebook. That would give us both a bare-minimum of testing and at least somewhere in the codebase that the intended use is shown.

Tests and documentation are in progress - I add them soon.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that is the goal - tell people during the three month we developed an Exascale ready version of pyiron and they can now install it directly from conda, with flux and all required dependencies.

Well, for me this only strengthens the case that the tests should be present at the time the functionality is merged, but in any case I wish you a smooth presentation.

self._new_hdf = hdf_dict["new_h5"] == 1

def db_entry(self):
Expand Down
15 changes: 12 additions & 3 deletions pyiron_base/jobs/job/runfunction.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,15 @@ def run_job_with_status_created(job):
elif job.server.run_mode.srun:
run_job_with_runmode_srun(job=job)
elif job.server.run_mode.flux:
return run_job_with_runmode_flux(job=job, executor=job.flux_executor)
if job.server.gpus is not None:
gpus_per_slot = int(job.server.gpus / job.server.cores)
else:
gpus_per_slot = None
return run_job_with_runmode_flux(
job=job,
executor=job.flux_executor,
gpus_per_slot=gpus_per_slot,
)
elif (
job.server.run_mode.non_modal
or job.server.run_mode.thread
Expand Down Expand Up @@ -443,7 +451,7 @@ def run_job_with_runmode_srun(job):
)


def run_job_with_runmode_flux(job, executor):
def run_job_with_runmode_flux(job, executor, gpus_per_slot=None):
if not flux_available:
raise ModuleNotFoundError(
"No module named 'flux'. No linux you can install flux via conda."
Expand Down Expand Up @@ -480,6 +488,7 @@ def run_job_with_runmode_flux(job, executor):
script=exeuctable_str,
num_nodes=1,
cores_per_slot=1,
gpus_per_slot=gpus_per_slot,
num_slots=job.server.cores,
)
jobspec.cwd = job.project_hdf5.working_directory
Expand Down Expand Up @@ -515,7 +524,7 @@ def execute_job_with_external_executable(job):
raise ValueError("No executable set!")
job.status.running = True
executable, shell = job.executable.get_input_for_subprocess_call(
cores=job.server.cores, threads=job.server.threads
cores=job.server.cores, threads=job.server.threads, gpus=job.server.gpus
)
job_crashed, out = False, None
try:
Expand Down
Loading