Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[train] support memory per worker #42999

Merged
merged 10 commits into from
Feb 23, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 8 additions & 0 deletions python/ray/train/_internal/worker_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,13 @@ def __init__(
self.num_cpus_per_worker = num_cpus_per_worker
self.num_gpus_per_worker = num_gpus_per_worker
self.additional_resources_per_worker = additional_resources_per_worker

self.memory_per_worker = None
if self.additional_resources_per_worker is not None:
matthewdeng marked this conversation as resolved.
Show resolved Hide resolved
self.memory_per_worker = self.additional_resources_per_worker.pop(
"memory", None
)

self.workers = []
self._base_cls = create_executable_class(actor_cls)
assert issubclass(self._base_cls, RayTrainWorker)
Expand All @@ -188,6 +195,7 @@ def __init__(
self._remote_cls = ray.remote(
num_cpus=self.num_cpus_per_worker,
num_gpus=self.num_gpus_per_worker,
memory=self.memory_per_worker,
resources=self.additional_resources_per_worker,
)(self._base_cls)
self.start()
Expand Down
13 changes: 13 additions & 0 deletions python/ray/train/tests/test_worker_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,14 @@ def ray_start_2_cpus_and_neuron_core_accelerator():
ray.shutdown()


@pytest.fixture
def ray_start_2_cpus_and_10kb_memory():
address_info = ray.init(num_cpus=2, _memory=10_000)
yield address_info
# The code after the yield will run as teardown code.
ray.shutdown()


def test_worker_creation(ray_start_2_cpus):
assert ray.available_resources()["CPU"] == 2
wg = WorkerGroup(num_workers=2)
Expand All @@ -52,6 +60,11 @@ def test_worker_creation_num_cpus(ray_start_2_cpus):
wg.shutdown()


def test_worker_creation_with_memory(ray_start_2_cpus_and_10kb_memory):
wg = WorkerGroup(num_workers=2, additional_resources_per_worker={"memory": 1_000})
assert len(wg.workers) == 2
matthewdeng marked this conversation as resolved.
Show resolved Hide resolved


def test_worker_shutdown(ray_start_2_cpus):
assert ray.available_resources()["CPU"] == 2
wg = WorkerGroup(num_workers=2)
Expand Down