Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Core] Support Intel GPU #38553

Merged
merged 10 commits into from
Oct 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion python/ray/_private/accelerators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from ray._private.accelerators.accelerator import AcceleratorManager
from ray._private.accelerators.nvidia_gpu import NvidiaGPUAcceleratorManager
from ray._private.accelerators.intel_gpu import IntelGPUAcceleratorManager
from ray._private.accelerators.tpu import TPUAcceleratorManager
from ray._private.accelerators.neuron import NeuronAcceleratorManager

Expand All @@ -10,6 +11,7 @@ def get_all_accelerator_managers() -> Set[AcceleratorManager]:
"""Get all accelerator managers supported by Ray."""
return {
NvidiaGPUAcceleratorManager,
IntelGPUAcceleratorManager,
TPUAcceleratorManager,
NeuronAcceleratorManager,
}
Expand All @@ -35,11 +37,18 @@ def get_accelerator_manager_for_resource(resource_name: str) -> AcceleratorManag

E.g., TPUAcceleratorManager is returned if resource name is "TPU"
"""
return _resource_name_to_accelerator_manager.get(resource_name, None)
if resource_name == "GPU":
if IntelGPUAcceleratorManager.get_current_node_num_accelerators() > 0:
return IntelGPUAcceleratorManager
else:
return NvidiaGPUAcceleratorManager
else:
return _resource_name_to_accelerator_manager.get(resource_name, None)


__all__ = [
"NvidiaGPUAcceleratorManager",
"IntelGPUAcceleratorManager",
"TPUAcceleratorManager",
"NeuronAcceleratorManager",
"get_all_accelerator_managers",
Expand Down
103 changes: 103 additions & 0 deletions python/ray/_private/accelerators/intel_gpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import os
import logging
from typing import Optional, List, Tuple

from ray._private.accelerators.accelerator import AcceleratorManager

logger = logging.getLogger(__name__)

ONEAPI_DEVICE_SELECTOR_ENV_VAR = "ONEAPI_DEVICE_SELECTOR"
NOSET_ONEAPI_DEVICE_SELECTOR_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR"
ONEAPI_DEVICE_BACKEND_TYPE = "level_zero"
ONEAPI_DEVICE_TYPE = "gpu"


class IntelGPUAcceleratorManager(AcceleratorManager):
"""Intel GPU accelerators."""

@staticmethod
def get_resource_name() -> str:
return "GPU"

@staticmethod
def get_visible_accelerator_ids_env_var() -> str:
return ONEAPI_DEVICE_SELECTOR_ENV_VAR

@staticmethod
def get_current_process_visible_accelerator_ids() -> Optional[List[str]]:
harborn marked this conversation as resolved.
Show resolved Hide resolved
oneapi_visible_devices = os.environ.get(
IntelGPUAcceleratorManager.get_visible_accelerator_ids_env_var(), None
)
if oneapi_visible_devices is None:
return None

if oneapi_visible_devices == "":
return []

if oneapi_visible_devices == "NoDevFiles":
return []

prefix = ONEAPI_DEVICE_BACKEND_TYPE + ":"

return list(oneapi_visible_devices.split(prefix)[1].split(","))

@staticmethod
def get_current_node_num_accelerators() -> int:
try:
import dpctl
except ImportError:
dpctl = None
if dpctl is None:
return 0

num_gpus = 0
try:
dev_info = ONEAPI_DEVICE_BACKEND_TYPE + ":" + ONEAPI_DEVICE_TYPE
context = dpctl.SyclContext(dev_info)
num_gpus = context.device_count
except Exception:
num_gpus = 0
return num_gpus

@staticmethod
def get_current_node_accelerator_type() -> Optional[str]:
"""Get the name of first Intel GPU. (supposed only one GPU type on a node)
Example:
name: 'Intel(R) Data Center GPU Max 1550'
return name: 'Intel-GPU-Max-1550'
Returns:
A string representing the name of Intel GPU type.
"""
try:
import dpctl
except ImportError:
dpctl = None
if dpctl is None:
return None

accelerator_type = None
try:
dev_info = ONEAPI_DEVICE_BACKEND_TYPE + ":" + ONEAPI_DEVICE_TYPE + ":0"
dev = dpctl.SyclDevice(dev_info)
accelerator_type = "Intel-GPU-" + "-".join(dev.name.split(" ")[-2:])
except Exception:
accelerator_type = None
return accelerator_type

@staticmethod
def validate_resource_request_quantity(
quantity: float,
) -> Tuple[bool, Optional[str]]:
return (True, None)
jjyao marked this conversation as resolved.
Show resolved Hide resolved

@staticmethod
def set_current_process_visible_accelerator_ids(
harborn marked this conversation as resolved.
Show resolved Hide resolved
visible_xpu_devices: List[str],
) -> None:
if os.environ.get(NOSET_ONEAPI_DEVICE_SELECTOR_ENV_VAR):
return

prefix = ONEAPI_DEVICE_BACKEND_TYPE + ":"
os.environ[
IntelGPUAcceleratorManager.get_visible_accelerator_ids_env_var()
] = prefix + ",".join([str(i) for i in visible_xpu_devices])
4 changes: 2 additions & 2 deletions python/ray/_private/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,8 +334,8 @@ def set_omp_num_threads_if_unset() -> bool:


def set_visible_accelerator_ids() -> None:
"""Set (CUDA_VISIBLE_DEVICES, NEURON_RT_VISIBLE_CORES, TPU_VISIBLE_CHIPS ,...)
environment variables based on the accelerator runtime.
"""Set (CUDA_VISIBLE_DEVICES, ONEAPI_DEVICE_SELECTOR, NEURON_RT_VISIBLE_CORES,
TPU_VISIBLE_CHIPS ,...) environment variables based on the accelerator runtime.
"""
for resource_name, accelerator_ids in (
ray.get_runtime_context().get_resource_ids().items()
Expand Down
9 changes: 5 additions & 4 deletions python/ray/_private/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,8 +424,8 @@ def __init__(self):
self.mode = None
self.actors = {}
# When the worker is constructed. Record the original value of the
# (CUDA_VISIBLE_DEVICES, NEURON_RT_VISIBLE_CORES, TPU_VISIBLE_CHIPS, ..)
# environment variables.
# (CUDA_VISIBLE_DEVICES, ONEAPI_DEVICE_SELECTOR, NEURON_RT_VISIBLE_CORES,
# TPU_VISIBLE_CHIPS, ..) environment variables.
self.original_visible_accelerator_ids = (
ray._private.utils.get_visible_accelerator_ids()
)
Expand Down Expand Up @@ -870,8 +870,9 @@ def get_accelerator_ids_for_accelerator_resource(
assigned_ids.add(resource_id)

# If the user had already set the environment variables
# (CUDA_VISIBLE_DEVICES, NEURON_RT_VISIBLE_CORES, TPU_VISIBLE_CHIPS, ..) then
# respect that in the sense that only IDs that appear in (CUDA_VISIBLE_DEVICES,
# (CUDA_VISIBLE_DEVICES, ONEAPI_DEVICE_SELECTOR, NEURON_RT_VISIBLE_CORES,
# TPU_VISIBLE_CHIPS, ..) then respect that in the sense that only IDs
# that appear in (CUDA_VISIBLE_DEVICES, ONEAPI_DEVICE_SELECTOR,
# NEURON_RT_VISIBLE_CORES, TPU_VISIBLE_CHIPS, ..) should be returned.
if self.original_visible_accelerator_ids.get(resource_name, None) is not None:
original_ids = self.original_visible_accelerator_ids[resource_name]
Expand Down
1 change: 1 addition & 0 deletions python/ray/tests/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ py_test_module_list(
"accelerators/test_neuron.py",
"accelerators/test_tpu.py",
"accelerators/test_nvidia_gpu.py",
"accelerators/test_intel_gpu.py",
"test_log_dedup.py",
"test_logging.py",
"test_memory_scheduling.py",
Expand Down
16 changes: 16 additions & 0 deletions python/ray/tests/accelerators/mock_dpctl_1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
class SyclContext:
def __init__(self, info):
pass

@property
def device_count(self):
return 6


class SyclDevice:
def __init__(self, info):
pass

@property
def name(self):
return "Intel(R) Data Center GPU Max 1550"
16 changes: 16 additions & 0 deletions python/ray/tests/accelerators/mock_dpctl_2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
class SyclContext:
def __init__(self, info):
pass

@property
def device_count(self):
return 4


class SyclDevice:
def __init__(self, info):
pass

@property
def name(self):
return "Intel(R) Data Center GPU Max 1100"
104 changes: 104 additions & 0 deletions python/ray/tests/accelerators/test_intel_gpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import os
import sys
import pytest
from unittest.mock import patch

import ray
from ray._private.accelerators import IntelGPUAcceleratorManager as Accelerator
from ray.util.accelerators import INTEL_MAX_1550, INTEL_MAX_1100


def test_visible_intel_gpu_ids(shutdown_only):
with patch.object(Accelerator, "get_current_node_num_accelerators", return_value=4):
os.environ["ONEAPI_DEVICE_SELECTOR"] = "level_zero:0,1,2"
ray.init()
manager = ray._private.accelerators.get_accelerator_manager_for_resource("GPU")
assert manager.get_current_node_num_accelerators() == 4
assert manager.__name__ == "IntelGPUAcceleratorManager"
assert ray.available_resources()["GPU"] == 3


def test_visible_intel_gpu_type(shutdown_only):
with patch.object(
Accelerator, "get_current_node_num_accelerators", return_value=4
), patch.object(
Accelerator, "get_current_node_accelerator_type", return_value=INTEL_MAX_1550
):
os.environ["ONEAPI_DEVICE_SELECTOR"] = "level_zero:0,1,2"
ray.init()
manager = ray._private.accelerators.get_accelerator_manager_for_resource("GPU")
assert manager.get_current_node_accelerator_type() == INTEL_MAX_1550


@pytest.mark.skipif(sys.platform == "win32", reason="Not supported mock on Windows")
def test_get_current_node_num_accelerators():
old_dpctl = None
if "dpctl" in sys.modules:
old_dpctl = sys.modules["dpctl"]

sys.modules["dpctl"] = __import__("mock_dpctl_1")
harborn marked this conversation as resolved.
Show resolved Hide resolved
assert Accelerator.get_current_node_num_accelerators() == 6

sys.modules["dpctl"] = __import__("mock_dpctl_2")
assert Accelerator.get_current_node_num_accelerators() == 4

if old_dpctl is not None:
sys.modules["dpctl"] = old_dpctl


@pytest.mark.skipif(sys.platform == "win32", reason="Not supported mock on Windows")
def test_get_current_node_accelerator_type():
old_dpctl = None
if "dpctl" in sys.modules:
old_dpctl = sys.modules["dpctl"]

sys.modules["dpctl"] = __import__("mock_dpctl_1")
assert Accelerator.get_current_node_accelerator_type() == INTEL_MAX_1550

sys.modules["dpctl"] = __import__("mock_dpctl_2")
assert Accelerator.get_current_node_accelerator_type() == INTEL_MAX_1100

if old_dpctl is not None:
sys.modules["dpctl"] = old_dpctl


def test_intel_gpu_accelerator_manager_api():
assert Accelerator.get_resource_name() == "GPU"
assert Accelerator.get_visible_accelerator_ids_env_var() == "ONEAPI_DEVICE_SELECTOR"
assert Accelerator.validate_resource_request_quantity(0.1) == (True, None)


def test_get_current_process_visible_accelerator_ids():
os.environ["ONEAPI_DEVICE_SELECTOR"] = "level_zero:0,1,2"
assert Accelerator.get_current_process_visible_accelerator_ids() == ["0", "1", "2"]

del os.environ["ONEAPI_DEVICE_SELECTOR"]
assert Accelerator.get_current_process_visible_accelerator_ids() is None

os.environ["ONEAPI_DEVICE_SELECTOR"] = ""
assert Accelerator.get_current_process_visible_accelerator_ids() == []

os.environ["ONEAPI_DEVICE_SELECTOR"] = "NoDevFiles"
assert Accelerator.get_current_process_visible_accelerator_ids() == []

del os.environ["ONEAPI_DEVICE_SELECTOR"]


def test_set_current_process_visible_accelerator_ids():
Accelerator.set_current_process_visible_accelerator_ids(["0"])
assert os.environ["ONEAPI_DEVICE_SELECTOR"] == "level_zero:0"

Accelerator.set_current_process_visible_accelerator_ids(["0", "1"])
assert os.environ["ONEAPI_DEVICE_SELECTOR"] == "level_zero:0,1"

Accelerator.set_current_process_visible_accelerator_ids(["0", "1", "2"])
assert os.environ["ONEAPI_DEVICE_SELECTOR"] == "level_zero:0,1,2"

del os.environ["ONEAPI_DEVICE_SELECTOR"]


if __name__ == "__main__":
harborn marked this conversation as resolved.
Show resolved Hide resolved
if os.environ.get("PARALLEL_CI"):
sys.exit(pytest.main(["-n", "auto", "--boxed", "-vs", __file__]))
else:
sys.exit(pytest.main(["-sv", __file__]))
4 changes: 4 additions & 0 deletions python/ray/util/accelerators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
NVIDIA_TESLA_K80,
NVIDIA_TESLA_A100,
NVIDIA_TESLA_A10G,
INTEL_MAX_1550,
INTEL_MAX_1100,
AWS_NEURON_CORE,
GOOGLE_TPU_V2,
GOOGLE_TPU_V3,
Expand All @@ -20,6 +22,8 @@
"NVIDIA_TESLA_K80",
"NVIDIA_TESLA_A100",
"NVIDIA_TESLA_A10G",
"INTEL_MAX_1550",
"INTEL_MAX_1100",
"AWS_NEURON_CORE",
"GOOGLE_TPU_V2",
"GOOGLE_TPU_V3",
Expand Down
2 changes: 2 additions & 0 deletions python/ray/util/accelerators/accelerators.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
NVIDIA_TESLA_K80 = "K80"
NVIDIA_TESLA_A100 = "A100"
NVIDIA_TESLA_A10G = "A10G"
INTEL_MAX_1550 = "Intel-GPU-Max-1550"
INTEL_MAX_1100 = "Intel-GPU-Max-1100"
AWS_NEURON_CORE = "aws-neuron-core"
GOOGLE_TPU_V2 = "TPU-V2"
GOOGLE_TPU_V3 = "TPU-V3"
Expand Down
Loading