Numba patch: Support linking code from memory (#60)

Adds support for linking code from memory to Numba's `@cuda.jit` decorator (in addition to the already-supported linking files from disk). New classes are added to Numba's top-level `cuda` module: * `Archive`: An archive of objects * `CUSource`: A CUDA C/C++ source * `Cubin`: A cubin ELF * `Fatbin`: A fatbin ELF * `Object`: An object file * `PTXSource`: PTX assembly source code. These are all used by constructing them with a single argument, the code in memory to use. Once created, they can then be passed to the `link=` kwarg of the `@cuda.jit` decorator. An example showing a use case with a CUDA C/C++ source is added. This implementation is inspired by the approach outlined by @ed-o-saurus in numba/numba#9470. Notes on changes: * Various tests now run on the GPU, so test binaries need to be generated using the relevant compute capability - this change is applied in the `Makefile`, along with some refactoring to tidy it up a little. * Test for new functionality are added. In addition, existing tests that used the test binaries are all modified such that they use a relevant compute capability (usually the one in the test machine, for most of them) because the test binaries are now built for the CC of the current GPU - it's no longer sufficient to hard code CCs like 7.5 or 7.0 for tests. Since fixtures are needed across test files, I've started moving them into `conftest.py`. --------- Co-authored-by: Bradley Dice <bdice@bradleydice.com> Co-authored-by: jakirkham <jakirkham@gmail.com>
rapidsai · Mar 19, 2024 · a92e60f · a92e60f
1 parent 2dae4c2
commit a92e60f
Show file tree

Hide file tree

Showing 8 changed files with 464 additions and 81 deletions.
diff --git a/examples/jit_link_data.py b/examples/jit_link_data.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+# Demonstrates the use of a CUSource object to link code to a @cuda.jit
+# function where the linked source is supplied as a string in memory, rather
+# than on-disk. The CUSource object is passed in the `link` list, just as paths
+# to files are passed when linking source files from disk.
+#
+# In addition to CUSource files, PTXSource objects can be used to link PTX from
+# memory (not shown in this example).
+
+from numba import cuda
+from pynvjitlink import patch
+
+import numpy as np
+
+patch.patch_numba_linker()
+
+source = cuda.CUSource(
+    """
+typedef unsigned int uint32_t;
+
+extern "C" __device__ int cu_add(uint32_t* result, uint32_t a, uint32_t b)
+{
+    *result = a + b;
+    return 0;
+}
+"""
+)
+
+
+cu_add = cuda.declare_device("cu_add", "uint32(uint32, uint32)")
+
+
+@cuda.jit(link=[source])
+def kernel(result, a, b):
+    result[0] = cu_add(a, b)
+
+
+a = 1
+b = 2
+result = np.zeros(1, dtype=np.uint32)
+kernel[1, 1](result, a, b)
+
+print(f"According to a CUDA kernel, {a} + {b} = {result[0]}")
diff --git a/pynvjitlink/patch.py b/pynvjitlink/patch.py
@@ -29,6 +29,7 @@
 
 if _numba_version_ok:
     from numba.core import config
+    from numba import cuda
     from numba.cuda.cudadrv import nvrtc
     from numba.cuda.cudadrv.driver import (
         driver,
@@ -42,6 +43,61 @@
     Linker = object
 
 
+class LinkableCode:
+    """An object that can be passed in the `link` list argument to `@cuda.jit`
+    kernels to supply code to be linked from memory."""
+
+    def __init__(self, data, name=None):
+        self.data = data
+        self._name = name
+
+    @property
+    def name(self):
+        return self._name or self.default_name
+
+
+class PTXSource(LinkableCode):
+    """PTX Source code in memory"""
+
+    kind = FILE_EXTENSION_MAP["ptx"]
+    default_name = "<unnamed-ptx>"
+
+
+class CUSource(LinkableCode):
+    """CUDA C/C++ Source code in memory"""
+
+    kind = "cu"
+    default_name = "<unnamed-cu>"
+
+
+class Fatbin(LinkableCode):
+    """A fatbin ELF in memory"""
+
+    kind = FILE_EXTENSION_MAP["fatbin"]
+    default_name = "<unnamed-fatbin>"
+
+
+class Cubin(LinkableCode):
+    """A cubin ELF in memory"""
+
+    kind = FILE_EXTENSION_MAP["cubin"]
+    default_name = "<unnamed-cubin>"
+
+
+class Archive(LinkableCode):
+    """An archive of objects in memory"""
+
+    kind = FILE_EXTENSION_MAP["a"]
+    default_name = "<unnamed-archive>"
+
+
+class Object(LinkableCode):
+    """An object file in memory"""
+
+    kind = FILE_EXTENSION_MAP["o"]
+    default_name = "<unnamed-object>"
+
+
 class PatchedLinker(Linker):
     def __init__(
         self,
@@ -91,6 +147,34 @@ def add_ltoir(self, ltoir, name="<external-ltoir>"):
     def add_object(self, obj, name="<external-object>"):
         self._linker.add_object(obj, name)
 
+    def add_file_guess_ext(self, path_or_code):
+        # Numba's add_file_guess_ext expects to always be passed a path to a
+        # file that it will load from the filesystem to link. We augment it
+        # here with the ability to provide a file from memory.
+
+        # To maintain compatibility with the original interface, all strings
+        # are treated as paths in the filesystem.
+        if isinstance(path_or_code, str):
+            # Upstream numba does not yet recognize LTOIR, so handle that
+            # separately here.
+            extension = pathlib.Path(path_or_code).suffix
+            if extension == ".ltoir":
+                self.add_file(path_or_code, "ltoir")
+            else:
+                # Use Numba's logic for non-LTOIR
+                super().add_file_guess_ext(path_or_code)
+
+            return
+
+        # Otherwise, we should have been given a LinkableCode object
+        if not isinstance(path_or_code, LinkableCode):
+            raise TypeError("Expected path to file or a LinkableCode object")
+
+        if path_or_code.kind == "cu":
+            self.add_cu(path_or_code.data, path_or_code.name)
+        else:
+            self.add_data(path_or_code.data, path_or_code.kind, path_or_code.name)
+
     def add_file(self, path, kind):
         try:
             with open(path, "rb") as f:
@@ -99,6 +183,9 @@ def add_file(self, path, kind):
             raise LinkerError(f"{path} not found")
 
         name = pathlib.Path(path).name
+        self.add_data(data, kind, name)
+
+    def add_data(self, data, kind, name):
         if kind == FILE_EXTENSION_MAP["cubin"]:
             fn = self._linker.add_cubin
         elif kind == FILE_EXTENSION_MAP["fatbin"]:
@@ -109,6 +196,8 @@ def add_file(self, path, kind):
             return self.add_ptx(data, name)
         elif kind == FILE_EXTENSION_MAP["o"]:
             fn = self._linker.add_object
+        elif kind == "ltoir":
+            fn = self._linker.add_ltoir
         else:
             raise LinkerError(f"Don't know how to link {kind}")
 
@@ -159,4 +248,14 @@ def patch_numba_linker():
         msg = f"Cannot patch Numba: {_numba_error}"
         raise RuntimeError(msg)
 
+    # Replace the built-in linker that uses the Driver API with our linker that
+    # uses nvJitLink
     Linker.new = new_patched_linker
+
+    # Add linkable code objects to Numba's top-level API
+    cuda.Archive = Archive
+    cuda.CUSource = CUSource
+    cuda.Cubin = Cubin
+    cuda.Fatbin = Fatbin
+    cuda.Object = Object
+    cuda.PTXSource = PTXSource
diff --git a/pynvjitlink/tests/conftest.py b/pynvjitlink/tests/conftest.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import os
+import pytest
+
+from numba import cuda
+from pynvjitlink.patch import Archive, Cubin, CUSource, Fatbin, Object, PTXSource
+
+
+@pytest.fixture(scope="session")
+def gpu_compute_capability():
+    """Compute capability of the current GPU"""
+    return cuda.get_current_device().compute_capability
+
+
+@pytest.fixture(scope="session")
+def alt_gpu_compute_capability(gpu_compute_capability):
+    """A compute capability that does not match the current GPU"""
+    # For sufficient incompatibility for the test suites, the major number of
+    # the compute capabilities must differ (for example one can load a 7.0
+    # cubin when linking for 7.5)
+    if gpu_compute_capability[0] == 7:
+        return (8, 0)
+    else:
+        return (7, 0)
+
+
+@pytest.fixture(scope="session")
+def absent_gpu_compute_capability(gpu_compute_capability, alt_gpu_compute_capability):
+    """A compute capability not used in any cubin or fatbin test binary"""
+    cc_majors = {6, 7, 8}
+    cc_majors.remove(gpu_compute_capability[0])
+    cc_majors.remove(alt_gpu_compute_capability[0])
+    return (cc_majors.pop(), 0)
+
+
+@pytest.fixture(scope="session")
+def gpu_arch_flag(gpu_compute_capability):
+    """nvJitLink arch flag to link for the current GPU"""
+    major, minor = gpu_compute_capability
+    return f"-arch=sm_{major}{minor}"
+
+
+@pytest.fixture(scope="session")
+def alt_gpu_arch_flag(alt_gpu_compute_capability):
+    """nvJitLink arch flag to link for a different kind of GPU to the current
+    one"""
+    major, minor = alt_gpu_compute_capability
+    return f"-arch=sm_{major}{minor}"
+
+
+@pytest.fixture(scope="session")
+def absent_gpu_arch_flag(absent_gpu_compute_capability):
+    """nvJitLink arch flag to link for an architecture not in any cubin or
+    fatbin"""
+    major, minor = absent_gpu_compute_capability
+    return f"-arch=sm_{major}{minor}"
+
+
+@pytest.fixture(scope="session")
+def device_functions_archive():
+    test_dir = os.path.dirname(os.path.abspath(__file__))
+    path = os.path.join(test_dir, "test_device_functions.a")
+    with open(path, "rb") as f:
+        return f.read()
+
+
+@pytest.fixture(scope="session")
+def device_functions_cubin():
+    test_dir = os.path.dirname(os.path.abspath(__file__))
+    path = os.path.join(test_dir, "test_device_functions.cubin")
+    with open(path, "rb") as f:
+        return f.read()
+
+
+@pytest.fixture(scope="session")
+def device_functions_cusource():
+    test_dir = os.path.dirname(os.path.abspath(__file__))
+    path = os.path.join(test_dir, "test_device_functions.cu")
+    with open(path, "r") as f:
+        return f.read()
+
+
+@pytest.fixture(scope="session")
+def device_functions_fatbin():
+    test_dir = os.path.dirname(os.path.abspath(__file__))
+    path = os.path.join(test_dir, "test_device_functions.fatbin")
+    with open(path, "rb") as f:
+        return f.read()
+
+
+@pytest.fixture(scope="session")
+def device_functions_object():
+    test_dir = os.path.dirname(os.path.abspath(__file__))
+    path = os.path.join(test_dir, "test_device_functions.o")
+    with open(path, "rb") as f:
+        return f.read()
+
+
+@pytest.fixture(scope="session")
+def device_functions_ptx():
+    test_dir = os.path.dirname(os.path.abspath(__file__))
+    path = os.path.join(test_dir, "test_device_functions.ptx")
+    with open(path, "rb") as f:
+        return f.read()
+
+
+@pytest.fixture(scope="session")
+def undefined_extern_cubin():
+    test_dir = os.path.dirname(os.path.abspath(__file__))
+    fatbin_path = os.path.join(test_dir, "undefined_extern.cubin")
+    with open(fatbin_path, "rb") as f:
+        return f.read()
+
+
+@pytest.fixture(scope="session")
+def linkable_code_archive(device_functions_archive):
+    return Archive(device_functions_archive)
+
+
+@pytest.fixture(scope="session")
+def linkable_code_cubin(device_functions_cubin):
+    return Cubin(device_functions_cubin)
+
+
+@pytest.fixture(scope="session")
+def linkable_code_cusource(device_functions_cusource):
+    return CUSource(device_functions_cusource)
+
+
+@pytest.fixture(scope="session")
+def linkable_code_fatbin(device_functions_fatbin):
+    return Fatbin(device_functions_fatbin)
+
+
+@pytest.fixture(scope="session")
+def linkable_code_object(device_functions_object):
+    return Object(device_functions_object)
+
+
+@pytest.fixture(scope="session")
+def linkable_code_ptx(device_functions_ptx):
+    return PTXSource(device_functions_ptx)