Don't import allocators by default

This will make importing RMM faster if you don't want the hooks for cupy, numba, or pytorch. Before, a sampling trace of `import rmm` with pyinstrument shows: $ pyinstrument -i 0.01 importrmm.py _ ._ __/__ _ _ _ _ _/_ Recorded: 10:19:56 Samples: 67 /_//_/// /_\ / //_// / //_'/ // Duration: 0.839 CPU time: 0.837 / _/ v4.4.0 Program: importrmm.py 0.839 <module> importrmm.py:1 └─ 0.839 <module> rmm/__init__.py:1 ├─ 0.315 <module> rmm/allocators/torch.py:1 │ └─ 0.315 <module> torch/__init__.py:1 │ [96 frames hidden] torch, <built-in>, enum, inspect, tok... ├─ 0.297 <module> rmm/mr.py:1 │ └─ 0.297 <module> rmm/_lib/__init__.py:1 │ ├─ 0.216 <module> numba/__init__.py:1 │ │ [140 frames hidden] numba, abc, <built-in>, importlib, em... │ ├─ 0.040 <module> numba/cuda/__init__.py:1 │ │ [34 frames hidden] numba, asyncio, ssl, <built-in>, re, ... │ ├─ 0.030 __new__ enum.py:180 │ │ [5 frames hidden] enum, <built-in> │ └─ 0.011 [self] None └─ 0.227 <module> rmm/allocators/cupy.py:1 └─ 0.227 <module> cupy/__init__.py:1 [123 frames hidden] cupy, pytest, _pytest, attr, <built-i... After: $ pyinstrument -i 0.01 importrmm.py _ ._ __/__ _ _ _ _ _/_ Recorded: 10:20:10 Samples: 28 /_//_/// /_\ / //_// / //_'/ // Duration: 0.297 CPU time: 0.297 / _/ v4.4.0 Program: importrmm.py 0.296 <module> importrmm.py:1 └─ 0.296 <module> rmm/__init__.py:1 └─ 0.296 <module> rmm/mr.py:1 └─ 0.296 <module> rmm/_lib/__init__.py:1 ├─ 0.216 <module> numba/__init__.py:1 │ [141 frames hidden] numba, <built-in>, importlib, email, ... ├─ 0.040 <module> numba/cuda/__init__.py:1 │ [19 frames hidden] numba, asyncio, ssl, <built-in>, unit... ├─ 0.031 [self] None └─ 0.010 __new__ enum.py:180 [4 frames hidden] enum, <built-in>
rapidsai · Feb 23, 2023 · 359fcc0 · 359fcc0
1 parent 6f204e0
commit 359fcc0
Show file tree

Hide file tree

Showing 4 changed files with 13 additions and 17 deletions.
diff --git a/python/rmm/__init__.py b/python/rmm/__init__.py
@@ -14,9 +14,6 @@
 
 from rmm import mr
 from rmm._lib.device_buffer import DeviceBuffer
-from rmm.allocators.cupy import rmm_cupy_allocator
-from rmm.allocators.numba import RMMNumbaManager, _numba_memory_manager
-from rmm.allocators.torch import rmm_torch_allocator
 from rmm.mr import disable_logging, enable_logging, get_log_filenames
 from rmm.rmm import (
     RMMError,
@@ -29,16 +26,13 @@
 __all__ = [
     "DeviceBuffer",
     "RMMError",
-    "RMMNumbaManager",
     "disable_logging",
     "enable_logging",
     "get_log_filenames",
     "is_initialized",
     "mr",
     "register_reinitialize_hook",
     "reinitialize",
-    "rmm_cupy_allocator",
-    "rmm_torch_allocator",
     "unregister_reinitialize_hook",
 ]
 

diff --git a/python/rmm/allocators/cupy.py b/python/rmm/allocators/cupy.py
@@ -26,7 +26,7 @@ def rmm_cupy_allocator(nbytes):
 
     Examples
     --------
-    >>> import rmm
+    >>> from rmm.allocators.cupy import rmm_cupy_allocator
     >>> import cupy
     >>> cupy.cuda.set_allocator(rmm.rmm_cupy_allocator)
     """

diff --git a/python/rmm/tests/test_rmm.py b/python/rmm/tests/test_rmm.py
@@ -24,6 +24,8 @@
 
 import rmm
 import rmm._cuda.stream
+from rmm.allocators.cupy import rmm_cupy_allocator
+from rmm.allocators.numba import RMMNumbaManager
 
 if sys.version_info < (3, 8):
     try:
@@ -33,7 +35,7 @@
 else:
     import pickle
 
-cuda.set_memory_manager(rmm.RMMNumbaManager)
+cuda.set_memory_manager(RMMNumbaManager)
 
 _driver_version = rmm._cuda.gpu.driverGetVersion()
 _runtime_version = rmm._cuda.gpu.runtimeGetVersion()
@@ -303,17 +305,17 @@ def test_rmm_pool_numba_stream(stream):
 def test_rmm_cupy_allocator():
     cupy = pytest.importorskip("cupy")
 
-    m = rmm.rmm_cupy_allocator(42)
+    m = rmm_cupy_allocator(42)
     assert m.mem.size == 42
     assert m.mem.ptr != 0
     assert isinstance(m.mem._owner, rmm.DeviceBuffer)
 
-    m = rmm.rmm_cupy_allocator(0)
+    m = rmm_cupy_allocator(0)
     assert m.mem.size == 0
     assert m.mem.ptr == 0
     assert isinstance(m.mem._owner, rmm.DeviceBuffer)
 
-    cupy.cuda.set_allocator(rmm.rmm_cupy_allocator)
+    cupy.cuda.set_allocator(rmm_cupy_allocator)
     a = cupy.arange(10)
     assert isinstance(a.data.mem._owner, rmm.DeviceBuffer)
 
@@ -323,20 +325,20 @@ def test_rmm_pool_cupy_allocator_with_stream(stream):
     cupy = pytest.importorskip("cupy")
 
     rmm.reinitialize(pool_allocator=True)
-    cupy.cuda.set_allocator(rmm.rmm_cupy_allocator)
+    cupy.cuda.set_allocator(rmm_cupy_allocator)
 
     if stream == "null":
         stream = cupy.cuda.stream.Stream.null
     else:
         stream = cupy.cuda.stream.Stream()
 
     with stream:
-        m = rmm.rmm_cupy_allocator(42)
+        m = rmm_cupy_allocator(42)
         assert m.mem.size == 42
         assert m.mem.ptr != 0
         assert isinstance(m.mem._owner, rmm.DeviceBuffer)
 
-        m = rmm.rmm_cupy_allocator(0)
+        m = rmm_cupy_allocator(0)
         assert m.mem.size == 0
         assert m.mem.ptr == 0
         assert isinstance(m.mem._owner, rmm.DeviceBuffer)
@@ -355,7 +357,7 @@ def test_rmm_pool_cupy_allocator_stream_lifetime():
     cupy = pytest.importorskip("cupy")
 
     rmm.reinitialize(pool_allocator=True)
-    cupy.cuda.set_allocator(rmm.rmm_cupy_allocator)
+    cupy.cuda.set_allocator(rmm_cupy_allocator)
 
     stream = cupy.cuda.stream.Stream()
 

diff --git a/python/rmm/tests/test_rmm_pytorch.py b/python/rmm/tests/test_rmm_pytorch.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-import rmm
+from rmm.allocators.torch import rmm_torch_allocator
 
 torch = pytest.importorskip("torch")
 
@@ -13,7 +13,7 @@ def torch_allocator():
         from torch.cuda.memory import change_current_allocator
     except ImportError:
         pytest.skip("pytorch pluggable allocator not available")
-    change_current_allocator(rmm.rmm_torch_allocator)
+    change_current_allocator(rmm_torch_allocator)
 
 
 def test_rmm_torch_allocator(torch_allocator, stats_mr):