Skip to content

Commit

Permalink
fix(python): further improved lazy loading (#5459)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexander-beedie committed Nov 10, 2022
1 parent 9f5049f commit 7ef7de3
Show file tree
Hide file tree
Showing 4 changed files with 142 additions and 129 deletions.
4 changes: 3 additions & 1 deletion py-polars/docs/source/_static/css/custom.css
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ footer.bd-footer{
ToC, so hide until there's actually something to put there...
*/
div.bd-sidebar-secondary {
position: absolute;
display: none;
}
label.sidebar-toggle.secondary-toggle {
display: none !important;
}
89 changes: 43 additions & 46 deletions py-polars/polars/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@

from _ctypes import _SimpleCData # type: ignore[import]

from polars.dependencies import _PYARROW_AVAILABLE
from polars.dependencies import pyarrow as pa

try:
Expand Down Expand Up @@ -486,51 +485,49 @@ def NUMPY_CHAR_CODE_TO_DTYPE(self) -> dict[str, PolarsDataType]:
"?": Boolean,
}

if _PYARROW_AVAILABLE:

@property
@cache
def PY_TYPE_TO_ARROW_TYPE(self) -> dict[type, Callable[[], pa.lib.DataType]]:
return {
float: pa.float64(),
int: pa.int64(),
str: pa.large_utf8(),
bool: pa.bool_(),
date: pa.date32(),
time: pa.time64("us"),
datetime: pa.timestamp("us"),
timedelta: pa.duration("us"),
}

@property
@cache
def DTYPE_TO_ARROW_TYPE(
self,
) -> dict[PolarsDataType, Callable[[], pa.lib.DataType]]:
return {
Int8: pa.int8(),
Int16: pa.int16(),
Int32: pa.int32(),
Int64: pa.int64(),
UInt8: pa.uint8(),
UInt16: pa.uint16(),
UInt32: pa.uint32(),
UInt64: pa.uint64(),
Float32: pa.float32(),
Float64: pa.float64(),
Boolean: pa.bool_(),
Utf8: pa.large_utf8(),
Date: pa.date32(),
Datetime: pa.timestamp("us"),
Datetime("ms"): pa.timestamp("ms"),
Datetime("us"): pa.timestamp("us"),
Datetime("ns"): pa.timestamp("ns"),
Duration: pa.duration("us"),
Duration("ms"): pa.duration("ms"),
Duration("us"): pa.duration("us"),
Duration("ns"): pa.duration("ns"),
Time: pa.time64("us"),
}
@property
@cache
def PY_TYPE_TO_ARROW_TYPE(self) -> dict[type, Callable[[], pa.lib.DataType]]:
return {
float: pa.float64(),
int: pa.int64(),
str: pa.large_utf8(),
bool: pa.bool_(),
date: pa.date32(),
time: pa.time64("us"),
datetime: pa.timestamp("us"),
timedelta: pa.duration("us"),
}

@property
@cache
def DTYPE_TO_ARROW_TYPE(
self,
) -> dict[PolarsDataType, Callable[[], pa.lib.DataType]]:
return {
Int8: pa.int8(),
Int16: pa.int16(),
Int32: pa.int32(),
Int64: pa.int64(),
UInt8: pa.uint8(),
UInt16: pa.uint16(),
UInt32: pa.uint32(),
UInt64: pa.uint64(),
Float32: pa.float32(),
Float64: pa.float64(),
Boolean: pa.bool_(),
Utf8: pa.large_utf8(),
Date: pa.date32(),
Datetime: pa.timestamp("us"),
Datetime("ms"): pa.timestamp("ms"),
Datetime("us"): pa.timestamp("us"),
Datetime("ns"): pa.timestamp("ns"),
Duration: pa.duration("us"),
Duration("ms"): pa.duration("ms"),
Duration("us"): pa.duration("us"),
Duration("ns"): pa.duration("ns"),
Time: pa.time64("us"),
}


# initialise once (poor man's singleton :)
Expand Down
170 changes: 92 additions & 78 deletions py-polars/polars/dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,11 @@
import inspect
import re
import sys
from importlib.machinery import ModuleSpec
from importlib.util import LazyLoader, find_spec, module_from_spec
from importlib import import_module
from importlib.util import find_spec
from types import ModuleType
from typing import TYPE_CHECKING, Any

_mod_pfx = {
"numpy": "np.",
"pandas": "pd.",
"pyarrow": "pa.",
}

_FSSPEC_AVAILABLE = True
_NUMPY_AVAILABLE = True
_PANDAS_AVAILABLE = True
Expand All @@ -22,45 +16,82 @@
_HYPOTHESIS_AVAILABLE = True


def _proxy_module(module_name: str) -> ModuleType:
class _LazyModule(ModuleType):
"""
Create a module that raises a helpful/explanatory exception on attribute access.
Module that can act both as a lazy-loader and as a proxy.
Notes
-----
We do NOT register this module with `sys.modules` so as not to cause confusion
in the global environment. This way we have a valid lazy/proxy module for our
own use, but it is scoped _exclusively_ for use within polars.
Parameters
----------
module_name : str
the name of the new/proxy module.
We do NOT register this module with `sys.modules` so as not to cause
confusion in the global environment. This way we have a valid proxy
module for our own use, but it lives _exclusively_ within polars.
"""
# module-level getattr for the proxy
def __getattr__(*args: Any, **kwargs: Any) -> None:
attr = args[0]
# allow some very minimal introspection on private module
# attrs to avoid unnecessary error-handling elsewhere
if re.match(r"^__\w+__$", attr):
return None

# all other attribute access raises an exception
pfx = _mod_pfx.get(module_name, "")
raise ModuleNotFoundError(
f"{pfx}{attr} requires '{module_name}' module to be installed"
) from None

# create the module (do NOT register with sys.globals)
proxy_module = module_from_spec(ModuleSpec(module_name, None))
for name, obj in (("__getattr__", __getattr__),):
setattr(proxy_module, name, obj)

return proxy_module


def lazy_import(module_name: str) -> tuple[ModuleType, bool]:
_mod_pfx: dict[str, str] = {
"numpy": "np.",
"pandas": "pd.",
"pyarrow": "pa.",
}

def __init__(
self,
module_name: str,
module_available: bool,
) -> None:
"""
Initialise lazy-loading proxy module.
Parameters
----------
module_name : str
the name of the module to lazy-load (if available).
module_available : bool
indicate if the referenced module is actually available (we will proxy it
in both cases, but raise a helpful error when invoked if it doesn't exist).
"""
self._module_available = module_available
self._module_name = module_name
self._globals = globals()
super().__init__(module_name)

def _import(self) -> ModuleType:
# import the referenced module, replacing the proxy in this module's globals
module = import_module(self.__name__)
self._globals[self._module_name] = module
self.__dict__.update(module.__dict__)
return module

def __getattr__(self, attr: Any) -> Any:
# have "hasattr('__wrapped__')" return False without triggering import
# (it's for decorators, not modules, but keeps "make doctest" happy)
if attr == "__wrapped__":
raise AttributeError(
f"{self._module_name!r} object has no attribute {attr!r}"
)

# accessing the proxy module's attributes triggers import of the real thing
if self._module_available:
# import the module and return the requested attribute
module = self._import()
return getattr(module, attr)
else:
# user has not installed the proxied module
if re.match(r"^__\w+__$", attr):
# allow some minimal introspection on private module
# attrs to avoid unnecessary error-handling elsewhere
return None

# all other attribute access raises a helpful exception
pfx = self._mod_pfx.get(self._module_name, "")
raise ModuleNotFoundError(
f"{pfx}{attr} requires '{self._module_name}' module to be installed"
) from None


def _lazy_import(module_name: str) -> tuple[ModuleType, bool]:
"""
Lazy import the given module; avoids up-front import costs.
Expand Down Expand Up @@ -89,37 +120,20 @@ def lazy_import(module_name: str) -> tuple[ModuleType, bool]:

# check if module is AVAILABLE
try:
spec = find_spec(module_name)
if spec is None or spec.loader is None:
spec = None
module_spec = find_spec(module_name)
module_available = not (module_spec is None or module_spec.loader is None)
except ModuleNotFoundError:
spec = None

# if NOT available, return proxy module that raises on attribute access
if spec is None:
return _proxy_module(module_name), False
else:
# handle modules that have old-style loaders (ref: #5326)
if not hasattr(spec.loader, "exec_module"):
if hasattr(spec.loader, "load_module"):
spec.loader.exec_module = ( # type: ignore[assignment, union-attr]
# wrap deprecated 'load_module' for use with 'exec_module'
lambda module: spec.loader.load_module(module.__name__) # type: ignore[union-attr] # noqa: E501
)
if not hasattr(spec.loader, "create_module"):
spec.loader.create_module = ( # type: ignore[assignment, union-attr]
# note: returning 'None' implies use of the standard machinery
lambda spec: None
)

# module IS available, but not yet imported into the environment; create
# a lazy loader that proxies (then replaces) the module in sys.modules
loader = LazyLoader(spec.loader) # type: ignore[arg-type]
spec.loader = loader
module = module_from_spec(spec)
sys.modules[module_name] = module
loader.exec_module(module)
return module, True
module_available = False

# create lazy/proxy module that imports the real one on first use
# (or raises an explanatory ModuleNotFoundError if not available)
return (
_LazyModule(
module_name=module_name,
module_available=module_available,
),
module_available,
)


if TYPE_CHECKING:
Expand All @@ -134,15 +148,15 @@ def lazy_import(module_name: str) -> tuple[ModuleType, bool]:
else:
from backports import zoneinfo
else:
fsspec, _FSSPEC_AVAILABLE = lazy_import("fsspec")
numpy, _NUMPY_AVAILABLE = lazy_import("numpy")
pandas, _PANDAS_AVAILABLE = lazy_import("pandas")
pyarrow, _PYARROW_AVAILABLE = lazy_import("pyarrow")
hypothesis, _HYPOTHESIS_AVAILABLE = lazy_import("hypothesis")
fsspec, _FSSPEC_AVAILABLE = _lazy_import("fsspec")
numpy, _NUMPY_AVAILABLE = _lazy_import("numpy")
pandas, _PANDAS_AVAILABLE = _lazy_import("pandas")
pyarrow, _PYARROW_AVAILABLE = _lazy_import("pyarrow")
hypothesis, _HYPOTHESIS_AVAILABLE = _lazy_import("hypothesis")
zoneinfo, _ZONEINFO_AVAILABLE = (
lazy_import("zoneinfo")
_lazy_import("zoneinfo")
if sys.version_info >= (3, 9)
else lazy_import("backports.zoneinfo")
else _lazy_import("backports.zoneinfo")
)


Expand Down Expand Up @@ -173,7 +187,7 @@ def _PYARROW_TYPE(obj: Any) -> bool:
"pandas",
"pyarrow",
"zoneinfo",
"_proxy_module",
"_LazyModule",
"_FSSPEC_AVAILABLE",
"_NUMPY_AVAILABLE",
"_NUMPY_TYPE",
Expand Down
8 changes: 4 additions & 4 deletions py-polars/tests/unit/test_interop.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,20 +312,20 @@ def test_from_pandas_series() -> None:


def test_from_optional_not_available() -> None:
from polars.dependencies import _proxy_module
from polars.dependencies import _LazyModule

# proxy module is created dynamically if the required module is not available
# (see the polars.dependencies source code for additional detail/comments)

np = _proxy_module("numpy")
np = _LazyModule("numpy", module_available=False)
with pytest.raises(ImportError, match=r"np\.array requires 'numpy'"):
pl.from_numpy(np.array([[1, 2], [3, 4]]), columns=["a", "b"])

pa = _proxy_module("pyarrow")
pa = _LazyModule("pyarrow", module_available=False)
with pytest.raises(ImportError, match=r"pa\.table requires 'pyarrow'"):
pl.from_arrow(pa.table({"a": [1, 2, 3], "b": [4, 5, 6]}))

pd = _proxy_module("pandas")
pd = _LazyModule("pandas", module_available=False)
with pytest.raises(ImportError, match=r"pd\.Series requires 'pandas'"):
pl.from_pandas(pd.Series([1, 2, 3]))

Expand Down

0 comments on commit 7ef7de3

Please sign in to comment.