feat(python): much faster lazy type-checks (#6064)

pola-rs · Jan 5, 2023 · 41f0aa6 · 41f0aa6
1 parent 2feb833
commit 41f0aa6
Show file tree

Hide file tree

Showing 8 changed files with 97 additions and 80 deletions.
diff --git a/py-polars/polars/dependencies.py b/py-polars/polars/dependencies.py
@@ -1,12 +1,12 @@
 from __future__ import annotations
 
-import inspect
 import re
 import sys
+from functools import lru_cache
 from importlib import import_module
 from importlib.util import find_spec
 from types import ModuleType
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, Hashable, cast
 
 _FSSPEC_AVAILABLE = True
 _NUMPY_AVAILABLE = True
@@ -29,6 +29,8 @@ class _LazyModule(ModuleType):
 
     """
 
+    __lazy__ = True
+
     _mod_pfx: dict[str, str] = {
         "numpy": "np.",
         "pandas": "pd.",
@@ -163,25 +165,26 @@ def _lazy_import(module_name: str) -> tuple[ModuleType, bool]:
     )
 
 
-def _NUMPY_TYPE(obj: Any) -> bool:
-    return _NUMPY_AVAILABLE and any(
-        "numpy." in str(o)
-        for o in (obj if inspect.isclass(obj) else obj.__class__).mro()
-    )
+@lru_cache(maxsize=None)
+def _might_be(cls: type, type_: str) -> bool:
+    # infer whether the given class "might" be associated with the given
+    # module (in which case it's reasonable to do a real isinstance check)
+    try:
+        return any(f"{type_}." in str(o) for o in cls.mro())
+    except TypeError:
+        return False
 
 
-def _PANDAS_TYPE(obj: Any) -> bool:
-    return _PANDAS_AVAILABLE and any(
-        "pandas." in str(o)
-        for o in (obj if inspect.isclass(obj) else obj.__class__).mro()
-    )
+def _check_for_numpy(obj: Any) -> bool:
+    return _NUMPY_AVAILABLE and _might_be(cast(Hashable, type(obj)), "numpy")
 
 
-def _PYARROW_TYPE(obj: Any) -> bool:
-    return _PYARROW_AVAILABLE and any(
-        "pyarrow." in str(o)
-        for o in (obj if inspect.isclass(obj) else obj.__class__).mro()
-    )
+def _check_for_pandas(obj: Any) -> bool:
+    return _PANDAS_AVAILABLE and _might_be(cast(Hashable, type(obj)), "pandas")
+
+
+def _check_for_pyarrow(obj: Any) -> bool:
+    return _PYARROW_AVAILABLE and _might_be(cast(Hashable, type(obj)), "pyarrow")
 
 
 __all__ = [
@@ -194,11 +197,11 @@ def _PYARROW_TYPE(obj: Any) -> bool:
     "_LazyModule",
     "_FSSPEC_AVAILABLE",
     "_NUMPY_AVAILABLE",
-    "_NUMPY_TYPE",
+    "_check_for_numpy",
     "_PANDAS_AVAILABLE",
-    "_PANDAS_TYPE",
+    "_check_for_pandas",
     "_PYARROW_AVAILABLE",
-    "_PYARROW_TYPE",
+    "_check_for_pyarrow",
     "_ZONEINFO_AVAILABLE",
     "_HYPOTHESIS_AVAILABLE",
     "_DELTALAKE_AVAILABLE",

diff --git a/py-polars/polars/internals/construction.py b/py-polars/polars/internals/construction.py
@@ -42,9 +42,9 @@
 )
 from polars.dependencies import (
     _NUMPY_AVAILABLE,
-    _NUMPY_TYPE,
-    _PANDAS_TYPE,
     _PYARROW_AVAILABLE,
+    _check_for_numpy,
+    _check_for_pandas,
 )
 from polars.dependencies import numpy as np
 from polars.dependencies import pandas as pd
@@ -374,7 +374,7 @@ def sequence_to_pyseries(
             return PySeries.new_object(name, values, strict)
 
         elif (
-            _NUMPY_TYPE(value)
+            _check_for_numpy(value)
             and isinstance(value, np.ndarray)
             and len(value.shape) == 1
         ):
@@ -591,7 +591,11 @@ def dict_to_pydf(
         count_numpy = 0
         for val in data.values():
             # only start a thread pool from a reasonable size.
-            count_numpy += int(isinstance(val, np.ndarray) and len(val) > 1000)
+            count_numpy += int(
+                _check_for_numpy(val)
+                and isinstance(val, np.ndarray)
+                and len(val) > 1000
+            )
 
         # if we have more than 3 numpy arrays we multi-thread
         if count_numpy > 2:
@@ -661,7 +665,7 @@ def sequence_to_pydf(
             pydf = _post_apply_columns(pydf, column_names)
         return pydf
 
-    elif isinstance(data[0], Sequence) and not isinstance(data[0], str):
+    elif isinstance(data[0], (list, tuple, Sequence)) and not isinstance(data[0], str):
         if is_namedtuple(data[0]):
             if columns is None:
                 columns = data[0]._fields  # type: ignore[attr-defined]
@@ -728,7 +732,9 @@ def sequence_to_pydf(
             pydf = _post_apply_columns(pydf, columns, categoricals)
         return pydf
 
-    elif _PANDAS_TYPE(data[0]) and isinstance(data[0], (pd.Series, pd.DatetimeIndex)):
+    elif _check_for_pandas(data[0]) and isinstance(
+        data[0], (pd.Series, pd.DatetimeIndex)
+    ):
         dtypes = {}
         if columns is not None:
             columns, dtypes = _unpack_columns(columns, n_expected=1)

diff --git a/py-polars/polars/internals/dataframe/frame.py b/py-polars/polars/internals/dataframe/frame.py
@@ -46,7 +46,7 @@
     get_idx_type,
     py_type_to_dtype,
 )
-from polars.dependencies import _NUMPY_TYPE, _PANDAS_TYPE, _PYARROW_TYPE
+from polars.dependencies import _check_for_numpy, _check_for_pandas, _check_for_pyarrow
 from polars.dependencies import numpy as np
 from polars.dependencies import pandas as pd
 from polars.dependencies import pyarrow as pa
@@ -274,23 +274,23 @@ def __init__(
         elif isinstance(data, dict):
             self._df = dict_to_pydf(data, columns=columns)
 
-        elif _NUMPY_TYPE(data) and isinstance(data, np.ndarray):
-            self._df = numpy_to_pydf(data, columns=columns, orient=orient)
-
-        elif _PYARROW_TYPE(data) and isinstance(data, pa.Table):
-            self._df = arrow_to_pydf(data, columns=columns)
+        elif isinstance(data, pli.Series):
+            self._df = series_to_pydf(data, columns=columns)
 
-        elif isinstance(data, Sequence) and not isinstance(data, str):
+        elif isinstance(data, (list, tuple, Sequence)):
             self._df = sequence_to_pydf(
                 data, columns=columns, orient=orient, infer_schema_length=50
             )
-        elif isinstance(data, pli.Series):
-            self._df = series_to_pydf(data, columns=columns)
+        elif _check_for_numpy(data) and isinstance(data, np.ndarray):
+            self._df = numpy_to_pydf(data, columns=columns, orient=orient)
+
+        elif _check_for_pyarrow(data) and isinstance(data, pa.Table):
+            self._df = arrow_to_pydf(data, columns=columns)
 
-        elif _PANDAS_TYPE(data) and isinstance(data, pd.DataFrame):
+        elif _check_for_pandas(data) and isinstance(data, pd.DataFrame):
             self._df = pandas_to_pydf(data, columns=columns)
 
-        elif isinstance(data, (Generator, Iterable)) and not isinstance(data, Sized):
+        elif not isinstance(data, Sized) and isinstance(data, (Generator, Iterable)):
             self._df = iterable_to_pydf(data, columns=columns, orient=orient)
         else:
             raise ValueError(
@@ -1176,7 +1176,7 @@ def _pos_idxs(
 
                 return idxs.cast(idx_type)
 
-        if _NUMPY_TYPE(idxs) and isinstance(idxs, np.ndarray):
+        if _check_for_numpy(idxs) and isinstance(idxs, np.ndarray):
             if idxs.ndim != 1:
                 raise ValueError("Only 1D numpy array is supported as index.")
             if idxs.dtype.kind in ("i", "u"):
@@ -1301,7 +1301,8 @@ def __getitem__(
             # df[2, :] (select row as df)
             if isinstance(row_selection, int):
                 if isinstance(col_selection, (slice, list)) or (
-                    _NUMPY_TYPE(col_selection) and isinstance(col_selection, np.ndarray)
+                    _check_for_numpy(col_selection)
+                    and isinstance(col_selection, np.ndarray)
                 ):
                     df = self[:, col_selection]
                     return df.slice(row_selection, 1)
@@ -1348,7 +1349,7 @@ def __getitem__(
         # select rows by numpy mask or index
         # df[np.array([1, 2, 3])]
         # df[np.array([True, False, True])]
-        if _NUMPY_TYPE(item) and isinstance(item, np.ndarray):
+        if _check_for_numpy(item) and isinstance(item, np.ndarray):
             if item.ndim != 1:
                 raise ValueError("Only a 1D-Numpy array is supported as index.")
             if item.dtype.kind in ("i", "u"):
@@ -2556,7 +2557,7 @@ def filter(
         └─────┴─────┴─────┘
 
         """
-        if _NUMPY_TYPE(predicate) and isinstance(predicate, np.ndarray):
+        if _check_for_numpy(predicate) and isinstance(predicate, np.ndarray):
             predicate = pli.Series(predicate)
 
         return (

diff --git a/py-polars/polars/internals/expr/expr.py b/py-polars/polars/internals/expr/expr.py
@@ -15,7 +15,7 @@
     is_polars_dtype,
     py_type_to_dtype,
 )
-from polars.dependencies import _NUMPY_TYPE
+from polars.dependencies import _check_for_numpy
 from polars.dependencies import numpy as np
 from polars.internals.expr.binary import ExprBinaryNameSpace
 from polars.internals.expr.categorical import ExprCatNameSpace
@@ -1999,7 +1999,7 @@ def take(
 
         """
         if isinstance(indices, list) or (
-            _NUMPY_TYPE(indices) and isinstance(indices, np.ndarray)
+            _check_for_numpy(indices) and isinstance(indices, np.ndarray)
         ):
             indices = cast("np.ndarray[Any, Any]", indices)
             indices_lit = pli.lit(pli.Series("", indices, dtype=UInt32))

diff --git a/py-polars/polars/internals/lazy_functions.py b/py-polars/polars/internals/lazy_functions.py
@@ -19,7 +19,7 @@
     is_polars_dtype,
     py_type_to_dtype,
 )
-from polars.dependencies import _NUMPY_TYPE
+from polars.dependencies import _check_for_numpy
 from polars.dependencies import numpy as np
 from polars.internals.type_aliases import EpochTimeUnit
 from polars.utils import (
@@ -194,7 +194,7 @@ def col(
     if isinstance(name, DataType):
         return pli.wrap_expr(_dtype_cols([name]))
 
-    elif not isinstance(name, str) and isinstance(name, Sequence):
+    elif not isinstance(name, str) and isinstance(name, (list, tuple, Sequence)):
         if len(name) == 0 or isinstance(name[0], str):
             return pli.wrap_expr(pycols(name))
         elif is_polars_dtype(name[0]):
@@ -1101,7 +1101,7 @@ def lit(
             return e
         return e.alias(name)
 
-    if _NUMPY_TYPE(value) and isinstance(value, np.ndarray):
+    if _check_for_numpy(value) and isinstance(value, np.ndarray):
         return lit(pli.Series("", value))
 
     if dtype:

diff --git a/py-polars/polars/internals/series/series.py b/py-polars/polars/internals/series/series.py
@@ -46,10 +46,10 @@
     supported_numpy_char_code,
 )
 from polars.dependencies import (
-    _NUMPY_TYPE,
-    _PANDAS_TYPE,
     _PYARROW_AVAILABLE,
-    _PYARROW_TYPE,
+    _check_for_numpy,
+    _check_for_pandas,
+    _check_for_pyarrow,
 )
 from polars.dependencies import numpy as np
 from polars.dependencies import pandas as pd
@@ -232,25 +232,6 @@ def __init__(
         elif isinstance(values, Series):
             self._s = series_to_pyseries(name, values)
 
-        elif _PYARROW_TYPE(values) and isinstance(values, (pa.Array, pa.ChunkedArray)):
-            self._s = arrow_to_pyseries(name, values)
-
-        elif _NUMPY_TYPE(values) and isinstance(values, np.ndarray):
-            self._s = numpy_to_pyseries(name, values, strict, nan_to_null)
-            if values.dtype.type == np.datetime64:
-                # cast to appropriate dtype, handling NaT values
-                dtype = _resolve_datetime_dtype(dtype, values.dtype)
-                if dtype is not None:
-                    self._s = (
-                        self.cast(dtype)
-                        .set_at_idx(np.argwhere(np.isnat(values)).flatten(), None)
-                        ._s
-                    )
-                    return
-
-            if dtype is not None:
-                self._s = self.cast(dtype, strict=True)._s
-
         elif isinstance(values, range):
             self._s = (
                 pli.arange(
@@ -272,7 +253,30 @@ def __init__(
                 dtype_if_empty=dtype_if_empty,
                 nan_to_null=nan_to_null,
             )
-        elif _PANDAS_TYPE(values) and isinstance(values, (pd.Series, pd.DatetimeIndex)):
+        elif _check_for_numpy(values) and isinstance(values, np.ndarray):
+            self._s = numpy_to_pyseries(name, values, strict, nan_to_null)
+            if values.dtype.type == np.datetime64:
+                # cast to appropriate dtype, handling NaT values
+                dtype = _resolve_datetime_dtype(dtype, values.dtype)
+                if dtype is not None:
+                    self._s = (
+                        self.cast(dtype)
+                        .set_at_idx(np.argwhere(np.isnat(values)).flatten(), None)
+                        ._s
+                    )
+                    return
+
+            if dtype is not None:
+                self._s = self.cast(dtype, strict=True)._s
+
+        elif _check_for_pyarrow(values) and isinstance(
+            values, (pa.Array, pa.ChunkedArray)
+        ):
+            self._s = arrow_to_pyseries(name, values)
+
+        elif _check_for_pandas(values) and isinstance(
+            values, (pd.Series, pd.DatetimeIndex)
+        ):
             self._s = pandas_to_pyseries(name, values)
 
         elif _is_generator(values):
@@ -627,7 +631,7 @@ def __rpow__(self, other: Any) -> Series:
 
     def __matmul__(self, other: Any) -> float | Series | None:
         if isinstance(other, Sequence) or (
-            _NUMPY_TYPE(other) and isinstance(other, np.ndarray)
+            _check_for_numpy(other) and isinstance(other, np.ndarray)
         ):
             other = Series(other)
         # elif isinstance(other, pli.DataFrame):
@@ -636,7 +640,7 @@ def __matmul__(self, other: Any) -> float | Series | None:
 
     def __rmatmul__(self, other: Any) -> float | Series | None:
         if isinstance(other, Sequence) or (
-            _NUMPY_TYPE(other) and isinstance(other, np.ndarray)
+            _check_for_numpy(other) and isinstance(other, np.ndarray)
         ):
             other = Series(other)
         return other.dot(self)
@@ -706,7 +710,7 @@ def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series:
 
                 return idxs.cast(idx_type)
 
-        elif _NUMPY_TYPE(idxs) and isinstance(idxs, np.ndarray):
+        elif _check_for_numpy(idxs) and isinstance(idxs, np.ndarray):
             if idxs.ndim != 1:
                 raise ValueError("Only 1D numpy array is supported as index.")
             if idxs.dtype.kind in ("i", "u"):
@@ -785,7 +789,7 @@ def __getitem__(
             return wrap_s(self._s.take_with_series(self._pos_idxs(item)._s))
 
         elif (
-            _NUMPY_TYPE(item)
+            _check_for_numpy(item)
             and isinstance(item, np.ndarray)
             and item.dtype.kind in ("i", "u")
         ):
@@ -843,7 +847,7 @@ def __getitem__(
             return wrap_s(self._s.filter(item._s))
 
         elif (
-            _NUMPY_TYPE(item)
+            _check_for_numpy(item)
             and isinstance(item, np.ndarray)
             and item.dtype.kind == "b"
         ):
@@ -897,7 +901,7 @@ def __setitem__(
                 self._s = self.set_at_idx(key, value)._s
 
         # TODO: implement for these types without casting to series
-        elif _NUMPY_TYPE(key) and isinstance(key, np.ndarray):
+        elif _check_for_numpy(key) and isinstance(key, np.ndarray):
             if key.dtype == np.bool_:
                 # boolean numpy mask
                 self._s = self.set_at_idx(np.argwhere(key)[:, 0], value)._s