Skip to content

Commit

Permalink
feat(python): much faster lazy type-checks (#6064)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexander-beedie committed Jan 5, 2023
1 parent 2feb833 commit 41f0aa6
Show file tree
Hide file tree
Showing 8 changed files with 97 additions and 80 deletions.
43 changes: 23 additions & 20 deletions py-polars/polars/dependencies.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from __future__ import annotations

import inspect
import re
import sys
from functools import lru_cache
from importlib import import_module
from importlib.util import find_spec
from types import ModuleType
from typing import TYPE_CHECKING, Any
from typing import TYPE_CHECKING, Any, Hashable, cast

_FSSPEC_AVAILABLE = True
_NUMPY_AVAILABLE = True
Expand All @@ -29,6 +29,8 @@ class _LazyModule(ModuleType):
"""

__lazy__ = True

_mod_pfx: dict[str, str] = {
"numpy": "np.",
"pandas": "pd.",
Expand Down Expand Up @@ -163,25 +165,26 @@ def _lazy_import(module_name: str) -> tuple[ModuleType, bool]:
)


def _NUMPY_TYPE(obj: Any) -> bool:
return _NUMPY_AVAILABLE and any(
"numpy." in str(o)
for o in (obj if inspect.isclass(obj) else obj.__class__).mro()
)
@lru_cache(maxsize=None)
def _might_be(cls: type, type_: str) -> bool:
# infer whether the given class "might" be associated with the given
# module (in which case it's reasonable to do a real isinstance check)
try:
return any(f"{type_}." in str(o) for o in cls.mro())
except TypeError:
return False


def _PANDAS_TYPE(obj: Any) -> bool:
return _PANDAS_AVAILABLE and any(
"pandas." in str(o)
for o in (obj if inspect.isclass(obj) else obj.__class__).mro()
)
def _check_for_numpy(obj: Any) -> bool:
return _NUMPY_AVAILABLE and _might_be(cast(Hashable, type(obj)), "numpy")


def _PYARROW_TYPE(obj: Any) -> bool:
return _PYARROW_AVAILABLE and any(
"pyarrow." in str(o)
for o in (obj if inspect.isclass(obj) else obj.__class__).mro()
)
def _check_for_pandas(obj: Any) -> bool:
return _PANDAS_AVAILABLE and _might_be(cast(Hashable, type(obj)), "pandas")


def _check_for_pyarrow(obj: Any) -> bool:
return _PYARROW_AVAILABLE and _might_be(cast(Hashable, type(obj)), "pyarrow")


__all__ = [
Expand All @@ -194,11 +197,11 @@ def _PYARROW_TYPE(obj: Any) -> bool:
"_LazyModule",
"_FSSPEC_AVAILABLE",
"_NUMPY_AVAILABLE",
"_NUMPY_TYPE",
"_check_for_numpy",
"_PANDAS_AVAILABLE",
"_PANDAS_TYPE",
"_check_for_pandas",
"_PYARROW_AVAILABLE",
"_PYARROW_TYPE",
"_check_for_pyarrow",
"_ZONEINFO_AVAILABLE",
"_HYPOTHESIS_AVAILABLE",
"_DELTALAKE_AVAILABLE",
Expand Down
18 changes: 12 additions & 6 deletions py-polars/polars/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@
)
from polars.dependencies import (
_NUMPY_AVAILABLE,
_NUMPY_TYPE,
_PANDAS_TYPE,
_PYARROW_AVAILABLE,
_check_for_numpy,
_check_for_pandas,
)
from polars.dependencies import numpy as np
from polars.dependencies import pandas as pd
Expand Down Expand Up @@ -374,7 +374,7 @@ def sequence_to_pyseries(
return PySeries.new_object(name, values, strict)

elif (
_NUMPY_TYPE(value)
_check_for_numpy(value)
and isinstance(value, np.ndarray)
and len(value.shape) == 1
):
Expand Down Expand Up @@ -591,7 +591,11 @@ def dict_to_pydf(
count_numpy = 0
for val in data.values():
# only start a thread pool from a reasonable size.
count_numpy += int(isinstance(val, np.ndarray) and len(val) > 1000)
count_numpy += int(
_check_for_numpy(val)
and isinstance(val, np.ndarray)
and len(val) > 1000
)

# if we have more than 3 numpy arrays we multi-thread
if count_numpy > 2:
Expand Down Expand Up @@ -661,7 +665,7 @@ def sequence_to_pydf(
pydf = _post_apply_columns(pydf, column_names)
return pydf

elif isinstance(data[0], Sequence) and not isinstance(data[0], str):
elif isinstance(data[0], (list, tuple, Sequence)) and not isinstance(data[0], str):
if is_namedtuple(data[0]):
if columns is None:
columns = data[0]._fields # type: ignore[attr-defined]
Expand Down Expand Up @@ -728,7 +732,9 @@ def sequence_to_pydf(
pydf = _post_apply_columns(pydf, columns, categoricals)
return pydf

elif _PANDAS_TYPE(data[0]) and isinstance(data[0], (pd.Series, pd.DatetimeIndex)):
elif _check_for_pandas(data[0]) and isinstance(
data[0], (pd.Series, pd.DatetimeIndex)
):
dtypes = {}
if columns is not None:
columns, dtypes = _unpack_columns(columns, n_expected=1)
Expand Down
31 changes: 16 additions & 15 deletions py-polars/polars/internals/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
get_idx_type,
py_type_to_dtype,
)
from polars.dependencies import _NUMPY_TYPE, _PANDAS_TYPE, _PYARROW_TYPE
from polars.dependencies import _check_for_numpy, _check_for_pandas, _check_for_pyarrow
from polars.dependencies import numpy as np
from polars.dependencies import pandas as pd
from polars.dependencies import pyarrow as pa
Expand Down Expand Up @@ -274,23 +274,23 @@ def __init__(
elif isinstance(data, dict):
self._df = dict_to_pydf(data, columns=columns)

elif _NUMPY_TYPE(data) and isinstance(data, np.ndarray):
self._df = numpy_to_pydf(data, columns=columns, orient=orient)

elif _PYARROW_TYPE(data) and isinstance(data, pa.Table):
self._df = arrow_to_pydf(data, columns=columns)
elif isinstance(data, pli.Series):
self._df = series_to_pydf(data, columns=columns)

elif isinstance(data, Sequence) and not isinstance(data, str):
elif isinstance(data, (list, tuple, Sequence)):
self._df = sequence_to_pydf(
data, columns=columns, orient=orient, infer_schema_length=50
)
elif isinstance(data, pli.Series):
self._df = series_to_pydf(data, columns=columns)
elif _check_for_numpy(data) and isinstance(data, np.ndarray):
self._df = numpy_to_pydf(data, columns=columns, orient=orient)

elif _check_for_pyarrow(data) and isinstance(data, pa.Table):
self._df = arrow_to_pydf(data, columns=columns)

elif _PANDAS_TYPE(data) and isinstance(data, pd.DataFrame):
elif _check_for_pandas(data) and isinstance(data, pd.DataFrame):
self._df = pandas_to_pydf(data, columns=columns)

elif isinstance(data, (Generator, Iterable)) and not isinstance(data, Sized):
elif not isinstance(data, Sized) and isinstance(data, (Generator, Iterable)):
self._df = iterable_to_pydf(data, columns=columns, orient=orient)
else:
raise ValueError(
Expand Down Expand Up @@ -1176,7 +1176,7 @@ def _pos_idxs(

return idxs.cast(idx_type)

if _NUMPY_TYPE(idxs) and isinstance(idxs, np.ndarray):
if _check_for_numpy(idxs) and isinstance(idxs, np.ndarray):
if idxs.ndim != 1:
raise ValueError("Only 1D numpy array is supported as index.")
if idxs.dtype.kind in ("i", "u"):
Expand Down Expand Up @@ -1301,7 +1301,8 @@ def __getitem__(
# df[2, :] (select row as df)
if isinstance(row_selection, int):
if isinstance(col_selection, (slice, list)) or (
_NUMPY_TYPE(col_selection) and isinstance(col_selection, np.ndarray)
_check_for_numpy(col_selection)
and isinstance(col_selection, np.ndarray)
):
df = self[:, col_selection]
return df.slice(row_selection, 1)
Expand Down Expand Up @@ -1348,7 +1349,7 @@ def __getitem__(
# select rows by numpy mask or index
# df[np.array([1, 2, 3])]
# df[np.array([True, False, True])]
if _NUMPY_TYPE(item) and isinstance(item, np.ndarray):
if _check_for_numpy(item) and isinstance(item, np.ndarray):
if item.ndim != 1:
raise ValueError("Only a 1D-Numpy array is supported as index.")
if item.dtype.kind in ("i", "u"):
Expand Down Expand Up @@ -2556,7 +2557,7 @@ def filter(
└─────┴─────┴─────┘
"""
if _NUMPY_TYPE(predicate) and isinstance(predicate, np.ndarray):
if _check_for_numpy(predicate) and isinstance(predicate, np.ndarray):
predicate = pli.Series(predicate)

return (
Expand Down
4 changes: 2 additions & 2 deletions py-polars/polars/internals/expr/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
is_polars_dtype,
py_type_to_dtype,
)
from polars.dependencies import _NUMPY_TYPE
from polars.dependencies import _check_for_numpy
from polars.dependencies import numpy as np
from polars.internals.expr.binary import ExprBinaryNameSpace
from polars.internals.expr.categorical import ExprCatNameSpace
Expand Down Expand Up @@ -1999,7 +1999,7 @@ def take(
"""
if isinstance(indices, list) or (
_NUMPY_TYPE(indices) and isinstance(indices, np.ndarray)
_check_for_numpy(indices) and isinstance(indices, np.ndarray)
):
indices = cast("np.ndarray[Any, Any]", indices)
indices_lit = pli.lit(pli.Series("", indices, dtype=UInt32))
Expand Down
6 changes: 3 additions & 3 deletions py-polars/polars/internals/lazy_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
is_polars_dtype,
py_type_to_dtype,
)
from polars.dependencies import _NUMPY_TYPE
from polars.dependencies import _check_for_numpy
from polars.dependencies import numpy as np
from polars.internals.type_aliases import EpochTimeUnit
from polars.utils import (
Expand Down Expand Up @@ -194,7 +194,7 @@ def col(
if isinstance(name, DataType):
return pli.wrap_expr(_dtype_cols([name]))

elif not isinstance(name, str) and isinstance(name, Sequence):
elif not isinstance(name, str) and isinstance(name, (list, tuple, Sequence)):
if len(name) == 0 or isinstance(name[0], str):
return pli.wrap_expr(pycols(name))
elif is_polars_dtype(name[0]):
Expand Down Expand Up @@ -1101,7 +1101,7 @@ def lit(
return e
return e.alias(name)

if _NUMPY_TYPE(value) and isinstance(value, np.ndarray):
if _check_for_numpy(value) and isinstance(value, np.ndarray):
return lit(pli.Series("", value))

if dtype:
Expand Down
62 changes: 33 additions & 29 deletions py-polars/polars/internals/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,10 @@
supported_numpy_char_code,
)
from polars.dependencies import (
_NUMPY_TYPE,
_PANDAS_TYPE,
_PYARROW_AVAILABLE,
_PYARROW_TYPE,
_check_for_numpy,
_check_for_pandas,
_check_for_pyarrow,
)
from polars.dependencies import numpy as np
from polars.dependencies import pandas as pd
Expand Down Expand Up @@ -232,25 +232,6 @@ def __init__(
elif isinstance(values, Series):
self._s = series_to_pyseries(name, values)

elif _PYARROW_TYPE(values) and isinstance(values, (pa.Array, pa.ChunkedArray)):
self._s = arrow_to_pyseries(name, values)

elif _NUMPY_TYPE(values) and isinstance(values, np.ndarray):
self._s = numpy_to_pyseries(name, values, strict, nan_to_null)
if values.dtype.type == np.datetime64:
# cast to appropriate dtype, handling NaT values
dtype = _resolve_datetime_dtype(dtype, values.dtype)
if dtype is not None:
self._s = (
self.cast(dtype)
.set_at_idx(np.argwhere(np.isnat(values)).flatten(), None)
._s
)
return

if dtype is not None:
self._s = self.cast(dtype, strict=True)._s

elif isinstance(values, range):
self._s = (
pli.arange(
Expand All @@ -272,7 +253,30 @@ def __init__(
dtype_if_empty=dtype_if_empty,
nan_to_null=nan_to_null,
)
elif _PANDAS_TYPE(values) and isinstance(values, (pd.Series, pd.DatetimeIndex)):
elif _check_for_numpy(values) and isinstance(values, np.ndarray):
self._s = numpy_to_pyseries(name, values, strict, nan_to_null)
if values.dtype.type == np.datetime64:
# cast to appropriate dtype, handling NaT values
dtype = _resolve_datetime_dtype(dtype, values.dtype)
if dtype is not None:
self._s = (
self.cast(dtype)
.set_at_idx(np.argwhere(np.isnat(values)).flatten(), None)
._s
)
return

if dtype is not None:
self._s = self.cast(dtype, strict=True)._s

elif _check_for_pyarrow(values) and isinstance(
values, (pa.Array, pa.ChunkedArray)
):
self._s = arrow_to_pyseries(name, values)

elif _check_for_pandas(values) and isinstance(
values, (pd.Series, pd.DatetimeIndex)
):
self._s = pandas_to_pyseries(name, values)

elif _is_generator(values):
Expand Down Expand Up @@ -627,7 +631,7 @@ def __rpow__(self, other: Any) -> Series:

def __matmul__(self, other: Any) -> float | Series | None:
if isinstance(other, Sequence) or (
_NUMPY_TYPE(other) and isinstance(other, np.ndarray)
_check_for_numpy(other) and isinstance(other, np.ndarray)
):
other = Series(other)
# elif isinstance(other, pli.DataFrame):
Expand All @@ -636,7 +640,7 @@ def __matmul__(self, other: Any) -> float | Series | None:

def __rmatmul__(self, other: Any) -> float | Series | None:
if isinstance(other, Sequence) or (
_NUMPY_TYPE(other) and isinstance(other, np.ndarray)
_check_for_numpy(other) and isinstance(other, np.ndarray)
):
other = Series(other)
return other.dot(self)
Expand Down Expand Up @@ -706,7 +710,7 @@ def _pos_idxs(self, idxs: np.ndarray[Any, Any] | Series) -> Series:

return idxs.cast(idx_type)

elif _NUMPY_TYPE(idxs) and isinstance(idxs, np.ndarray):
elif _check_for_numpy(idxs) and isinstance(idxs, np.ndarray):
if idxs.ndim != 1:
raise ValueError("Only 1D numpy array is supported as index.")
if idxs.dtype.kind in ("i", "u"):
Expand Down Expand Up @@ -785,7 +789,7 @@ def __getitem__(
return wrap_s(self._s.take_with_series(self._pos_idxs(item)._s))

elif (
_NUMPY_TYPE(item)
_check_for_numpy(item)
and isinstance(item, np.ndarray)
and item.dtype.kind in ("i", "u")
):
Expand Down Expand Up @@ -843,7 +847,7 @@ def __getitem__(
return wrap_s(self._s.filter(item._s))

elif (
_NUMPY_TYPE(item)
_check_for_numpy(item)
and isinstance(item, np.ndarray)
and item.dtype.kind == "b"
):
Expand Down Expand Up @@ -897,7 +901,7 @@ def __setitem__(
self._s = self.set_at_idx(key, value)._s

# TODO: implement for these types without casting to series
elif _NUMPY_TYPE(key) and isinstance(key, np.ndarray):
elif _check_for_numpy(key) and isinstance(key, np.ndarray):
if key.dtype == np.bool_:
# boolean numpy mask
self._s = self.set_at_idx(np.argwhere(key)[:, 0], value)._s
Expand Down

0 comments on commit 41f0aa6

Please sign in to comment.