Skip to content

Commit

Permalink
feat(python): enable frame init from sequence of pandas series, and i…
Browse files Browse the repository at this point in the history
…mprove lazy typechecks (handle subclasses) (#5383)
  • Loading branch information
alexander-beedie committed Oct 31, 2022
1 parent d5c2022 commit 55a58a3
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 5 deletions.
18 changes: 14 additions & 4 deletions py-polars/polars/dependencies.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import inspect
import re
import sys
from importlib.machinery import ModuleSpec
Expand Down Expand Up @@ -41,7 +42,7 @@ def __getattr__(*args: Any, **kwargs: Any) -> None:
if re.match(r"^__\w+__$", attr):
return None

# other attribute access raises exception
# all other attribute access raises exception
pfx = _mod_pfx.get(module_name, "")
raise ModuleNotFoundError(
f"{pfx}{attr} requires '{module_name}' module to be installed"
Expand Down Expand Up @@ -145,15 +146,24 @@ def lazy_import(module_name: str) -> tuple[ModuleType, bool]:


def _NUMPY_TYPE(obj: Any) -> bool:
return _NUMPY_AVAILABLE and "numpy." in str(type(obj))
return _NUMPY_AVAILABLE and any(
"numpy." in str(o)
for o in (obj if inspect.isclass(obj) else obj.__class__).mro()
)


def _PANDAS_TYPE(obj: Any) -> bool:
return _PANDAS_AVAILABLE and "pandas." in str(type(obj))
return _PANDAS_AVAILABLE and any(
"pandas." in str(o)
for o in (obj if inspect.isclass(obj) else obj.__class__).mro()
)


def _PYARROW_TYPE(obj: Any) -> bool:
return _PYARROW_AVAILABLE and "pyarrow." in str(type(obj))
return _PYARROW_AVAILABLE and any(
"pyarrow." in str(o)
for o in (obj if inspect.isclass(obj) else obj.__class__).mro()
)


__all__ = [
Expand Down
19 changes: 18 additions & 1 deletion py-polars/polars/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
polars_type_to_constructor,
py_type_to_constructor,
)
from polars.dependencies import _NUMPY_AVAILABLE, _PYARROW_AVAILABLE
from polars.dependencies import _NUMPY_AVAILABLE, _PANDAS_TYPE, _PYARROW_AVAILABLE
from polars.dependencies import numpy as np
from polars.dependencies import pandas as pd
from polars.dependencies import pyarrow as pa
Expand Down Expand Up @@ -613,6 +613,7 @@ def sequence_to_pydf(
raise ValueError(
f"orient must be one of {{'col', 'row', None}}, got {orient} instead."
)

elif is_dataclass(data[0]):
columns = columns or [
(col, py_type_to_dtype(tp, raise_unmatched=False))
Expand All @@ -623,6 +624,22 @@ def sequence_to_pydf(
columns=columns,
)
return pydf

elif _PANDAS_TYPE(data[0]) and isinstance(data[0], (pd.Series, pd.DatetimeIndex)):
dtypes = {}
if columns is not None:
columns, dtypes = _unpack_columns(columns, n_expected=1)

data_series = []
for i, s in enumerate(data):
name = columns[i] if columns else s.name
dtype = dtypes.get(name, None)
pyseries = pandas_to_pyseries(name=name, values=s)
if dtype is not None and dtype != pyseries.dtype():
pyseries = pyseries.cast(dtype, strict=True)
data_series.append(pyseries)

columns = None
else:
columns, dtypes = _unpack_columns(columns, n_expected=1)
data_series = [pli.Series(columns[0], data, dtypes.get(columns[0]))._s]
Expand Down
29 changes: 29 additions & 0 deletions py-polars/tests/unit/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,35 @@ def test_init_pandas(monkeypatch: Any) -> None:
assert df.schema == {"x": pl.Float64, "y": pl.Float64}
assert df.rows() == [(1.0, 2.0), (3.0, 4.0)]

# subclassed pandas object, with/without data & overrides
class XSeries(pd.Series):
@property
def _constructor(self) -> type:
return XSeries

df = pl.DataFrame(
data=[
XSeries(name="x", data=[], dtype=np.dtype("<M8[ns]")),
XSeries(name="y", data=[], dtype=np.dtype("f8")),
XSeries(name="z", data=[], dtype=np.dtype("?")),
],
)
assert df.schema == {"x": pl.Datetime("ns"), "y": pl.Float64, "z": pl.Boolean}
assert df.rows() == []

df = pl.DataFrame(
data=[
XSeries(
name="x",
data=[datetime(2022, 10, 31, 10, 30, 45, 123456)],
dtype=np.dtype("<M8[ns]"),
)
],
columns={"colx": pl.Datetime("us")},
)
assert df.schema == {"colx": pl.Datetime("us")}
assert df.rows() == [(datetime(2022, 10, 31, 10, 30, 45, 123456),)]

# pandas is not available
monkeypatch.setattr(pl.internals.dataframe.frame, "_PANDAS_TYPE", lambda x: False)
with pytest.raises(ValueError):
Expand Down

0 comments on commit 55a58a3

Please sign in to comment.