Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(python): handle Series init from python sequence of numpy arrays #5918

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions py-polars/polars/datatypes_constructor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Any, Callable, Sequence

from polars.datatypes import (
DTYPE_TEMPORAL_UNITS,
Boolean,
Categorical,
Date,
Expand Down Expand Up @@ -92,6 +93,22 @@ def _set_numpy_to_constructor() -> None:
}


def numpy_values_and_dtype(
values: np.ndarray[Any, Any]
) -> tuple[np.ndarray[Any, Any], type]:
"""Return numpy values and their associated dtype, adjusting if required."""
dtype = values.dtype.type
if dtype == np.float16:
values = values.astype(np.float32)
dtype = values.dtype.type
elif dtype == np.datetime64:
if np.datetime_data(values.dtype)[0] in DTYPE_TEMPORAL_UNITS:
values = values.astype(np.int64)
else:
dtype = object
return values, dtype


def numpy_type_to_constructor(dtype: type[np.dtype[Any]]) -> Callable[..., PySeries]:
"""Get the right PySeries constructor for the given Polars dtype."""
if _NUMPY_TYPE_TO_CONSTRUCTOR is None:
Expand Down
44 changes: 24 additions & 20 deletions py-polars/polars/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

from polars import internals as pli
from polars.datatypes import (
DTYPE_TEMPORAL_UNITS,
Categorical,
ColumnsType,
Date,
Expand All @@ -37,10 +36,16 @@
)
from polars.datatypes_constructor import (
numpy_type_to_constructor,
numpy_values_and_dtype,
polars_type_to_constructor,
py_type_to_constructor,
)
from polars.dependencies import _NUMPY_AVAILABLE, _PANDAS_TYPE, _PYARROW_AVAILABLE
from polars.dependencies import (
_NUMPY_AVAILABLE,
_NUMPY_TYPE,
_PANDAS_TYPE,
_PYARROW_AVAILABLE,
)
from polars.dependencies import numpy as np
from polars.dependencies import pandas as pd
from polars.dependencies import pyarrow as pa
Expand Down Expand Up @@ -146,32 +151,18 @@ def numpy_to_pyseries(
values = np.array(values)

if len(values.shape) == 1:
dtype = values.dtype.type
if dtype == np.float16:
values = values.astype(np.float32)
dtype = values.dtype.type
elif (
dtype == np.datetime64
and np.datetime_data(values.dtype)[0] not in DTYPE_TEMPORAL_UNITS
):
dtype = object

values, dtype = numpy_values_and_dtype(values)
constructor = numpy_type_to_constructor(dtype)

if dtype == np.float32 or dtype == np.float64:
return constructor(name, values, nan_to_null)
elif dtype == np.datetime64:
return constructor(name, values.astype(np.int64), strict)
else:
return constructor(name, values, strict)
return constructor(
name, values, nan_to_null if dtype in (np.float32, np.float64) else strict
)
elif len(values.shape) == 2:
pyseries_container = []
for row in range(values.shape[0]):
pyseries_container.append(
numpy_to_pyseries("", values[row, :], strict, nan_to_null)
)
return PySeries.new_series_list(name, pyseries_container, False)

else:
return PySeries.new_object(name, values, strict)

Expand Down Expand Up @@ -250,6 +241,7 @@ def sequence_to_pyseries(
dtype: PolarsDataType | None = None,
strict: bool = True,
dtype_if_empty: PolarsDataType | None = None,
nan_to_null: bool = False,
) -> PySeries:
"""Construct a PySeries from a sequence."""
python_dtype: type | None = None
Expand Down Expand Up @@ -383,8 +375,20 @@ def sequence_to_pyseries(
# Convert mixed sequences like `[[12], "foo", 9]`
return PySeries.new_object(name, values, strict)

elif (
_NUMPY_TYPE(value)
and isinstance(value, np.ndarray)
and len(value.shape) == 1
):
return PySeries.new_series_list(
name,
[numpy_to_pyseries("", v, strict, nan_to_null) for v in values],
strict,
)

elif python_dtype == pli.Series:
return PySeries.new_series_list(name, [v._s for v in values], strict)

elif python_dtype == PySeries:
return PySeries.new_series_list(name, values, strict)
else:
Expand Down
7 changes: 6 additions & 1 deletion py-polars/polars/internals/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,12 @@ def __init__(
)
elif isinstance(values, Sequence):
self._s = sequence_to_pyseries(
name, values, dtype=dtype, strict=strict, dtype_if_empty=dtype_if_empty
name,
values,
dtype=dtype,
strict=strict,
dtype_if_empty=dtype_if_empty,
nan_to_null=nan_to_null,
)
elif _PANDAS_TYPE(values) and isinstance(values, (pd.Series, pd.DatetimeIndex)):
self._s = pandas_to_pyseries(name, values)
Expand Down
4 changes: 2 additions & 2 deletions py-polars/polars/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -1065,7 +1065,7 @@ def read_sql(

Notes
-----
Make sure to install connectorx>=0.2.2. Read the documentation
Make sure to install connectorx>=0.3.1. Read the documentation
`here <https://sfu-db.github.io/connector-x/intro.html>`_.

Examples
Expand Down Expand Up @@ -1100,7 +1100,7 @@ def read_sql(
import connectorx as cx
except ImportError:
raise ImportError(
"connectorx is not installed. Please run `pip install connectorx>=0.2.2`."
"connectorx is not installed. Please run `pip install connectorx>=0.3.1`."
) from None

tbl = cx.read_sql(
Expand Down
34 changes: 29 additions & 5 deletions py-polars/tests/unit/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def test_cum_agg() -> None:


def test_init_inputs(monkeypatch: Any) -> None:
nan = float("nan")
for flag in [False, True]:
monkeypatch.setattr(pl.internals.construction, "_PYARROW_AVAILABLE", flag)
# Good inputs
Expand Down Expand Up @@ -71,11 +72,34 @@ def test_init_inputs(monkeypatch: Any) -> None:
)
assert pl.Series("a", [10000, 20000, 30000], dtype=pl.Time).dtype == pl.Time

# 2d numpy array
res = pl.Series(name="a", values=np.array([[1, 2], [3, 4]], dtype=np.int64))
assert res.dtype == pl.List(pl.Int64)
assert res[0].to_list() == [1, 2]
assert res[1].to_list() == [3, 4]
# 2d numpy array and/or list of 1d numpy arrays
for res in (
pl.Series(
name="a",
values=np.array([[1, 2], [3, nan]], dtype=np.float32),
nan_to_null=True,
),
pl.Series(
name="a",
values=[
np.array([1, 2], dtype=np.float32),
np.array([3, nan], dtype=np.float32),
],
nan_to_null=True,
),
pl.Series(
name="a",
values=(
np.ndarray((2,), np.float32, np.array([1, 2], dtype=np.float32)),
np.ndarray((2,), np.float32, np.array([3, nan], dtype=np.float32)),
),
nan_to_null=True,
),
):
assert res.dtype == pl.List(pl.Float32)
assert res[0].to_list() == [1.0, 2.0]
assert res[1].to_list() == [3.0, None]

assert pl.Series(
values=np.array([["foo", "bar"], ["foo2", "bar2"]])
).dtype == pl.List(pl.Utf8)
Expand Down