Skip to content

Commit

Permalink
feat(python): handle Series init from python sequence of numpy arrays (
Browse files Browse the repository at this point in the history
  • Loading branch information
alexander-beedie committed Dec 28, 2022
1 parent b0883e0 commit eaeb703
Show file tree
Hide file tree
Showing 5 changed files with 78 additions and 28 deletions.
17 changes: 17 additions & 0 deletions py-polars/polars/datatypes_constructor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Any, Callable, Sequence

from polars.datatypes import (
DTYPE_TEMPORAL_UNITS,
Boolean,
Categorical,
Date,
Expand Down Expand Up @@ -92,6 +93,22 @@ def _set_numpy_to_constructor() -> None:
}


def numpy_values_and_dtype(
values: np.ndarray[Any, Any]
) -> tuple[np.ndarray[Any, Any], type]:
"""Return numpy values and their associated dtype, adjusting if required."""
dtype = values.dtype.type
if dtype == np.float16:
values = values.astype(np.float32)
dtype = values.dtype.type
elif dtype == np.datetime64:
if np.datetime_data(values.dtype)[0] in DTYPE_TEMPORAL_UNITS:
values = values.astype(np.int64)
else:
dtype = object
return values, dtype


def numpy_type_to_constructor(dtype: type[np.dtype[Any]]) -> Callable[..., PySeries]:
"""Get the right PySeries constructor for the given Polars dtype."""
if _NUMPY_TYPE_TO_CONSTRUCTOR is None:
Expand Down
44 changes: 24 additions & 20 deletions py-polars/polars/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

from polars import internals as pli
from polars.datatypes import (
DTYPE_TEMPORAL_UNITS,
Categorical,
ColumnsType,
Date,
Expand All @@ -37,10 +36,16 @@
)
from polars.datatypes_constructor import (
numpy_type_to_constructor,
numpy_values_and_dtype,
polars_type_to_constructor,
py_type_to_constructor,
)
from polars.dependencies import _NUMPY_AVAILABLE, _PANDAS_TYPE, _PYARROW_AVAILABLE
from polars.dependencies import (
_NUMPY_AVAILABLE,
_NUMPY_TYPE,
_PANDAS_TYPE,
_PYARROW_AVAILABLE,
)
from polars.dependencies import numpy as np
from polars.dependencies import pandas as pd
from polars.dependencies import pyarrow as pa
Expand Down Expand Up @@ -146,32 +151,18 @@ def numpy_to_pyseries(
values = np.array(values)

if len(values.shape) == 1:
dtype = values.dtype.type
if dtype == np.float16:
values = values.astype(np.float32)
dtype = values.dtype.type
elif (
dtype == np.datetime64
and np.datetime_data(values.dtype)[0] not in DTYPE_TEMPORAL_UNITS
):
dtype = object

values, dtype = numpy_values_and_dtype(values)
constructor = numpy_type_to_constructor(dtype)

if dtype == np.float32 or dtype == np.float64:
return constructor(name, values, nan_to_null)
elif dtype == np.datetime64:
return constructor(name, values.astype(np.int64), strict)
else:
return constructor(name, values, strict)
return constructor(
name, values, nan_to_null if dtype in (np.float32, np.float64) else strict
)
elif len(values.shape) == 2:
pyseries_container = []
for row in range(values.shape[0]):
pyseries_container.append(
numpy_to_pyseries("", values[row, :], strict, nan_to_null)
)
return PySeries.new_series_list(name, pyseries_container, False)

else:
return PySeries.new_object(name, values, strict)

Expand Down Expand Up @@ -250,6 +241,7 @@ def sequence_to_pyseries(
dtype: PolarsDataType | None = None,
strict: bool = True,
dtype_if_empty: PolarsDataType | None = None,
nan_to_null: bool = False,
) -> PySeries:
"""Construct a PySeries from a sequence."""
python_dtype: type | None = None
Expand Down Expand Up @@ -383,8 +375,20 @@ def sequence_to_pyseries(
# Convert mixed sequences like `[[12], "foo", 9]`
return PySeries.new_object(name, values, strict)

elif (
_NUMPY_TYPE(value)
and isinstance(value, np.ndarray)
and len(value.shape) == 1
):
return PySeries.new_series_list(
name,
[numpy_to_pyseries("", v, strict, nan_to_null) for v in values],
strict,
)

elif python_dtype == pli.Series:
return PySeries.new_series_list(name, [v._s for v in values], strict)

elif python_dtype == PySeries:
return PySeries.new_series_list(name, values, strict)
else:
Expand Down
7 changes: 6 additions & 1 deletion py-polars/polars/internals/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,12 @@ def __init__(
)
elif isinstance(values, Sequence):
self._s = sequence_to_pyseries(
name, values, dtype=dtype, strict=strict, dtype_if_empty=dtype_if_empty
name,
values,
dtype=dtype,
strict=strict,
dtype_if_empty=dtype_if_empty,
nan_to_null=nan_to_null,
)
elif _PANDAS_TYPE(values) and isinstance(values, (pd.Series, pd.DatetimeIndex)):
self._s = pandas_to_pyseries(name, values)
Expand Down
4 changes: 2 additions & 2 deletions py-polars/polars/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -1065,7 +1065,7 @@ def read_sql(
Notes
-----
Make sure to install connectorx>=0.2.2. Read the documentation
Make sure to install connectorx>=0.3.1. Read the documentation
`here <https://sfu-db.github.io/connector-x/intro.html>`_.
Examples
Expand Down Expand Up @@ -1100,7 +1100,7 @@ def read_sql(
import connectorx as cx
except ImportError:
raise ImportError(
"connectorx is not installed. Please run `pip install connectorx>=0.2.2`."
"connectorx is not installed. Please run `pip install connectorx>=0.3.1`."
) from None

tbl = cx.read_sql(
Expand Down
34 changes: 29 additions & 5 deletions py-polars/tests/unit/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def test_cum_agg() -> None:


def test_init_inputs(monkeypatch: Any) -> None:
nan = float("nan")
for flag in [False, True]:
monkeypatch.setattr(pl.internals.construction, "_PYARROW_AVAILABLE", flag)
# Good inputs
Expand Down Expand Up @@ -71,11 +72,34 @@ def test_init_inputs(monkeypatch: Any) -> None:
)
assert pl.Series("a", [10000, 20000, 30000], dtype=pl.Time).dtype == pl.Time

# 2d numpy array
res = pl.Series(name="a", values=np.array([[1, 2], [3, 4]], dtype=np.int64))
assert res.dtype == pl.List(pl.Int64)
assert res[0].to_list() == [1, 2]
assert res[1].to_list() == [3, 4]
# 2d numpy array and/or list of 1d numpy arrays
for res in (
pl.Series(
name="a",
values=np.array([[1, 2], [3, nan]], dtype=np.float32),
nan_to_null=True,
),
pl.Series(
name="a",
values=[
np.array([1, 2], dtype=np.float32),
np.array([3, nan], dtype=np.float32),
],
nan_to_null=True,
),
pl.Series(
name="a",
values=(
np.ndarray((2,), np.float32, np.array([1, 2], dtype=np.float32)),
np.ndarray((2,), np.float32, np.array([3, nan], dtype=np.float32)),
),
nan_to_null=True,
),
):
assert res.dtype == pl.List(pl.Float32)
assert res[0].to_list() == [1.0, 2.0]
assert res[1].to_list() == [3.0, None]

assert pl.Series(
values=np.array([["foo", "bar"], ["foo2", "bar2"]])
).dtype == pl.List(pl.Utf8)
Expand Down

0 comments on commit eaeb703

Please sign in to comment.