Skip to content

Commit

Permalink
feat[python]: support np.datetime64 init for Series (#4473)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexander-beedie committed Aug 21, 2022
1 parent 5eb0837 commit 82257cb
Show file tree
Hide file tree
Showing 5 changed files with 133 additions and 27 deletions.
1 change: 1 addition & 0 deletions py-polars/polars/datatypes_constructor.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ def polars_type_to_constructor(
np.uint64: PySeries.new_u64,
np.str_: PySeries.new_str,
np.bool_: PySeries.new_bool,
np.datetime64: PySeries.new_i64,
}


Expand Down
10 changes: 10 additions & 0 deletions py-polars/polars/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from polars import internals as pli
from polars.datatypes import (
DTYPE_TEMPORAL_UNITS,
Categorical,
ColumnsType,
Date,
Expand Down Expand Up @@ -101,9 +102,18 @@ def numpy_to_pyseries(
if dtype == np.float16:
values = values.astype(np.float32)
dtype = values.dtype.type
elif (
dtype == np.datetime64
and np.datetime_data(values.dtype)[0] not in DTYPE_TEMPORAL_UNITS
):
dtype = object

constructor = numpy_type_to_constructor(dtype)

if dtype == np.float32 or dtype == np.float64:
return constructor(name, values, nan_to_null)
elif dtype == np.datetime64:
return constructor(name, values.astype(np.int64), strict)
else:
return constructor(name, values, strict)
else:
Expand Down
69 changes: 46 additions & 23 deletions py-polars/polars/internals/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
Int64,
List,
Object,
PolarsDataType,
Time,
UInt8,
UInt16,
Expand Down Expand Up @@ -102,10 +103,25 @@
)


def _resolve_datetime_dtype(
dtype: PolarsDataType | None, ndtype: np.datetime64
) -> PolarsDataType | None:
"""Given polars/numpy datetime dtypes, resolve to an explicit unit"""
if dtype is None or (dtype == Datetime and not getattr(dtype, "tu", None)):
tu = getattr(dtype, "tu", np.datetime_data(ndtype)[0])
# explicit formulation is verbose, but keeps mypy happy
# (and avoids unsupported timeunits such as "s")
if tu == "ns":
dtype = Datetime("ns")
elif tu == "us":
dtype = Datetime("us")
elif tu == "ms":
dtype = Datetime("ms")
return dtype


def get_ffi_func(
name: str,
dtype: type[DataType],
obj: PySeries,
name: str, dtype: type[DataType], obj: PySeries
) -> Callable[..., Any] | None:
"""
Dynamically obtain the proper ffi function/ method.
Expand Down Expand Up @@ -161,10 +177,10 @@ class Series:
dtype : DataType, default None
Polars dtype of the Series data. If not specified, the dtype is inferred.
strict
Throw error on numeric overflow
Throw error on numeric overflow.
nan_to_null
In case a numpy arrow is used to create this Series, indicate how to deal with
np.nan
In case a numpy array is used to create this Series, indicate how to deal
with np.nan values.
Examples
--------
Expand Down Expand Up @@ -241,6 +257,17 @@ def __init__(
self._s = arrow_to_pyseries(name, values)
elif _NUMPY_AVAILABLE and isinstance(values, np.ndarray):
self._s = numpy_to_pyseries(name, values, strict, nan_to_null)
if values.dtype.type == np.datetime64:
# cast to appropriate dtype, handling NaT values
dtype = _resolve_datetime_dtype(dtype, values.dtype)
if dtype is not None:
self._s = (
self.cast(dtype)
.set_at_idx(np.argwhere(np.isnat(values)).flatten(), None)
._s
)
return

if dtype is not None:
self._s = self.cast(dtype, strict=True)._s
elif isinstance(values, Sequence):
Expand Down Expand Up @@ -269,10 +296,7 @@ def _from_arrow(cls, name: str, values: pa.Array, rechunk: bool = True) -> Serie

@classmethod
def _from_pandas(
cls,
name: str,
values: pd.Series | pd.DatetimeIndex,
nan_to_none: bool = True,
cls, name: str, values: pd.Series | pd.DatetimeIndex, nan_to_none: bool = True
) -> Series:
"""Construct a Series from a pandas Series or DatetimeIndex."""
return cls._from_pyseries(
Expand Down Expand Up @@ -2350,11 +2374,7 @@ def __array__(self, dtype: Any = None) -> np.ndarray[Any, Any]:
return self.to_numpy().__array__()

def __array_ufunc__(
self,
ufunc: np.ufunc,
method: str,
*inputs: Any,
**kwargs: Any,
self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any
) -> Series:
"""Numpy universal functions."""
if not _NUMPY_AVAILABLE:
Expand Down Expand Up @@ -2534,7 +2554,7 @@ def set(self, filter: Series, value: int | float | str) -> Series:

def set_at_idx(
self,
idx: Series | np.ndarray[Any, Any] | list[int] | tuple[int],
idx: Series | np.ndarray[Any, Any] | Sequence[int] | int,
value: int
| float
| str
Expand All @@ -2545,7 +2565,8 @@ def set_at_idx(
| Sequence[datetime]
| date
| datetime
| Series,
| Series
| None,
) -> Series:
"""
Set values at the index locations.
Expand All @@ -2566,9 +2587,14 @@ def set_at_idx(
the series mutated
"""
if isinstance(idx, int):
idx = [idx]
if len(idx) == 0:
return self

if self.is_numeric() or self.is_datelike():
idx = Series("", idx)
if isinstance(value, (int, float, bool)):
if isinstance(value, (int, float, bool)) or (value is None):
value = Series("", [value])

# if we need to set more than a single value, we extend it
Expand All @@ -2588,7 +2614,7 @@ def set_at_idx(
)
if isinstance(idx, Series):
# make sure the dtype matches
idx = idx.cast(UInt32)
idx = idx.cast(get_idx_type())
idx_array = idx.view()
elif _NUMPY_AVAILABLE and isinstance(idx, np.ndarray):
if not idx.data.c_contiguous:
Expand All @@ -2597,7 +2623,6 @@ def set_at_idx(
idx_array = idx
if idx_array.dtype != np.uint32:
idx_array = np.array(idx_array, np.uint32)

else:
if not _NUMPY_AVAILABLE:
raise ImportError("'numpy' is required for this functionality.")
Expand Down Expand Up @@ -3046,9 +3071,7 @@ def tanh(self) -> Series:
return self.to_frame().select(pli.col(self.name).tanh()).to_series()

def apply(
self,
func: Callable[[Any], Any],
return_dtype: type[DataType] | None = None,
self, func: Callable[[Any], Any], return_dtype: type[DataType] | None = None
) -> Series:
"""
Apply a function over elements in this Series and return a new Series.
Expand Down
28 changes: 25 additions & 3 deletions py-polars/tests/test_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -921,17 +921,39 @@ def test_literal_series() -> None:
df = pl.DataFrame(
{
"a": np.array([21.7, 21.8, 21], dtype=np.float32),
"b": np.array([1, 3, 2], dtype=np.int64),
"b": np.array([1, 3, 2], dtype=np.int8),
"c": ["reg1", "reg2", "reg3"],
"d": np.array(
[datetime(2022, 8, 16), datetime(2022, 8, 17), datetime(2022, 8, 18)],
dtype="<M8[ns]",
),
}
)
out = (
df.lazy()
.with_column(pl.Series("e", [2, 1, 3])) # type: ignore[arg-type]
.with_column(pl.Series("e", [2, 1, 3], pl.Int32)) # type: ignore[arg-type]
.with_column(pl.col("e").cast(pl.Float32))
.collect()
)
assert out["e"] == [2, 1, 3]
expected_schema = {
"a": pl.Float32,
"b": pl.Int8,
"c": pl.Utf8,
"d": pl.Datetime("ns"),
"e": pl.Float32,
}
assert_frame_equal(
pl.DataFrame(
[
(21.7, 1, "reg1", datetime(2022, 8, 16, 0), 2),
(21.8, 3, "reg2", datetime(2022, 8, 17, 0), 1),
(21.0, 2, "reg3", datetime(2022, 8, 18, 0), 3),
],
columns=expected_schema, # type: ignore[arg-type]
),
out,
atol=0.00001,
)


def test_to_html(df: pl.DataFrame) -> None:
Expand Down
52 changes: 51 additions & 1 deletion py-polars/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import pytest

import polars as pl
from polars.datatypes import Date, Float64, Int32, Int64, UInt32, UInt64
from polars.datatypes import Date, Datetime, Float64, Int32, Int64, UInt32, UInt64
from polars.testing import assert_series_equal, verify_series_and_expr_api


Expand Down Expand Up @@ -45,6 +45,7 @@ def test_init_inputs(monkeypatch: Any) -> None:
== pl.List
)
assert pl.Series("a", [10000, 20000, 30000], dtype=pl.Time).dtype == pl.Time

# 2d numpy array
res = pl.Series(name="a", values=np.array([[1, 2], [3, 4]]))
assert all(res[0] == np.array([1, 2]))
Expand All @@ -57,6 +58,21 @@ def test_init_inputs(monkeypatch: Any) -> None:
# lists
assert pl.Series("a", [[1, 2], [3, 4]]).dtype == pl.List

# datetime64: check timeunit (auto-detect, implicit/explicit) and NaT
d64 = pd.date_range(date(2021, 8, 1), date(2021, 8, 3)).values
d64[1] = None

expected = [datetime(2021, 8, 1, 0), None, datetime(2021, 8, 3, 0)]
for dtype in (None, Datetime, Datetime("ns")):
s = pl.Series("dates", d64, dtype)
assert s.to_list() == expected
assert Datetime == s.dtype
assert s.dtype.tu == "ns" # type: ignore[attr-defined]

s = pl.Series(values=d64.astype("<M8[ms]"))
assert s.dtype.tu == "ms" # type: ignore[attr-defined]
assert expected == s.to_list()

# pandas
assert pl.Series(pd.Series([1, 2])).dtype == pl.Int64

Expand Down Expand Up @@ -1794,3 +1810,37 @@ def test_mutable_borrowed_append_3915() -> None:
s = pl.Series("s", [1, 2, 3])
s.append(s)
assert s.to_list() == [1, 2, 3, 1, 2, 3]


def test_set_at_idx() -> None:
s = pl.Series("s", [1, 2, 3])

# no-op (empty sequences)
for x in (
(),
[],
pl.Series(),
pl.Series(dtype=pl.Int8),
np.array([]),
np.ndarray(shape=(0, 0)),
):
s.set_at_idx(x, 8) # type: ignore[arg-type]
assert s.to_list() == [1, 2, 3]

# set new values, one index at a time
s.set_at_idx(0, 8)
s.set_at_idx([1], None)
assert s.to_list() == [8, None, 3]

# set new value at multiple indexes in one go
s.set_at_idx([0, 2], None)
assert s.to_list() == [None, None, None]

# try with different series dtype
s = pl.Series("s", ["a", "b", "c"])
s.set_at_idx((1, 2), "x")
assert s.to_list() == ["a", "x", "x"]

# expected error condition
with pytest.raises(TypeError):
s.set_at_idx([0, 2], 0.12345)

0 comments on commit 82257cb

Please sign in to comment.