feat[python]: support np.datetime64 init for Series (#4473)

pola-rs · Aug 21, 2022 · 82257cb · 82257cb
1 parent 5eb0837
commit 82257cb
Show file tree

Hide file tree

Showing 5 changed files with 133 additions and 27 deletions.
diff --git a/py-polars/polars/datatypes_constructor.py b/py-polars/polars/datatypes_constructor.py
@@ -91,6 +91,7 @@ def polars_type_to_constructor(
         np.uint64: PySeries.new_u64,
         np.str_: PySeries.new_str,
         np.bool_: PySeries.new_bool,
+        np.datetime64: PySeries.new_i64,
     }
 
 

diff --git a/py-polars/polars/internals/construction.py b/py-polars/polars/internals/construction.py
@@ -7,6 +7,7 @@
 
 from polars import internals as pli
 from polars.datatypes import (
+    DTYPE_TEMPORAL_UNITS,
     Categorical,
     ColumnsType,
     Date,
@@ -101,9 +102,18 @@ def numpy_to_pyseries(
         if dtype == np.float16:
             values = values.astype(np.float32)
             dtype = values.dtype.type
+        elif (
+            dtype == np.datetime64
+            and np.datetime_data(values.dtype)[0] not in DTYPE_TEMPORAL_UNITS
+        ):
+            dtype = object
+
         constructor = numpy_type_to_constructor(dtype)
+
         if dtype == np.float32 or dtype == np.float64:
             return constructor(name, values, nan_to_null)
+        elif dtype == np.datetime64:
+            return constructor(name, values.astype(np.int64), strict)
         else:
             return constructor(name, values, strict)
     else:

diff --git a/py-polars/polars/internals/series/series.py b/py-polars/polars/internals/series/series.py
@@ -20,6 +20,7 @@
     Int64,
     List,
     Object,
+    PolarsDataType,
     Time,
     UInt8,
     UInt16,
@@ -102,10 +103,25 @@
     )
 
 
+def _resolve_datetime_dtype(
+    dtype: PolarsDataType | None, ndtype: np.datetime64
+) -> PolarsDataType | None:
+    """Given polars/numpy datetime dtypes, resolve to an explicit unit"""
+    if dtype is None or (dtype == Datetime and not getattr(dtype, "tu", None)):
+        tu = getattr(dtype, "tu", np.datetime_data(ndtype)[0])
+        # explicit formulation is verbose, but keeps mypy happy
+        # (and avoids unsupported timeunits such as "s")
+        if tu == "ns":
+            dtype = Datetime("ns")
+        elif tu == "us":
+            dtype = Datetime("us")
+        elif tu == "ms":
+            dtype = Datetime("ms")
+    return dtype
+
+
 def get_ffi_func(
-    name: str,
-    dtype: type[DataType],
-    obj: PySeries,
+    name: str, dtype: type[DataType], obj: PySeries
 ) -> Callable[..., Any] | None:
     """
     Dynamically obtain the proper ffi function/ method.
@@ -161,10 +177,10 @@ class Series:
     dtype : DataType, default None
         Polars dtype of the Series data. If not specified, the dtype is inferred.
     strict
-        Throw error on numeric overflow
+        Throw error on numeric overflow.
     nan_to_null
-        In case a numpy arrow is used to create this Series, indicate how to deal with
-        np.nan
+        In case a numpy array is used to create this Series, indicate how to deal
+        with np.nan values.
 
     Examples
     --------
@@ -241,6 +257,17 @@ def __init__(
             self._s = arrow_to_pyseries(name, values)
         elif _NUMPY_AVAILABLE and isinstance(values, np.ndarray):
             self._s = numpy_to_pyseries(name, values, strict, nan_to_null)
+            if values.dtype.type == np.datetime64:
+                # cast to appropriate dtype, handling NaT values
+                dtype = _resolve_datetime_dtype(dtype, values.dtype)
+                if dtype is not None:
+                    self._s = (
+                        self.cast(dtype)
+                        .set_at_idx(np.argwhere(np.isnat(values)).flatten(), None)
+                        ._s
+                    )
+                    return
+
             if dtype is not None:
                 self._s = self.cast(dtype, strict=True)._s
         elif isinstance(values, Sequence):
@@ -269,10 +296,7 @@ def _from_arrow(cls, name: str, values: pa.Array, rechunk: bool = True) -> Serie
 
     @classmethod
     def _from_pandas(
-        cls,
-        name: str,
-        values: pd.Series | pd.DatetimeIndex,
-        nan_to_none: bool = True,
+        cls, name: str, values: pd.Series | pd.DatetimeIndex, nan_to_none: bool = True
     ) -> Series:
         """Construct a Series from a pandas Series or DatetimeIndex."""
         return cls._from_pyseries(
@@ -2350,11 +2374,7 @@ def __array__(self, dtype: Any = None) -> np.ndarray[Any, Any]:
             return self.to_numpy().__array__()
 
     def __array_ufunc__(
-        self,
-        ufunc: np.ufunc,
-        method: str,
-        *inputs: Any,
-        **kwargs: Any,
+        self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any
     ) -> Series:
         """Numpy universal functions."""
         if not _NUMPY_AVAILABLE:
@@ -2534,7 +2554,7 @@ def set(self, filter: Series, value: int | float | str) -> Series:
 
     def set_at_idx(
         self,
-        idx: Series | np.ndarray[Any, Any] | list[int] | tuple[int],
+        idx: Series | np.ndarray[Any, Any] | Sequence[int] | int,
         value: int
         | float
         | str
@@ -2545,7 +2565,8 @@ def set_at_idx(
         | Sequence[datetime]
         | date
         | datetime
-        | Series,
+        | Series
+        | None,
     ) -> Series:
         """
         Set values at the index locations.
@@ -2566,9 +2587,14 @@ def set_at_idx(
         the series mutated
 
         """
+        if isinstance(idx, int):
+            idx = [idx]
+        if len(idx) == 0:
+            return self
+
         if self.is_numeric() or self.is_datelike():
             idx = Series("", idx)
-            if isinstance(value, (int, float, bool)):
+            if isinstance(value, (int, float, bool)) or (value is None):
                 value = Series("", [value])
 
                 # if we need to set more than a single value, we extend it
@@ -2588,7 +2614,7 @@ def set_at_idx(
             )
         if isinstance(idx, Series):
             # make sure the dtype matches
-            idx = idx.cast(UInt32)
+            idx = idx.cast(get_idx_type())
             idx_array = idx.view()
         elif _NUMPY_AVAILABLE and isinstance(idx, np.ndarray):
             if not idx.data.c_contiguous:
@@ -2597,7 +2623,6 @@ def set_at_idx(
                 idx_array = idx
                 if idx_array.dtype != np.uint32:
                     idx_array = np.array(idx_array, np.uint32)
-
         else:
             if not _NUMPY_AVAILABLE:
                 raise ImportError("'numpy' is required for this functionality.")
@@ -3046,9 +3071,7 @@ def tanh(self) -> Series:
         return self.to_frame().select(pli.col(self.name).tanh()).to_series()
 
     def apply(
-        self,
-        func: Callable[[Any], Any],
-        return_dtype: type[DataType] | None = None,
+        self, func: Callable[[Any], Any], return_dtype: type[DataType] | None = None
     ) -> Series:
         """
         Apply a function over elements in this Series and return a new Series.

diff --git a/py-polars/tests/test_df.py b/py-polars/tests/test_df.py
@@ -921,17 +921,39 @@ def test_literal_series() -> None:
     df = pl.DataFrame(
         {
             "a": np.array([21.7, 21.8, 21], dtype=np.float32),
-            "b": np.array([1, 3, 2], dtype=np.int64),
+            "b": np.array([1, 3, 2], dtype=np.int8),
             "c": ["reg1", "reg2", "reg3"],
+            "d": np.array(
+                [datetime(2022, 8, 16), datetime(2022, 8, 17), datetime(2022, 8, 18)],
+                dtype="<M8[ns]",
+            ),
         }
     )
     out = (
         df.lazy()
-        .with_column(pl.Series("e", [2, 1, 3]))  # type: ignore[arg-type]
+        .with_column(pl.Series("e", [2, 1, 3], pl.Int32))  # type: ignore[arg-type]
         .with_column(pl.col("e").cast(pl.Float32))
         .collect()
     )
-    assert out["e"] == [2, 1, 3]
+    expected_schema = {
+        "a": pl.Float32,
+        "b": pl.Int8,
+        "c": pl.Utf8,
+        "d": pl.Datetime("ns"),
+        "e": pl.Float32,
+    }
+    assert_frame_equal(
+        pl.DataFrame(
+            [
+                (21.7, 1, "reg1", datetime(2022, 8, 16, 0), 2),
+                (21.8, 3, "reg2", datetime(2022, 8, 17, 0), 1),
+                (21.0, 2, "reg3", datetime(2022, 8, 18, 0), 3),
+            ],
+            columns=expected_schema,  # type: ignore[arg-type]
+        ),
+        out,
+        atol=0.00001,
+    )
 
 
 def test_to_html(df: pl.DataFrame) -> None:

diff --git a/py-polars/tests/test_series.py b/py-polars/tests/test_series.py
@@ -10,7 +10,7 @@
 import pytest
 
 import polars as pl
-from polars.datatypes import Date, Float64, Int32, Int64, UInt32, UInt64
+from polars.datatypes import Date, Datetime, Float64, Int32, Int64, UInt32, UInt64
 from polars.testing import assert_series_equal, verify_series_and_expr_api
 
 
@@ -45,6 +45,7 @@ def test_init_inputs(monkeypatch: Any) -> None:
             == pl.List
         )
         assert pl.Series("a", [10000, 20000, 30000], dtype=pl.Time).dtype == pl.Time
+
         # 2d numpy array
         res = pl.Series(name="a", values=np.array([[1, 2], [3, 4]]))
         assert all(res[0] == np.array([1, 2]))
@@ -57,6 +58,21 @@ def test_init_inputs(monkeypatch: Any) -> None:
         # lists
         assert pl.Series("a", [[1, 2], [3, 4]]).dtype == pl.List
 
+    # datetime64: check timeunit (auto-detect, implicit/explicit) and NaT
+    d64 = pd.date_range(date(2021, 8, 1), date(2021, 8, 3)).values
+    d64[1] = None
+
+    expected = [datetime(2021, 8, 1, 0), None, datetime(2021, 8, 3, 0)]
+    for dtype in (None, Datetime, Datetime("ns")):
+        s = pl.Series("dates", d64, dtype)
+        assert s.to_list() == expected
+        assert Datetime == s.dtype
+        assert s.dtype.tu == "ns"  # type: ignore[attr-defined]
+
+    s = pl.Series(values=d64.astype("<M8[ms]"))
+    assert s.dtype.tu == "ms"  # type: ignore[attr-defined]
+    assert expected == s.to_list()
+
     # pandas
     assert pl.Series(pd.Series([1, 2])).dtype == pl.Int64
 
@@ -1794,3 +1810,37 @@ def test_mutable_borrowed_append_3915() -> None:
     s = pl.Series("s", [1, 2, 3])
     s.append(s)
     assert s.to_list() == [1, 2, 3, 1, 2, 3]
+
+
+def test_set_at_idx() -> None:
+    s = pl.Series("s", [1, 2, 3])
+
+    # no-op (empty sequences)
+    for x in (
+        (),
+        [],
+        pl.Series(),
+        pl.Series(dtype=pl.Int8),
+        np.array([]),
+        np.ndarray(shape=(0, 0)),
+    ):
+        s.set_at_idx(x, 8)  # type: ignore[arg-type]
+        assert s.to_list() == [1, 2, 3]
+
+    # set new values, one index at a time
+    s.set_at_idx(0, 8)
+    s.set_at_idx([1], None)
+    assert s.to_list() == [8, None, 3]
+
+    # set new value at multiple indexes in one go
+    s.set_at_idx([0, 2], None)
+    assert s.to_list() == [None, None, None]
+
+    # try with different series dtype
+    s = pl.Series("s", ["a", "b", "c"])
+    s.set_at_idx((1, 2), "x")
+    assert s.to_list() == ["a", "x", "x"]
+
+    # expected error condition
+    with pytest.raises(TypeError):
+        s.set_at_idx([0, 2], 0.12345)