Complete pythonic slice support (inc. negative indexing/stride) for D…

…ataFrame and Series (#3904)
pola-rs · Jul 5, 2022 · 214ac46 · 214ac46
1 parent f6ff3d3
commit 214ac46
Show file tree

Hide file tree

Showing 8 changed files with 244 additions and 57 deletions.
diff --git a/py-polars/build.requirements.txt b/py-polars/build.requirements.txt
@@ -13,7 +13,7 @@ types-pytz
 maturin==0.12.19
 pytest==7.1.2
 pytest-cov[toml]==3.0.0
-hypothesis==6.48
+hypothesis==6.49.1
 black==22.3.0
 blackdoc==0.3.4
 isort~=5.10.1

diff --git a/py-polars/polars/internals/frame.py b/py-polars/polars/internals/frame.py
@@ -34,6 +34,7 @@
     sequence_to_pydf,
     series_to_pydf,
 )
+from polars.internals.functions import PolarsSlice
 from polars.utils import (
     _prepare_row_count_args,
     _process_null_values,
@@ -1735,30 +1736,7 @@ def __getitem__(
 
         # df[:]
         if isinstance(item, slice):
-            # special case df[::-1]
-            if item.start is None and item.stop is None and item.step == -1:
-                return self.reverse()
-
-            if getattr(item, "end", False):
-                raise ValueError("A slice with steps larger than 1 is not supported.")
-            if item.start is None:
-                start = 0
-            else:
-                start = item.start
-            if item.stop is None:
-                stop = self.height
-            else:
-                stop = item.stop
-
-            length = stop - start
-            if item.step is None:
-                # df[start:stop]
-                return self.slice(start, length)
-            else:
-                # df[start:stop:step]
-                return self.select(
-                    pli.col("*").slice(start, length).take_every(item.step)
-                )
+            return PolarsSlice(self).apply(item)  # type: ignore
 
         # select rows by numpy mask or index
         # df[[1, 2, 3]]

diff --git a/py-polars/polars/internals/functions.py b/py-polars/polars/internals/functions.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from datetime import date, datetime, timedelta
-from typing import Sequence, overload
+from typing import Sequence, Union, overload
 
 from polars import internals as pli
 from polars.datatypes import Date
@@ -255,3 +255,107 @@ def date_range(
         dt_range = dt_range.cast(Date)
 
     return dt_range
+
+
+FrameOrSeries = Union["pli.DataFrame", "pli.Series"]
+
+# TODO:
+# class LazyPolarsSlice:
+
+
+class PolarsSlice:
+    """
+    Apply python slice object to Polars DataFrame or Series,
+    with full support for negative indexing and/or stride.
+    """
+
+    stop: int
+    start: int
+    stride: int
+    slice_length: int
+    obj: FrameOrSeries
+
+    def __init__(self, obj: FrameOrSeries):
+        self.obj = obj
+
+    @staticmethod
+    def _as_original(lazy: "pli.LazyFrame", obj: FrameOrSeries) -> FrameOrSeries:
+        """
+        Return lazy variant back to its original type.
+        """
+        frame = lazy.collect()
+        return frame if isinstance(obj, pli.DataFrame) else frame.to_series()
+
+    @staticmethod
+    def _lazify(obj: FrameOrSeries) -> "pli.LazyFrame":
+        """
+        Make lazy to ensure efficent/consistent handling.
+        """
+        return obj.lazy() if isinstance(obj, pli.DataFrame) else obj.to_frame().lazy()
+
+    def _slice_positive(self, obj: "pli.LazyFrame") -> "pli.LazyFrame":
+        """
+        Logic for slices with positive stride.
+        """
+        return obj.slice(self.start, self.slice_length).take_every(self.stride)
+
+    def _slice_negative(self, obj: "pli.LazyFrame") -> "pli.LazyFrame":
+        """
+        Logic for slices with negative stride.
+        """
+        stride = abs(self.stride)
+        lazyslice = obj.slice(self.stop + 1, self.slice_length)
+        if self.slice_length == 1:
+            return lazyslice
+        else:
+            lazyslice = lazyslice.reverse()
+            return lazyslice.take_every(stride) if (stride > 1) else lazyslice
+
+    def _slice_setup(self, s: slice) -> None:
+        """
+        Normalise slice bounds, identify unbounded and/or zero-length slices.
+        """
+        obj_len = len(self.obj)
+        start, stop, stride = slice(s.start, s.stop, s.step).indices(obj_len)
+        if stride >= 1:
+            self.is_unbounded = start <= 0 and stop >= obj_len
+        else:
+            self.is_unbounded = stop is None and (
+                start is None or (start >= obj_len - 1)
+            )
+        self._positive_indices = start >= 0 and stop >= 0
+        self.slice_length = (
+            0
+            if self.obj.is_empty()
+            or (
+                (start == stop)
+                or (stride > 0 and start > stop)
+                or (stride < 0 and start < stop)
+            )
+            else abs(stop - start)
+        )
+        self.start, self.stop, self.stride = start, stop, stride
+
+    def apply(self, s: slice) -> FrameOrSeries:
+        """
+        Apply a slice operation, taking advantage of any potential fast paths.
+        """
+        self._slice_setup(s)
+
+        # check for fast-paths / early-exit
+        if self.slice_length == 0:
+            return self.obj.cleared()
+
+        elif self.is_unbounded and self.stride in (-1, 1):
+            return self.obj.reverse() if (self.stride < 0) else self.obj.clone()
+
+        elif self._positive_indices and self.stride == 1:
+            return self.obj.slice(self.start, self.slice_length)
+
+        lazyobj = self._lazify(self.obj)
+        sliced = (
+            self._slice_positive(lazyobj)
+            if self.stride > 0
+            else self._slice_negative(lazyobj)
+        )
+        return self._as_original(sliced, self.obj)
diff --git a/py-polars/polars/internals/series.py b/py-polars/polars/internals/series.py
@@ -41,6 +41,7 @@
     sequence_to_pyseries,
     series_to_pyseries,
 )
+from polars.internals.functions import PolarsSlice
 from polars.utils import (
     _date_to_pl_date,
     _datetime_to_pl_timestamp,
@@ -479,12 +480,7 @@ def __getitem__(self, item: int | Series | range | slice) -> Any:
 
         # slice
         if isinstance(item, slice):
-            start, stop, stride = item.indices(self.len())
-            out = self.slice(start, stop - start)
-            if stride != 1:
-                return out.take_every(stride)
-            else:
-                return out
+            return PolarsSlice(self).apply(item)
 
         raise NotImplementedError
 

diff --git a/py-polars/tests/test_df.py b/py-polars/tests/test_df.py
@@ -587,15 +587,26 @@ def test_take_every() -> None:
 
 
 def test_slice() -> None:
-    df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"]})
-    expected = pl.DataFrame({"a": [1, 3], "b": ["b", "c"]})
+    df = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
+    expected = pl.DataFrame({"a": [2, 3], "b": ["b", "c"]})
     for slice_params in (
         [1, 10],  # slice > len(df)
         [1, 2],  # slice == len(df)
         [1],  # optional len
     ):
         assert df.slice(*slice_params).frame_equal(expected)
 
+    for py_slice in (
+        slice(1, 2),
+        slice(0, 2, 2),
+        slice(3, -3, -1),
+        slice(1, None, -2),
+        slice(-1, -3, -1),
+        slice(-3, None, -3),
+    ):
+        # confirm frame slice matches python slice
+        assert df[py_slice].rows() == df.rows()[py_slice]
+
 
 def test_head_tail_limit() -> None:
     df = pl.DataFrame({"a": range(10), "b": range(10)})

diff --git a/py-polars/tests/test_series.py b/py-polars/tests/test_series.py
@@ -219,17 +219,8 @@ def test_various() -> None:
     assert a.len() == 2
     assert len(a) == 2
 
-    for b in (
-        a.slice(1, 10),
-        a.slice(1, 1),
-        a.slice(1, None),
-        a.slice(1),
-    ):
-        assert b.len() == 1
-        assert b.series_equal(pl.Series("b", [2]))
-
-    a.append(b)
-    assert a.series_equal(pl.Series("b", [1, 2, 2]))
+    a.append(a.clone())
+    assert a.series_equal(pl.Series("b", [1, 2, 1, 2]))
 
     a = pl.Series("a", range(20))
     assert a.head(5).len() == 5
@@ -661,6 +652,29 @@ def test_is_in() -> None:
     assert df.select(pl.col("a").is_in(pl.col("b"))).to_series() == [True, False]
 
 
+def test_slice() -> None:
+    s = pl.Series(name="a", values=[0, 1, 2, 3, 4, 5], dtype=pl.UInt8)
+    for srs_slice, expected in (
+        [s.slice(2, 3), [2, 3, 4]],
+        [s.slice(4, 1), [4]],
+        [s.slice(4, None), [4, 5]],
+        [s.slice(3), [3, 4, 5]],
+        [s.slice(-2), [4, 5]],
+    ):
+        assert srs_slice.to_list() == expected  # type: ignore[attr-defined]
+
+    for py_slice in (
+        slice(1, 2),
+        slice(0, 2, 2),
+        slice(3, -3, -1),
+        slice(1, None, -2),
+        slice(-1, -3, -1),
+        slice(-3, None, -3),
+    ):
+        # confirm series slice matches python slice
+        assert s[py_slice].to_list() == s.to_list()[py_slice]
+
+
 def test_str_slice() -> None:
     df = pl.DataFrame({"a": ["foobar", "barfoo"]})
     assert df["a"].str.slice(-3) == ["bar", "foo"]

diff --git a/py-polars/tests_parametric/test_dataframe.py b/py-polars/tests_parametric/test_dataframe.py
@@ -1,10 +1,11 @@
 # -------------------------------------------------
 # Validate Series behaviour with parameteric tests
 # -------------------------------------------------
-from hypothesis import given
+from hypothesis import example, given, settings
+from hypothesis.strategies import integers
 
 import polars as pl
-from polars.testing import dataframes
+from polars.testing import column, dataframes
 
 
 @given(df=dataframes())
@@ -13,12 +14,75 @@ def test_repr(df: pl.DataFrame) -> None:
     # print(df)
 
 
-@given(df=dataframes(allowed_dtypes=[pl.Boolean, pl.UInt64, pl.Utf8, pl.Time]))
+@given(df=dataframes(min_size=1, min_cols=1, null_probability=0.25))
+@example(df=pl.DataFrame(columns=["x", "y", "z"]))
+@example(df=pl.DataFrame())
 def test_null_count(df: pl.DataFrame) -> None:
+    # note: the zero-row and zero-col cases are always passed as explicit examples
     null_count, ncols = df.null_count(), len(df.columns)
     if ncols == 0:
         assert null_count.shape == (0, 0)
     else:
         assert null_count.shape == (1, ncols)
         for idx, count in enumerate(null_count.rows()[0]):
             assert count == sum(v is None for v in df.select_at_idx(idx).to_list())
+    print(null_count.rows())
+
+
+@given(
+    df=dataframes(
+        max_size=20,
+        cols=[
+            column(
+                "start",
+                dtype=pl.Int8,
+                null_probability=0.15,
+                strategy=integers(min_value=-12, max_value=12),
+            ),
+            column(
+                "stop",
+                dtype=pl.Int8,
+                null_probability=0.15,
+                strategy=integers(min_value=-10, max_value=10),
+            ),
+            column(
+                "step",
+                dtype=pl.Int8,
+                null_probability=0.15,
+                strategy=integers(min_value=-8, max_value=8).filter(lambda x: x != 0),
+            ),
+            column("misc", dtype=pl.Int32),
+        ],
+    )
+    # generated dataframe example -
+    # ┌───────┬──────┬──────┬───────┐
+    # │ start ┆ stop ┆ step ┆ misc  │
+    # │ ---   ┆ ---  ┆ ---  ┆ ---   │
+    # │ i8    ┆ i8   ┆ i8   ┆ i32   │
+    # ╞═══════╪══════╪══════╪═══════╡
+    # │ 2     ┆ -1   ┆ null ┆ -55   │
+    # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+    # │ -3    ┆ 0    ┆ -2   ┆ 61582 │
+    # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
+    # │ null  ┆ 1    ┆ 2    ┆ 5865  │
+    # └───────┴──────┴──────┴───────┘
+)
+@settings(max_examples=500)
+def test_frame_slice(df: pl.DataFrame) -> None:
+    # take strategy-generated integer values from the frame as slice bounds.
+    # use these bounds to slice the same frame, and then validate the result
+    # against a py-native slice of the same data using the same bounds.
+    #
+    # given the average number of rows in the frames, and the value of
+    # max_examples, this will result in close to 5000 test permutations,
+    # running in around ~3 secs (depending on hardware/etc).
+    py_data = df.rows()
+
+    for start, stop, step, _ in py_data:
+        s = slice(start, stop, step)
+        sliced_py_data = py_data[s]
+        sliced_df_data = df[s].rows()
+
+        assert (
+            sliced_py_data == sliced_df_data
+        ), f"slice [{start}:{stop}:{step}] failed on df w/len={len(df)}"