Skip to content

Commit

Permalink
Complete pythonic slice support (inc. negative indexing/stride) for D…
Browse files Browse the repository at this point in the history
…ataFrame and Series (#3904)
  • Loading branch information
alexander-beedie committed Jul 5, 2022
1 parent f6ff3d3 commit 214ac46
Show file tree
Hide file tree
Showing 8 changed files with 244 additions and 57 deletions.
2 changes: 1 addition & 1 deletion py-polars/build.requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ types-pytz
maturin==0.12.19
pytest==7.1.2
pytest-cov[toml]==3.0.0
hypothesis==6.48
hypothesis==6.49.1
black==22.3.0
blackdoc==0.3.4
isort~=5.10.1
Expand Down
26 changes: 2 additions & 24 deletions py-polars/polars/internals/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
sequence_to_pydf,
series_to_pydf,
)
from polars.internals.functions import PolarsSlice
from polars.utils import (
_prepare_row_count_args,
_process_null_values,
Expand Down Expand Up @@ -1735,30 +1736,7 @@ def __getitem__(

# df[:]
if isinstance(item, slice):
# special case df[::-1]
if item.start is None and item.stop is None and item.step == -1:
return self.reverse()

if getattr(item, "end", False):
raise ValueError("A slice with steps larger than 1 is not supported.")
if item.start is None:
start = 0
else:
start = item.start
if item.stop is None:
stop = self.height
else:
stop = item.stop

length = stop - start
if item.step is None:
# df[start:stop]
return self.slice(start, length)
else:
# df[start:stop:step]
return self.select(
pli.col("*").slice(start, length).take_every(item.step)
)
return PolarsSlice(self).apply(item) # type: ignore

# select rows by numpy mask or index
# df[[1, 2, 3]]
Expand Down
106 changes: 105 additions & 1 deletion py-polars/polars/internals/functions.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

from datetime import date, datetime, timedelta
from typing import Sequence, overload
from typing import Sequence, Union, overload

from polars import internals as pli
from polars.datatypes import Date
Expand Down Expand Up @@ -255,3 +255,107 @@ def date_range(
dt_range = dt_range.cast(Date)

return dt_range


FrameOrSeries = Union["pli.DataFrame", "pli.Series"]

# TODO:
# class LazyPolarsSlice:


class PolarsSlice:
"""
Apply python slice object to Polars DataFrame or Series,
with full support for negative indexing and/or stride.
"""

stop: int
start: int
stride: int
slice_length: int
obj: FrameOrSeries

def __init__(self, obj: FrameOrSeries):
self.obj = obj

@staticmethod
def _as_original(lazy: "pli.LazyFrame", obj: FrameOrSeries) -> FrameOrSeries:
"""
Return lazy variant back to its original type.
"""
frame = lazy.collect()
return frame if isinstance(obj, pli.DataFrame) else frame.to_series()

@staticmethod
def _lazify(obj: FrameOrSeries) -> "pli.LazyFrame":
"""
Make lazy to ensure efficent/consistent handling.
"""
return obj.lazy() if isinstance(obj, pli.DataFrame) else obj.to_frame().lazy()

def _slice_positive(self, obj: "pli.LazyFrame") -> "pli.LazyFrame":
"""
Logic for slices with positive stride.
"""
return obj.slice(self.start, self.slice_length).take_every(self.stride)

def _slice_negative(self, obj: "pli.LazyFrame") -> "pli.LazyFrame":
"""
Logic for slices with negative stride.
"""
stride = abs(self.stride)
lazyslice = obj.slice(self.stop + 1, self.slice_length)
if self.slice_length == 1:
return lazyslice
else:
lazyslice = lazyslice.reverse()
return lazyslice.take_every(stride) if (stride > 1) else lazyslice

def _slice_setup(self, s: slice) -> None:
"""
Normalise slice bounds, identify unbounded and/or zero-length slices.
"""
obj_len = len(self.obj)
start, stop, stride = slice(s.start, s.stop, s.step).indices(obj_len)
if stride >= 1:
self.is_unbounded = start <= 0 and stop >= obj_len
else:
self.is_unbounded = stop is None and (
start is None or (start >= obj_len - 1)
)
self._positive_indices = start >= 0 and stop >= 0
self.slice_length = (
0
if self.obj.is_empty()
or (
(start == stop)
or (stride > 0 and start > stop)
or (stride < 0 and start < stop)
)
else abs(stop - start)
)
self.start, self.stop, self.stride = start, stop, stride

def apply(self, s: slice) -> FrameOrSeries:
"""
Apply a slice operation, taking advantage of any potential fast paths.
"""
self._slice_setup(s)

# check for fast-paths / early-exit
if self.slice_length == 0:
return self.obj.cleared()

elif self.is_unbounded and self.stride in (-1, 1):
return self.obj.reverse() if (self.stride < 0) else self.obj.clone()

elif self._positive_indices and self.stride == 1:
return self.obj.slice(self.start, self.slice_length)

lazyobj = self._lazify(self.obj)
sliced = (
self._slice_positive(lazyobj)
if self.stride > 0
else self._slice_negative(lazyobj)
)
return self._as_original(sliced, self.obj)
8 changes: 2 additions & 6 deletions py-polars/polars/internals/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
sequence_to_pyseries,
series_to_pyseries,
)
from polars.internals.functions import PolarsSlice
from polars.utils import (
_date_to_pl_date,
_datetime_to_pl_timestamp,
Expand Down Expand Up @@ -479,12 +480,7 @@ def __getitem__(self, item: int | Series | range | slice) -> Any:

# slice
if isinstance(item, slice):
start, stop, stride = item.indices(self.len())
out = self.slice(start, stop - start)
if stride != 1:
return out.take_every(stride)
else:
return out
return PolarsSlice(self).apply(item)

raise NotImplementedError

Expand Down
15 changes: 13 additions & 2 deletions py-polars/tests/test_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -587,15 +587,26 @@ def test_take_every() -> None:


def test_slice() -> None:
df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"]})
expected = pl.DataFrame({"a": [1, 3], "b": ["b", "c"]})
df = pl.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
expected = pl.DataFrame({"a": [2, 3], "b": ["b", "c"]})
for slice_params in (
[1, 10], # slice > len(df)
[1, 2], # slice == len(df)
[1], # optional len
):
assert df.slice(*slice_params).frame_equal(expected)

for py_slice in (
slice(1, 2),
slice(0, 2, 2),
slice(3, -3, -1),
slice(1, None, -2),
slice(-1, -3, -1),
slice(-3, None, -3),
):
# confirm frame slice matches python slice
assert df[py_slice].rows() == df.rows()[py_slice]


def test_head_tail_limit() -> None:
df = pl.DataFrame({"a": range(10), "b": range(10)})
Expand Down
36 changes: 25 additions & 11 deletions py-polars/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,17 +219,8 @@ def test_various() -> None:
assert a.len() == 2
assert len(a) == 2

for b in (
a.slice(1, 10),
a.slice(1, 1),
a.slice(1, None),
a.slice(1),
):
assert b.len() == 1
assert b.series_equal(pl.Series("b", [2]))

a.append(b)
assert a.series_equal(pl.Series("b", [1, 2, 2]))
a.append(a.clone())
assert a.series_equal(pl.Series("b", [1, 2, 1, 2]))

a = pl.Series("a", range(20))
assert a.head(5).len() == 5
Expand Down Expand Up @@ -661,6 +652,29 @@ def test_is_in() -> None:
assert df.select(pl.col("a").is_in(pl.col("b"))).to_series() == [True, False]


def test_slice() -> None:
s = pl.Series(name="a", values=[0, 1, 2, 3, 4, 5], dtype=pl.UInt8)
for srs_slice, expected in (
[s.slice(2, 3), [2, 3, 4]],
[s.slice(4, 1), [4]],
[s.slice(4, None), [4, 5]],
[s.slice(3), [3, 4, 5]],
[s.slice(-2), [4, 5]],
):
assert srs_slice.to_list() == expected # type: ignore[attr-defined]

for py_slice in (
slice(1, 2),
slice(0, 2, 2),
slice(3, -3, -1),
slice(1, None, -2),
slice(-1, -3, -1),
slice(-3, None, -3),
):
# confirm series slice matches python slice
assert s[py_slice].to_list() == s.to_list()[py_slice]


def test_str_slice() -> None:
df = pl.DataFrame({"a": ["foobar", "barfoo"]})
assert df["a"].str.slice(-3) == ["bar", "foo"]
Expand Down
70 changes: 67 additions & 3 deletions py-polars/tests_parametric/test_dataframe.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# -------------------------------------------------
# Validate Series behaviour with parameteric tests
# -------------------------------------------------
from hypothesis import given
from hypothesis import example, given, settings
from hypothesis.strategies import integers

import polars as pl
from polars.testing import dataframes
from polars.testing import column, dataframes


@given(df=dataframes())
Expand All @@ -13,12 +14,75 @@ def test_repr(df: pl.DataFrame) -> None:
# print(df)


@given(df=dataframes(allowed_dtypes=[pl.Boolean, pl.UInt64, pl.Utf8, pl.Time]))
@given(df=dataframes(min_size=1, min_cols=1, null_probability=0.25))
@example(df=pl.DataFrame(columns=["x", "y", "z"]))
@example(df=pl.DataFrame())
def test_null_count(df: pl.DataFrame) -> None:
# note: the zero-row and zero-col cases are always passed as explicit examples
null_count, ncols = df.null_count(), len(df.columns)
if ncols == 0:
assert null_count.shape == (0, 0)
else:
assert null_count.shape == (1, ncols)
for idx, count in enumerate(null_count.rows()[0]):
assert count == sum(v is None for v in df.select_at_idx(idx).to_list())
print(null_count.rows())


@given(
df=dataframes(
max_size=20,
cols=[
column(
"start",
dtype=pl.Int8,
null_probability=0.15,
strategy=integers(min_value=-12, max_value=12),
),
column(
"stop",
dtype=pl.Int8,
null_probability=0.15,
strategy=integers(min_value=-10, max_value=10),
),
column(
"step",
dtype=pl.Int8,
null_probability=0.15,
strategy=integers(min_value=-8, max_value=8).filter(lambda x: x != 0),
),
column("misc", dtype=pl.Int32),
],
)
# generated dataframe example -
# ┌───────┬──────┬──────┬───────┐
# │ start ┆ stop ┆ step ┆ misc │
# │ --- ┆ --- ┆ --- ┆ --- │
# │ i8 ┆ i8 ┆ i8 ┆ i32 │
# ╞═══════╪══════╪══════╪═══════╡
# │ 2 ┆ -1 ┆ null ┆ -55 │
# ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
# │ -3 ┆ 0 ┆ -2 ┆ 61582 │
# ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
# │ null ┆ 1 ┆ 2 ┆ 5865 │
# └───────┴──────┴──────┴───────┘
)
@settings(max_examples=500)
def test_frame_slice(df: pl.DataFrame) -> None:
# take strategy-generated integer values from the frame as slice bounds.
# use these bounds to slice the same frame, and then validate the result
# against a py-native slice of the same data using the same bounds.
#
# given the average number of rows in the frames, and the value of
# max_examples, this will result in close to 5000 test permutations,
# running in around ~3 secs (depending on hardware/etc).
py_data = df.rows()

for start, stop, step, _ in py_data:
s = slice(start, stop, step)
sliced_py_data = py_data[s]
sliced_df_data = df[s].rows()

assert (
sliced_py_data == sliced_df_data
), f"slice [{start}:{stop}:{step}] failed on df w/len={len(df)}"

0 comments on commit 214ac46

Please sign in to comment.