additional negative indexing support, and frame-level "take_every" (#…

…3888)
pola-rs · Jul 4, 2022 · 25b58e2 · 25b58e2
1 parent 585c10c
commit 25b58e2
Show file tree

Hide file tree

Showing 6 changed files with 109 additions and 6 deletions.
diff --git a/py-polars/docs/source/reference/dataframe.rst b/py-polars/docs/source/reference/dataframe.rst
@@ -125,6 +125,7 @@ Manipulation/ selection
     DataFrame.slice
     DataFrame.sort
     DataFrame.tail
+    DataFrame.take_every
     DataFrame.to_dummies
     DataFrame.to_series
     DataFrame.transpose

diff --git a/py-polars/docs/source/reference/lazyframe.rst b/py-polars/docs/source/reference/lazyframe.rst
@@ -75,6 +75,7 @@ Manipulation/ selection
     LazyFrame.slice
     LazyFrame.sort
     LazyFrame.tail
+    LazyFrame.take_every
     LazyFrame.unique
     LazyFrame.unnest
     LazyFrame.with_column

diff --git a/py-polars/polars/internals/frame.py b/py-polars/polars/internals/frame.py
@@ -1882,6 +1882,8 @@ def to_series(self, index: int = 0) -> pli.Series:
         ]
 
         """
+        if index < 0:
+            index = len(self.columns) + index
         return pli.wrap_s(self._df.select_at_idx(index))
 
     def reverse(self: DF) -> DF:
@@ -1974,6 +1976,8 @@ def insert_at_idx(self, index: int, series: pli.Series) -> None:
         └─────┴─────┴─────┘
 
         """
+        if index < 0:
+            index = len(self.columns) + index
         self._df.insert_at_idx(index, series._s)
 
     def filter(self: DF, predicate: pli.Expr) -> DF:
@@ -2275,6 +2279,8 @@ def replace_at_idx(self, index: int, series: pli.Series) -> None:
         └───────┴─────┴─────┘
 
         """
+        if index < 0:
+            index = len(self.columns) + index
         self._df.replace_at_idx(index, series._s)
 
     @overload
@@ -4079,6 +4085,8 @@ def select_at_idx(self, idx: int) -> pli.Series:
         ]
 
         """
+        if idx < 0:
+            idx = len(self.columns) + idx
         return pli.wrap_s(self._df.select_at_idx(idx))
 
     def cleared(self: DF) -> DF:
@@ -5462,6 +5470,27 @@ def shrink_to_fit(self: DF, in_place: bool = False) -> DF | None:
             df._df.shrink_to_fit()
             return df
 
+    def take_every(self: DF, n: int) -> DF:
+        """
+        Take every nth row in the DataFrame and return as a new DataFrame.
+
+        Examples
+        --------
+        >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]})
+        >>> s.take_every(2)
+        shape: (2, 2)
+        ┌─────┬─────┐
+        │ a   ┆ b   │
+        │ --- ┆ --- │
+        │ i64 ┆ i64 │
+        ╞═════╪═════╡
+        │ 1   ┆ 5   │
+        ├╌╌╌╌╌┼╌╌╌╌╌┤
+        │ 3   ┆ 7   │
+        └─────┴─────┘
+        """
+        return self.select(pli.col("*").take_every(n))
+
     def hash_rows(
         self, k0: int = 0, k1: int = 1, k2: int = 2, k3: int = 3
     ) -> pli.Series:

diff --git a/py-polars/polars/internals/lazy_frame.py b/py-polars/polars/internals/lazy_frame.py
@@ -1782,6 +1782,27 @@ def with_row_count(self: LDF, name: str = "row_nr", offset: int = 0) -> LDF:
         """
         return self._from_pyldf(self._ldf.with_row_count(name, offset))
 
+    def take_every(self: LDF, n: int) -> LDF:
+        """
+        Take every nth row in the LazyFrame and return as a new LazyFrame.
+
+        Examples
+        --------
+        >>> s = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}).lazy()
+        >>> s.take_every(2).collect()
+        shape: (2, 2)
+        ┌─────┬─────┐
+        │ a   ┆ b   │
+        │ --- ┆ --- │
+        │ i64 ┆ i64 │
+        ╞═════╪═════╡
+        │ 1   ┆ 5   │
+        ├╌╌╌╌╌┼╌╌╌╌╌┤
+        │ 3   ┆ 7   │
+        └─────┴─────┘
+        """
+        return self.select(pli.col("*").take_every(n))
+
     def fill_null(self: LDF, fill_value: int | str | pli.Expr) -> LDF:
         """
         Fill missing values with a literal or Expr.

diff --git a/py-polars/tests/test_df.py b/py-polars/tests/test_df.py
@@ -14,7 +14,7 @@
 from hypothesis import given
 
 import polars as pl
-from polars.testing import assert_series_equal, columns, dataframes
+from polars.testing import assert_frame_equal, assert_series_equal, columns, dataframes
 
 if sys.version_info >= (3, 8):
     from typing import Literal
@@ -32,8 +32,7 @@ def test_repr(df: pl.DataFrame) -> None:
     # print(df)
 
 
-# note: *temporarily* constraining dtypes this test until #3843 and a windows-specific
-# fixfor a related date bug is merged (tblocking the PR to merge hypothesis code).
+# note: temporarily constraining dtypes for this test (possible windows-specific date bug)
 @given(df=dataframes(allowed_dtypes=[pl.Boolean, pl.UInt64, pl.Utf8]))
 def test_null_count(df: pl.DataFrame) -> None:
     null_count, ncols = df.null_count(), len(df.columns)
@@ -548,6 +547,54 @@ def test_assignment() -> None:
     assert df["foo"].to_list() == [1, 9, 9]
 
 
+def test_select_at_idx() -> None:
+    df = pl.DataFrame({"x": [1, 2, 3], "y": [2, 3, 4], "z": [3, 4, 5]})
+    for idx in range(len(df.columns)):
+        assert_series_equal(
+            df.select_at_idx(idx),  # regular positive indexing
+            df.select_at_idx(idx - len(df.columns)),  # equivalent negative index
+        )
+
+
+def test_insert_at_idx() -> None:
+    df = pl.DataFrame({"z": [3, 4, 5]})
+    df.insert_at_idx(0, pl.Series("x", [1, 2, 3]))
+    df.insert_at_idx(-1, pl.Series("y", [2, 3, 4]))
+
+    expected_df = pl.DataFrame({"x": [1, 2, 3], "y": [2, 3, 4], "z": [3, 4, 5]})
+    assert_frame_equal(expected_df, df)
+
+
+def test_replace_at_idx() -> None:
+    df = pl.DataFrame({"x": [1, 2, 3], "y": [2, 3, 4], "z": [3, 4, 5]})
+    df.replace_at_idx(0, pl.Series("a", [4, 5, 6]))
+    df.replace_at_idx(-2, pl.Series("b", [5, 6, 7]))
+    df.replace_at_idx(-1, pl.Series("c", [6, 7, 8]))
+
+    expected_df = pl.DataFrame({"a": [4, 5, 6], "b": [5, 6, 7], "c": [6, 7, 8]})
+    assert_frame_equal(expected_df, df)
+
+
+def test_to_series() -> None:
+    df = pl.DataFrame({"x": [1, 2, 3], "y": [2, 3, 4], "z": [3, 4, 5]})
+
+    assert_series_equal(df.to_series(), df["x"])
+    assert_series_equal(df.to_series(0), df["x"])
+    assert_series_equal(df.to_series(-3), df["x"])
+
+    assert_series_equal(df.to_series(1), df["y"])
+    assert_series_equal(df.to_series(-2), df["y"])
+
+    assert_series_equal(df.to_series(2), df["z"])
+    assert_series_equal(df.to_series(-1), df["z"])
+
+
+def test_take_every() -> None:
+    df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["w", "x", "y", "z"]})
+    expected_df = pl.DataFrame({"a": [1, 3], "b": ["w", "y"]})
+    assert_frame_equal(expected_df, df.take_every(2))
+
+
 def test_slice() -> None:
     df = pl.DataFrame({"a": [2, 1, 3], "b": ["a", "b", "c"]})
     expected = pl.DataFrame({"a": [1, 3], "b": ["b", "c"]})
@@ -1219,9 +1266,6 @@ def test_lazy_functions() -> None:
     expected = 3
     assert np.isclose(out.select_at_idx(9), expected)
     assert np.isclose(pl.last(df["b"]), expected)
-    expected = 3
-    assert np.isclose(out.select_at_idx(9), expected)
-    assert np.isclose(pl.last(df["b"]), expected)
 
 
 def test_multiple_column_sort() -> None:

diff --git a/py-polars/tests/test_lazy.py b/py-polars/tests/test_lazy.py
@@ -4,6 +4,7 @@
 
 import polars as pl
 from polars import col, lit, map_binary, when
+from polars.testing import assert_frame_equal
 
 
 def test_lazy() -> None:
@@ -50,6 +51,12 @@ def test_set_null() -> None:
     assert s[2] is None
 
 
+def test_take_every() -> None:
+    df = pl.DataFrame({"a": [1, 2, 3, 4], "b": ["w", "x", "y", "z"]}).lazy()
+    expected_df = pl.DataFrame({"a": [1, 3], "b": ["w", "y"]})
+    assert_frame_equal(expected_df, df.take_every(2).collect())
+
+
 def test_agg() -> None:
     df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0]})
     ldf = df.lazy().min()