From 0c97b76f02e7d62ceb8abdd7cef5a2edc504eea7 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 5 May 2024 11:02:52 +0100 Subject: [PATCH] split out rolling_by --- .../reference/expressions/computation.rst | 8 + py-polars/polars/expr/expr.py | 1343 ++++++++++++++++- .../tests/parametric/test_groupby_rolling.py | 4 +- .../tests/unit/datatypes/test_temporal.py | 2 +- .../unit/operations/rolling/test_rolling.py | 50 +- 5 files changed, 1369 insertions(+), 38 deletions(-) diff --git a/py-polars/docs/source/reference/expressions/computation.rst b/py-polars/docs/source/reference/expressions/computation.rst index 663f04df0185..34fb8aa38341 100644 --- a/py-polars/docs/source/reference/expressions/computation.rst +++ b/py-polars/docs/source/reference/expressions/computation.rst @@ -56,14 +56,22 @@ Computation Expr.rolling_apply Expr.rolling_map Expr.rolling_max + Expr.rolling_max_by Expr.rolling_mean + Expr.rolling_mean_by Expr.rolling_median + Expr.rolling_median_by Expr.rolling_min + Expr.rolling_min_by Expr.rolling_quantile + Expr.rolling_quantile_by Expr.rolling_skew Expr.rolling_std + Expr.rolling_std_by Expr.rolling_sum + Expr.rolling_sum_by Expr.rolling_var + Expr.rolling_var_by Expr.search_sorted Expr.sign Expr.sin diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py index cd3cc21c8f77..5e4bd9bca3c9 100644 --- a/py-polars/polars/expr/expr.py +++ b/py-polars/polars/expr/expr.py @@ -6122,6 +6122,1239 @@ def interpolate(self, method: InterpolationMethod = "linear") -> Self: """ return self._from_pyexpr(self._pyexpr.interpolate(method)) + @unstable() + def rolling_min_by( + self, + by: str, + window_size: timedelta | str, + *, + min_periods: int = 1, + closed: ClosedInterval = "right", + warn_if_unsorted: bool = True, + ) -> Self: + """ + Apply a rolling min based on another column. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Given a `by` column ``, then `closed="right"` + (the default) means the windows will be: + + - (t_0 - window_size, t_0] + - (t_1 - window_size, t_1] + - ... + - (t_n - window_size, t_n] + + Parameters + ---------- + by + This column must be of dtype Datetime or Date. + + .. warning:: + The column must be sorted in ascending order. Otherwise, + results will not be correct. + window_size + The length of the window. Can be a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + min_periods + The number of values in the window that should be non-null before computing + a result. + closed : {'left', 'right', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive), + defaults to `'right'`. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_index() + >>> df_temporal + shape: (25, 2) + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ 4 ┆ 2001-01-01 04:00:00 │ + │ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ + + Compute the rolling min with the temporal windows closed on the right (default) + + >>> df_temporal.with_columns( + ... rolling_row_min=pl.col("index").rolling_min_by("date", window_size="2h") + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 19 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └───────┴─────────────────────┴─────────────────┘ + """ + window_size = deprecate_saturating(window_size) + window_size, min_periods = _prepare_rolling_window_args( + window_size, min_periods + ) + return self._from_pyexpr( + self._pyexpr.rolling_min( + window_size, None, min_periods, False, by, closed, warn_if_unsorted + ) + ) + + @unstable() + def rolling_max_by( + self, + by: str, + window_size: timedelta | str, + *, + min_periods: int = 1, + closed: ClosedInterval = "right", + warn_if_unsorted: bool = True, + ) -> Self: + """ + Apply a rolling max based on another column. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Given a `by` column ``, then `closed="right"` + (the default) means the windows will be: + + - (t_0 - window_size, t_0] + - (t_1 - window_size, t_1] + - ... + - (t_n - window_size, t_n] + + Parameters + ---------- + by + This column must be of dtype Datetime or Date. + + .. warning:: + The column must be sorted in ascending order. Otherwise, + results will not be correct. + window_size + The length of the window. Can be a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + min_periods + The number of values in the window that should be non-null before computing + a result. + closed : {'left', 'right', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive), + defaults to `'right'`. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_index() + >>> df_temporal + shape: (25, 2) + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ 4 ┆ 2001-01-01 04:00:00 │ + │ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ + + Compute the rolling max with the temporal windows closed on the right (default) + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("index").rolling_max_by("date", window_size="2h") + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 4 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 20 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └───────┴─────────────────────┴─────────────────┘ + + Compute the rolling max with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_max=pl.col("index").rolling_max_by( + ... "date", window_size="2h", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 4 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 20 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └───────┴─────────────────────┴─────────────────┘ + """ + window_size = deprecate_saturating(window_size) + window_size, min_periods = _prepare_rolling_window_args( + window_size, min_periods + ) + return self._from_pyexpr( + self._pyexpr.rolling_max( + window_size, None, min_periods, False, by, closed, warn_if_unsorted + ) + ) + + @unstable() + def rolling_mean_by( + self, + by: str, + window_size: timedelta | str, + *, + min_periods: int = 1, + closed: ClosedInterval = "right", + warn_if_unsorted: bool = True, + ) -> Self: + """ + Apply a rolling mean based on another column. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Given a `by` column ``, then `closed="right"` + (the default) means the windows will be: + + - (t_0 - window_size, t_0] + - (t_1 - window_size, t_1] + - ... + - (t_n - window_size, t_n] + + Parameters + ---------- + by + This column must be of dtype Datetime or Date. + + .. warning:: + The column must be sorted in ascending order. Otherwise, + results will not be correct. + window_size + The length of the window. Can be a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + min_periods + The number of values in the window that should be non-null before computing + a result. + closed : {'left', 'right', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive), + defaults to `'right'`. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_index() + >>> df_temporal + shape: (25, 2) + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ 4 ┆ 2001-01-01 04:00:00 │ + │ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ + + Compute the rolling mean with the temporal windows closed on the right (default) + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("index").rolling_mean_by( + ... "date", window_size="2h" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬──────────────────┐ + │ index ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.5 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 3.5 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 19.5 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.5 │ + └───────┴─────────────────────┴──────────────────┘ + + Compute the rolling mean with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_mean=pl.col("index").rolling_mean_by( + ... "date", window_size="2h", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬──────────────────┐ + │ index ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 3.0 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 19.0 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └───────┴─────────────────────┴──────────────────┘ + """ + window_size = deprecate_saturating(window_size) + window_size, min_periods = _prepare_rolling_window_args( + window_size, min_periods + ) + return self._from_pyexpr( + self._pyexpr.rolling_mean( + window_size, + None, + min_periods, + False, + by, + closed, + warn_if_unsorted, + ) + ) + + @unstable() + def rolling_sum_by( + self, + by: str, + window_size: timedelta | str, + *, + min_periods: int = 1, + closed: ClosedInterval = "right", + warn_if_unsorted: bool = True, + ) -> Self: + """ + Apply a rolling sum based on another column. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Given a `by` column ``, then `closed="right"` + (the default) means the windows will be: + + - (t_0 - window_size, t_0] + - (t_1 - window_size, t_1] + - ... + - (t_n - window_size, t_n] + + Parameters + ---------- + window_size + The length of the window. Can be a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + min_periods + The number of values in the window that should be non-null before computing + a result. + by + This column must of dtype `{Date, Datetime}` + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + closed : {'left', 'right', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive), + defaults to `'right'`. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_index() + >>> df_temporal + shape: (25, 2) + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ 4 ┆ 2001-01-01 04:00:00 │ + │ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ + + Compute the rolling sum with the temporal windows closed on the right (default) + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("index").rolling_sum_by("date", window_size="2h") + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 5 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 7 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 39 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 41 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 43 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 45 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 47 │ + └───────┴─────────────────────┴─────────────────┘ + + Compute the rolling sum with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_sum=pl.col("index").rolling_sum_by( + ... "date", window_size="2h", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 9 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 57 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └───────┴─────────────────────┴─────────────────┘ + """ + window_size = deprecate_saturating(window_size) + window_size, min_periods = _prepare_rolling_window_args( + window_size, min_periods + ) + return self._from_pyexpr( + self._pyexpr.rolling_sum( + window_size, None, min_periods, False, by, closed, warn_if_unsorted + ) + ) + + @unstable() + def rolling_std_by( + self, + by: str, + window_size: timedelta | str, + *, + min_periods: int = 1, + closed: ClosedInterval = "right", + ddof: int = 1, + warn_if_unsorted: bool = True, + ) -> Self: + """ + Compute a rolling standard deviation based on another column. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Given a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + + Parameters + ---------- + by + This column must be of dtype Datetime or Date. + + .. warning:: + The column must be sorted in ascending order. Otherwise, + results will not be correct. + window_size + The length of the window. Can be a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + min_periods + The number of values in the window that should be non-null before computing + a result. + closed : {'left', 'right', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive), + defaults to `'right'`. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + warn_if_unsorted + Warn if data is not known to be sorted by `by` column. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_index() + >>> df_temporal + shape: (25, 2) + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ 4 ┆ 2001-01-01 04:00:00 │ + │ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ + + Compute the rolling std with the temporal windows closed on the right (default) + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("index").rolling_std_by("date", window_size="2h") + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 0.707107 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └───────┴─────────────────────┴─────────────────┘ + + Compute the rolling std with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_std=pl.col("index").rolling_std_by( + ... "date", window_size="2h", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 1.0 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └───────┴─────────────────────┴─────────────────┘ + """ + window_size = deprecate_saturating(window_size) + window_size, min_periods = _prepare_rolling_window_args( + window_size, min_periods + ) + return self._from_pyexpr( + self._pyexpr.rolling_std( + window_size, + None, + min_periods, + False, + by, + closed, + ddof, + warn_if_unsorted, + ) + ) + + @unstable() + def rolling_var_by( + self, + by: str, + window_size: timedelta | str, + *, + min_periods: int = 1, + closed: ClosedInterval = "right", + ddof: int = 1, + warn_if_unsorted: bool = True, + ) -> Self: + """ + Compute a rolling variance based on another column. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Given a `by` column ``, then `closed="right"` + (the default) means the windows will be: + + - (t_0 - window_size, t_0] + - (t_1 - window_size, t_1] + - ... + - (t_n - window_size, t_n] + + Parameters + ---------- + by + This column must be of dtype Datetime or Date. + + .. warning:: + The column must be sorted in ascending order. Otherwise, + results will not be correct. + window_size + The length of the window. Can be a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + min_periods + The number of values in the window that should be non-null before computing + a result. + closed : {'left', 'right', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive), + defaults to `'right'`. + ddof + "Delta Degrees of Freedom": The divisor for a length N window is N - ddof + warn_if_unsorted + Warn if data is not known to be sorted by `by` column. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_index() + >>> df_temporal + shape: (25, 2) + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ 4 ┆ 2001-01-01 04:00:00 │ + │ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ + + Compute the rolling var with the temporal windows closed on the right (default) + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("index").rolling_var_by("date", window_size="2h") + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 0.5 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └───────┴─────────────────────┴─────────────────┘ + + Compute the rolling var with the closure of windows on both sides + + >>> df_temporal.with_columns( + ... rolling_row_var=pl.col("index").rolling_var_by( + ... "date", window_size="2h", closed="both" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 1.0 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └───────┴─────────────────────┴─────────────────┘ + """ + window_size = deprecate_saturating(window_size) + window_size, min_periods = _prepare_rolling_window_args( + window_size, min_periods + ) + return self._from_pyexpr( + self._pyexpr.rolling_var( + window_size, + None, + min_periods, + False, + by, + closed, + ddof, + warn_if_unsorted, + ) + ) + + @unstable() + def rolling_median_by( + self, + by: str, + window_size: timedelta | str, + *, + min_periods: int = 1, + closed: ClosedInterval = "right", + warn_if_unsorted: bool = True, + ) -> Self: + """ + Compute a rolling median based on another column. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Given a `by` column ``, then `closed="left"` means + the windows will be: + + - [t_0 - window_size, t_0) + - [t_1 - window_size, t_1) + - ... + - [t_n - window_size, t_n) + + Parameters + ---------- + by + This column must be of dtype Datetime or Date. + + .. warning:: + The column must be sorted in ascending order. Otherwise, + results will not be correct. + window_size + The length of the window. Can be a dynamic temporal + size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + min_periods + The number of values in the window that should be non-null before computing + a result. + closed : {'left', 'right', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive), + defaults to `'right'`. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_index() + >>> df_temporal + shape: (25, 2) + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ 4 ┆ 2001-01-01 04:00:00 │ + │ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ + + Compute the rolling median with the temporal windows closed on the right: + + >>> df_temporal.with_columns( + ... rolling_row_median=pl.col("index").rolling_median_by( + ... "date", window_size="2h" + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬────────────────────┐ + │ index ┆ date ┆ rolling_row_median │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.5 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 3.5 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 19.5 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.5 │ + └───────┴─────────────────────┴────────────────────┘ + """ + window_size = deprecate_saturating(window_size) + window_size, min_periods = _prepare_rolling_window_args( + window_size, min_periods + ) + return self._from_pyexpr( + self._pyexpr.rolling_median( + window_size, None, min_periods, False, by, closed, warn_if_unsorted + ) + ) + + @unstable() + def rolling_quantile_by( + self, + by: str, + window_size: timedelta | str, + *, + quantile: float, + interpolation: RollingInterpolationMethod = "nearest", + min_periods: int = 1, + closed: ClosedInterval = "right", + warn_if_unsorted: bool = True, + ) -> Self: + """ + Compute a rolling quantile based on another column. + + .. warning:: + This functionality is considered **unstable**. It may be changed + at any point without it being considered a breaking change. + + Given a `by` column ``, then `closed="right"` + (the default) means the windows will be: + + - (t_0 - window_size, t_0] + - (t_1 - window_size, t_1] + - ... + - (t_n - window_size, t_n] + + Parameters + ---------- + by + This column must be of dtype Datetime or Date. + + .. warning:: + The column must be sorted in ascending order. Otherwise, + results will not be correct. + quantile + Quantile between 0.0 and 1.0. + interpolation : {'nearest', 'higher', 'lower', 'midpoint', 'linear'} + Interpolation method. + window_size + The length of the window. Can be a dynamic + temporal size indicated by a timedelta or the following string language: + + - 1ns (1 nanosecond) + - 1us (1 microsecond) + - 1ms (1 millisecond) + - 1s (1 second) + - 1m (1 minute) + - 1h (1 hour) + - 1d (1 calendar day) + - 1w (1 calendar week) + - 1mo (1 calendar month) + - 1q (1 calendar quarter) + - 1y (1 calendar year) + + By "calendar day", we mean the corresponding time on the next day + (which may not be 24 hours, due to daylight savings). Similarly for + "calendar week", "calendar month", "calendar quarter", and + "calendar year". + min_periods + The number of values in the window that should be non-null before computing + a result. + closed : {'left', 'right', 'both', 'none'} + Define which sides of the temporal interval are closed (inclusive), + defaults to `'right'`. + warn_if_unsorted + Warn if data is not known to be sorted by `by` column. + + Notes + ----- + If you want to compute multiple aggregation statistics over the same dynamic + window, consider using `rolling` - this method can cache the window size + computation. + + Examples + -------- + Create a DataFrame with a datetime column and a row number column + + >>> from datetime import timedelta, datetime + >>> start = datetime(2001, 1, 1) + >>> stop = datetime(2001, 1, 2) + >>> df_temporal = pl.DataFrame( + ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} + ... ).with_row_index() + >>> df_temporal + shape: (25, 2) + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ 4 ┆ 2001-01-01 04:00:00 │ + │ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ + + Compute the rolling quantile with the temporal windows closed on the right: + + >>> df_temporal.with_columns( + ... rolling_row_quantile=pl.col("index").rolling_quantile_by( + ... "date", window_size="2h", quantile=0.3 + ... ) + ... ) + shape: (25, 3) + ┌───────┬─────────────────────┬──────────────────────┐ + │ index ┆ date ┆ rolling_row_quantile │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪══════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ 4 ┆ 2001-01-01 04:00:00 ┆ 3.0 │ + │ … ┆ … ┆ … │ + │ 20 ┆ 2001-01-01 20:00:00 ┆ 19.0 │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └───────┴─────────────────────┴──────────────────────┘ + """ + window_size = deprecate_saturating(window_size) + window_size, min_periods = _prepare_rolling_window_args( + window_size, min_periods + ) + return self._from_pyexpr( + self._pyexpr.rolling_quantile( + quantile, + interpolation, + window_size, + None, + min_periods, + False, + by, + closed, + warn_if_unsorted, + ) + ) + @unstable() def rolling_min( self, @@ -6201,6 +7434,10 @@ def rolling_min( .. warning:: If passed, the column must be sorted in ascending order. Otherwise, results will not be correct. + + .. deprecated:: 0.20.24 + Passing `by` to `rolling_min` is deprecated - please use + :meth:`.rolling_min_by` instead. closed : {'left', 'right', 'both', 'none'} Define which sides of the temporal interval are closed (inclusive); only applicable if `by` has been set (in which case, it defaults to `'right'`). @@ -6305,7 +7542,7 @@ def rolling_min( >>> df_temporal.with_columns( ... rolling_row_min=pl.col("index").rolling_min(window_size="2h", by="date") - ... ) + ... ) # doctest:+SKIP shape: (25, 3) ┌───────┬─────────────────────┬─────────────────┐ │ index ┆ date ┆ rolling_row_min │ @@ -6329,6 +7566,12 @@ def rolling_min( window_size, min_periods = _prepare_rolling_window_args( window_size, min_periods ) + if by is not None: + issue_deprecation_warning( + "Passing `by` to `rolling_min` is deprecated. Instead of " + "`rolling_min(..., by='foo')`, please use `rolling_min_by('foo', ...)`.", + version="0.20.24", + ) return self._from_pyexpr( self._pyexpr.rolling_min( window_size, weights, min_periods, center, by, closed, warn_if_unsorted @@ -6410,6 +7653,14 @@ def rolling_max( If the `window_size` is temporal, for instance `"5h"` or `"3s"`, you must set the column that will be used to determine the windows. This column must be of dtype Datetime or Date. + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + + .. deprecated:: 0.20.24 + Passing `by` to `rolling_max` is deprecated - please use + :meth:`.rolling_max_by` instead. closed : {'left', 'right', 'both', 'none'} Define which sides of the temporal interval are closed (inclusive); only applicable if `by` has been set (in which case, it defaults to `'right'`). @@ -6514,7 +7765,7 @@ def rolling_max( >>> df_temporal.with_columns( ... rolling_row_max=pl.col("index").rolling_max(window_size="2h", by="date") - ... ) + ... ) # doctest:+SKIP shape: (25, 3) ┌───────┬─────────────────────┬─────────────────┐ │ index ┆ date ┆ rolling_row_max │ @@ -6540,7 +7791,7 @@ def rolling_max( ... rolling_row_max=pl.col("index").rolling_max( ... window_size="2h", by="date", closed="both" ... ) - ... ) + ... ) # doctest:+SKIP shape: (25, 3) ┌───────┬─────────────────────┬─────────────────┐ │ index ┆ date ┆ rolling_row_max │ @@ -6564,6 +7815,12 @@ def rolling_max( window_size, min_periods = _prepare_rolling_window_args( window_size, min_periods ) + if by is not None: + issue_deprecation_warning( + "Passing `by` to `rolling_max` is deprecated. Instead of " + "`rolling_max(..., by='foo')`, please use `rolling_max_by('foo', ...)`.", + version="0.20.24", + ) return self._from_pyexpr( self._pyexpr.rolling_max( window_size, weights, min_periods, center, by, closed, warn_if_unsorted @@ -6649,6 +7906,10 @@ def rolling_mean( .. warning:: If passed, the column must be sorted in ascending order. Otherwise, results will not be correct. + + .. deprecated:: 0.20.24 + Passing `by` to `rolling_mean` is deprecated - please use + :meth:`.rolling_max_by` instead. closed : {'left', 'right', 'both', 'none'} Define which sides of the temporal interval are closed (inclusive); only applicable if `by` has been set (in which case, it defaults to `'right'`). @@ -6755,7 +8016,7 @@ def rolling_mean( ... rolling_row_mean=pl.col("index").rolling_mean( ... window_size="2h", by="date" ... ) - ... ) + ... ) # doctest:+SKIP shape: (25, 3) ┌───────┬─────────────────────┬──────────────────┐ │ index ┆ date ┆ rolling_row_mean │ @@ -6781,7 +8042,7 @@ def rolling_mean( ... rolling_row_mean=pl.col("index").rolling_mean( ... window_size="2h", by="date", closed="both" ... ) - ... ) + ... ) # doctest:+SKIP shape: (25, 3) ┌───────┬─────────────────────┬──────────────────┐ │ index ┆ date ┆ rolling_row_mean │ @@ -6805,6 +8066,12 @@ def rolling_mean( window_size, min_periods = _prepare_rolling_window_args( window_size, min_periods ) + if by is not None: + issue_deprecation_warning( + "Passing `by` to `rolling_mean` is deprecated. Instead of " + "`rolling_mean(..., by='foo')`, please use `rolling_mean_by('foo', ...)`.", + version="0.20.24", + ) return self._from_pyexpr( self._pyexpr.rolling_mean( window_size, @@ -6892,6 +8159,14 @@ def rolling_sum( If the `window_size` is temporal for instance `"5h"` or `"3s"`, you must set the column that will be used to determine the windows. This column must of dtype `{Date, Datetime}` + + .. warning:: + If passed, the column must be sorted in ascending order. Otherwise, + results will not be correct. + + .. deprecated:: 0.20.24 + Passing `by` to `rolling_sum` is deprecated - please use + :meth:`.rolling_sum_by` instead. closed : {'left', 'right', 'both', 'none'} Define which sides of the temporal interval are closed (inclusive); only applicable if `by` has been set (in which case, it defaults to `'right'`). @@ -6996,7 +8271,7 @@ def rolling_sum( >>> df_temporal.with_columns( ... rolling_row_sum=pl.col("index").rolling_sum(window_size="2h", by="date") - ... ) + ... ) # doctest:+SKIP shape: (25, 3) ┌───────┬─────────────────────┬─────────────────┐ │ index ┆ date ┆ rolling_row_sum │ @@ -7022,7 +8297,7 @@ def rolling_sum( ... rolling_row_sum=pl.col("index").rolling_sum( ... window_size="2h", by="date", closed="both" ... ) - ... ) + ... ) # doctest:+SKIP shape: (25, 3) ┌───────┬─────────────────────┬─────────────────┐ │ index ┆ date ┆ rolling_row_sum │ @@ -7046,6 +8321,12 @@ def rolling_sum( window_size, min_periods = _prepare_rolling_window_args( window_size, min_periods ) + if by is not None: + issue_deprecation_warning( + "Passing `by` to `rolling_sum` is deprecated. Instead of " + "`rolling_sum(..., by='foo')`, please use `rolling_sum_by('foo', ...)`.", + version="0.20.24", + ) return self._from_pyexpr( self._pyexpr.rolling_sum( window_size, weights, min_periods, center, by, closed, warn_if_unsorted @@ -7129,6 +8410,10 @@ def rolling_std( .. warning:: If passed, the column must be sorted in ascending order. Otherwise, results will not be correct. + + .. deprecated:: 0.20.24 + Passing `by` to `rolling_std` is deprecated - please use + :meth:`.rolling_std_by` instead. closed : {'left', 'right', 'both', 'none'} Define which sides of the temporal interval are closed (inclusive); only applicable if `by` has been set (in which case, it defaults to `'right'`). @@ -7235,7 +8520,7 @@ def rolling_std( >>> df_temporal.with_columns( ... rolling_row_std=pl.col("index").rolling_std(window_size="2h", by="date") - ... ) + ... ) # doctest:+SKIP shape: (25, 3) ┌───────┬─────────────────────┬─────────────────┐ │ index ┆ date ┆ rolling_row_std │ @@ -7261,7 +8546,7 @@ def rolling_std( ... rolling_row_std=pl.col("index").rolling_std( ... window_size="2h", by="date", closed="both" ... ) - ... ) + ... ) # doctest:+SKIP shape: (25, 3) ┌───────┬─────────────────────┬─────────────────┐ │ index ┆ date ┆ rolling_row_std │ @@ -7285,6 +8570,12 @@ def rolling_std( window_size, min_periods = _prepare_rolling_window_args( window_size, min_periods ) + if by is not None: + issue_deprecation_warning( + "Passing `by` to `rolling_std` is deprecated. Instead of " + "`rolling_std(..., by='foo')`, please use `rolling_std_by('foo', ...)`.", + version="0.20.24", + ) return self._from_pyexpr( self._pyexpr.rolling_std( window_size, @@ -7374,6 +8665,10 @@ def rolling_var( .. warning:: If passed, the column must be sorted in ascending order. Otherwise, results will not be correct. + + .. deprecated:: 0.20.24 + Passing `by` to `rolling_var` is deprecated - please use + :meth:`.rolling_var_by` instead. closed : {'left', 'right', 'both', 'none'} Define which sides of the temporal interval are closed (inclusive); only applicable if `by` has been set (in which case, it defaults to `'right'`). @@ -7480,7 +8775,7 @@ def rolling_var( >>> df_temporal.with_columns( ... rolling_row_var=pl.col("index").rolling_var(window_size="2h", by="date") - ... ) + ... ) # doctest:+SKIP shape: (25, 3) ┌───────┬─────────────────────┬─────────────────┐ │ index ┆ date ┆ rolling_row_var │ @@ -7506,7 +8801,7 @@ def rolling_var( ... rolling_row_var=pl.col("index").rolling_var( ... window_size="2h", by="date", closed="both" ... ) - ... ) + ... ) # doctest:+SKIP shape: (25, 3) ┌───────┬─────────────────────┬─────────────────┐ │ index ┆ date ┆ rolling_row_var │ @@ -7530,6 +8825,12 @@ def rolling_var( window_size, min_periods = _prepare_rolling_window_args( window_size, min_periods ) + if by is not None: + issue_deprecation_warning( + "Passing `by` to `rolling_var` is deprecated. Instead of " + "`rolling_var(..., by='foo')`, please use `rolling_var_by('foo', ...)`.", + version="0.20.24", + ) return self._from_pyexpr( self._pyexpr.rolling_var( window_size, @@ -7619,6 +8920,10 @@ def rolling_median( .. warning:: If passed, the column must be sorted in ascending order. Otherwise, results will not be correct. + + .. deprecated:: 0.20.24 + Passing `by` to `rolling_median` is deprecated - please use + :meth:`.rolling_median_by` instead. closed : {'left', 'right', 'both', 'none'} Define which sides of the temporal interval are closed (inclusive); only applicable if `by` has been set (in which case, it defaults to `'right'`). @@ -7695,6 +9000,12 @@ def rolling_median( window_size, min_periods = _prepare_rolling_window_args( window_size, min_periods ) + if by is not None: + issue_deprecation_warning( + "Passing `by` to `rolling_median` is deprecated. Instead of " + "`rolling_median(..., by='foo')`, please use `rolling_median_by('foo', ...)`.", + version="0.20.24", + ) return self._from_pyexpr( self._pyexpr.rolling_median( window_size, weights, min_periods, center, by, closed, warn_if_unsorted @@ -7782,6 +9093,10 @@ def rolling_quantile( .. warning:: If passed, the column must be sorted in ascending order. Otherwise, results will not be correct. + + .. deprecated:: 0.20.24 + Passing `by` to `rolling_quantile` is deprecated - please use + :meth:`.rolling_quantile_by` instead. closed : {'left', 'right', 'both', 'none'} Define which sides of the temporal interval are closed (inclusive); only applicable if `by` has been set (in which case, it defaults to `'right'`). @@ -7886,6 +9201,12 @@ def rolling_quantile( window_size, min_periods = _prepare_rolling_window_args( window_size, min_periods ) + if by is not None: + issue_deprecation_warning( + "Passing `by` to `rolling_quantile` is deprecated. Instead of " + "`rolling_quantile(..., by='foo')`, please use `rolling_quantile_by('foo', ...)`.", + version="0.20.24", + ) return self._from_pyexpr( self._pyexpr.rolling_quantile( quantile, diff --git a/py-polars/tests/parametric/test_groupby_rolling.py b/py-polars/tests/parametric/test_groupby_rolling.py index 39836c388014..9b1eb75e20c6 100644 --- a/py-polars/tests/parametric/test_groupby_rolling.py +++ b/py-polars/tests/parametric/test_groupby_rolling.py @@ -134,9 +134,9 @@ def test_rolling_aggs( ) ) df = dataframe.sort("ts") - func = f"rolling_{aggregation}" + func = f"rolling_{aggregation}_by" result = df.with_columns( - getattr(pl.col("value"), func)(window_size=window_size, by="ts", closed=closed) + getattr(pl.col("value"), func)("ts", window_size=window_size, closed=closed) ) expected_dict: dict[str, list[object]] = {"ts": [], "value": []} diff --git a/py-polars/tests/unit/datatypes/test_temporal.py b/py-polars/tests/unit/datatypes/test_temporal.py index 06c0de8d339f..79ef41e9a8cf 100644 --- a/py-polars/tests/unit/datatypes/test_temporal.py +++ b/py-polars/tests/unit/datatypes/test_temporal.py @@ -2965,7 +2965,7 @@ def test_rolling_duplicates() -> None: "value": [0, 1], } ) - assert df.sort("ts").with_columns(pl.col("value").rolling_max("1d", by="ts"))[ + assert df.sort("ts").with_columns(pl.col("value").rolling_max_by("ts", "1d"))[ "value" ].to_list() == [1, 1] diff --git a/py-polars/tests/unit/operations/rolling/test_rolling.py b/py-polars/tests/unit/operations/rolling/test_rolling.py index 58dde79db827..bc69f1d5ca0b 100644 --- a/py-polars/tests/unit/operations/rolling/test_rolling.py +++ b/py-polars/tests/unit/operations/rolling/test_rolling.py @@ -46,12 +46,12 @@ def test_rolling_kernels_and_rolling( # null here # where the sum aggregation of an empty set is 0 pl.col("values") - .rolling_sum(period, by="dt", closed=closed) + .rolling_sum_by("dt", period, closed=closed) .fill_null(0) .alias("sum"), - pl.col("values").rolling_var(period, by="dt", closed=closed).alias("var"), - pl.col("values").rolling_mean(period, by="dt", closed=closed).alias("mean"), - pl.col("values").rolling_std(period, by="dt", closed=closed).alias("std"), + pl.col("values").rolling_var_by("dt", period, closed=closed).alias("var"), + pl.col("values").rolling_mean_by("dt", period, closed=closed).alias("mean"), + pl.col("values").rolling_std_by("dt", period, closed=closed).alias("std"), ] ) out2 = ( @@ -208,9 +208,10 @@ def test_rolling_crossing_dst( datetime(2021, 11, 5), datetime(2021, 11, 10), "1d", time_zone="UTC", eager=True ).dt.replace_time_zone(time_zone) df = pl.DataFrame({"ts": ts, "value": [1, 2, 3, 4, 5, 6]}) - result = df.with_columns( - getattr(pl.col("value"), rolling_fn)("1d", by="ts", closed="left") - ) + with pytest.deprecated_call(match=f"{rolling_fn}_by"): + result = df.with_columns( + getattr(pl.col("value"), rolling_fn)("1d", by="ts", closed="left") + ) expected = pl.DataFrame( {"ts": ts, "value": expected_values}, schema_overrides={"value": expected_dtype} ) @@ -221,11 +222,11 @@ def test_rolling_by_invalid() -> None: df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}).sort("a") msg = "in `rolling_min` operation, `by` argument of dtype `i64` is not supported" with pytest.raises(InvalidOperationError, match=msg): - df.select(pl.col("b").rolling_min(2, by="a")) + df.select(pl.col("b").rolling_min_by("a", 2)) # type: ignore[arg-type] df = pl.DataFrame({"a": [1, 2, 3], "b": [date(2020, 1, 1)] * 3}).sort("b") msg = "if `by` argument is passed, then `window_size` must be a temporal window" with pytest.raises(InvalidOperationError, match=msg): - df.select(pl.col("a").rolling_min(2, by="b")) + df.select(pl.col("a").rolling_min_by("b", 2)) # type: ignore[arg-type] def test_rolling_infinity() -> None: @@ -249,7 +250,7 @@ def test_rolling_by_non_temporal_window_size() -> None: ).sort("a", "b") msg = "if `by` argument is passed, then `window_size` must be a temporal window" with pytest.raises(InvalidOperationError, match=msg): - df.with_columns(pl.col("a").rolling_sum(2, by="b", closed="left")) + df.with_columns(pl.col("a").rolling_sum_by("b", 2, closed="left")) # type: ignore[arg-type] def test_rolling_by_weights() -> None: @@ -257,8 +258,9 @@ def test_rolling_by_weights() -> None: {"a": [4, 5, 6], "b": [date(2020, 1, 1), date(2020, 1, 2), date(2020, 1, 3)]} ).sort("b") msg = r"`weights` is not supported in 'rolling_\*\(..., by=...\)' expression" - with pytest.raises(InvalidOperationError, match=msg): - df.with_columns(pl.col("a").rolling_sum("2d", by="b", weights=[1, 2])) + with pytest.raises(InvalidOperationError, match=msg): # noqa: SIM117 + with pytest.deprecated_call(match="rolling_sum_by"): + df.with_columns(pl.col("a").rolling_sum("2d", by="b", weights=[1, 2])) def test_rolling_extrema() -> None: @@ -566,11 +568,11 @@ def test_rolling_negative_period() -> None: ).collect() with pytest.raises(ComputeError, match="window size should be strictly positive"): df.select( - pl.col("value").rolling_min(by="ts", window_size="-1d", closed="left") + pl.col("value").rolling_min_by("ts", window_size="-1d", closed="left") ) with pytest.raises(ComputeError, match="window size should be strictly positive"): df.lazy().select( - pl.col("value").rolling_min(by="ts", window_size="-1d", closed="left") + pl.col("value").rolling_min_by("ts", window_size="-1d", closed="left") ).collect() @@ -612,7 +614,7 @@ def test_rolling_empty_window_9406(time_unit: TimeUnit) -> None: df.select( [ pl.col("d"), - pl.col("x").rolling_max(by="d", window_size="3d", closed="left"), + pl.col("x").rolling_max_by("d", window_size="3d", closed="left"), ] ), ) @@ -621,7 +623,7 @@ def test_rolling_empty_window_9406(time_unit: TimeUnit) -> None: df.select( [ pl.col("d"), - pl.col("x").rolling_min(by="d", window_size="3d", closed="left"), + pl.col("x").rolling_min_by("d", window_size="3d", closed="left"), ] ), ) @@ -660,7 +662,7 @@ def test_rolling_aggregations_unsorted_raise_10991() -> None: with pytest.warns( UserWarning, match="Series is not known to be sorted by `by` column." ): - df.with_columns(roll=pl.col("val").rolling_sum("2d", by="dt")) + df.with_columns(roll=pl.col("val").rolling_sum_by("dt", "2d")) def test_rolling_aggregations_with_over_11225() -> None: @@ -677,9 +679,9 @@ def test_rolling_aggregations_with_over_11225() -> None: result = df_temporal.with_columns( rolling_row_mean=pl.col("index") - .rolling_mean( - window_size="2d", + .rolling_mean_by( by="date", + window_size="2d", closed="left", warn_if_unsorted=False, ) @@ -773,7 +775,7 @@ def test_rolling_by_date() -> None: } ).sort("dt") - result = df.with_columns(roll=pl.col("val").rolling_sum("2d", by="dt")) + result = df.with_columns(roll=pl.col("val").rolling_sum_by("dt", "2d")) expected = df.with_columns(roll=pl.Series([1, 3, 5])) assert_frame_equal(result, expected) @@ -790,7 +792,7 @@ def test_rolling_nanoseconds_11003() -> None: } ) df = df.with_columns(pl.col("dt").str.to_datetime(time_unit="ns")).set_sorted("dt") - result = df.with_columns(pl.col("val").rolling_sum("500ns", by="dt")) + result = df.with_columns(pl.col("val").rolling_sum_by("dt", "500ns")) expected = df.with_columns(val=pl.Series([1, 3, 6])) assert_frame_equal(result, expected) @@ -933,8 +935,8 @@ def test_rolling_min_periods( ) -> None: df = pl.DataFrame({"date": dates, "value": [1, 2, 3]}).sort("date") result = df.select( - pl.col("value").rolling_sum( - window_size="2d", by="date", min_periods=2, closed=closed + pl.col("value").rolling_sum_by( + "date", window_size="2d", min_periods=2, closed=closed ) )["value"] assert_series_equal(result, pl.Series("value", expected, pl.Int64)) @@ -948,7 +950,7 @@ def test_rolling_returns_scalar_15656() -> None: "c": [1, 2, 3], } ) - result = df.group_by("c").agg(pl.col("b").rolling_mean("2d", by="a")).sort("c") + result = df.group_by("c").agg(pl.col("b").rolling_mean_by("a", "2d")).sort("c") expected = pl.DataFrame({"c": [1, 2, 3], "b": [[4.0], [5.0], [6.0]]}) assert_frame_equal(result, expected)