diff --git a/NEWS.md b/NEWS.md index 518b3dd53..195c86366 100644 --- a/NEWS.md +++ b/NEWS.md @@ -31,14 +31,17 @@ - New methods `$write_json()` and `$write_ndjson()` for DataFrame (#502). - Removed argument `name` in `pl$date_range()`, which was deprecated for a while (#503). -- New private method `.pr$DataFrame$drop_all_in_place(df)` to drop `DataFrame` in-place, - to release memory without invoking gc(). However, if there are other strong references to any of - the underlying Series or arrow arrays, that memory will specifically not be released. This method - is aimed for r-polars extensions, and will be kept stable as much as possible (#504). +- New private method `.pr$DataFrame$drop_all_in_place(df)` to drop `DataFrame` + in-place, to release memory without invoking gc(). However, if there are other + strong references to any of the underlying Series or arrow arrays, that memory + will specifically not be released. This method is aimed for r-polars extensions, + and will be kept stable as much as possible (#504). - New functions `pl$min_horizontal()`, `pl$max_horizontal()`, `pl$sum_horizontal()`, `pl$all_horizontal()`, `pl$any_horizontal()` (#508). -- New generic functions `as_polars_df()` and `as_polars_lf()` to create polars DataFrames - and LazyFrames (#519). +- New generic functions `as_polars_df()` and `as_polars_lf()` to create polars + DataFrames and LazyFrames (#519). +- New method `$rolling()` to apply an Expr over a rolling window based on + date/datetime/numeric indices (#470). # polars 0.10.1 diff --git a/R/expr__expr.R b/R/expr__expr.R index f4c592a39..dd932fa25 100644 --- a/R/expr__expr.R +++ b/R/expr__expr.R @@ -3358,3 +3358,93 @@ Expr_peak_min = function() { Expr_peak_max = function() { .pr$Expr$peak_max(self) } + +#' Create rolling groups based on a time or numeric column +#' +#' @description +#' If you have a time series ``, then by default the windows +#' created will be: +#' * (t_0 - period, t_0] +#' * (t_1 - period, t_1] +#' * … +#' * (t_n - period, t_n] +#' +#' whereas if you pass a non-default offset, then the windows will be: +#' * (t_0 + offset, t_0 + offset + period] +#' * (t_1 + offset, t_1 + offset + period] +#' * … +#' * (t_n + offset, t_n + offset + period] +#' +#' @param index_column Column used to group based on the time window. Often of +#' type Date/Datetime. This column must be sorted in ascending order. If this +#' column represents an index, it has to be either Int32 or Int64. Note that +#' Int32 gets temporarily cast to Int64, so if performance matters use an Int64 +#' column. +#' @param period Length of the window, must be non-negative. +#' @param offset Offset of the window. Default is `-period`. +#' @param closed Define which sides of the temporal interval are closed +#' (inclusive). This can be either `"left"`, `"right"`, `"both"` or `"none"`. +#' @param check_sorted Check whether data is actually sorted. Checking it is +#' expensive so if you are sure the data within the `index_column` is sorted, you +#' can set this to `FALSE` but note that if the data actually is unsorted, it +#' will lead to incorrect output. +#' +#' @details +#' The period and offset arguments are created either from a timedelta, or by +#' using the following string language: +#' * 1ns (1 nanosecond) +#' * 1us (1 microsecond) +#' * 1ms (1 millisecond) +#' * 1s (1 second) +#' * 1m (1 minute) +#' * 1h (1 hour) +#' * 1d (1 calendar day) +#' * 1w (1 calendar week) +#' * 1mo (1 calendar month) +#' * 1q (1 calendar quarter) +#' * 1y (1 calendar year) +#' * 1i (1 index count) +#' +#' Or combine them: "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds +#' +#' By "calendar day", we mean the corresponding time on the next day (which may +#' not be 24 hours, due to daylight savings). Similarly for "calendar week", +#' "calendar month", "calendar quarter", and "calendar year". +#' +#' In case of a rolling operation on an integer column, the windows are defined +#' by: +#' * "1i" # length 1 +#' * "10i" # length 10 +#' +#' @return Expr +#' +#' @examples +#' # create a DataFrame with a Datetime column and an f64 column +#' dates = c("2020-01-01 13:45:48", "2020-01-01 16:42:13", "2020-01-01 16:45:09", +#' "2020-01-02 18:12:48", "2020-01-03 19:45:32", "2020-01-08 23:16:43") +#' +#' df = pl$DataFrame(dt = dates, a = c(3, 7, 5, 9, 2, 1))$ +#' with_columns( +#' pl$col("dt")$str$strptime(pl$Datetime(tu = "us"), format = "%Y-%m-%d %H:%M:%S")$set_sorted() +#' ) +#' +#' df$with_columns( +#' sum_a=pl$sum("a")$rolling(index_column="dt", period="2d"), +#' min_a=pl$min("a")$rolling(index_column="dt", period="2d"), +#' max_a=pl$max("a")$rolling(index_column="dt", period="2d") +#' ) +#' +#' # we can use "offset" to change the start of the window period. Here, with +#' # offset = "1d", we start the window one day after the value in "dt", and +#' # then we add a 2-day window relative to the window start. +#' df$with_columns( +#' sum_a_offset1 = pl$sum("a")$rolling(index_column = "dt", period = "2d", offset = "1d") +#' ) +Expr_rolling = function(index_column, period, offset = NULL, + closed = "right", check_sorted = TRUE) { + if (is.null(offset)) { + offset = paste0("-", period) + } + .pr$Expr$rolling(self, index_column, period, offset, closed, check_sorted) |> + unwrap("in $rolling():") +} diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index 0d09cdecf..4e5941c70 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -541,6 +541,8 @@ Expr$rolling_quantile <- function(quantile, interpolation, window_size, weights, Expr$rolling_skew <- function(window_size_f, bias) .Call(wrap__Expr__rolling_skew, self, window_size_f, bias) +Expr$rolling <- function(index_column, period, offset, closed, check_sorted) .Call(wrap__Expr__rolling, self, index_column, period, offset, closed, check_sorted) + Expr$abs <- function() .Call(wrap__Expr__abs, self) Expr$rank <- function(method, descending) .Call(wrap__Expr__rank, self, method, descending) diff --git a/man/Expr_rolling.Rd b/man/Expr_rolling.Rd new file mode 100644 index 000000000..ef0206d1b --- /dev/null +++ b/man/Expr_rolling.Rd @@ -0,0 +1,108 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/expr__expr.R +\name{Expr_rolling} +\alias{Expr_rolling} +\title{Create rolling groups based on a time or numeric column} +\usage{ +Expr_rolling( + index_column, + period, + offset = NULL, + closed = "right", + check_sorted = TRUE +) +} +\arguments{ +\item{index_column}{Column used to group based on the time window. Often of +type Date/Datetime. This column must be sorted in ascending order. If this +column represents an index, it has to be either Int32 or Int64. Note that +Int32 gets temporarily cast to Int64, so if performance matters use an Int64 +column.} + +\item{period}{Length of the window, must be non-negative.} + +\item{offset}{Offset of the window. Default is \code{-period}.} + +\item{closed}{Define which sides of the temporal interval are closed +(inclusive). This can be either \code{"left"}, \code{"right"}, \code{"both"} or \code{"none"}.} + +\item{check_sorted}{Check whether data is actually sorted. Checking it is +expensive so if you are sure the data within the \code{index_column} is sorted, you +can set this to \code{FALSE} but note that if the data actually is unsorted, it +will lead to incorrect output.} +} +\value{ +Expr +} +\description{ +If you have a time series \verb{}, then by default the windows +created will be: +\itemize{ +\item (t_0 - period, t_0] +\item (t_1 - period, t_1] +\item … +\item (t_n - period, t_n] +} + +whereas if you pass a non-default offset, then the windows will be: +\itemize{ +\item (t_0 + offset, t_0 + offset + period] +\item (t_1 + offset, t_1 + offset + period] +\item … +\item (t_n + offset, t_n + offset + period] +} +} +\details{ +The period and offset arguments are created either from a timedelta, or by +using the following string language: +\itemize{ +\item 1ns (1 nanosecond) +\item 1us (1 microsecond) +\item 1ms (1 millisecond) +\item 1s (1 second) +\item 1m (1 minute) +\item 1h (1 hour) +\item 1d (1 calendar day) +\item 1w (1 calendar week) +\item 1mo (1 calendar month) +\item 1q (1 calendar quarter) +\item 1y (1 calendar year) +\item 1i (1 index count) +} + +Or combine them: "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + +By "calendar day", we mean the corresponding time on the next day (which may +not be 24 hours, due to daylight savings). Similarly for "calendar week", +"calendar month", "calendar quarter", and "calendar year". + +In case of a rolling operation on an integer column, the windows are defined +by: +\itemize{ +\item "1i" # length 1 +\item "10i" # length 10 +} +} +\examples{ +# create a DataFrame with a Datetime column and an f64 column +dates = c("2020-01-01 13:45:48", "2020-01-01 16:42:13", "2020-01-01 16:45:09", + "2020-01-02 18:12:48", "2020-01-03 19:45:32", "2020-01-08 23:16:43") + +df = pl$DataFrame(dt = dates, a = c(3, 7, 5, 9, 2, 1))$ + with_columns( + pl$col("dt")$str$strptime(pl$Datetime(tu = "us"), format = "\%Y-\%m-\%d \%H:\%M:\%S")$set_sorted() + ) + +df$with_columns( + sum_a=pl$sum("a")$rolling(index_column="dt", period="2d"), + min_a=pl$min("a")$rolling(index_column="dt", period="2d"), + max_a=pl$max("a")$rolling(index_column="dt", period="2d") +) + +# we can use "offset" to change the start of the window period. Here, with +# offset = "1d", we start the window one day after the value in "dt", and +# then we add a 2-day window relative to the window start. +df$with_columns( + sum_a_offset1 = pl$sum("a")$rolling(index_column = "dt", period = "2d", offset = "1d") +) +} diff --git a/src/rust/src/lazy/dsl.rs b/src/rust/src/lazy/dsl.rs index 10d3b2339..197ea38c5 100644 --- a/src/rust/src/lazy/dsl.rs +++ b/src/rust/src/lazy/dsl.rs @@ -1,10 +1,8 @@ use crate::concurrent::RFnSignature; -use crate::rdatatype::literal_to_any_value; -use crate::rdatatype::new_null_behavior; -use crate::rdatatype::new_rank_method; -use crate::rdatatype::new_rolling_cov_options; -use crate::rdatatype::robj_to_timeunit; -use crate::rdatatype::{DataTypeVector, RPolarsDataType}; +use crate::rdatatype::{ + literal_to_any_value, new_null_behavior, new_rank_method, + new_rolling_cov_options, robj_to_timeunit, DataTypeVector, RPolarsDataType, +}; use crate::robj_to; use crate::rpolarserr::polars_to_rpolars_err; use crate::rpolarserr::{rerr, rpolars_to_polars_err, RResult, Rctx, WithRctx}; @@ -18,7 +16,10 @@ use crate::utils::{try_f64_into_i64, try_f64_into_u32, try_f64_into_usize}; use crate::CONFIG; use extendr_api::{extendr, prelude::*, rprintln, Deref, DerefMut, Rinternals}; use pl::PolarsError as pl_error; -use pl::{BinaryNameSpaceImpl, DurationMethods, IntoSeries, TemporalMethods, Utf8NameSpaceImpl}; +use pl::{ + BinaryNameSpaceImpl, Duration, DurationMethods, IntoSeries, RollingGroupOptions, + TemporalMethods, Utf8NameSpaceImpl, +}; use polars::lazy::dsl; use polars::prelude as pl; use polars::prelude::SortOptions; @@ -2370,6 +2371,31 @@ impl Expr { ) .into()) } + + pub fn rolling( + &self, + index_column: Robj, + period: Robj, + offset: Robj, + closed: Robj, + check_sorted: Robj, + ) -> RResult { + let index_column = robj_to!(String, index_column)?.into(); + let period = Duration::parse(robj_to!(str, period)?); + let offset = Duration::parse(robj_to!(str, offset)?); + let closed_window = robj_to!(ClosedWindow, closed)?; + let check_sorted = robj_to!(bool, check_sorted)?; + + let options = RollingGroupOptions { + index_column, + period, + offset, + closed_window, + check_sorted, + }; + + Ok(self.0.clone().rolling(options).into()) + } } // handle varition in implementation if not full_features diff --git a/tests/testthat/test-expr.R b/tests/testthat/test-expr.R index 0b46cee1f..de84ba582 100644 --- a/tests/testthat/test-expr.R +++ b/tests/testthat/test-expr.R @@ -2444,3 +2444,107 @@ test_that("pl$min_horizontal works", { list(min = c(1, 2, 2, -Inf)) ) }) + +test_that("rolling, basic", { + dates = c("2020-01-01 13:45:48", "2020-01-01 16:42:13", "2020-01-01 16:45:09", + "2020-01-02 18:12:48", "2020-01-03 19:45:32", "2020-01-08 23:16:43") + + df = pl$DataFrame(dt = dates, a = c(3, 7, 5, 9, 2, 1))$ + with_columns( + pl$col("dt")$str$strptime(pl$Datetime(tu = "us"), format = "%Y-%m-%d %H:%M:%S")$set_sorted() + ) + + out = df$with_columns( + sum_a = pl$sum("a")$rolling(index_column = "dt", period = "2d"), + min_a = pl$min("a")$rolling(index_column = "dt", period = "2d"), + max_a = pl$max("a")$rolling(index_column = "dt", period = "2d"), + mean_a = pl$mean("a")$rolling(index_column = "dt", period = "2d") + )$select("sum_a", "min_a", "max_a", "mean_a")$to_data_frame() + + expect_identical( + out, + data.frame( + sum_a = c(3, 10, 15, 24, 11, 1), + min_a = c(3, 3, 3, 3, 2, 1), + max_a = c(3, 7, 7, 9, 9, 1), + mean_a = c(3, 5, 5, 6, 5.5, 1) + ) + ) +}) + +test_that("rolling, arg closed", { + dates = c("2020-01-01 13:45:48", "2020-01-01 16:42:13", "2020-01-01 16:45:09", + "2020-01-02 18:12:48", "2020-01-03 19:45:32", "2020-01-08 23:16:43") + + df = pl$DataFrame(dt = dates, a = c(3, 7, 5, 9, 2, 1))$ + with_columns( + pl$col("dt")$str$strptime(pl$Datetime(tu = "us"), format = "%Y-%m-%d %H:%M:%S")$set_sorted() + ) + + out = df$with_columns( + sum_a_left = pl$sum("a")$rolling(index_column = "dt", period = "2d", closed = "left"), + sum_a_both = pl$sum("a")$rolling(index_column = "dt", period = "2d", closed = "both"), + sum_a_none = pl$sum("a")$rolling(index_column = "dt", period = "2d", closed = "none"), + sum_a_right = pl$sum("a")$rolling(index_column = "dt", period = "2d", closed = "right") + )$select("sum_a_left", "sum_a_both", "sum_a_none", "sum_a_right")$to_data_frame() + + expect_identical( + out, + data.frame( + sum_a_left = c(0, 3, 10, 15, 9, 0), + sum_a_both = c(3, 10, 15, 24, 11, 1), + sum_a_none = c(0, 3, 10, 15, 9, 0), + sum_a_right = c(3, 10, 15, 24, 11, 1) + ) + ) +}) + +test_that("rolling, arg offset", { + dates = c("2020-01-01 13:45:48", "2020-01-01 16:42:13", "2020-01-01 16:45:09", + "2020-01-02 18:12:48", "2020-01-03 19:45:32", "2020-01-08 23:16:43") + + df = pl$DataFrame(dt = dates, a = c(3, 7, 5, 9, 2, 1))$ + with_columns( + pl$col("dt")$str$strptime(pl$Datetime(tu = "us"), format = "%Y-%m-%d %H:%M:%S")$set_sorted() + ) + + # with offset = "1d", we start the window at one or two days after the value + # in "dt", and then we add a 2-day window relative to the window start. + out = df$with_columns( + sum_a_offset1 = pl$sum("a")$rolling(index_column = "dt", period = "2d", offset = "1d"), + sum_a_offset2 = pl$sum("a")$rolling(index_column = "dt", period = "2d", offset = "2d") + )$select("sum_a_offset1", "sum_a_offset2")$to_data_frame() + + expect_identical( + out, + data.frame( + sum_a_offset1 = c(11, 11, 11, 2, NA, NA), + sum_a_offset2 = c(2, 2, 2, NA, NA, NA) + ) + ) +}) + +test_that("rolling, arg check_sorted", { + dates = c("2020-01-02 18:12:48", "2020-01-03 19:45:32", "2020-01-08 23:16:43", + "2020-01-01 13:45:48", "2020-01-01 16:42:13", "2020-01-01 16:45:09") + + df = pl$DataFrame(dt = dates, a = c(3, 7, 5, 9, 2, 1))$ + with_columns( + pl$col("dt")$str$strptime(pl$Datetime(tu = "us"), format = "%Y-%m-%d %H:%M:%S") + ) + + expect_error( + df$with_columns( + sum_a_offset1 = pl$sum("a")$rolling(index_column = "dt", period = "2d") + ), + "is not explicitly sorted" + ) + + # no error message but wrong output + expect_no_error( + df$with_columns(pl$col("dt")$set_sorted())$with_columns( + sum_a_offset1 = pl$sum("a")$rolling(index_column = "dt", period = "2d", + check_sorted = FALSE) + ) + ) +})