From e7c6ab342f22fffc68c046604ae93a6dfe0e177e Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Thu, 11 Jan 2024 13:28:55 +0100 Subject: [PATCH 01/14] init --- R/extendr-wrappers.R | 2 + R/lazyframe__lazy.R | 34 +++++++++++ man/LazyFrame_group_by_dynamic.Rd | 93 +++++++++++++++++++++++++++++++ src/rust/src/lazy/dataframe.rs | 38 +++++++++++++ src/rust/src/rdatatype.rs | 30 ++++++++++ src/rust/src/utils/mod.rs | 6 ++ 6 files changed, 203 insertions(+) create mode 100644 man/LazyFrame_group_by_dynamic.Rd diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index 7740fd58a..4788a0c7f 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -1117,6 +1117,8 @@ RPolarsLazyFrame$with_context <- function(contexts) .Call(wrap__RPolarsLazyFrame RPolarsLazyFrame$rolling <- function(index_column, period, offset, closed, by, check_sorted) .Call(wrap__RPolarsLazyFrame__rolling, self, index_column, period, offset, closed, by, check_sorted) +RPolarsLazyFrame$group_by_dynamic <- function(index_column, every, period, offset, label, include_boundaries, closed, by, start_by, check_sorted) .Call(wrap__RPolarsLazyFrame__group_by_dynamic, self, index_column, every, period, offset, label, include_boundaries, closed, by, start_by, check_sorted) + #' @export `$.RPolarsLazyFrame` <- function (self, name) { func <- RPolarsLazyFrame[[name]]; environment(func) <- environment(); func } diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index ede534787..cc5ffe7e4 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -1654,3 +1654,37 @@ LazyFrame_rolling = function(index_column, period, offset = NULL, closed = "righ ) |> unwrap("in $rolling():") } + + +#' Group based on a date/time or integer column +#' +#' @inherit LazyFrame_rolling description details params +#' +#' +#' @return A [LazyGroupBy][LazyGroupBy_class] object +#' +#' @examples +LazyFrame_group_by_dynamic = function( + index_column, + every, + period = NULL, + offset = NULL, + include_boundaries = FALSE, + closed = "left", + label = "left", + by = NULL, + start_by = "window", + check_sorted = TRUE +) { + if (is.null(offset)) { + offset = paste0("-", every) + } + if (is.null(period)) { + period = every + } + .pr$LazyFrame$group_by_dynamic( + self, index_column, every, period, offset, label, include_boundaries, closed, + wrap_elist_result(by, str_to_lit = FALSE), start_by, check_sorted + ) |> + unwrap("in $group_by_dynamic():") +} diff --git a/man/LazyFrame_group_by_dynamic.Rd b/man/LazyFrame_group_by_dynamic.Rd new file mode 100644 index 000000000..093d3fc65 --- /dev/null +++ b/man/LazyFrame_group_by_dynamic.Rd @@ -0,0 +1,93 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe__lazy.R +\name{LazyFrame_group_by_dynamic} +\alias{LazyFrame_group_by_dynamic} +\title{Group based on a date/time or integer column} +\usage{ +LazyFrame_group_by_dynamic( + index_column, + every, + period = NULL, + offset = NULL, + include_boundaries = FALSE, + closed = "left", + label = "left", + by = NULL, + start_by = "window", + check_sorted = TRUE +) +} +\arguments{ +\item{index_column}{Column used to group based on the time window. Often of +type Date/Datetime. This column must be sorted in ascending order (or, if \code{by} +is specified, then it must be sorted in ascending order within each group). In +case of a rolling group by on indices, dtype needs to be either Int32 or Int64. +Note that Int32 gets temporarily cast to Int64, so if performance matters use +an Int64 column.} + +\item{period}{Length of the window, must be non-negative.} + +\item{offset}{Offset of the window. Default is \code{-period}.} + +\item{closed}{Define which sides of the temporal interval are closed +(inclusive). This can be either \code{"left"}, \code{"right"}, \code{"both"} or \code{"none"}.} + +\item{by}{Also group by this column/these columns.} + +\item{check_sorted}{Check whether data is actually sorted. Checking it is +expensive so if you are sure the data within the \code{index_column} is sorted, you +can set this to \code{FALSE} but note that if the data actually is unsorted, it +will lead to incorrect output.} +} +\value{ +A \link[=LazyGroupBy_class]{LazyGroupBy} object +} +\description{ +If you have a time series \verb{}, then by default the windows +created will be: +\itemize{ +\item (t_0 - period, t_0] +\item (t_1 - period, t_1] +\item … +\item (t_n - period, t_n] +} + +whereas if you pass a non-default offset, then the windows will be: +\itemize{ +\item (t_0 + offset, t_0 + offset + period] +\item (t_1 + offset, t_1 + offset + period] +\item … +\item (t_n + offset, t_n + offset + period] +} +} +\details{ +The period and offset arguments are created either from a timedelta, or by +using the following string language: +\itemize{ +\item 1ns (1 nanosecond) +\item 1us (1 microsecond) +\item 1ms (1 millisecond) +\item 1s (1 second) +\item 1m (1 minute) +\item 1h (1 hour) +\item 1d (1 calendar day) +\item 1w (1 calendar week) +\item 1mo (1 calendar month) +\item 1q (1 calendar quarter) +\item 1y (1 calendar year) +\item 1i (1 index count) +} + +Or combine them: "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + +By "calendar day", we mean the corresponding time on the next day (which may +not be 24 hours, due to daylight savings). Similarly for "calendar week", +"calendar month", "calendar quarter", and "calendar year". + +In case of a rolling operation on an integer column, the windows are defined +by: +\itemize{ +\item "1i" # length 1 +\item "10i" # length 10 +} +} diff --git a/src/rust/src/lazy/dataframe.rs b/src/rust/src/lazy/dataframe.rs index eec512077..159182483 100644 --- a/src/rust/src/lazy/dataframe.rs +++ b/src/rust/src/lazy/dataframe.rs @@ -641,6 +641,44 @@ impl RPolarsLazyFrame { opt_state: self.0.get_current_optimizations(), }) } + + pub fn group_by_dynamic( + &self, + index_column: Robj, + every: Robj, + period: Robj, + offset: Robj, + label: Robj, + include_boundaries: Robj, + closed: Robj, + by: Robj, + start_by: Robj, + check_sorted: Robj, + ) -> RResult { + let closed_window = robj_to!(ClosedWindow, closed)?; + let by = robj_to!(VecPLExprCol, by)?; + let ldf = self.0.clone(); + let lazy_gb = ldf.group_by_dynamic( + robj_to!(PLExprCol, index_column)?, + by, + pl::DynamicGroupOptions { + every: robj_to!(pl_duration, every)?, + period: robj_to!(pl_duration, period)?, + offset: robj_to!(pl_duration, offset)?, + label: robj_to!(Label, label)?, + include_boundaries: robj_to!(bool, include_boundaries)?, + closed_window, + start_by: robj_to!(StartBy, start_by)?, + check_sorted: robj_to!(bool, check_sorted)?, + ..Default::default() + }, + ); + + Ok(RPolarsLazyGroupBy { + lgb: lazy_gb, + opt_state: self.0.get_current_optimizations(), + }) + } } #[derive(Clone)] diff --git a/src/rust/src/rdatatype.rs b/src/rust/src/rdatatype.rs index b4a7f91d8..47f332b7c 100644 --- a/src/rust/src/rdatatype.rs +++ b/src/rust/src/rdatatype.rs @@ -499,6 +499,36 @@ pub fn robj_to_closed_window(robj: Robj) -> RResult { } } +pub fn robj_to_label(robj: Robj) -> RResult { + use pl::Label; + match robj_to_rchoice(robj)?.as_str() { + "left" => Ok(Label::Left), + "right" => Ok(Label::Right), + "datapoint" => Ok(Label::DataPoint), + s => rerr().bad_val(format!( + "Label choice ['{s}'] should be one of 'left', 'right', 'datapoint'" + )), + } +} + +pub fn robj_to_start_by(robj: Robj) -> RResult { + use pl::StartBy as SB; + match robj_to_rchoice(robj)?.as_str() { + "window" => Ok(SB::WindowBound), + "datapoint" => Ok(SB::DataPoint), + "monday" => Ok(SB::Monday), + "tuesday" => Ok(SB::Tuesday), + "wednesday" => Ok(SB::Wednesday), + "thursday" => Ok(SB::Thursday), + "friday" => Ok(SB::Friday), + "saturday" => Ok(SB::Saturday), + "sunday" => Ok(SB::Sunday), + s => rerr().bad_val(format!( + "StartBy choice ['{s}'] should be one of 'window', 'datapoint', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'" + )), + } +} + pub fn robj_to_parallel_strategy(robj: extendr_api::Robj) -> RResult { use pl::ParallelStrategy as PS; match robj_to_rchoice(robj)?.to_lowercase().as_str() { diff --git a/src/rust/src/utils/mod.rs b/src/rust/src/utils/mod.rs index f77cac379..3df08d5f4 100644 --- a/src/rust/src/utils/mod.rs +++ b/src/rust/src/utils/mod.rs @@ -944,6 +944,12 @@ macro_rules! robj_to_inner { (ClosedWindow, $a:ident) => { $crate::rdatatype::robj_to_closed_window($a) }; + (Label, $a:ident) => { + $crate::rdatatype::robj_to_label($a) + }; + (StartBy, $a:ident) => { + $crate::rdatatype::robj_to_start_by($a) + }; (new_quantile_interpolation_option, $a:ident) => { $crate::rdatatype::new_quantile_interpolation_option($a) }; From afde2eb3fb93609b3419d3c134f53a8c5d041e72 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Thu, 11 Jan 2024 13:54:06 +0100 Subject: [PATCH 02/14] document --- R/lazyframe__lazy.R | 99 +++++++++++++++++++++++++++---- man/LazyFrame_group_by_dynamic.Rd | 88 +++++++++++++++++++++++++++ 2 files changed, 176 insertions(+), 11 deletions(-) diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index 72c95386a..a2ece9e18 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -1658,22 +1658,99 @@ LazyFrame_rolling = function(index_column, period, offset = NULL, closed = "righ #' #' @inherit LazyFrame_rolling description details params #' +#' @param every Interval of the window. +#' @param include_boundaries Add two columns `"_lower_boundary"` and +#' `"_upper_boundary"` columns that show the boundaries of the window. This will +#' impact performance because it’s harder to parallelize. +#' @param label Define which label to use for the window: +#' * `"left"`: lower boundary of the window +#' * `"right"`: upper boundary of the window +#' * `"datapoint"`: the first value of the index column in the given window. If +#' you don’t need the label to be at one of the boundaries, choose this option +#' for maximum performance. +#' @param start_by The strategy to determine the start of the first window by: +#' * `"window"`: start by taking the earliest timestamp, truncating it with `every`, +#' and then adding `offset`. Note that weekly windows start on Monday. +#' * `"datapoint"`: start from the first encountered data point. +#' * a day of the week (only takes effect if `every` contains `"w"`): `"monday"` +#' starts the window on the Monday before the first data point, etc. #' #' @return A [LazyGroupBy][LazyGroupBy_class] object #' #' @examples +#' lf = pl$LazyFrame( +#' time = pl$date_range( +#' start = strptime("2021-12-16 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), +#' end = strptime("2021-12-16 03:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), +#' interval = "30m", +#' eager = TRUE, +#' ), +#' n = 0:6 +#' ) +#' lf$collect() +#' +#' # get the sum in the following hour relative to the "time" column +#' lf$group_by_dynamic("time", every = "1h")$agg( +#' vals = pl$col("n"), +#' sum = pl$col("n")$sum() +#' )$collect() +#' +#' # using "include_boundaries = TRUE" is helpful to see the period considered +#' lf$group_by_dynamic("time", every = "1h", include_boundaries = TRUE)$agg( +#' vals = pl$col("n") +#' )$collect() +#' +#' # in the example above, the values didn't include the one *exactly* 1h after +#' # the start because "closed = 'left'" by default. +#' # Changing it to "right" includes values that are exactly 1h after. Note that +#' # the value at 00:00:00 now becomes included in the interval [23:00:00 - 00:00:00], +#' # even if this interval wasn't there originally +#' lf$group_by_dynamic("time", every = "1h", closed = "right")$agg( +#' vals = pl$col("n") +#' )$collect() +#' # To keep both boundaries, we use "closed = 'both'". Some values now belong to +#' # several groups: +#' lf$group_by_dynamic("time", every = "1h", closed = "both")$agg( +#' vals = pl$col("n") +#' )$collect() +#' +#' # Dynamic group bys can also be combined with grouping on normal keys +#' lf = lf$with_columns(groups = pl$Series(c("a", "a", "a", "b", "b", "a", "a"))) +#' lf$collect() +#' +#' lf$group_by_dynamic( +#' "time", +#' every = "1h", +#' closed = "both", +#' by = "groups", +#' include_boundaries = TRUE +#' )$agg(pl$col("n"))$collect() +#' +#' # We can also create a dynamic group by based on an index column +#' lf = pl$LazyFrame( +#' idx = 0:5, +#' A = c("A", "A", "B", "B", "B", "C") +#' )$with_columns(pl$col("idx")$set_sorted()) +#' lf$collect() +#' +#' lf$group_by_dynamic( +#' "idx", +#' every = "2i", +#' period = "3i", +#' include_boundaries = TRUE, +#' closed = "right" +#' )$agg(A_agg_list = pl$col("A"))$collect() LazyFrame_group_by_dynamic = function( - index_column, - every, - period = NULL, - offset = NULL, - include_boundaries = FALSE, - closed = "left", - label = "left", - by = NULL, - start_by = "window", - check_sorted = TRUE -) { + index_column, + every, + period = NULL, + offset = NULL, + include_boundaries = FALSE, + closed = "left", + label = "left", + by = NULL, + start_by = "window", + check_sorted = TRUE) { if (is.null(offset)) { offset = paste0("-", every) } diff --git a/man/LazyFrame_group_by_dynamic.Rd b/man/LazyFrame_group_by_dynamic.Rd index 093d3fc65..d3039b4d5 100644 --- a/man/LazyFrame_group_by_dynamic.Rd +++ b/man/LazyFrame_group_by_dynamic.Rd @@ -25,15 +25,39 @@ case of a rolling group by on indices, dtype needs to be either Int32 or Int64. Note that Int32 gets temporarily cast to Int64, so if performance matters use an Int64 column.} +\item{every}{Interval of the window.} + \item{period}{Length of the window, must be non-negative.} \item{offset}{Offset of the window. Default is \code{-period}.} +\item{include_boundaries}{Add two columns \code{"_lower_boundary"} and +\code{"_upper_boundary"} columns that show the boundaries of the window. This will +impact performance because it’s harder to parallelize.} + \item{closed}{Define which sides of the temporal interval are closed (inclusive). This can be either \code{"left"}, \code{"right"}, \code{"both"} or \code{"none"}.} +\item{label}{Define which label to use for the window: +\itemize{ +\item \code{"left"}: lower boundary of the window +\item \code{"right"}: upper boundary of the window +\item \code{"datapoint"}: the first value of the index column in the given window. If +you don’t need the label to be at one of the boundaries, choose this option +for maximum performance. +}} + \item{by}{Also group by this column/these columns.} +\item{start_by}{The strategy to determine the start of the first window by: +\itemize{ +\item \code{"window"}: start by taking the earliest timestamp, truncating it with \code{every}, +and then adding \code{offset}. Note that weekly windows start on Monday. +\item \code{"datapoint"}: start from the first encountered data point. +\item a day of the week (only takes effect if \code{every} contains \code{"w"}): \code{"monday"} +starts the window on the Monday before the first data point, etc. +}} + \item{check_sorted}{Check whether data is actually sorted. Checking it is expensive so if you are sure the data within the \code{index_column} is sorted, you can set this to \code{FALSE} but note that if the data actually is unsorted, it @@ -91,3 +115,67 @@ by: \item "10i" # length 10 } } +\examples{ +lf = pl$LazyFrame( + time = pl$date_range( + start = strptime("2021-12-16 00:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"), + end = strptime("2021-12-16 03:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"), + interval = "30m", + eager = TRUE, + ), + n = 0:6 +) +lf$collect() + +# get the sum in the following hour relative to the "time" column +lf$group_by_dynamic("time", every = "1h")$agg( + vals = pl$col("n"), + sum = pl$col("n")$sum() +)$collect() + +# using "include_boundaries = TRUE" is helpful to see the period considered +lf$group_by_dynamic("time", every = "1h", include_boundaries = TRUE)$agg( + vals = pl$col("n") +)$collect() + +# in the example above, the values didn't include the one *exactly* 1h after +# the start because "closed = 'left'" by default. +# Changing it to "right" includes values that are exactly 1h after. Note that +# the value at 00:00:00 now becomes included in the interval [23:00:00 - 00:00:00], +# even if this interval wasn't there originally +lf$group_by_dynamic("time", every = "1h", closed = "right")$agg( + vals = pl$col("n") +)$collect() +# To keep both boundaries, we use "closed = 'both'". Some values now belong to +# several groups: +lf$group_by_dynamic("time", every = "1h", closed = "both")$agg( + vals = pl$col("n") +)$collect() + +# Dynamic group bys can also be combined with grouping on normal keys +lf = lf$with_columns(groups = pl$Series(c("a", "a", "a", "b", "b", "a", "a"))) +lf$collect() + +lf$group_by_dynamic( + "time", + every = "1h", + closed = "both", + by = "groups", + include_boundaries = TRUE +)$agg(pl$col("n"))$collect() + +# We can also create a dynamic group by based on an index column +lf = pl$LazyFrame( + idx = 0:5, + A = c("A", "A", "B", "B", "B", "C") +)$with_columns(pl$col("idx")$set_sorted()) +lf$collect() + +lf$group_by_dynamic( + "idx", + every = "2i", + period = "3i", + include_boundaries = TRUE, + closed = "right" +)$agg(A_agg_list = pl$col("A"))$collect() +} From 40ea71fe3b328c442a323b34e3f22a9f1a0c1ad5 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Thu, 11 Jan 2024 20:01:24 +0100 Subject: [PATCH 03/14] document, test --- R/dataframe__frame.R | 85 ++++++++ R/group_by.R | 6 +- man/DataFrame_group_by_dynamic.Rd | 180 ++++++++++++++++ tests/testthat/test-groupby.R | 336 ++++++++++++++++++++++++++++-- 4 files changed, 592 insertions(+), 15 deletions(-) create mode 100644 man/DataFrame_group_by_dynamic.Rd diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R index 76e5fbba4..048e724b3 100644 --- a/R/dataframe__frame.R +++ b/R/dataframe__frame.R @@ -1833,3 +1833,88 @@ DataFrame_rolling = function(index_column, period, offset = NULL, closed = "righ class(out) = "RPolarsGroupBy" out } + +#' @inherit LazyFrame_group_by_dynamic title description details params +#' @return A [GroupBy][GroupBy_class] object +#' +#' @examples +#' df = pl$DataFrame( +#' time = pl$date_range( +#' start = strptime("2021-12-16 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), +#' end = strptime("2021-12-16 03:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), +#' interval = "30m", +#' eager = TRUE, +#' ), +#' n = 0:6 +#' ) +#' +#' # get the sum in the following hour relative to the "time" column +#' df$group_by_dynamic("time", every = "1h")$agg( +#' vals = pl$col("n"), +#' sum = pl$col("n")$sum() +#' ) +#' +#' # using "include_boundaries = TRUE" is helpful to see the period considered +#' df$group_by_dynamic("time", every = "1h", include_boundaries = TRUE)$agg( +#' vals = pl$col("n") +#' ) +#' +#' # in the example above, the values didn't include the one *exactly* 1h after +#' # the start because "closed = 'left'" by default. +#' # Changing it to "right" includes values that are exactly 1h after. Note that +#' # the value at 00:00:00 now becomes included in the interval [23:00:00 - 00:00:00], +#' # even if this interval wasn't there originally +#' df$group_by_dynamic("time", every = "1h", closed = "right")$agg( +#' vals = pl$col("n") +#' ) +#' # To keep both boundaries, we use "closed = 'both'". Some values now belong to +#' # several groups: +#' df$group_by_dynamic("time", every = "1h", closed = "both")$agg( +#' vals = pl$col("n") +#' ) +#' +#' # Dynamic group bys can also be combined with grouping on normal keys +#' df = df$with_columns(groups = pl$Series(c("a", "a", "a", "b", "b", "a", "a"))) +#' df +#' +#' df$group_by_dynamic( +#' "time", +#' every = "1h", +#' closed = "both", +#' by = "groups", +#' include_boundaries = TRUE +#' )$agg(pl$col("n")) +#' +#' # We can also create a dynamic group by based on an index column +#' df = pl$LazyFrame( +#' idx = 0:5, +#' A = c("A", "A", "B", "B", "B", "C") +#' )$with_columns(pl$col("idx")$set_sorted()) +#' df +#' +#' df$group_by_dynamic( +#' "idx", +#' every = "2i", +#' period = "3i", +#' include_boundaries = TRUE, +#' closed = "right" +#' )$agg(A_agg_list = pl$col("A")) +DataFrame_group_by_dynamic = function( + index_column, + every, + period = NULL, + offset = NULL, + include_boundaries = FALSE, + closed = "left", + label = "left", + by = NULL, + start_by = "window", + check_sorted = TRUE) { + out = self$lazy()$group_by_dynamic( + index_column, every, period, offset, include_boundaries, closed, label, by, + start_by, check_sorted + ) + attr(out, "is_dynamic_group_by") = TRUE + class(out) = "RPolarsGroupBy" + out +} diff --git a/R/group_by.R b/R/group_by.R index 33e4803df..41950b819 100644 --- a/R/group_by.R +++ b/R/group_by.R @@ -86,7 +86,8 @@ print.RPolarsGroupBy = function(x, ...) { #' pl$col("bar")$mean()$alias("bar_tail_sum") #' ) GroupBy_agg = function(...) { - if (isTRUE(attributes(self)[["is_rolling_group_by"]])) { + if (isTRUE(attributes(self)[["is_rolling_group_by"]]) || + isTRUE(attributes(self)[["is_dynamic_group_by"]])) { class(self) = "RPolarsLazyGroupBy" self$agg(unpack_list(..., .context = "in $agg():"))$collect(no_optimization = TRUE) } else { @@ -300,7 +301,8 @@ GroupBy_null_count = function() { #' #' gb$ungroup() GroupBy_ungroup = function() { - if (isTRUE(attributes(self)[["is_rolling_group_by"]])) { + if (isTRUE(attributes(self)[["is_rolling_group_by"]]) || + isTRUE(attributes(self)[["is_dynamic_group_by"]])) { class(self) = "RPolarsLazyGroupBy" self = self$ungroup()$collect(no_optimization = TRUE) } else { diff --git a/man/DataFrame_group_by_dynamic.Rd b/man/DataFrame_group_by_dynamic.Rd new file mode 100644 index 000000000..ac240d21b --- /dev/null +++ b/man/DataFrame_group_by_dynamic.Rd @@ -0,0 +1,180 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataframe__frame.R +\name{DataFrame_group_by_dynamic} +\alias{DataFrame_group_by_dynamic} +\title{Group based on a date/time or integer column} +\usage{ +DataFrame_group_by_dynamic( + index_column, + every, + period = NULL, + offset = NULL, + include_boundaries = FALSE, + closed = "left", + label = "left", + by = NULL, + start_by = "window", + check_sorted = TRUE +) +} +\arguments{ +\item{index_column}{Column used to group based on the time window. Often of +type Date/Datetime. This column must be sorted in ascending order (or, if \code{by} +is specified, then it must be sorted in ascending order within each group). In +case of a rolling group by on indices, dtype needs to be either Int32 or Int64. +Note that Int32 gets temporarily cast to Int64, so if performance matters use +an Int64 column.} + +\item{every}{Interval of the window.} + +\item{period}{Length of the window, must be non-negative.} + +\item{offset}{Offset of the window. Default is \code{-period}.} + +\item{include_boundaries}{Add two columns \code{"_lower_boundary"} and +\code{"_upper_boundary"} columns that show the boundaries of the window. This will +impact performance because it’s harder to parallelize.} + +\item{closed}{Define which sides of the temporal interval are closed +(inclusive). This can be either \code{"left"}, \code{"right"}, \code{"both"} or \code{"none"}.} + +\item{label}{Define which label to use for the window: +\itemize{ +\item \code{"left"}: lower boundary of the window +\item \code{"right"}: upper boundary of the window +\item \code{"datapoint"}: the first value of the index column in the given window. If +you don’t need the label to be at one of the boundaries, choose this option +for maximum performance. +}} + +\item{by}{Also group by this column/these columns.} + +\item{start_by}{The strategy to determine the start of the first window by: +\itemize{ +\item \code{"window"}: start by taking the earliest timestamp, truncating it with \code{every}, +and then adding \code{offset}. Note that weekly windows start on Monday. +\item \code{"datapoint"}: start from the first encountered data point. +\item a day of the week (only takes effect if \code{every} contains \code{"w"}): \code{"monday"} +starts the window on the Monday before the first data point, etc. +}} + +\item{check_sorted}{Check whether data is actually sorted. Checking it is +expensive so if you are sure the data within the \code{index_column} is sorted, you +can set this to \code{FALSE} but note that if the data actually is unsorted, it +will lead to incorrect output.} +} +\value{ +A \link[=GroupBy_class]{GroupBy} object +} +\description{ +If you have a time series \verb{}, then by default the windows +created will be: +\itemize{ +\item (t_0 - period, t_0] +\item (t_1 - period, t_1] +\item … +\item (t_n - period, t_n] +} + +whereas if you pass a non-default offset, then the windows will be: +\itemize{ +\item (t_0 + offset, t_0 + offset + period] +\item (t_1 + offset, t_1 + offset + period] +\item … +\item (t_n + offset, t_n + offset + period] +} +} +\details{ +The period and offset arguments are created either from a timedelta, or by +using the following string language: +\itemize{ +\item 1ns (1 nanosecond) +\item 1us (1 microsecond) +\item 1ms (1 millisecond) +\item 1s (1 second) +\item 1m (1 minute) +\item 1h (1 hour) +\item 1d (1 calendar day) +\item 1w (1 calendar week) +\item 1mo (1 calendar month) +\item 1q (1 calendar quarter) +\item 1y (1 calendar year) +\item 1i (1 index count) +} + +Or combine them: "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds + +By "calendar day", we mean the corresponding time on the next day (which may +not be 24 hours, due to daylight savings). Similarly for "calendar week", +"calendar month", "calendar quarter", and "calendar year". + +In case of a rolling operation on an integer column, the windows are defined +by: +\itemize{ +\item "1i" # length 1 +\item "10i" # length 10 +} +} +\examples{ +df = pl$DataFrame( + time = pl$date_range( + start = strptime("2021-12-16 00:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"), + end = strptime("2021-12-16 03:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"), + interval = "30m", + eager = TRUE, + ), + n = 0:6 +) + +# get the sum in the following hour relative to the "time" column +df$group_by_dynamic("time", every = "1h")$agg( + vals = pl$col("n"), + sum = pl$col("n")$sum() +) + +# using "include_boundaries = TRUE" is helpful to see the period considered +df$group_by_dynamic("time", every = "1h", include_boundaries = TRUE)$agg( + vals = pl$col("n") +) + +# in the example above, the values didn't include the one *exactly* 1h after +# the start because "closed = 'left'" by default. +# Changing it to "right" includes values that are exactly 1h after. Note that +# the value at 00:00:00 now becomes included in the interval [23:00:00 - 00:00:00], +# even if this interval wasn't there originally +df$group_by_dynamic("time", every = "1h", closed = "right")$agg( + vals = pl$col("n") +) +# To keep both boundaries, we use "closed = 'both'". Some values now belong to +# several groups: +df$group_by_dynamic("time", every = "1h", closed = "both")$agg( + vals = pl$col("n") +) + +# Dynamic group bys can also be combined with grouping on normal keys +df = df$with_columns(groups = pl$Series(c("a", "a", "a", "b", "b", "a", "a"))) +df + +df$group_by_dynamic( + "time", + every = "1h", + closed = "both", + by = "groups", + include_boundaries = TRUE +)$agg(pl$col("n")) + +# We can also create a dynamic group by based on an index column +df = pl$LazyFrame( + idx = 0:5, + A = c("A", "A", "B", "B", "B", "C") +)$with_columns(pl$col("idx")$set_sorted()) +df + +df$group_by_dynamic( + "idx", + every = "2i", + period = "3i", + include_boundaries = TRUE, + closed = "right" +)$agg(A_agg_list = pl$col("A")) +} diff --git a/tests/testthat/test-groupby.R b/tests/testthat/test-groupby.R index e99bf55cc..1f6bc8147 100644 --- a/tests/testthat/test-groupby.R +++ b/tests/testthat/test-groupby.R @@ -1,3 +1,5 @@ +### group_by ------------------------------------------------ + df = pl$DataFrame( list( foo = c("one", "two", "two", "one", "two"), @@ -7,6 +9,19 @@ df = pl$DataFrame( gb = df$group_by("foo", maintain_order = TRUE) +test_that("groupby", { + df2 = gb$agg( + pl$col("bar")$sum()$alias("bar_sum"), + pl$col("bar")$mean()$alias("bar_tail_sum") + )$to_data_frame() + + expect_equal( + df2, + data.frame(foo = c("one", "two"), bar_sum = c(9, 6), bar_tail_sum = c(4.5, 2)) + ) +}) + + patrick::with_parameters_test_that("groupby print", { .env_var = .value @@ -21,19 +36,6 @@ test_that("groupby print when several groups", { expect_snapshot(df) }) -test_that("groupby", { - df2 = gb$agg( - pl$col("bar")$sum()$alias("bar_sum"), - pl$col("bar")$mean()$alias("bar_tail_sum") - )$to_data_frame() - - expect_equal( - df2, - data.frame(foo = c("one", "two"), bar_sum = c(9, 6), bar_tail_sum = c(4.5, 2)) - ) -}) - - make_cases = function() { tibble::tribble( ~.test_name, ~pola, ~base, @@ -162,3 +164,311 @@ test_that("LazyGroupBy clone", { expect_true(mem_address(lgb) != mem_address(lgb_clone)) expect_true(mem_address(lgb) == mem_address(lgb_copy)) }) + + + + + + +### group_by_dynamic ------------------------------------------------ + +test_that("group_by_dynamic for DataFrame calls the LazyFrame method", { + df = pl$DataFrame( + dt = as.Date(as.Date("2021-12-16"):as.Date("2021-12-22")), + n = 0:6 + )$with_columns( + pl$col("dt")$set_sorted() + ) + + actual = df$group_by_dynamic(index_column = "dt", every = "2d")$agg( + pl$col("n")$mean() + )$to_data_frame() + + expect_equal( + actual[, "n"], + c(0, 1.5, 3.5, 5.5) + ) +}) + +test_that("group_by_dynamic for LazyFrame: date variable", { + df = pl$LazyFrame( + dt = as.Date(as.Date("2021-12-16"):as.Date("2021-12-22")), + n = 0:6 + )$with_columns( + pl$col("dt")$set_sorted() + ) + + actual = df$group_by_dynamic(index_column = "dt", every = "2d")$agg( + pl$col("n")$mean() + )$collect()$to_data_frame() + + expect_equal( + actual[, "n"], + c(0, 1.5, 3.5, 5.5) + ) +}) + +test_that("group_by_dynamic for LazyFrame: datetime variable", { + df = pl$LazyFrame( + dt = c( + "2021-12-16 00:00:00", "2021-12-16 00:30:00", "2021-12-16 01:00:00", + "2021-12-16 01:30:00", "2021-12-16 02:00:00", "2021-12-16 02:30:00", + "2021-12-16 03:00:00" + ), + n = 0:6 + )$with_columns( + pl$col("dt")$str$strptime(pl$Datetime("ms"), format = NULL)$set_sorted() + ) + + actual = df$group_by_dynamic(index_column = "dt", every = "1h")$agg( + pl$col("n")$mean() + )$collect()$to_data_frame() + + expect_equal( + actual[, "n"], + c(0.5, 2.5, 4.5, 6) + ) +}) + +test_that("group_by_dynamic for LazyFrame: integer variable", { + df = pl$LazyFrame( + idx = 0:5, + n = 0:5 + )$with_columns(pl$col("idx")$set_sorted()) + + actual = df$group_by_dynamic( + "idx", + every = "2i" + )$agg(pl$col("n")$mean())$collect()$to_data_frame() + + expect_equal( + actual[, "n"], + c(0.5, 2.5, 4.5) + ) +}) + +test_that("group_by_dynamic for LazyFrame: error if not explicitly sorted", { + df = pl$LazyFrame( + index = c(1L, 2L, 3L, 4L, 8L, 9L), + a = c(3, 7, 5, 9, 2, 1) + ) + expect_error( + df$group_by_dynamic(index_column = "index", every = "2i")$agg(pl$col("a"))$collect(), + "not explicitly sorted" + ) +}) + +test_that("group_by_dynamic for LazyFrame: arg 'closed' works", { + df = pl$LazyFrame( + dt = c( + "2021-12-16 00:00:00", "2021-12-16 00:30:00", "2021-12-16 01:00:00", + "2021-12-16 01:30:00", "2021-12-16 02:00:00", "2021-12-16 02:30:00", + "2021-12-16 03:00:00" + ), + n = 0:6 + )$with_columns( + pl$col("dt")$str$strptime(pl$Datetime("ms"), format = NULL)$set_sorted() + ) + + actual = df$group_by_dynamic(index_column = "dt", closed = "right", every = "1h")$agg( + pl$col("n")$mean() + )$collect()$to_data_frame() + + expect_equal( + actual[, "n"], + c(0, 1.5, 3.5, 5.5) + ) + + expect_error( + df$group_by_dynamic(index_column = "dt", closed = "foobar", every = "1h")$agg( + pl$col("n")$mean() + )$collect(), + "should be one of" + ) +}) + +test_that("group_by_dynamic for LazyFrame: arg 'label' works", { + df = pl$LazyFrame( + dt = c( + "2021-12-16 00:00:00", "2021-12-16 00:30:00", "2021-12-16 01:00:00", + "2021-12-16 01:30:00", "2021-12-16 02:00:00", "2021-12-16 02:30:00", + "2021-12-16 03:00:00" + ), + n = 0:6 + )$with_columns( + pl$col("dt")$str$strptime(pl$Datetime("ms"), format = NULL)$set_sorted() + ) + + actual = df$group_by_dynamic(index_column = "dt", label = "right", every = "1h")$agg( + pl$col("n")$mean() + )$collect()$to_data_frame() + + expect_equal( + actual[, "dt"], + as.POSIXct( + c("2021-12-16 02:00:00 CET", "2021-12-16 03:00:00 CET", "2021-12-16 04:00:00 CET", "2021-12-16 05:00:00 CET") + ) + ) + + expect_error( + df$group_by_dynamic(index_column = "dt", label = "foobar", every = "1h")$agg( + pl$col("n")$mean() + )$collect(), + "should be one of" + ) +}) + +test_that("group_by_dynamic for LazyFrame: arg 'start_by' works", { + df = pl$LazyFrame( + dt = c( + "2021-12-16 00:00:00", "2021-12-16 00:30:00", "2021-12-16 01:00:00", + "2021-12-16 01:30:00", "2021-12-16 02:00:00", "2021-12-16 02:30:00", + "2021-12-16 03:00:00" + ), + n = 0:6 + )$with_columns( + pl$col("dt")$str$strptime(pl$Datetime("ms"), format = NULL)$set_sorted() + ) + + # TODO: any weekday should return the same since it is ignored when there's no + # "w" in "every". + # https://github.com/pola-rs/polars/issues/13648 + actual = df$group_by_dynamic(index_column = "dt", start_by = "monday", every = "1h")$agg( + pl$col("n")$mean() + )$collect()$to_data_frame() + + expect_equal( + actual[, "dt"], + as.POSIXct( + c("2021-12-16 01:00:00 CET", "2021-12-16 02:00:00 CET", "2021-12-16 03:00:00 CET", "2021-12-16 04:00:00 CET") + ) + ) + + expect_error( + df$group_by_dynamic(index_column = "dt", start_by = "foobar", every = "1h")$agg( + pl$col("n")$mean() + )$collect(), + "should be one of" + ) +}) + +test_that("group_by_dynamic for LazyFrame: argument 'by' works", { + df = pl$LazyFrame( + dt = c( + "2021-12-16 00:00:00", "2021-12-16 00:30:00", "2021-12-16 01:00:00", + "2021-12-16 01:30:00", "2021-12-16 02:00:00", "2021-12-16 02:30:00", + "2021-12-16 03:00:00" + ), + n = 0:6, + grp = c("a", "a", "a", "b", "b", "a", "a") + )$with_columns( + pl$col("dt")$str$strptime(pl$Datetime("ms"), format = NULL)$set_sorted() + ) + + actual = df$group_by_dynamic(index_column = "dt", every = "2h", by = pl$col("grp"))$agg( + pl$col("n")$mean() + )$collect()$to_data_frame() + + expect_equal( + actual[, "n"], + c(1, 5.5, 3, 4) + ) + + # string is parsed as column name in "by" + expect_equal( + df$group_by_dynamic(index_column = "dt", every = "2h", by = pl$col("grp"))$agg( + pl$col("n")$mean() + )$collect()$to_data_frame(), + df$group_by_dynamic(index_column = "dt", every = "2h", by = "grp")$agg( + pl$col("n")$mean() + )$collect()$to_data_frame() + ) +}) + +test_that("group_by_dynamic for LazyFrame: argument 'check_sorted' works", { + df = pl$LazyFrame( + index = c(2L, 1L, 3L, 4L, 9L, 8L), # unsorted index + grp = c("a", "a", rep("b", 4)), + a = c(3, 7, 5, 9, 2, 1) + ) + expect_error( + df$group_by_dynamic(index_column = "index", every = "2i", by = "grp")$agg( + pl$sum("a")$alias("sum_a") + )$collect(), + "not sorted" + ) + expect_no_error( + df$group_by_dynamic(index_column = "index", every = "2i", by = "grp", check_sorted = FALSE)$agg( + pl$sum("a")$alias("sum_a") + )$collect() + ) +}) + +test_that("group_by_dynamic for LazyFrame: error if index not int or date/time", { + df = pl$LazyFrame( + index = c(1:5, 6.0), + a = c(3, 7, 5, 9, 2, 1) + )$with_columns(pl$col("index")$set_sorted()) + + expect_error( + df$group_by_dynamic(index_column = "index", every = "2i")$agg( + pl$sum("a")$alias("sum_a") + )$collect() + ) +}) + +test_that("group_by_dynamic for LazyFrame: arg 'offset' works", { + df = pl$LazyFrame( + dt = c( + "2020-01-01", "2020-01-01", "2020-01-01", + "2020-01-02", "2020-01-03", "2020-01-08" + ), + n = c(3, 7, 5, 9, 2, 1) + )$with_columns( + pl$col("dt")$str$strptime(pl$Date, format = NULL)$set_sorted() + ) + + # checked with python-polars but unclear on how "offset" works + actual = df$group_by_dynamic(index_column = "dt", every = "2d", offset = "1d")$agg( + pl$col("n")$mean() + )$collect()$to_data_frame() + + expect_equal( + actual[, "n"], + c(5.5, 1) + ) +}) + +test_that("group_by_dynamic for LazyFrame: arg 'include_boundaries' works", { + df = pl$LazyFrame( + dt = c( + "2020-01-01", "2020-01-01", "2020-01-01", + "2020-01-02", "2020-01-03", "2020-01-08" + ), + n = c(3, 7, 5, 9, 2, 1) + )$with_columns( + pl$col("dt")$str$strptime(pl$Date, format = NULL)$set_sorted() + ) + + actual = df$group_by_dynamic( + index_column = "dt", every = "2d", offset = "1d", + include_boundaries = TRUE)$ + agg( + pl$col("n") + ) + + expect_named(actual, c("_lower_boundary", "_upper_boundary", "dt", "n")) +}) + +test_that("group_by_dynamic for LazyFrame: can be ungrouped", { + df = pl$LazyFrame( + index = c(1:5, 6.0), + a = c(3, 7, 5, 9, 2, 1) + )$with_columns(pl$col("index")$set_sorted()) + + actual = df$group_by_dynamic(index_column = "dt", every = "2i")$ + ungroup()$ + collect()$ + to_data_frame() + expect_equal(actual, df$collect()$to_data_frame()) +}) From 384679c716f223c7ed76cd0a2728b5a8b1d158d4 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Thu, 11 Jan 2024 20:02:09 +0100 Subject: [PATCH 04/14] bump news --- NEWS.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index a7211d68e..2c2f7e449 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,7 +5,8 @@ ### What's changed - New method `$rolling()` for `DataFrame` and `LazyFrame` (#682). -- New method `$sink_ndjson()` for LazyFrame (#681). +- New method `$sink_ndjson()` for `LazyFrame` (#681). +- New method `$group_by_dynamic()` for `DataFrame` and `LazyFrame` (#691). ## polars 0.12.2 From 4dfc3a91cde5a562f2ac596411758d8813895720 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Thu, 11 Jan 2024 20:09:03 +0100 Subject: [PATCH 05/14] update snapshots --- tests/testthat/_snaps/after-wrappers.md | 110 ++++++++++++------------ 1 file changed, 56 insertions(+), 54 deletions(-) diff --git a/tests/testthat/_snaps/after-wrappers.md b/tests/testthat/_snaps/after-wrappers.md index 3ea08a45f..19b5e65d6 100644 --- a/tests/testthat/_snaps/after-wrappers.md +++ b/tests/testthat/_snaps/after-wrappers.md @@ -71,23 +71,23 @@ Code ls(.pr$env[[class_name]]) Output - [1] "clone" "columns" "describe" "drop" - [5] "drop_in_place" "drop_nulls" "dtype_strings" "dtypes" - [9] "equals" "estimated_size" "explode" "fill_nan" - [13] "fill_null" "filter" "first" "get_column" - [17] "get_columns" "glimpse" "group_by" "head" - [21] "height" "join" "join_asof" "last" - [25] "lazy" "limit" "max" "mean" - [29] "median" "melt" "min" "n_chunks" - [33] "null_count" "pivot" "print" "quantile" - [37] "rechunk" "rename" "reverse" "rolling" - [41] "sample" "schema" "select" "shape" - [45] "shift" "shift_and_fill" "slice" "sort" - [49] "std" "sum" "tail" "to_data_frame" - [53] "to_list" "to_series" "to_struct" "transpose" - [57] "unique" "unnest" "var" "width" - [61] "with_columns" "with_row_count" "write_csv" "write_json" - [65] "write_ndjson" + [1] "clone" "columns" "describe" "drop" + [5] "drop_in_place" "drop_nulls" "dtype_strings" "dtypes" + [9] "equals" "estimated_size" "explode" "fill_nan" + [13] "fill_null" "filter" "first" "get_column" + [17] "get_columns" "glimpse" "group_by" "group_by_dynamic" + [21] "head" "height" "join" "join_asof" + [25] "last" "lazy" "limit" "max" + [29] "mean" "median" "melt" "min" + [33] "n_chunks" "null_count" "pivot" "print" + [37] "quantile" "rechunk" "rename" "reverse" + [41] "rolling" "sample" "schema" "select" + [45] "shape" "shift" "shift_and_fill" "slice" + [49] "sort" "std" "sum" "tail" + [53] "to_data_frame" "to_list" "to_series" "to_struct" + [57] "transpose" "unique" "unnest" "var" + [61] "width" "with_columns" "with_row_count" "write_csv" + [65] "write_json" "write_ndjson" --- @@ -139,25 +139,26 @@ [11] "fetch" "fill_nan" [13] "fill_null" "filter" [15] "first" "get_optimization_toggle" - [17] "group_by" "head" - [19] "join" "join_asof" - [21] "last" "limit" - [23] "max" "mean" - [25] "median" "melt" - [27] "min" "print" - [29] "profile" "quantile" - [31] "rename" "reverse" - [33] "rolling" "schema" - [35] "select" "set_optimization_toggle" - [37] "shift" "shift_and_fill" - [39] "sink_csv" "sink_ipc" - [41] "sink_ndjson" "sink_parquet" - [43] "slice" "sort" - [45] "std" "sum" - [47] "tail" "unique" - [49] "unnest" "var" - [51] "width" "with_columns" - [53] "with_context" "with_row_count" + [17] "group_by" "group_by_dynamic" + [19] "head" "join" + [21] "join_asof" "last" + [23] "limit" "max" + [25] "mean" "median" + [27] "melt" "min" + [29] "print" "profile" + [31] "quantile" "rename" + [33] "reverse" "rolling" + [35] "schema" "select" + [37] "set_optimization_toggle" "shift" + [39] "shift_and_fill" "sink_csv" + [41] "sink_ipc" "sink_ndjson" + [43] "sink_parquet" "slice" + [45] "sort" "std" + [47] "sum" "tail" + [49] "unique" "unnest" + [51] "var" "width" + [53] "with_columns" "with_context" + [55] "with_row_count" --- @@ -172,24 +173,25 @@ [11] "fill_nan" "fill_null" [13] "filter" "first" [15] "get_optimization_toggle" "group_by" - [17] "join" "join_asof" - [19] "last" "limit" - [21] "max" "mean" - [23] "median" "melt" - [25] "min" "print" - [27] "profile" "quantile" - [29] "rename" "reverse" - [31] "rolling" "schema" - [33] "select" "select_str_as_lit" - [35] "set_optimization_toggle" "shift" - [37] "shift_and_fill" "sink_csv" - [39] "sink_ipc" "sink_json" - [41] "sink_parquet" "slice" - [43] "sort_by_exprs" "std" - [45] "sum" "tail" - [47] "unique" "unnest" - [49] "var" "with_columns" - [51] "with_context" "with_row_count" + [17] "group_by_dynamic" "join" + [19] "join_asof" "last" + [21] "limit" "max" + [23] "mean" "median" + [25] "melt" "min" + [27] "print" "profile" + [29] "quantile" "rename" + [31] "reverse" "rolling" + [33] "schema" "select" + [35] "select_str_as_lit" "set_optimization_toggle" + [37] "shift" "shift_and_fill" + [39] "sink_csv" "sink_ipc" + [41] "sink_json" "sink_parquet" + [43] "slice" "sort_by_exprs" + [45] "std" "sum" + [47] "tail" "unique" + [49] "unnest" "var" + [51] "with_columns" "with_context" + [53] "with_row_count" # public and private methods of each class Expr From 192a43363e64128975d4ae7bfc7230527b30cbfc Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Thu, 11 Jan 2024 21:57:37 +0100 Subject: [PATCH 06/14] fix timezone issue --- tests/testthat/test-groupby.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-groupby.R b/tests/testthat/test-groupby.R index 1f6bc8147..9e5305182 100644 --- a/tests/testthat/test-groupby.R +++ b/tests/testthat/test-groupby.R @@ -296,7 +296,7 @@ test_that("group_by_dynamic for LazyFrame: arg 'label' works", { ), n = 0:6 )$with_columns( - pl$col("dt")$str$strptime(pl$Datetime("ms"), format = NULL)$set_sorted() + pl$col("dt")$str$strptime(pl$Datetime("ms"), format = NULL)$set_sorted()$dt$replace_time_zone("UTC") ) actual = df$group_by_dynamic(index_column = "dt", label = "right", every = "1h")$agg( @@ -306,7 +306,8 @@ test_that("group_by_dynamic for LazyFrame: arg 'label' works", { expect_equal( actual[, "dt"], as.POSIXct( - c("2021-12-16 02:00:00 CET", "2021-12-16 03:00:00 CET", "2021-12-16 04:00:00 CET", "2021-12-16 05:00:00 CET") + c("2021-12-16 01:00:00", "2021-12-16 02:00:00", "2021-12-16 03:00:00", "2021-12-16 04:00:00"), + tz = "UTC" ) ) From 1803541eb6aa0850be1933ccc92123952fb593b2 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Fri, 12 Jan 2024 15:29:39 +0100 Subject: [PATCH 07/14] dirty hack about groups --- R/group_by.R | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/R/group_by.R b/R/group_by.R index 41950b819..e620495ff 100644 --- a/R/group_by.R +++ b/R/group_by.R @@ -89,16 +89,19 @@ GroupBy_agg = function(...) { if (isTRUE(attributes(self)[["is_rolling_group_by"]]) || isTRUE(attributes(self)[["is_dynamic_group_by"]])) { class(self) = "RPolarsLazyGroupBy" - self$agg(unpack_list(..., .context = "in $agg():"))$collect(no_optimization = TRUE) + out = self$agg(unpack_list(..., .context = "in $agg():"))$collect(no_optimization = TRUE) + class(self) = "RPolarsGroupBy" } else { class(self) = "RPolarsDataFrame" - self$lazy()$group_by( + out = self$lazy()$clone()$group_by( attr(self, "private")$groupby_input, maintain_order = attr(self, "private")$maintain_order )$ agg(...)$ collect(no_optimization = TRUE) + class(self) = "RPolarsGroupBy" } + out } From d816aad16af3a9f67cce9cefd69610acb8e32fbd Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Sat, 13 Jan 2024 16:53:44 +0100 Subject: [PATCH 08/14] create DynamicGroupBy class --- DESCRIPTION | 1 + NAMESPACE | 4 + R/group_by.R | 32 -------- R/group_by_dynamic.R | 133 ++++++++++++++++++++++++++++++++++ man/DynamicGroupBy_agg.Rd | 33 +++++++++ man/DynamicGroupBy_class.Rd | 11 +++ man/DynamicGroupBy_ungroup.Rd | 25 +++++++ 7 files changed, 207 insertions(+), 32 deletions(-) create mode 100644 R/group_by_dynamic.R create mode 100644 man/DynamicGroupBy_agg.Rd create mode 100644 man/DynamicGroupBy_class.Rd create mode 100644 man/DynamicGroupBy_ungroup.Rd diff --git a/DESCRIPTION b/DESCRIPTION index d22cba48b..40f5face0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -91,6 +91,7 @@ Collate: 'functions__lazy.R' 'functions__whenthen.R' 'group_by.R' + 'group_by_dynamic.R' 'group_by_rolling.R' 'info.R' 'ipc.R' diff --git a/NAMESPACE b/NAMESPACE index 20145cd38..dadb9dd14 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,6 +9,7 @@ S3method("$",RPolarsChainedWhen) S3method("$",RPolarsDataFrame) S3method("$",RPolarsDataType) S3method("$",RPolarsDataTypeVector) +S3method("$",RPolarsDynamicGroupBy) S3method("$",RPolarsErr) S3method("$",RPolarsExpr) S3method("$",RPolarsExprBinNameSpace) @@ -69,6 +70,7 @@ S3method("[[",RPolarsChainedWhen) S3method("[[",RPolarsDataFrame) S3method("[[",RPolarsDataType) S3method("[[",RPolarsDataTypeVector) +S3method("[[",RPolarsDynamicGroupBy) S3method("[[",RPolarsErr) S3method("[[",RPolarsExpr) S3method("[[",RPolarsGroupBy) @@ -90,6 +92,7 @@ S3method("|",RPolarsExpr) S3method(.DollarNames,RPolarsChainedThen) S3method(.DollarNames,RPolarsChainedWhen) S3method(.DollarNames,RPolarsDataFrame) +S3method(.DollarNames,RPolarsDynamicGroupBy) S3method(.DollarNames,RPolarsErr) S3method(.DollarNames,RPolarsExpr) S3method(.DollarNames,RPolarsGroupBy) @@ -162,6 +165,7 @@ S3method(print,RPolarsChainedThen) S3method(print,RPolarsChainedWhen) S3method(print,RPolarsDataFrame) S3method(print,RPolarsDataType) +S3method(print,RPolarsDynamicGroupBy) S3method(print,RPolarsErr) S3method(print,RPolarsExpr) S3method(print,RPolarsGroupBy) diff --git a/R/group_by.R b/R/group_by.R index 9d1c95d08..8d3ded52f 100644 --- a/R/group_by.R +++ b/R/group_by.R @@ -87,24 +87,6 @@ print.RPolarsGroupBy = function(x, ...) { #' pl$col("bar")$mean()$alias("bar_tail_sum") #' ) GroupBy_agg = function(...) { -<<<<<<< HEAD - if (isTRUE(attributes(self)[["is_rolling_group_by"]]) || - isTRUE(attributes(self)[["is_dynamic_group_by"]])) { - class(self) = "RPolarsLazyGroupBy" - out = self$agg(unpack_list(..., .context = "in $agg():"))$collect(no_optimization = TRUE) - class(self) = "RPolarsGroupBy" - } else { - class(self) = "RPolarsDataFrame" - out = self$lazy()$clone()$group_by( - attr(self, "private")$groupby_input, - maintain_order = attr(self, "private")$maintain_order - )$ - agg(...)$ - collect(no_optimization = TRUE) - class(self) = "RPolarsGroupBy" - } - out -======= prv = attr(self, "private") prv$dat$lazy()$group_by( prv$groupby_input, @@ -112,7 +94,6 @@ GroupBy_agg = function(...) { )$ agg(...)$ collect(no_optimization = TRUE) ->>>>>>> main } @@ -315,19 +296,6 @@ GroupBy_null_count = function() { #' #' gb$ungroup() GroupBy_ungroup = function() { -<<<<<<< HEAD - if (isTRUE(attributes(self)[["is_rolling_group_by"]]) || - isTRUE(attributes(self)[["is_dynamic_group_by"]])) { - class(self) = "RPolarsLazyGroupBy" - self = self$ungroup()$collect(no_optimization = TRUE) - } else { - self = .pr$DataFrame$clone_in_rust(self) - class(self) = "RPolarsDataFrame" - attr(self, "private") = NULL - } - self -======= prv = attr(self, "private") prv$dat ->>>>>>> main } diff --git a/R/group_by_dynamic.R b/R/group_by_dynamic.R new file mode 100644 index 000000000..38b393e37 --- /dev/null +++ b/R/group_by_dynamic.R @@ -0,0 +1,133 @@ +#' Operations on Polars DataFrame grouped on time or integer values +#' +#' @return not applicable +#' @name DynamicGroupBy_class +NULL + +RPolarsDynamicGroupBy = new.env(parent = emptyenv()) + +#' @export +`$.RPolarsDynamicGroupBy` = function(self, name) { + func = RPolarsDynamicGroupBy[[name]] + environment(func) = environment() + func +} + +#' @export +`[[.RPolarsDynamicGroupBy` = `$.RPolarsDynamicGroupBy` + +#' @export +#' @noRd +.DollarNames.RPolarsDynamicGroupBy = function(x, pattern = "") { + paste0(ls(RPolarsDynamicGroupBy, pattern = pattern), "()") +} + +#' The internal DynamicGroupBy constructor +#' @return The input as grouped DataFrame +#' @noRd +construct_rolling_group_by = function(df, index_column, period, offset, closed, by, check_sorted) { + if (!inherits(df, "RPolarsDataFrame")) { + stop("internal error: construct_group called not on DataFrame") + } + # Make an empty object. Store everything (including data) in attributes, so + # that we can keep the RPolarsDataFrame class on the data but still return + # a RPolarsDynamicGroupBy object here. + out = c(" ") + attr(out, "private") = list( + dat = df$clone(), + index_column = index_column, + period = period, + offset = offset, + closed = closed, + by = by, + check_sorted = check_sorted + ) + class(out) = "RPolarsDynamicGroupBy" + out +} + +#' print DynamicGroupBy +#' +#' @param x DataFrame +#' @param ... not used +#' @noRd +#' @return self +#' @export +#' +#' @examples +#' df = pl$DataFrame( +#' dt = c("2020-01-01", "2020-01-01", "2020-01-01", "2020-01-02", "2020-01-03", "2020-01-08"), +#' a = c(3, 7, 5, 9, 2, 1) +#' )$with_columns( +#' pl$col("dt")$str$strptime(pl$Date, format = NULL)$set_sorted() +#' ) +#' +#' df$rolling(index_column = "dt", period = "2d") +print.RPolarsDynamicGroupBy = function(x, ...) { + prv = attr(x, "private") + .pr$DataFrame$print(prv$dat) + cat(paste("index column:", prv$index)) + cat(paste("\nother groups:", toString(prv$by))) + cat(paste("\nperiod:", prv$period)) + cat(paste("\noffset:", prv$offset)) + cat(paste("\nclosed:", prv$closed)) +} + + +#' Aggregate over a DynamicGroupBy +#' +#' Aggregate a DataFrame over a rolling window created with `$rolling()`. +#' +#' @param ... Exprs to aggregate over. Those can also be passed wrapped in a +#' list, e.g `$agg(list(e1,e2,e3))`. +#' +#' @return An aggregated [DataFrame][DataFrame_class] +#' @examples +#' df = pl$DataFrame( +#' dt = c("2020-01-01", "2020-01-01", "2020-01-01", "2020-01-02", "2020-01-03", "2020-01-08"), +#' a = c(3, 7, 5, 9, 2, 1) +#' )$with_columns( +#' pl$col("dt")$str$strptime(pl$Date, format = NULL)$set_sorted() +#' ) +#' +#' df$rolling(index_column = "dt", period = "2d")$agg( +#' pl$col("a"), +#' pl$sum("a")$alias("sum_a"), +#' pl$min("a")$alias("min_a"), +#' pl$max("a")$alias("max_a") +#' ) +DynamicGroupBy_agg = function(...) { + prv = attr(self, "private") + prv$dat$ + lazy()$ + rolling( + index_column = prv$index, + period = prv$period, + offset = prv$offset, + closed = prv$closed, + by = prv$by, + check_sorted = prv$check_sorted + )$ + agg(unpack_list(..., .context = "in $agg():"))$ + collect(no_optimization = TRUE) +} + +#' Ungroup a DynamicGroupBy object +#' +#' Revert the `$rolling()` operation. Doing `$rolling(...)$ungroup()` +#' returns the original `DataFrame`. +#' +#' @return [DataFrame][DataFrame_class] +#' @examples +#' df = pl$DataFrame( +#' dt = c("2020-01-01", "2020-01-01", "2020-01-01", "2020-01-02", "2020-01-03", "2020-01-08"), +#' a = c(3, 7, 5, 9, 2, 1) +#' )$with_columns( +#' pl$col("dt")$str$strptime(pl$Date, format = NULL)$set_sorted() +#' ) +#' +#' df$rolling(index_column = "dt", period = "2d")$ungroup() +DynamicGroupBy_ungroup = function() { + prv = attr(self, "private") + prv$dat +} diff --git a/man/DynamicGroupBy_agg.Rd b/man/DynamicGroupBy_agg.Rd new file mode 100644 index 000000000..2a3b0e9cd --- /dev/null +++ b/man/DynamicGroupBy_agg.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/group_by_dynamic.R +\name{DynamicGroupBy_agg} +\alias{DynamicGroupBy_agg} +\title{Aggregate over a DynamicGroupBy} +\usage{ +DynamicGroupBy_agg(...) +} +\arguments{ +\item{...}{Exprs to aggregate over. Those can also be passed wrapped in a +list, e.g \verb{$agg(list(e1,e2,e3))}.} +} +\value{ +An aggregated \link[=DataFrame_class]{DataFrame} +} +\description{ +Aggregate a DataFrame over a rolling window created with \verb{$rolling()}. +} +\examples{ +df = pl$DataFrame( + dt = c("2020-01-01", "2020-01-01", "2020-01-01", "2020-01-02", "2020-01-03", "2020-01-08"), + a = c(3, 7, 5, 9, 2, 1) +)$with_columns( + pl$col("dt")$str$strptime(pl$Date, format = NULL)$set_sorted() +) + +df$rolling(index_column = "dt", period = "2d")$agg( + pl$col("a"), + pl$sum("a")$alias("sum_a"), + pl$min("a")$alias("min_a"), + pl$max("a")$alias("max_a") +) +} diff --git a/man/DynamicGroupBy_class.Rd b/man/DynamicGroupBy_class.Rd new file mode 100644 index 000000000..52442ea1b --- /dev/null +++ b/man/DynamicGroupBy_class.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/group_by_dynamic.R +\name{DynamicGroupBy_class} +\alias{DynamicGroupBy_class} +\title{Operations on Polars DataFrame grouped on time or integer values} +\value{ +not applicable +} +\description{ +Operations on Polars DataFrame grouped on time or integer values +} diff --git a/man/DynamicGroupBy_ungroup.Rd b/man/DynamicGroupBy_ungroup.Rd new file mode 100644 index 000000000..91ff7f0d0 --- /dev/null +++ b/man/DynamicGroupBy_ungroup.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/group_by_dynamic.R +\name{DynamicGroupBy_ungroup} +\alias{DynamicGroupBy_ungroup} +\title{Ungroup a DynamicGroupBy object} +\usage{ +DynamicGroupBy_ungroup() +} +\value{ +\link[=DataFrame_class]{DataFrame} +} +\description{ +Revert the \verb{$rolling()} operation. Doing \verb{$rolling(...)$ungroup()} +returns the original \code{DataFrame}. +} +\examples{ +df = pl$DataFrame( + dt = c("2020-01-01", "2020-01-01", "2020-01-01", "2020-01-02", "2020-01-03", "2020-01-08"), + a = c(3, 7, 5, 9, 2, 1) +)$with_columns( + pl$col("dt")$str$strptime(pl$Date, format = NULL)$set_sorted() +) + +df$rolling(index_column = "dt", period = "2d")$ungroup() +} From cead51d5b1433af4b06693a7b19dc096830f83a0 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Sat, 13 Jan 2024 17:12:58 +0100 Subject: [PATCH 09/14] redoc --- R/dataframe__frame.R | 15 +++--- R/group_by_dynamic.R | 75 +++++++++++++++--------------- R/group_by_rolling.R | 8 +--- R/zzz.R | 5 +- man/DynamicGroupBy_agg.Rd | 69 +++++++++++++++++++++++---- man/DynamicGroupBy_ungroup.Rd | 18 ++++--- tests/testthat/_snaps/dataframe.md | 24 ---------- tests/testthat/test-dataframe.R | 9 ---- 8 files changed, 122 insertions(+), 101 deletions(-) diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R index 523d1e082..7d99c0a65 100644 --- a/R/dataframe__frame.R +++ b/R/dataframe__frame.R @@ -1910,11 +1910,14 @@ DataFrame_group_by_dynamic = function( by = NULL, start_by = "window", check_sorted = TRUE) { - out = self$lazy()$group_by_dynamic( - index_column, every, period, offset, include_boundaries, closed, label, by, - start_by, check_sorted + if (is.null(offset)) { + offset = paste0("-", every) + } + if (is.null(period)) { + period = every + } + construct_group_by_dynamic( + self, index_column, every, period, offset, include_boundaries, closed, label, + by, start_by, check_sorted ) - attr(out, "is_dynamic_group_by") = TRUE - class(out) = "RPolarsGroupBy" - out } diff --git a/R/group_by_dynamic.R b/R/group_by_dynamic.R index 38b393e37..a4335b73e 100644 --- a/R/group_by_dynamic.R +++ b/R/group_by_dynamic.R @@ -25,7 +25,10 @@ RPolarsDynamicGroupBy = new.env(parent = emptyenv()) #' The internal DynamicGroupBy constructor #' @return The input as grouped DataFrame #' @noRd -construct_rolling_group_by = function(df, index_column, period, offset, closed, by, check_sorted) { +construct_group_by_dynamic = function( + df, index_column, every, period, offset, include_boundaries, closed, label, + by, start_by, check_sorted +) { if (!inherits(df, "RPolarsDataFrame")) { stop("internal error: construct_group called not on DataFrame") } @@ -36,10 +39,14 @@ construct_rolling_group_by = function(df, index_column, period, offset, closed, attr(out, "private") = list( dat = df$clone(), index_column = index_column, + every = every, period = period, offset = offset, + include_boundaries = include_boundaries, closed = closed, + label = label, by = by, + start_by = start_by, check_sorted = check_sorted ) class(out) = "RPolarsDynamicGroupBy" @@ -56,56 +63,46 @@ construct_rolling_group_by = function(df, index_column, period, offset, closed, #' #' @examples #' df = pl$DataFrame( -#' dt = c("2020-01-01", "2020-01-01", "2020-01-01", "2020-01-02", "2020-01-03", "2020-01-08"), -#' a = c(3, 7, 5, 9, 2, 1) -#' )$with_columns( -#' pl$col("dt")$str$strptime(pl$Date, format = NULL)$set_sorted() +#' time = pl$date_range( +#' start = strptime("2021-12-16 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), +#' end = strptime("2021-12-16 03:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), +#' interval = "30m", +#' eager = TRUE, +#' ), +#' n = 0:6 #' ) #' -#' df$rolling(index_column = "dt", period = "2d") +#' # get the sum in the following hour relative to the "time" column +#' df$group_by_dynamic("time", every = "1h") print.RPolarsDynamicGroupBy = function(x, ...) { - prv = attr(x, "private") - .pr$DataFrame$print(prv$dat) - cat(paste("index column:", prv$index)) - cat(paste("\nother groups:", toString(prv$by))) - cat(paste("\nperiod:", prv$period)) - cat(paste("\noffset:", prv$offset)) - cat(paste("\nclosed:", prv$closed)) + .pr$DataFrame$print(attr(x, "private")$dat) } #' Aggregate over a DynamicGroupBy #' -#' Aggregate a DataFrame over a rolling window created with `$rolling()`. +#' Aggregate a DataFrame over a time or integer window created with +#' `$group_by_dynamic()`. #' #' @param ... Exprs to aggregate over. Those can also be passed wrapped in a #' list, e.g `$agg(list(e1,e2,e3))`. #' #' @return An aggregated [DataFrame][DataFrame_class] -#' @examples -#' df = pl$DataFrame( -#' dt = c("2020-01-01", "2020-01-01", "2020-01-01", "2020-01-02", "2020-01-03", "2020-01-08"), -#' a = c(3, 7, 5, 9, 2, 1) -#' )$with_columns( -#' pl$col("dt")$str$strptime(pl$Date, format = NULL)$set_sorted() -#' ) -#' -#' df$rolling(index_column = "dt", period = "2d")$agg( -#' pl$col("a"), -#' pl$sum("a")$alias("sum_a"), -#' pl$min("a")$alias("min_a"), -#' pl$max("a")$alias("max_a") -#' ) +#' @inherit DataFrame_group_by_dynamic examples DynamicGroupBy_agg = function(...) { prv = attr(self, "private") prv$dat$ lazy()$ - rolling( - index_column = prv$index, + group_by_dynamic( + index_column = prv$index_column, + every = prv$every, period = prv$period, offset = prv$offset, + include_boundaries = prv$include_boundaries, closed = prv$closed, + label = prv$label, by = prv$by, + start_by = prv$start_by, check_sorted = prv$check_sorted )$ agg(unpack_list(..., .context = "in $agg():"))$ @@ -114,19 +111,23 @@ DynamicGroupBy_agg = function(...) { #' Ungroup a DynamicGroupBy object #' -#' Revert the `$rolling()` operation. Doing `$rolling(...)$ungroup()` -#' returns the original `DataFrame`. +#' Revert the `$group_by_dynamic()` operation. Doing +#' `$group_by_dynamic(...)$ungroup()` returns the original `DataFrame`. #' #' @return [DataFrame][DataFrame_class] #' @examples #' df = pl$DataFrame( -#' dt = c("2020-01-01", "2020-01-01", "2020-01-01", "2020-01-02", "2020-01-03", "2020-01-08"), -#' a = c(3, 7, 5, 9, 2, 1) -#' )$with_columns( -#' pl$col("dt")$str$strptime(pl$Date, format = NULL)$set_sorted() +#' time = pl$date_range( +#' start = strptime("2021-12-16 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), +#' end = strptime("2021-12-16 03:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), +#' interval = "30m", +#' eager = TRUE, +#' ), +#' n = 0:6 #' ) +#' df #' -#' df$rolling(index_column = "dt", period = "2d")$ungroup() +#' df$group_by_dynamic("time", every = "1h")$ungroup() DynamicGroupBy_ungroup = function() { prv = attr(self, "private") prv$dat diff --git a/R/group_by_rolling.R b/R/group_by_rolling.R index 29d2d1738..ef81d2f37 100644 --- a/R/group_by_rolling.R +++ b/R/group_by_rolling.R @@ -64,13 +64,7 @@ construct_rolling_group_by = function(df, index_column, period, offset, closed, #' #' df$rolling(index_column = "dt", period = "2d") print.RPolarsRollingGroupBy = function(x, ...) { - prv = attr(x, "private") - .pr$DataFrame$print(prv$dat) - cat(paste("index column:", prv$index)) - cat(paste("\nother groups:", toString(prv$by))) - cat(paste("\nperiod:", prv$period)) - cat(paste("\noffset:", prv$offset)) - cat(paste("\nclosed:", prv$closed)) + .pr$DataFrame$print(attr(x, "private")$dat) } diff --git a/R/zzz.R b/R/zzz.R index 4413b3279..c07ea05a9 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -22,9 +22,12 @@ replace_private_with_pub_methods(RPolarsLazyFrame, "^LazyFrame_") # LazyGroupBy replace_private_with_pub_methods(RPolarsLazyGroupBy, "^LazyGroupBy_") -# LazyGroupBy +# RollingGroupBy replace_private_with_pub_methods(RPolarsRollingGroupBy, "^RollingGroupBy_") +# DynamicGroupBy +replace_private_with_pub_methods(RPolarsDynamicGroupBy, "^DynamicGroupBy_") + # Expr replace_private_with_pub_methods(RPolarsExpr, "^Expr_") diff --git a/man/DynamicGroupBy_agg.Rd b/man/DynamicGroupBy_agg.Rd index 2a3b0e9cd..3d6a9cc47 100644 --- a/man/DynamicGroupBy_agg.Rd +++ b/man/DynamicGroupBy_agg.Rd @@ -14,20 +14,69 @@ list, e.g \verb{$agg(list(e1,e2,e3))}.} An aggregated \link[=DataFrame_class]{DataFrame} } \description{ -Aggregate a DataFrame over a rolling window created with \verb{$rolling()}. +Aggregate a DataFrame over a time or integer window created with +\verb{$group_by_dynamic()}. } \examples{ df = pl$DataFrame( - dt = c("2020-01-01", "2020-01-01", "2020-01-01", "2020-01-02", "2020-01-03", "2020-01-08"), - a = c(3, 7, 5, 9, 2, 1) -)$with_columns( - pl$col("dt")$str$strptime(pl$Date, format = NULL)$set_sorted() + time = pl$date_range( + start = strptime("2021-12-16 00:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"), + end = strptime("2021-12-16 03:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"), + interval = "30m", + eager = TRUE, + ), + n = 0:6 ) -df$rolling(index_column = "dt", period = "2d")$agg( - pl$col("a"), - pl$sum("a")$alias("sum_a"), - pl$min("a")$alias("min_a"), - pl$max("a")$alias("max_a") +# get the sum in the following hour relative to the "time" column +df$group_by_dynamic("time", every = "1h")$agg( + vals = pl$col("n"), + sum = pl$col("n")$sum() ) + +# using "include_boundaries = TRUE" is helpful to see the period considered +df$group_by_dynamic("time", every = "1h", include_boundaries = TRUE)$agg( + vals = pl$col("n") +) + +# in the example above, the values didn't include the one *exactly* 1h after +# the start because "closed = 'left'" by default. +# Changing it to "right" includes values that are exactly 1h after. Note that +# the value at 00:00:00 now becomes included in the interval [23:00:00 - 00:00:00], +# even if this interval wasn't there originally +df$group_by_dynamic("time", every = "1h", closed = "right")$agg( + vals = pl$col("n") +) +# To keep both boundaries, we use "closed = 'both'". Some values now belong to +# several groups: +df$group_by_dynamic("time", every = "1h", closed = "both")$agg( + vals = pl$col("n") +) + +# Dynamic group bys can also be combined with grouping on normal keys +df = df$with_columns(groups = pl$Series(c("a", "a", "a", "b", "b", "a", "a"))) +df + +df$group_by_dynamic( + "time", + every = "1h", + closed = "both", + by = "groups", + include_boundaries = TRUE +)$agg(pl$col("n")) + +# We can also create a dynamic group by based on an index column +df = pl$LazyFrame( + idx = 0:5, + A = c("A", "A", "B", "B", "B", "C") +)$with_columns(pl$col("idx")$set_sorted()) +df + +df$group_by_dynamic( + "idx", + every = "2i", + period = "3i", + include_boundaries = TRUE, + closed = "right" +)$agg(A_agg_list = pl$col("A")) } diff --git a/man/DynamicGroupBy_ungroup.Rd b/man/DynamicGroupBy_ungroup.Rd index 91ff7f0d0..ccb7e605a 100644 --- a/man/DynamicGroupBy_ungroup.Rd +++ b/man/DynamicGroupBy_ungroup.Rd @@ -10,16 +10,20 @@ DynamicGroupBy_ungroup() \link[=DataFrame_class]{DataFrame} } \description{ -Revert the \verb{$rolling()} operation. Doing \verb{$rolling(...)$ungroup()} -returns the original \code{DataFrame}. +Revert the \verb{$group_by_dynamic()} operation. Doing +\verb{$group_by_dynamic(...)$ungroup()} returns the original \code{DataFrame}. } \examples{ df = pl$DataFrame( - dt = c("2020-01-01", "2020-01-01", "2020-01-01", "2020-01-02", "2020-01-03", "2020-01-08"), - a = c(3, 7, 5, 9, 2, 1) -)$with_columns( - pl$col("dt")$str$strptime(pl$Date, format = NULL)$set_sorted() + time = pl$date_range( + start = strptime("2021-12-16 00:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"), + end = strptime("2021-12-16 03:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"), + interval = "30m", + eager = TRUE, + ), + n = 0:6 ) +df -df$rolling(index_column = "dt", period = "2d")$ungroup() +df$group_by_dynamic("time", every = "1h")$ungroup() } diff --git a/tests/testthat/_snaps/dataframe.md b/tests/testthat/_snaps/dataframe.md index 74740ff95..c67b2d0d5 100644 --- a/tests/testthat/_snaps/dataframe.md +++ b/tests/testthat/_snaps/dataframe.md @@ -38,27 +38,3 @@ & carb 4, 4, 1, 1, 2, 1, 4, 2, 2, 4 & literal 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 -# rolling for DataFrame: prints all info - - Code - df$rolling(index_column = "dt", period = "2i") - Output - shape: (6, 2) - ┌───────┬─────┐ - │ index ┆ a │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞═══════╪═════╡ - │ 1.0 ┆ 3.0 │ - │ 2.0 ┆ 7.0 │ - │ 3.0 ┆ 5.0 │ - │ 4.0 ┆ 9.0 │ - │ 5.0 ┆ 2.0 │ - │ 6.0 ┆ 1.0 │ - └───────┴─────┘ - index column: dt - other groups: - period: 2i - offset: -2i - closed: right - diff --git a/tests/testthat/test-dataframe.R b/tests/testthat/test-dataframe.R index dd060a2b2..aa063dd57 100644 --- a/tests/testthat/test-dataframe.R +++ b/tests/testthat/test-dataframe.R @@ -1267,15 +1267,6 @@ test_that("rolling for DataFrame: basic example", { ) }) -test_that("rolling for DataFrame: prints all info", { - df = pl$DataFrame( - index = c(1:5, 6.0), - a = c(3, 7, 5, 9, 2, 1) - )$with_columns(pl$col("index")$set_sorted()) - - expect_snapshot(df$rolling(index_column = "dt", period = "2i")) -}) - test_that("rolling for DataFrame: can be ungrouped", { df = pl$DataFrame( index = c(1:5, 6.0), From 462357d76cbbab97bea665fccf0ada4112dce427 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Sat, 13 Jan 2024 17:19:02 +0100 Subject: [PATCH 10/14] add as_polars_df method --- NAMESPACE | 1 + R/as_polars.R | 4 ++++ man/as_polars_df.Rd | 3 +++ tests/testthat/test-as_polars.R | 2 ++ 4 files changed, 10 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index dadb9dd14..11b611a3a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -118,6 +118,7 @@ S3method(as.matrix,RPolarsLazyFrame) S3method(as.vector,RPolarsSeries) S3method(as_polars_df,ArrowTabular) S3method(as_polars_df,RPolarsDataFrame) +S3method(as_polars_df,RPolarsDynamicGroupBy) S3method(as_polars_df,RPolarsGroupBy) S3method(as_polars_df,RPolarsLazyFrame) S3method(as_polars_df,RPolarsLazyGroupBy) diff --git a/R/as_polars.R b/R/as_polars.R index 7d127279e..9fbcedecc 100644 --- a/R/as_polars.R +++ b/R/as_polars.R @@ -106,6 +106,10 @@ as_polars_df.RPolarsGroupBy = function(x, ...) { #' @export as_polars_df.RPolarsRollingGroupBy = as_polars_df.RPolarsGroupBy +#' @rdname as_polars_df +#' @export +as_polars_df.RPolarsDynamicGroupBy = as_polars_df.RPolarsGroupBy + #' @rdname as_polars_df #' @export as_polars_df.RPolarsSeries = function(x, ...) { diff --git a/man/as_polars_df.Rd b/man/as_polars_df.Rd index 3333e91b4..250e998ce 100644 --- a/man/as_polars_df.Rd +++ b/man/as_polars_df.Rd @@ -7,6 +7,7 @@ \alias{as_polars_df.RPolarsDataFrame} \alias{as_polars_df.RPolarsGroupBy} \alias{as_polars_df.RPolarsRollingGroupBy} +\alias{as_polars_df.RPolarsDynamicGroupBy} \alias{as_polars_df.RPolarsSeries} \alias{as_polars_df.RPolarsLazyFrame} \alias{as_polars_df.RPolarsLazyGroupBy} @@ -25,6 +26,8 @@ as_polars_df(x, ...) \method{as_polars_df}{RPolarsRollingGroupBy}(x, ...) +\method{as_polars_df}{RPolarsDynamicGroupBy}(x, ...) + \method{as_polars_df}{RPolarsSeries}(x, ...) \method{as_polars_df}{RPolarsLazyFrame}( diff --git a/tests/testthat/test-as_polars.R b/tests/testthat/test-as_polars.R index a1eb5e67d..2d0bc4923 100644 --- a/tests/testthat/test-as_polars.R +++ b/tests/testthat/test-as_polars.R @@ -14,6 +14,8 @@ make_as_polars_df_cases = function() { "polars_lazy_group_by", pl$LazyFrame(test_df)$group_by("col_int"), "polars_rolling_group_by", pl$DataFrame(test_df)$rolling("col_int", period = "1i"), "polars_lazy_rolling_group_by", pl$LazyFrame(test_df)$rolling("col_int", period = "1i"), + "polars_group_by_dynamic", pl$DataFrame(test_df)$group_by_dynamic("col_int", every = "1i"), + "polars_lazy_group_by_dynamic", pl$LazyFrame(test_df)$group_by_dynamic("col_int", every = "1i"), "arrow Table", arrow::as_arrow_table(test_df) ) } From e9b2a98ffeb75511aef8ff291348dd477a268e8c Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Sat, 13 Jan 2024 17:21:31 +0100 Subject: [PATCH 11/14] fix --- tests/testthat/test-groupby.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-groupby.R b/tests/testthat/test-groupby.R index 9e5305182..5f501ddde 100644 --- a/tests/testthat/test-groupby.R +++ b/tests/testthat/test-groupby.R @@ -328,7 +328,7 @@ test_that("group_by_dynamic for LazyFrame: arg 'start_by' works", { ), n = 0:6 )$with_columns( - pl$col("dt")$str$strptime(pl$Datetime("ms"), format = NULL)$set_sorted() + pl$col("dt")$str$strptime(pl$Datetime("ms", tz = "UTC"), format = NULL)$set_sorted() ) # TODO: any weekday should return the same since it is ignored when there's no @@ -341,7 +341,8 @@ test_that("group_by_dynamic for LazyFrame: arg 'start_by' works", { expect_equal( actual[, "dt"], as.POSIXct( - c("2021-12-16 01:00:00 CET", "2021-12-16 02:00:00 CET", "2021-12-16 03:00:00 CET", "2021-12-16 04:00:00 CET") + c("2021-12-16 00:00:00 UTC", "2021-12-16 01:00:00 UTC", "2021-12-16 02:00:00 UTC", "2021-12-16 03:00:00 UTC"), + tz = "UTC" ) ) From 777d29ff6666143518f4f5458626d9aed8c9942c Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Sat, 13 Jan 2024 17:34:05 +0100 Subject: [PATCH 12/14] fix --- tests/testthat/test-groupby.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-groupby.R b/tests/testthat/test-groupby.R index 5f501ddde..9d67bff9a 100644 --- a/tests/testthat/test-groupby.R +++ b/tests/testthat/test-groupby.R @@ -192,7 +192,7 @@ test_that("group_by_dynamic for DataFrame calls the LazyFrame method", { test_that("group_by_dynamic for LazyFrame: date variable", { df = pl$LazyFrame( - dt = as.Date(as.Date("2021-12-16"):as.Date("2021-12-22")), + dt = as.Date(as.Date("2021-12-16"):as.Date("2021-12-22"), origin = "1970-01-01"), n = 0:6 )$with_columns( pl$col("dt")$set_sorted() From f2354367ea6d50d386b2b755229b2903743d3ebc Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Sat, 13 Jan 2024 17:44:01 +0100 Subject: [PATCH 13/14] fix --- tests/testthat/test-groupby.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-groupby.R b/tests/testthat/test-groupby.R index 9d67bff9a..4dd0b40b8 100644 --- a/tests/testthat/test-groupby.R +++ b/tests/testthat/test-groupby.R @@ -174,7 +174,7 @@ test_that("LazyGroupBy clone", { test_that("group_by_dynamic for DataFrame calls the LazyFrame method", { df = pl$DataFrame( - dt = as.Date(as.Date("2021-12-16"):as.Date("2021-12-22")), + dt = as.Date(as.Date("2021-12-16"):as.Date("2021-12-22"), origin = "1970-01-01"), n = 0:6 )$with_columns( pl$col("dt")$set_sorted() From cb927331efc08bbf749379236f3e062fe99d39b9 Mon Sep 17 00:00:00 2001 From: eitsupi Date: Sun, 14 Jan 2024 04:32:41 +0000 Subject: [PATCH 14/14] fix: add `...` to enforce named arguments for window functions and auto formatting --- R/expr__expr.R | 8 ++++++-- R/group_by_dynamic.R | 25 ++++++++++++------------- R/lazyframe__lazy.R | 4 +++- man/Expr_rolling.Rd | 3 +++ man/LazyFrame_group_by_dynamic.Rd | 3 +++ man/LazyFrame_rolling.Rd | 3 +++ tests/testthat/test-groupby.R | 7 ++++--- 7 files changed, 34 insertions(+), 19 deletions(-) diff --git a/R/expr__expr.R b/R/expr__expr.R index 36f7f07d3..ed2918a3b 100644 --- a/R/expr__expr.R +++ b/R/expr__expr.R @@ -3507,6 +3507,7 @@ Expr_peak_max = function() { #' column represents an index, it has to be either Int32 or Int64. Note that #' Int32 gets temporarily cast to Int64, so if performance matters use an Int64 #' column. +#' @param ... Ignored. #' @param period Length of the window, must be non-negative. #' @param offset Offset of the window. Default is `-period`. #' @param closed Define which sides of the temporal interval are closed @@ -3569,8 +3570,11 @@ Expr_peak_max = function() { #' df$with_columns( #' sum_a_offset1 = pl$sum("a")$rolling(index_column = "dt", period = "2d", offset = "1d") #' ) -Expr_rolling = function(index_column, period, offset = NULL, - closed = "right", check_sorted = TRUE) { +Expr_rolling = function( + index_column, + ..., + period, offset = NULL, + closed = "right", check_sorted = TRUE) { if (is.null(offset)) { offset = paste0("-", period) } diff --git a/R/group_by_dynamic.R b/R/group_by_dynamic.R index a4335b73e..104b1ff63 100644 --- a/R/group_by_dynamic.R +++ b/R/group_by_dynamic.R @@ -27,8 +27,7 @@ RPolarsDynamicGroupBy = new.env(parent = emptyenv()) #' @noRd construct_group_by_dynamic = function( df, index_column, every, period, offset, include_boundaries, closed, label, - by, start_by, check_sorted -) { + by, start_by, check_sorted) { if (!inherits(df, "RPolarsDataFrame")) { stop("internal error: construct_group called not on DataFrame") } @@ -94,17 +93,17 @@ DynamicGroupBy_agg = function(...) { prv$dat$ lazy()$ group_by_dynamic( - index_column = prv$index_column, - every = prv$every, - period = prv$period, - offset = prv$offset, - include_boundaries = prv$include_boundaries, - closed = prv$closed, - label = prv$label, - by = prv$by, - start_by = prv$start_by, - check_sorted = prv$check_sorted - )$ + index_column = prv$index_column, + every = prv$every, + period = prv$period, + offset = prv$offset, + include_boundaries = prv$include_boundaries, + closed = prv$closed, + label = prv$label, + by = prv$by, + start_by = prv$start_by, + check_sorted = prv$check_sorted + )$ agg(unpack_list(..., .context = "in $agg():"))$ collect(no_optimization = TRUE) } diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index 903e87dfd..c7569ceaf 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -1710,7 +1710,8 @@ LazyFrame_with_context = function(other) { #' pl$min("a")$alias("min_a"), #' pl$max("a")$alias("max_a") #' )$collect() -LazyFrame_rolling = function(index_column, period, offset = NULL, closed = "right", by = NULL, check_sorted = TRUE) { +LazyFrame_rolling = function( + index_column, ..., period, offset = NULL, closed = "right", by = NULL, check_sorted = TRUE) { if (is.null(offset)) { offset = paste0("-", period) } @@ -1810,6 +1811,7 @@ LazyFrame_rolling = function(index_column, period, offset = NULL, closed = "righ #' )$agg(A_agg_list = pl$col("A"))$collect() LazyFrame_group_by_dynamic = function( index_column, + ..., every, period = NULL, offset = NULL, diff --git a/man/Expr_rolling.Rd b/man/Expr_rolling.Rd index d5307a82a..0f106ab95 100644 --- a/man/Expr_rolling.Rd +++ b/man/Expr_rolling.Rd @@ -6,6 +6,7 @@ \usage{ Expr_rolling( index_column, + ..., period, offset = NULL, closed = "right", @@ -19,6 +20,8 @@ column represents an index, it has to be either Int32 or Int64. Note that Int32 gets temporarily cast to Int64, so if performance matters use an Int64 column.} +\item{...}{Ignored.} + \item{period}{Length of the window, must be non-negative.} \item{offset}{Offset of the window. Default is \code{-period}.} diff --git a/man/LazyFrame_group_by_dynamic.Rd b/man/LazyFrame_group_by_dynamic.Rd index d3039b4d5..c24614559 100644 --- a/man/LazyFrame_group_by_dynamic.Rd +++ b/man/LazyFrame_group_by_dynamic.Rd @@ -6,6 +6,7 @@ \usage{ LazyFrame_group_by_dynamic( index_column, + ..., every, period = NULL, offset = NULL, @@ -25,6 +26,8 @@ case of a rolling group by on indices, dtype needs to be either Int32 or Int64. Note that Int32 gets temporarily cast to Int64, so if performance matters use an Int64 column.} +\item{...}{Ignored.} + \item{every}{Interval of the window.} \item{period}{Length of the window, must be non-negative.} diff --git a/man/LazyFrame_rolling.Rd b/man/LazyFrame_rolling.Rd index 657d0a997..f4ab90dcb 100644 --- a/man/LazyFrame_rolling.Rd +++ b/man/LazyFrame_rolling.Rd @@ -6,6 +6,7 @@ \usage{ LazyFrame_rolling( index_column, + ..., period, offset = NULL, closed = "right", @@ -21,6 +22,8 @@ case of a rolling group by on indices, dtype needs to be either Int32 or Int64. Note that Int32 gets temporarily cast to Int64, so if performance matters use an Int64 column.} +\item{...}{Ignored.} + \item{period}{Length of the window, must be non-negative.} \item{offset}{Offset of the window. Default is \code{-period}.} diff --git a/tests/testthat/test-groupby.R b/tests/testthat/test-groupby.R index 4dd0b40b8..e992d4485 100644 --- a/tests/testthat/test-groupby.R +++ b/tests/testthat/test-groupby.R @@ -454,10 +454,11 @@ test_that("group_by_dynamic for LazyFrame: arg 'include_boundaries' works", { actual = df$group_by_dynamic( index_column = "dt", every = "2d", offset = "1d", - include_boundaries = TRUE)$ + include_boundaries = TRUE + )$ agg( - pl$col("n") - ) + pl$col("n") + ) expect_named(actual, c("_lower_boundary", "_upper_boundary", "dt", "n")) })