Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement group_by_dynamic() for DataFrame and LazyFrame #691

Merged
merged 18 commits into from
Jan 14, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
### What's changed

- New method `$rolling()` for `DataFrame` and `LazyFrame` (#682).
- New method `$sink_ndjson()` for LazyFrame (#681).
- New method `$sink_ndjson()` for `LazyFrame` (#681).
- New method `$group_by_dynamic()` for `DataFrame` and `LazyFrame` (#691).

## polars 0.12.2

Expand Down
85 changes: 85 additions & 0 deletions R/dataframe__frame.R
Original file line number Diff line number Diff line change
Expand Up @@ -1833,3 +1833,88 @@ DataFrame_rolling = function(index_column, period, offset = NULL, closed = "righ
class(out) = "RPolarsGroupBy"
out
}

#' @inherit LazyFrame_group_by_dynamic title description details params
#' @return A [GroupBy][GroupBy_class] object
#'
#' @examples
#' df = pl$DataFrame(
#' time = pl$date_range(
#' start = strptime("2021-12-16 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"),
#' end = strptime("2021-12-16 03:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"),
#' interval = "30m",
#' eager = TRUE,
#' ),
#' n = 0:6
#' )
#'
#' # get the sum in the following hour relative to the "time" column
#' df$group_by_dynamic("time", every = "1h")$agg(
#' vals = pl$col("n"),
#' sum = pl$col("n")$sum()
#' )
#'
#' # using "include_boundaries = TRUE" is helpful to see the period considered
#' df$group_by_dynamic("time", every = "1h", include_boundaries = TRUE)$agg(
#' vals = pl$col("n")
#' )
#'
#' # in the example above, the values didn't include the one *exactly* 1h after
#' # the start because "closed = 'left'" by default.
#' # Changing it to "right" includes values that are exactly 1h after. Note that
#' # the value at 00:00:00 now becomes included in the interval [23:00:00 - 00:00:00],
#' # even if this interval wasn't there originally
#' df$group_by_dynamic("time", every = "1h", closed = "right")$agg(
#' vals = pl$col("n")
#' )
#' # To keep both boundaries, we use "closed = 'both'". Some values now belong to
#' # several groups:
#' df$group_by_dynamic("time", every = "1h", closed = "both")$agg(
#' vals = pl$col("n")
#' )
#'
#' # Dynamic group bys can also be combined with grouping on normal keys
#' df = df$with_columns(groups = pl$Series(c("a", "a", "a", "b", "b", "a", "a")))
#' df
#'
#' df$group_by_dynamic(
#' "time",
#' every = "1h",
#' closed = "both",
#' by = "groups",
#' include_boundaries = TRUE
#' )$agg(pl$col("n"))
#'
#' # We can also create a dynamic group by based on an index column
#' df = pl$LazyFrame(
#' idx = 0:5,
#' A = c("A", "A", "B", "B", "B", "C")
#' )$with_columns(pl$col("idx")$set_sorted())
#' df
#'
#' df$group_by_dynamic(
#' "idx",
#' every = "2i",
#' period = "3i",
#' include_boundaries = TRUE,
#' closed = "right"
#' )$agg(A_agg_list = pl$col("A"))
DataFrame_group_by_dynamic = function(
index_column,
every,
period = NULL,
offset = NULL,
include_boundaries = FALSE,
closed = "left",
label = "left",
by = NULL,
start_by = "window",
check_sorted = TRUE) {
out = self$lazy()$group_by_dynamic(
index_column, every, period, offset, include_boundaries, closed, label, by,
start_by, check_sorted
)
attr(out, "is_dynamic_group_by") = TRUE
class(out) = "RPolarsGroupBy"
out
}
2 changes: 2 additions & 0 deletions R/extendr-wrappers.R
Original file line number Diff line number Diff line change
Expand Up @@ -1117,6 +1117,8 @@ RPolarsLazyFrame$with_context <- function(contexts) .Call(wrap__RPolarsLazyFrame

RPolarsLazyFrame$rolling <- function(index_column, period, offset, closed, by, check_sorted) .Call(wrap__RPolarsLazyFrame__rolling, self, index_column, period, offset, closed, by, check_sorted)

RPolarsLazyFrame$group_by_dynamic <- function(index_column, every, period, offset, label, include_boundaries, closed, by, start_by, check_sorted) .Call(wrap__RPolarsLazyFrame__group_by_dynamic, self, index_column, every, period, offset, label, include_boundaries, closed, by, start_by, check_sorted)

#' @export
`$.RPolarsLazyFrame` <- function (self, name) { func <- RPolarsLazyFrame[[name]]; environment(func) <- environment(); func }

Expand Down
6 changes: 4 additions & 2 deletions R/group_by.R
etiennebacher marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,8 @@ print.RPolarsGroupBy = function(x, ...) {
#' pl$col("bar")$mean()$alias("bar_tail_sum")
#' )
GroupBy_agg = function(...) {
if (isTRUE(attributes(self)[["is_rolling_group_by"]])) {
if (isTRUE(attributes(self)[["is_rolling_group_by"]]) ||
isTRUE(attributes(self)[["is_dynamic_group_by"]])) {
class(self) = "RPolarsLazyGroupBy"
self$agg(unpack_list(..., .context = "in $agg():"))$collect(no_optimization = TRUE)
} else {
Expand Down Expand Up @@ -300,7 +301,8 @@ GroupBy_null_count = function() {
#'
#' gb$ungroup()
GroupBy_ungroup = function() {
if (isTRUE(attributes(self)[["is_rolling_group_by"]])) {
if (isTRUE(attributes(self)[["is_rolling_group_by"]]) ||
isTRUE(attributes(self)[["is_dynamic_group_by"]])) {
class(self) = "RPolarsLazyGroupBy"
self = self$ungroup()$collect(no_optimization = TRUE)
} else {
Expand Down
111 changes: 111 additions & 0 deletions R/lazyframe__lazy.R
Original file line number Diff line number Diff line change
Expand Up @@ -1720,3 +1720,114 @@ LazyFrame_rolling = function(index_column, period, offset = NULL, closed = "righ
) |>
unwrap("in $rolling():")
}


#' Group based on a date/time or integer column
#'
#' @inherit LazyFrame_rolling description details params
#'
#' @param every Interval of the window.
#' @param include_boundaries Add two columns `"_lower_boundary"` and
#' `"_upper_boundary"` columns that show the boundaries of the window. This will
#' impact performance because it’s harder to parallelize.
#' @param label Define which label to use for the window:
#' * `"left"`: lower boundary of the window
#' * `"right"`: upper boundary of the window
#' * `"datapoint"`: the first value of the index column in the given window. If
#' you don’t need the label to be at one of the boundaries, choose this option
#' for maximum performance.
#' @param start_by The strategy to determine the start of the first window by:
#' * `"window"`: start by taking the earliest timestamp, truncating it with `every`,
#' and then adding `offset`. Note that weekly windows start on Monday.
#' * `"datapoint"`: start from the first encountered data point.
#' * a day of the week (only takes effect if `every` contains `"w"`): `"monday"`
#' starts the window on the Monday before the first data point, etc.
#'
#' @return A [LazyGroupBy][LazyGroupBy_class] object
#'
#' @examples
#' lf = pl$LazyFrame(
#' time = pl$date_range(
#' start = strptime("2021-12-16 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"),
#' end = strptime("2021-12-16 03:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"),
#' interval = "30m",
#' eager = TRUE,
#' ),
#' n = 0:6
#' )
#' lf$collect()
#'
#' # get the sum in the following hour relative to the "time" column
#' lf$group_by_dynamic("time", every = "1h")$agg(
#' vals = pl$col("n"),
#' sum = pl$col("n")$sum()
#' )$collect()
#'
#' # using "include_boundaries = TRUE" is helpful to see the period considered
#' lf$group_by_dynamic("time", every = "1h", include_boundaries = TRUE)$agg(
#' vals = pl$col("n")
#' )$collect()
#'
#' # in the example above, the values didn't include the one *exactly* 1h after
#' # the start because "closed = 'left'" by default.
#' # Changing it to "right" includes values that are exactly 1h after. Note that
#' # the value at 00:00:00 now becomes included in the interval [23:00:00 - 00:00:00],
#' # even if this interval wasn't there originally
#' lf$group_by_dynamic("time", every = "1h", closed = "right")$agg(
#' vals = pl$col("n")
#' )$collect()
#' # To keep both boundaries, we use "closed = 'both'". Some values now belong to
#' # several groups:
#' lf$group_by_dynamic("time", every = "1h", closed = "both")$agg(
#' vals = pl$col("n")
#' )$collect()
#'
#' # Dynamic group bys can also be combined with grouping on normal keys
#' lf = lf$with_columns(groups = pl$Series(c("a", "a", "a", "b", "b", "a", "a")))
#' lf$collect()
#'
#' lf$group_by_dynamic(
#' "time",
#' every = "1h",
#' closed = "both",
#' by = "groups",
#' include_boundaries = TRUE
#' )$agg(pl$col("n"))$collect()
#'
#' # We can also create a dynamic group by based on an index column
#' lf = pl$LazyFrame(
#' idx = 0:5,
#' A = c("A", "A", "B", "B", "B", "C")
#' )$with_columns(pl$col("idx")$set_sorted())
#' lf$collect()
#'
#' lf$group_by_dynamic(
#' "idx",
#' every = "2i",
#' period = "3i",
#' include_boundaries = TRUE,
#' closed = "right"
#' )$agg(A_agg_list = pl$col("A"))$collect()
LazyFrame_group_by_dynamic = function(
index_column,
every,
period = NULL,
offset = NULL,
include_boundaries = FALSE,
closed = "left",
label = "left",
by = NULL,
start_by = "window",
check_sorted = TRUE) {
eitsupi marked this conversation as resolved.
Show resolved Hide resolved
if (is.null(offset)) {
offset = paste0("-", every)
}
if (is.null(period)) {
period = every
}
.pr$LazyFrame$group_by_dynamic(
self, index_column, every, period, offset, label, include_boundaries, closed,
wrap_elist_result(by, str_to_lit = FALSE), start_by, check_sorted
) |>
unwrap("in $group_by_dynamic():")
}
Loading
Loading