Skip to content

Commit

Permalink
Rename $groupby() to $group_by() (#427)
Browse files Browse the repository at this point in the history
Co-authored-by: eitsupi <50911393+eitsupi@users.noreply.github.com>
  • Loading branch information
etiennebacher and eitsupi committed Oct 16, 2023
1 parent 6132f51 commit ca0b51b
Show file tree
Hide file tree
Showing 59 changed files with 165 additions and 164 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,10 @@ Collate:
'functions__eager.R'
'functions__lazy.R'
'functions__whenthen.R'
'groupby.R'
'group_by.R'
'info.R'
'ipc.R'
'lazyframe__groupby.R'
'lazyframe__group_by.R'
'lazyframe__lazy.R'
'namespace.R'
'options.R'
Expand Down
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
`$with_columns()` instead (#402).
- Subnamespace `$arr` has been removed (it was deprecated since 0.8.1). Use `$list`
instead (#402).
- `$groupby()` is renamed `$group_by()`. (#427)

## What's changed

Expand Down
2 changes: 1 addition & 1 deletion R/after-wrappers.R
Original file line number Diff line number Diff line change
Expand Up @@ -295,5 +295,5 @@ pl_pub_class_env = as.environment(mget(pl_class_names, envir = pl_pub_env))
#'
#' # The single exception from the rule is class "GroupBy", where objects also have
#' # two private attributes "groupby_input" and "maintain_order".
#' str(pl$DataFrame(iris)$groupby("Species"))
#' str(pl$DataFrame(iris)$group_by("Species"))
NULL
8 changes: 4 additions & 4 deletions R/dataframe__frame.R
Original file line number Diff line number Diff line change
Expand Up @@ -836,24 +836,24 @@ DataFrame_filter = function(bool_expr) {
}

#' Group a DataFrame
#' @inherit LazyFrame_groupby description params
#' @inherit LazyFrame_group_by description params
#' @keywords DataFrame
#' @return GroupBy (a DataFrame with special groupby methods like `$agg()`)
#' @examples
#' gb = pl$DataFrame(
#' foo = c("one", "two", "two", "one", "two"),
#' bar = c(5, 3, 2, 4, 1)
#' )$groupby("foo", maintain_order = TRUE)
#' )$group_by("foo", maintain_order = TRUE)
#'
#' gb
#'
#' gb$agg(
#' pl$col("bar")$sum()$suffix("_sum"),
#' pl$col("bar")$mean()$alias("bar_tail_sum")
#' )
DataFrame_groupby = function(..., maintain_order = pl$options$maintain_order) {
DataFrame_group_by = function(..., maintain_order = pl$options$maintain_order) {
# clone the DataFrame, bundle args as attributes. Non fallible.
construct_groupby(self, groupby_input = unpack_list(...), maintain_order = maintain_order)
construct_group_by(self, groupby_input = unpack_list(...), maintain_order = maintain_order)
}


Expand Down
16 changes: 8 additions & 8 deletions R/expr__expr.R
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,7 @@ Expr_gt_eq = function(other) {
#' group = c("one", "one", "one", "two", "two", "two"),
#' value = c(94, 95, 96, 97, 97, 99)
#' ))
#' df$groupby("group", maintain_order = TRUE)$agg(pl$col("value")$agg_groups())
#' df$group_by("group", maintain_order = TRUE)$agg(pl$col("value")$agg_groups())
Expr_agg_groups = "use_extendr_wrapper"


Expand Down Expand Up @@ -794,7 +794,7 @@ Expr_map = function(f, output_type = NULL, agg_list = FALSE, in_background = FAL
#' e_all = pl$all() # perform groupby agg on all columns otherwise e.g. pl$col("Sepal.Length")
#' e_sum = e_all$apply(\(s) sum(s$to_r()))$suffix("_sum")
#' e_head = e_all$apply(\(s) head(s$to_r(), 2))$suffix("_head")
#' pl$DataFrame(iris)$groupby("Species")$agg(e_sum, e_head)
#' pl$DataFrame(iris)$group_by("Species")$agg(e_sum, e_head)
#'
#'
#' # apply over single values (should be avoided as it takes ~2.5us overhead + R function exec time
Expand Down Expand Up @@ -847,7 +847,7 @@ Expr_map = function(f, output_type = NULL, agg_list = FALSE, in_background = FAL
#' #' #R parallel process example, use Sys.sleep() to imitate some CPU expensive computation.
#'
#' # use apply over each Species-group in each column equal to 12 sequential runs ~1.2 sec.
#' pl$LazyFrame(iris)$groupby("Species")$agg(
#' pl$LazyFrame(iris)$group_by("Species")$agg(
#' pl$all()$apply(\(s) {
#' Sys.sleep(.1)
#' s$sum()
Expand All @@ -858,7 +858,7 @@ Expr_map = function(f, output_type = NULL, agg_list = FALSE, in_background = FAL
#' pl$set_options(rpool_cap = 0) # drop any previous processes, just to show start-up overhead here
#' pl$set_options(rpool_cap = 4) # set back to 4, the default
#' pl$options$rpool_cap
#' pl$LazyFrame(iris)$groupby("Species")$agg(
#' pl$LazyFrame(iris)$group_by("Species")$agg(
#' pl$all()$apply(\(s) {
#' Sys.sleep(.1)
#' s$sum()
Expand All @@ -867,7 +867,7 @@ Expr_map = function(f, output_type = NULL, agg_list = FALSE, in_background = FAL
#'
#' # map in parallel 2: Reuse R processes in "polars global_rpool".
#' pl$options$rpool_cap
#' pl$LazyFrame(iris)$groupby("Species")$agg(
#' pl$LazyFrame(iris)$group_by("Species")$agg(
#' pl$all()$apply(\(s) {
#' Sys.sleep(.1)
#' s$sum()
Expand Down Expand Up @@ -2368,7 +2368,7 @@ Expr_quantile = function(quantile, interpolation = "nearest") {
#' b = c(1, 2, 3)
#' ))
#'
#' df$groupby("group_col")$agg(
#' df$group_by("group_col")$agg(
#' pl$col("b")$filter(pl$col("b") < 2)$sum()$alias("lt"),
#' pl$col("b")$filter(pl$col("b") >= 2)$sum()$alias("gte")
#' )
Expand Down Expand Up @@ -2406,7 +2406,7 @@ Expr_where = Expr_filter
#' @examples
#' pl$DataFrame(list(a = letters))$select(pl$col("a")$explode()$take(0:5))
#'
#' listed_group_df = pl$DataFrame(iris[c(1:3, 51:53), ])$groupby("Species")$agg(pl$all())
#' listed_group_df = pl$DataFrame(iris[c(1:3, 51:53), ])$group_by("Species")$agg(pl$all())
#' print(listed_group_df)
#' vectors_df = listed_group_df$select(
#' pl$col(c("Sepal.Width", "Sepal.Length"))$explode()
Expand Down Expand Up @@ -4261,7 +4261,7 @@ Expr_shrink_dtype = "use_extendr_wrapper"
#' df_with_list = pl$DataFrame(
#' group = c(1, 1, 2, 2, 3),
#' value = c(1:5)
#' )$groupby(
#' )$group_by(
#' "group",
#' maintain_order = TRUE
#' )$agg(
Expand Down
2 changes: 1 addition & 1 deletion R/extendr-wrappers.R
Original file line number Diff line number Diff line change
Expand Up @@ -1031,7 +1031,7 @@ LazyFrame$drop_nulls <- function(subset) .Call(wrap__LazyFrame__drop_nulls, self

LazyFrame$unique <- function(subset, keep, maintain_order) .Call(wrap__LazyFrame__unique, self, subset, keep, maintain_order)

LazyFrame$groupby <- function(exprs, maintain_order) .Call(wrap__LazyFrame__groupby, self, exprs, maintain_order)
LazyFrame$group_by <- function(exprs, maintain_order) .Call(wrap__LazyFrame__group_by, self, exprs, maintain_order)

LazyFrame$with_row_count <- function(name, offset) .Call(wrap__LazyFrame__with_row_count, self, name, offset)

Expand Down
2 changes: 1 addition & 1 deletion R/functions__lazy.R
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ pl$element = function() pl$col("")
#' df$select(pl$count())
#'
#'
#' df$groupby("c", maintain_order = TRUE)$agg(pl$count())
#' df$group_by("c", maintain_order = TRUE)$agg(pl$count())
pl$count = function(column = NULL) { # -> Expr | int:
if (is.null(column)) {
return(.pr$Expr$new_count())
Expand Down
30 changes: 15 additions & 15 deletions R/groupby.R → R/group_by.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ GroupBy = new.env(parent = emptyenv())
#' @keywords internal
#' @return The input as grouped DataFrame
#' @noRd
construct_groupby = function(df, groupby_input, maintain_order) {
construct_group_by = function(df, groupby_input, maintain_order) {
if (!inherits(df, "DataFrame")) stopf("internal error: construct_group called not on DataFrame")
df = df$clone()
attr(df, "private") = list(groupby_input = groupby_input, maintain_order = maintain_order)
Expand All @@ -59,7 +59,7 @@ construct_groupby = function(df, groupby_input, maintain_order) {
#' @return self
#' @export
#'
#' @examples pl$DataFrame(iris)$groupby("Species")
#' @examples pl$DataFrame(iris)$group_by("Species")
print.GroupBy = function(x, ...) {
.pr$DataFrame$print(x)
cat("groups: ")
Expand All @@ -81,7 +81,7 @@ print.GroupBy = function(x, ...) {
#' foo = c("one", "two", "two", "one", "two"),
#' bar = c(5, 3, 2, 4, 1)
#' )$
#' groupby("foo")$
#' group_by("foo")$
#' agg(
#' pl$col("bar")$sum()$suffix("_sum"),
#' pl$col("bar")$mean()$alias("bar_tail_sum")
Expand All @@ -108,7 +108,7 @@ GroupBy_agg = function(...) {
#' c = c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE),
#' d = c("Apple", "Orange", "Apple", "Apple", "Banana", "Banana")
#' )
#' df$groupby("d", maintain_order = TRUE)$first()
#' df$group_by("d", maintain_order = TRUE)$first()
GroupBy_first = function() {
self$agg(pl$all()$first())
}
Expand All @@ -124,7 +124,7 @@ GroupBy_first = function() {
#' c = c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE),
#' d = c("Apple", "Orange", "Apple", "Apple", "Banana", "Banana")
#' )
#' df$groupby("d", maintain_order = TRUE)$last()
#' df$group_by("d", maintain_order = TRUE)$last()
GroupBy_last = function() {
self$agg(pl$all()$last())
}
Expand All @@ -140,7 +140,7 @@ GroupBy_last = function() {
#' c = c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE),
#' d = c("Apple", "Orange", "Apple", "Apple", "Banana", "Banana")
#' )
#' df$groupby("d", maintain_order = TRUE)$max()
#' df$group_by("d", maintain_order = TRUE)$max()
GroupBy_max = function() {
self$agg(pl$all()$max())
}
Expand All @@ -156,7 +156,7 @@ GroupBy_max = function() {
#' c = c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE),
#' d = c("Apple", "Orange", "Apple", "Apple", "Banana", "Banana")
#' )
#' df$groupby("d", maintain_order = TRUE)$mean()
#' df$group_by("d", maintain_order = TRUE)$mean()
GroupBy_mean = function() {
self$agg(pl$all()$mean())
}
Expand All @@ -172,7 +172,7 @@ GroupBy_mean = function() {
#' c = c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE),
#' d = c("Apple", "Orange", "Apple", "Apple", "Banana", "Banana")
#' )
#' df$groupby("d", maintain_order = TRUE)$median()
#' df$group_by("d", maintain_order = TRUE)$median()
GroupBy_median = function() {
self$agg(pl$all()$median())
}
Expand All @@ -188,7 +188,7 @@ GroupBy_median = function() {
#' c = c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE),
#' d = c("Apple", "Orange", "Apple", "Apple", "Banana", "Banana")
#' )
#' df$groupby("d", maintain_order = TRUE)$min()
#' df$group_by("d", maintain_order = TRUE)$min()
GroupBy_min = function() {
self$agg(pl$all()$min())
}
Expand All @@ -204,7 +204,7 @@ GroupBy_min = function() {
#' c = c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE),
#' d = c("Apple", "Orange", "Apple", "Apple", "Banana", "Banana")
#' )
#' df$groupby("d", maintain_order = TRUE)$sum()
#' df$group_by("d", maintain_order = TRUE)$sum()
GroupBy_sum = function() {
self$agg(pl$all()$sum())
}
Expand All @@ -220,7 +220,7 @@ GroupBy_sum = function() {
#' c = c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE),
#' d = c("Apple", "Orange", "Apple", "Apple", "Banana", "Banana")
#' )
#' df$groupby("d", maintain_order = TRUE)$var()
#' df$group_by("d", maintain_order = TRUE)$var()
GroupBy_var = function() {
self$agg(pl$all()$var())
}
Expand All @@ -236,7 +236,7 @@ GroupBy_var = function() {
#' c = c(TRUE, TRUE, TRUE, FALSE, FALSE, TRUE),
#' d = c("Apple", "Orange", "Apple", "Apple", "Banana", "Banana")
#' )
#' df$groupby("d", maintain_order = TRUE)$std()
#' df$group_by("d", maintain_order = TRUE)$std()
GroupBy_std = function() {
self$agg(pl$all()$std())
}
Expand All @@ -257,7 +257,7 @@ GroupBy_quantile = function(quantile, interpolation = "nearest") {
#' @keywords GroupBy
#' @param periods integer Number of periods to shift (may be negative).
#' @return GroupBy
#' @examples pl$DataFrame(mtcars)$groupby("cyl")$shift(2)
#' @examples pl$DataFrame(mtcars)$group_by("cyl")$shift(2)
GroupBy_shift = function(periods = 1) {
self$agg(pl$all()$shift(periods))
}
Expand All @@ -268,7 +268,7 @@ GroupBy_shift = function(periods = 1) {
#' @param fill_value fill None values with the result of this expression.
#' @param periods integer Number of periods to shift (may be negative).
#' @return GroupBy
#' @examples pl$DataFrame(mtcars)$groupby("cyl")$shift_and_fill(99, 1)
#' @examples pl$DataFrame(mtcars)$group_by("cyl")$shift_and_fill(99, 1)
GroupBy_shift_and_fill = function(fill_value, periods = 1) {
self$agg(pl$all()$shift_and_fill(periods, fill_value))
}
Expand All @@ -280,7 +280,7 @@ GroupBy_shift_and_fill = function(fill_value, periods = 1) {
#' @examples
#' x = mtcars
#' x[1:10, 3:5] = NA
#' pl$DataFrame(x)$groupby("cyl")$null_count()
#' pl$DataFrame(x)$group_by("cyl")$null_count()
GroupBy_null_count = function() {
self$agg(pl$all()$null_count())
}
Expand Down
4 changes: 2 additions & 2 deletions R/lazyframe__groupby.R → R/lazyframe__group_by.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ print.LazyGroupBy = function(x, ...) {

#' @title LazyGroupBy_agg
#' @description
#' aggregate a polar_lazy_groupby
#' aggregate a polar_lazy_group_by
#' @param ... exprs to aggregate over.
#' ... args can also be passed wrapped in a list `$agg(list(e1,e2,e3))`
#' @return A new `LazyFrame` object.
Expand All @@ -26,7 +26,7 @@ print.LazyGroupBy = function(x, ...) {
#' bar = c(5, 3, 2, 4, 1)
#' )$
#' lazy()$
#' groupby("foo")
#' group_by("foo")
#'
#' #
#' print(lgb)
Expand Down
12 changes: 6 additions & 6 deletions R/lazyframe__lazy.R
Original file line number Diff line number Diff line change
Expand Up @@ -859,15 +859,15 @@ LazyFrame_unique = function(subset = NULL, keep = "first", maintain_order = FALS
#' foo = c("one", "two", "two", "one", "two"),
#' bar = c(5, 3, 2, 4, 1)
#' )$
#' groupby("foo")$
#' group_by("foo")$
#' agg(
#' pl$col("bar")$sum()$suffix("_sum"),
#' pl$col("bar")$mean()$alias("bar_tail_sum")
#' )$
#' collect()
LazyFrame_groupby = function(..., maintain_order = pl$options$maintain_order) {
.pr$LazyFrame$groupby(self, unpack_list(...), maintain_order) |>
unwrap("in $groupby():")
LazyFrame_group_by = function(..., maintain_order = pl$options$maintain_order) {
.pr$LazyFrame$group_by(self, unpack_list(...), maintain_order) |>
unwrap("in $group_by():")
}

#' Join LazyFrames
Expand Down Expand Up @@ -1294,7 +1294,7 @@ LazyFrame_fetch = function(
#' # -1- map each Species-group with native polars, takes ~120us only
#' pl$LazyFrame(iris)$
#' sort("Sepal.Length")$
#' groupby("Species", maintain_order = TRUE)$
#' group_by("Species", maintain_order = TRUE)$
#' agg(pl$col(pl$Float64)$first() + 5)$
#' profile()
#'
Expand All @@ -1308,7 +1308,7 @@ LazyFrame_fetch = function(
#'
#' pl$LazyFrame(iris)$
#' sort("Sepal.Length")$
#' groupby("Species", maintain_order = TRUE)$
#' group_by("Species", maintain_order = TRUE)$
#' agg(pl$col(pl$Float64)$apply(r_func))$
#' profile()
LazyFrame_profile = function() {
Expand Down
2 changes: 1 addition & 1 deletion R/options.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ polars_optreq$rpool_cap = list() # rust-side options already check args
#' general pro "immutable objects". Immutability is also classic in R. To mimic
#' the Python-polars API, set this to `FALSE.`
#' @param maintain_order Default for all `maintain_order` options (present in
#' `$groupby()` or `$unique()` for example).
#' `$group_by()` or `$unique()` for example).
#' @param do_not_repeat_call Do not print the call causing the error in error
#' messages. The default (`FALSE`) is to show them.
#' @param debug_polars Print additional information to debug Polars.
Expand Down
4 changes: 2 additions & 2 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -193,11 +193,11 @@ dat[1:4, c("mpg", "qsec", "hp")]

However, the true power of Polars is unlocked by using *methods*, which are
encapsulated in the `DataFrame` object itself. For example, we can chain the
`$groupby()` and the `$mean()` methods to compute group-wise means for each
`$group_by()` and the `$mean()` methods to compute group-wise means for each
column of the dataset:

```{r}
dat$groupby("cyl", maintain_order = TRUE)$mean()
dat$group_by("cyl", maintain_order = TRUE)$mean()
```

Note that we use `maintain_order = TRUE` so that `polars` always keeps the groups
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -206,11 +206,11 @@ dat[1:4, c("mpg", "qsec", "hp")]

However, the true power of Polars is unlocked by using *methods*, which
are encapsulated in the `DataFrame` object itself. For example, we can
chain the `$groupby()` and the `$mean()` methods to compute group-wise
chain the `$group_by()` and the `$mean()` methods to compute group-wise
means for each column of the dataset:

``` r
dat$groupby("cyl", maintain_order = TRUE)$mean()
dat$group_by("cyl", maintain_order = TRUE)$mean()
#> shape: (3, 11)
#> ┌─────┬───────────┬────────────┬────────────┬───┬──────────┬──────────┬──────────┬──────────┐
#> │ cyl ┆ mpg ┆ disp ┆ hp ┆ … ┆ vs ┆ am ┆ gear ┆ carb │
Expand Down
2 changes: 1 addition & 1 deletion docs/docs/reference_home.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ they are returned in a list, and only the new columns or the grouping columns
are returned.

```{r}
test$groupby(pl$col("cyl"))$agg(
test$group_by(pl$col("cyl"))$agg(
pl$col("mpg"), # varying number of values
pl$col("mpg")$slice(0, 2)$suffix("_sliced"), # two values
# aggregated to one value and implicitly unpacks list
Expand Down
Loading

0 comments on commit ca0b51b

Please sign in to comment.