Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More lazy functions #196

Merged
merged 12 commits into from
May 12, 2023
6 changes: 3 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,11 @@ Collate:
'expr__meta.R'
'expr__string.R'
'expr__struct.R'
'functions.R'
'functions__eager.R'
'functions__lazy.R'
'functions__whenthen.R'
'groupby.R'
'ipc.R'
'lazy_functions.R'
'lazyframe__background.R'
'lazyframe__groupby.R'
'lazyframe__lazy.R'
Expand All @@ -77,7 +78,6 @@ Collate:
'series__series.R'
'translation.R'
'vctrs.R'
'whenthen.R'
'zzz.R'
Config/rextendr/version: 0.2.0.9000
VignetteBuilder: knitr
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# polars (development version)

## What's changed
- lazy functions translated: `pl$implode`, `pl$explode`, `pl$unique`, `pl$approx_unique`, `pl$head`, `pl$tail` (#196)
- `pl$list` is deprecated, use `pl$implode` instead (#196)

# polars 0.6.0

## BREAKING CHANGES
Expand Down
43 changes: 32 additions & 11 deletions R/expr__expr.R
Original file line number Diff line number Diff line change
Expand Up @@ -1942,7 +1942,16 @@ Expr_product = "use_extendr_wrapper"
#' pl$DataFrame(iris)$select(pl$col("Species")$n_unique())
Expr_n_unique = "use_extendr_wrapper"


#' Approx count unique values
#' @keywords Expr
#' @description
#' This is done using the HyperLogLog++ algorithm for cardinality estimation.
#' @aliases approx_unique
#' @return Expr
sorhawell marked this conversation as resolved.
Show resolved Hide resolved
#' @docType NULL
#' @examples
#' pl$DataFrame(iris)$select(pl$col("Species")$approx_unique())
Expr_approx_unique = "use_extendr_wrapper"

#' Count `Nulls`
#' @keywords Expr
Expand Down Expand Up @@ -2232,9 +2241,8 @@ Expr_take_every = function(n) {
#' @examples
#' #get 3 first elements
#' pl$DataFrame(list(x=1:11))$select(pl$col("x")$head(3))
Expr_head = function(n=10) {
if(!is.numeric(n)) stopf("n must be numeric")
unwrap(.pr$Expr$head(self,n=n))
Expr_head = function(n = 10) {
unwrap(.pr$Expr$head(self, n = n), "in $head():")
}

#' Tail
Expand All @@ -2248,9 +2256,8 @@ Expr_head = function(n=10) {
#' @examples
#' #get 3 last elements
#' pl$DataFrame(list(x=1:11))$select(pl$col("x")$tail(3))
Expr_tail = function(n=10) {
if(!is.numeric(n)) stopf("n must be numeric")
unwrap(.pr$Expr$tail(self,n=n))
Expr_tail = function(n = 10) {
unwrap(.pr$Expr$tail(self, n = n), "in $tail():")
}


Expand Down Expand Up @@ -3952,16 +3959,30 @@ Expr_set_sorted = function(reverse = FALSE) {


#' Wrap column in list
#' @description Aggregate to list.
#' @description Aggregate values into a list.
#' @keywords Expr
#' @return Expr
#' @aliases list
#' @name Expr_list
#' @details use to_struct to wrap a DataFrame
#' @details use to_struct to wrap a DataFrame. Notice implode() is sometimes referred to
#' as list() .
#' @format a method
#' @examples
#' pl$select(pl$lit(1:4)$list(), pl$lit(c("a")))
Expr_list = "use_extendr_wrapper"
#' df = pl$DataFrame(
#' a = 1:3,
#' b = 4:6
#' )
#' df$select(pl$all()$implode())
Expr_implode = "use_extendr_wrapper"

##TODO REMOVE AT A BREAKING CHANGE
Expr_list = function() {
if ( is.null(runtime_state$warned_deprecate_list)) {
runtime_state$warned_deprecate_list = TRUE
warning("polars pl$list and <Expr>$list are deprecated, use $implode instead.")
}
self$implode()
}



Expand Down
6 changes: 5 additions & 1 deletion R/extendr-wrappers.R
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,7 @@ Expr$entropy <- function(base, normalize) .Call(wrap__Expr__entropy, self, base,

Expr$cumulative_eval <- function(expr, min_periods, parallel) .Call(wrap__Expr__cumulative_eval, self, expr, min_periods, parallel)

Expr$list <- function() .Call(wrap__Expr__list, self)
Expr$implode <- function() .Call(wrap__Expr__implode, self)

Expr$shrink_dtype <- function() .Call(wrap__Expr__shrink_dtype, self)

Expand Down Expand Up @@ -641,6 +641,8 @@ Expr$map <- function(lambda, output_type, agg_list) .Call(wrap__Expr__map, self,

Expr$is_unique <- function() .Call(wrap__Expr__is_unique, self)

Expr$approx_unique <- function() .Call(wrap__Expr__approx_unique, self)

Expr$is_first <- function() .Call(wrap__Expr__is_first, self)

Expr$map_alias <- function(lambda) .Call(wrap__Expr__map_alias, self, lambda)
Expand Down Expand Up @@ -925,6 +927,8 @@ Series$rename_mut <- function(name) invisible(.Call(wrap__Series__rename_mut, se

Series$dtype <- function() .Call(wrap__Series__dtype, self)

Series$n_unique <- function() .Call(wrap__Series__n_unique, self)

Series$name <- function() .Call(wrap__Series__name, self)

Series$sort_mut <- function(reverse) .Call(wrap__Series__sort_mut, self, reverse)
Expand Down
File renamed without changes.
164 changes: 161 additions & 3 deletions R/lazy_functions.R → R/functions__lazy.R
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ pl$col = function(name="", ...) {

#preconvert Series into char name(s)
if(inherits(name,"Series")) name = name$to_vector()

name_add = list(...)
if (length(name_add) > 0) {
if (is_string(name) && all(sapply(name_add, is_string))) {
Expand Down Expand Up @@ -146,6 +146,27 @@ pl$count = function(column = NULL) { # -> Expr | int:
unwrap(result(pl$col(column)$count()), "in pl$count():")
}

#' Aggregate all column values into a list.
#' @name pl_implode
#' @param name Name of the column(s) that should be imploded, passed to pl$col()
#' @keywords Expr
#' @return Expr
#' @examples
#' pl$DataFrame(iris)$select(pl$implode("Species"))
pl$implode = function(name) { # -> Expr
result(pl$col(name)) |>
map(.pr$Expr$implode) |>
unwrap("in pl$implode():")
}

##TODO REMOVE AT A BREAKING CHANGE
pl$list = function(name) {
if ( is.null(runtime_state$warned_deprecate_list)) {
runtime_state$warned_deprecate_list = TRUE
warning("polars pl$list and <Expr>$list are deprecated, use $implode instead.")
}
pl$implode(name)
}

#' pl$first
#' @name pl_first
Expand Down Expand Up @@ -232,6 +253,75 @@ pl$last = function(column = NULL) {#-> Expr | Any:
}


#' Get the first `n` rows.
#' @name pl_head
#' @param column if dtype is:
#' - Series: Take head value in `Series`
#' - str or int: syntactic sugar for `pl.col(..).head()`
#' @param n Number of rows to take
#' @keywords Expr_new
#' @return Expr or head value of input Series
#' @examples
#' df = pl$DataFrame(
sorhawell marked this conversation as resolved.
Show resolved Hide resolved
#' a = c(1, 8, 3),
#' b = c(4, 5, 2),
#' c = c("foo", "bar", "foo")
#' )
#'
#' expr_head = pl$head("a")
#' print(expr_head)
#' df$select(expr_head)
#'
#' df$select(pl$head("a",2))
#' pl$head(df$get_column("a"),2)
pl$head = function(column, n = 10) {#-> Expr | Any:
pcase(
inherits(column,"Series"), result(column$expr$head(n)),
is.character(column), result(pl$col(column)$head(n)),
inherits(column,"Expr"), result(column$head(n)),
or_else = Err(paste0(
"param [column] type is neither Series, charvec nor Expr, but ",
str_string(column)
))
) |>
unwrap("in pl$head():")
}


#' Get the last `n` rows.
#' @name pl_tail
#' @param column if dtype is:
#' - Series: Take tail value in `Series`
#' - str or in: syntactic sugar for `pl.col(..).tail()`
#' @param n Number of rows to take
#' @return Expr or tail value of input Series
#' @examples
#' df = pl$DataFrame(
sorhawell marked this conversation as resolved.
Show resolved Hide resolved
#' a = c(1, 8, 3),
#' b = c(4, 5, 2),
#' c = c("foo", "bar", "foo")
#' )
#'
#' expr_tail = pl$head("a")
#' print(expr_tail)
#' df$select(expr_tail)
#'
#' df$select(pl$tail("a",2))
#'
#' pl$tail(df$get_column("a"),2)
pl$tail = function(column, n = 10) {#-> Expr | Any:
pcase(
inherits(column,"Series"), result(column$expr$tail(n)),
is.character(column), result(pl$col(column)$tail(n)),
inherits(column,"Expr"), result(column$tail(n)),
or_else = Err(paste0(
"param [column] type is neither Series, charvec nor Expr, but ",
str_string(column)
))
) |>
unwrap("in pl$tail():")
}

#' pl$mean
#' @name pl_mean
#' @description Depending on the input type this function does different things:
Expand Down Expand Up @@ -321,10 +411,78 @@ pl$median = function(...) { #-> Expr | Any:
unwrap("in pl$median():")
}

#' Count `n` unique values
#' @name pl_n_unique
#' @description Depending on the input type this function does different things:
#' @param column if dtype is:
#' - Series: call method n_unique() to return value of unique values.
#' - String: syntactic sugar for `pl$col(column)$n_unique()`, returns Expr
#' - Expr: syntactic sugar for `column$n_unique()`, returns Expr
#'
#' @keywords Expr_new
#'
#' @return Expr or value
#'
#' @examples
#' #column as Series
#' pl$n_unique(pl$Series(1:4)) == 4
#'
#' #column as String
#' expr = pl$n_unique("bob")
#' print(expr)
#' pl$DataFrame(bob = 1:4)$select(expr)
#'
#' #colum as Expr
#' pl$DataFrame(bob = 1:4)$select(pl$n_unique(pl$col("bob")))
pl$n_unique = function(column) { #-> int or Expr
pcase(
inherits(column, c("Series","Expr")), result(column$n_unique()),
is_string(column), result(pl$col(column)$n_unique()),
or_else = Err(paste("arg [column] is neither Series, Expr or String, but", str_string(column)))
) |>
unwrap("in pl$n_unique():")
}

#' Approximate count of unique values.
#' @name pl_approx_unique
#' @description This is done using the HyperLogLog++ algorithm for cardinality estimation.
sorhawell marked this conversation as resolved.
Show resolved Hide resolved
#' @param column if dtype is:
#' - String: syntactic sugar for `pl$col(column)$approx_unique()`, returns Expr
#' - Expr: syntactic sugar for `column$approx_unique()`, returns Expr
#'
#' @keywords Expr_new
#'
#' @return Expr
#'
#' @details The approx_unique is likely only warranted for large columns. See example.
#' It appears approx_unique scales better than n_unique, such that the relative performance
#' difference increases with column size.
#'
#' @examples
#' #column as Series
#' pl$approx_unique(pl$lit(1:4)) == 4
#'
#' #column as String
#' expr = pl$approx_unique("bob")
#' print(expr)
#' pl$DataFrame(bob = 1:80)$select(expr)
#'
#' #colum as Expr
#' pl$DataFrame(bob = 1:4)$select(pl$approx_unique(pl$col("bob")))
#'
#' # comparison with n_unique for 2 million integers. (try change example to 20 million ints)
#' lit_series = pl$lit(c(1:1E6,1E6:1,1:1E6))
#' system.time(pl$approx_unique(lit_series)$lit_to_s()$print())
#' system.time(pl$n_unique(lit_series)$lit_to_s()$print())
pl$approx_unique = function(column) { #-> int or Expr
pcase(
inherits(column, "Expr"), result(column$approx_unique()),
is_string(column), result(pl$col(column)$approx_unique()),
or_else = Err(paste("arg [column] is neither Expr or String, but", str_string(column)))
) |>
unwrap("in pl$approx_unique():")
}

#TODO contribute polars, python pl.sum(list) states uses lambda, however it is folds expressions in rust
#docs should reflect that

#' sum across expressions / literals / Series
#' @description syntactic sugar for starting a expression with sum
Expand Down
File renamed without changes.
18 changes: 14 additions & 4 deletions R/series__series.R
Original file line number Diff line number Diff line change
Expand Up @@ -333,12 +333,12 @@ Series_shape = method_as_property(function() {
#' #make nested Series_list of Series_list of Series_Int32
#' #using Expr syntax because currently more complete translated
#' series_list = pl$DataFrame(list(a=c(1:5,NA_integer_)))$select(
#' pl$col("a")$list()$list()$append(
#' pl$col("a")$implode()$implode()$append(
#' (
#' pl$col("a")$head(2)$list()$append(
#' pl$col("a")$tail(1)$list()
#' pl$col("a")$head(2)$implode()$append(
#' pl$col("a")$tail(1)$implode()
#' )
#' )$list()
#' )$implode()
#' )
#' )$get_column("a") # get series from DataFrame
#'
Expand Down Expand Up @@ -1046,3 +1046,13 @@ Series_expr = method_as_property(function() {
Series_to_lit = function() {
pl$lit(self)
}

#' Count unique values in Series
#' @description Return count of unique values in Series
#' @keywords Series
#' @return Expr
#' @examples
#' pl$Series(1:4)$n_unique()
Series_n_unique = function() {
unwrap(.pr$Series$n_unique(self), "in $n_unique():")
}
22 changes: 22 additions & 0 deletions man/Expr_approx_unique.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading