pola-rs · etiennebacher · May 12, 2023 · May 4, 2023 · May 4, 2023 · May 4, 2023
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -59,10 +59,11 @@ Collate:
     'expr__meta.R'
     'expr__string.R'
     'expr__struct.R'
-    'functions.R'
+    'functions__eager.R'
+    'functions__lazy.R'
+    'functions__whenthen.R'
     'groupby.R'
     'ipc.R'
-    'lazy_functions.R'
     'lazyframe__background.R'
     'lazyframe__groupby.R'
     'lazyframe__lazy.R'
@@ -77,7 +78,6 @@ Collate:
     'series__series.R'
     'translation.R'
     'vctrs.R'
-    'whenthen.R'
     'zzz.R'
 Config/rextendr/version: 0.2.0.9000
 VignetteBuilder: knitr
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,8 @@
 # polars (development version)
 
+## What's changed
+ - lazy functions translated: `pl$implode`, `pl$explode`, `pl$unique`, `pl$approx_unique`, `pl$head`, `pl$tail`. Deprecated `pl$list` (use `pl$implode`) (#196)
+
 # polars 0.6.0
 
 ## BREAKING CHANGES

diff --git a/R/expr__expr.R b/R/expr__expr.R
@@ -1942,7 +1942,15 @@ Expr_product = "use_extendr_wrapper"
 #' pl$DataFrame(iris)$select(pl$col("Species")$n_unique())
 Expr_n_unique = "use_extendr_wrapper"
 
-
+#'  Approx count unique values
+#' @keywords Expr
+#' @description
+#' This is done using the HyperLogLog++ algorithm for cardinality estimation.
+#' @aliases approx_unique
+#' @return Expr
+#' @examples
+#' pl$DataFrame(iris)$select(pl$col("Species")$approx_unique())
+Expr_approx_unique = "use_extendr_wrapper"
 
 #' Count `Nulls`
 #' @keywords Expr
@@ -2232,9 +2240,8 @@ Expr_take_every = function(n) {
 #' @examples
 #' #get 3 first elements
 #' pl$DataFrame(list(x=1:11))$select(pl$col("x")$head(3))
-Expr_head = function(n=10) {
-  if(!is.numeric(n)) stopf("n must be numeric")
-  unwrap(.pr$Expr$head(self,n=n))
+Expr_head = function(n = 10) {
+  unwrap(.pr$Expr$head(self, n = n), "in $head():")
 }
 
 #' Tail
@@ -2248,9 +2255,8 @@ Expr_head = function(n=10) {
 #' @examples
 #' #get 3 last elements
 #' pl$DataFrame(list(x=1:11))$select(pl$col("x")$tail(3))
-Expr_tail = function(n=10) {
-  if(!is.numeric(n)) stopf("n must be numeric")
-  unwrap(.pr$Expr$tail(self,n=n))
+Expr_tail = function(n = 10) {
+  unwrap(.pr$Expr$tail(self, n = n), "in $tail():")
 }
 
 
@@ -3952,16 +3958,30 @@ Expr_set_sorted = function(reverse = FALSE) {
 
 
 #' Wrap column in list
-#' @description  Aggregate to list.
+#' @description  Aggregate values into a list.
 #' @keywords Expr
 #' @return Expr
 #' @aliases list
 #' @name Expr_list
-#' @details use to_struct to wrap a DataFrame
+#' @details use to_struct to wrap a DataFrame. Notice implode() is sometimes referred to
+#' as list() .
 #' @format a method
 #' @examples
-#' pl$select(pl$lit(1:4)$list(), pl$lit(c("a")))
-Expr_list = "use_extendr_wrapper"
+#' df = pl$DataFrame(
+#'   a = 1:3,
+#'   b = 4:6
+#' )
+#' df$select(pl$all()$implode())
+Expr_implode = "use_extendr_wrapper"
+
+##TODO REMOVE AT A BREAKING CHANGE
+Expr_list = function() {
+  if ( is.null(runtime_state$warned_deprecate_list)) {
+    runtime_state$warned_deprecate_list = TRUE
+    warning("polars pl$list and <Expr>$list are deprecated, use $implode instead.")
+  }
+  self$implode()
+}
 
 
 

diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R
@@ -427,7 +427,7 @@ Expr$entropy <- function(base, normalize) .Call(wrap__Expr__entropy, self, base,
 
 Expr$cumulative_eval <- function(expr, min_periods, parallel) .Call(wrap__Expr__cumulative_eval, self, expr, min_periods, parallel)
 
-Expr$list <- function() .Call(wrap__Expr__list, self)
+Expr$implode <- function() .Call(wrap__Expr__implode, self)
 
 Expr$shrink_dtype <- function() .Call(wrap__Expr__shrink_dtype, self)
 
@@ -641,6 +641,8 @@ Expr$map <- function(lambda, output_type, agg_list) .Call(wrap__Expr__map, self,
 
 Expr$is_unique <- function() .Call(wrap__Expr__is_unique, self)
 
+Expr$approx_unique <- function() .Call(wrap__Expr__approx_unique, self)
+
 Expr$is_first <- function() .Call(wrap__Expr__is_first, self)
 
 Expr$map_alias <- function(lambda) .Call(wrap__Expr__map_alias, self, lambda)
@@ -925,6 +927,8 @@ Series$rename_mut <- function(name) invisible(.Call(wrap__Series__rename_mut, se
 
 Series$dtype <- function() .Call(wrap__Series__dtype, self)
 
+Series$n_unique <- function() .Call(wrap__Series__n_unique, self)
+
 Series$name <- function() .Call(wrap__Series__name, self)
 
 Series$sort_mut <- function(reverse) .Call(wrap__Series__sort_mut, self, reverse)

diff --git a/R/functions.R → R/functions__eager.R b/R/functions.R → R/functions__eager.R
diff --git a/R/lazy_functions.R → R/functions__lazy.R b/R/lazy_functions.R → R/functions__lazy.R
@@ -71,7 +71,7 @@ pl$col = function(name="", ...) {
 
   #preconvert Series into char name(s)
   if(inherits(name,"Series")) name = name$to_vector()
-  
+
   name_add = list(...)
   if (length(name_add) > 0) {
     if (is_string(name) && all(sapply(name_add, is_string))) {
@@ -146,6 +146,27 @@ pl$count = function(column = NULL)  { # -> Expr | int:
   unwrap(result(pl$col(column)$count()), "in pl$count():")
 }
 
+#' Aggregate all column values into a list.
+#' @name pl_implode
+#' @param name Name of the column(s) that should be imploded, passed to pl$col()
+#' @keywords Expr
+#' @return Expr
+#' @examples
+#' pl$DataFrame(iris)$select(pl$implode("Species"))
+pl$implode = function(name) { # -> Expr
+  result(pl$col(name)) |>
+    map(.pr$Expr$implode) |>
+    unwrap("in pl$implode():")
+}
+
+##TODO REMOVE AT A BREAKING CHANGE
+pl$list = function(name) {
+  if ( is.null(runtime_state$warned_deprecate_list)) {
+    runtime_state$warned_deprecate_list = TRUE
+    warning("polars pl$list and <Expr>$list are deprecated, use $implode instead.")
+  }
+  pl$implode(name)
+}
 
 #' pl$first
 #' @name pl_first
@@ -232,6 +253,70 @@ pl$last = function(column = NULL) {#-> Expr | Any:
 }
 
 
+#' Get the first `n` rows.
+#' @name pl_head
+#' @param column if dtype is:
+#' - Series: Take head value in `Series`
+#' - str or in: syntactic sugar for `pl.col(..).head()`
+#' @param n number of rows to take, NULL
+#' @keywords Expr_new
+#' @return Expr or head value of input Series
+#' @examples
+#' df = pl$DataFrame(
+#'   a = c(1, 8, 3),
+#'   b = c(4, 5, 2),
+#'   c = c("foo", "bar", "foo")
+#' )
+#' df$select(pl$head("a"))
+#'
+#' df$select(pl$head("a",2))
+#'
+#' pl$head(df$get_column("a"),2)
+pl$head = function(column, n = 10) {#-> Expr | Any:
+  pcase(
+    inherits(column,"Series"), result(column$expr$head(n)),
+    is.character(column), result(pl$col(column)$head(n)),
+    inherits(column,"Expr"), result(column$head(n)),
+    or_else = Err(paste0(
+      "param [column] type is neither Series, charvec nor Expr, but ",
+      str_string(column)
+    ))
+  ) |>
+    unwrap("in pl$head():")
+}
+
+
+#' Get the last `n` rows.
+#' @name pl_tail
+#' @param column if dtype is:
+#' - Series: Take head value in `Series`
+#' - str or in: syntactic sugar for `pl.col(..).head()`
+#' @param n number of rows to take, NULL
+#' @return Expr or tail value of input Series
+#' @examples
+#' df = pl$DataFrame(
+#'   a = c(1, 8, 3),
+#'   b = c(4, 5, 2),
+#'   c = c("foo", "bar", "foo")
+#' )
+#' df$select(pl$tail("a"))
+#'
+#' df$select(pl$tail("a",2))
+#'
+#' pl$tail(df$get_column("a"),2)
+pl$tail = function(column, n = 10) {#-> Expr | Any:
+  pcase(
+    inherits(column,"Series"), result(column$expr$tail(n)),
+    is.character(column), result(pl$col(column)$tail(n)),
+    inherits(column,"Expr"), result(column$tail(n)),
+    or_else = Err(paste0(
+      "param [column] type is neither Series, charvec nor Expr, but ",
+      str_string(column)
+    ))
+  ) |>
+    unwrap("in pl$tail():")
+}
+
 #' pl$mean
 #' @name pl_mean
 #' @description Depending on the input type this function does different things:
@@ -321,10 +406,69 @@ pl$median = function(...) { #-> Expr | Any:
   unwrap("in pl$median():")
 }
 
+#' count n unique values
+#' @name pl_n_unique
+#' @description Depending on the input type this function does different things:
+#' @param column if dtype is:
+#' - Series: call method n_unique() to return value of unique values.
+#' - String: syntactic sugar for `pl$col(column)$n_unique()`, returns Expr
+#' - Expr: syntactic sugar for `column$n_unique()`, returns Expr
+#'
+#' @keywords Expr_new
+#'
+#' @return Expr or value
+#'
+#' @examples
+#' #column as Series
+#' pl$n_unique(pl$Series(1:4)) == 4
+#'
+#' #column as String
+#' expr = pl$n_unique("bob")
+#' print(expr)
+#' pl$DataFrame(bob = 1:4)$select(expr)
+#'
+#' #colum as Expr
+#' pl$DataFrame(bob = 1:4)$select(pl$n_unique(pl$col("bob")))
+pl$n_unique = function(column) { #-> int or Expr
+  pcase(
+    inherits(column, c("Series","Expr")), result(column$n_unique()),
+    is_string(column), result(pl$col(column)$n_unique()),
+    or_else = Err(paste("arg [column] is neither Series, Expr or String, but", str_string(column)))
+  ) |>
+    unwrap("in pl$n_unique():")
+}
 
+#' Approx count unique values.
+#' @name pl_approx_unique
+#' @description This is done using the HyperLogLog++ algorithm for cardinality estimation.
+#' @param column if dtype is:
+#' - String: syntactic sugar for `pl$col(column)$approx_unique()`, returns Expr
+#' - Expr: syntactic sugar for `column$approx_unique()`, returns Expr
+#'
+#' @keywords Expr_new
+#'
+#' @return Expr
+#'
+#' @examples
+#' #column as Series
+#' pl$approx_unique(pl$lit(1:4)) == 4
+#'
+#' #column as String
+#' expr = pl$approx_unique("bob")
+#' print(expr)
+#' pl$DataFrame(bob = 1:80)$select(expr)
+#'
+#' #colum as Expr
+#' pl$DataFrame(bob = 1:4)$select(pl$approx_unique(pl$col("bob")))
+pl$approx_unique = function(column) { #-> int or Expr
+  pcase(
+    inherits(column, "Expr"), result(column$approx_unique()),
+    is_string(column), result(pl$col(column)$approx_unique()),
+    or_else = Err(paste("arg [column] is neither Expr or String, but", str_string(column)))
+  ) |>
+    unwrap("in pl$approx_unique():")
+}
 
-#TODO contribute polars, python pl.sum(list) states uses lambda, however it is folds expressions in rust
-#docs should reflect that
 
 #' sum across expressions / literals / Series
 #' @description  syntactic sugar for starting a expression with sum

diff --git a/R/whenthen.R → R/functions__whenthen.R b/R/whenthen.R → R/functions__whenthen.R
diff --git a/R/series__series.R b/R/series__series.R
@@ -333,12 +333,12 @@ Series_shape = method_as_property(function() {
 #' #make nested Series_list of Series_list of Series_Int32
 #' #using Expr syntax because currently more complete translated
 #' series_list = pl$DataFrame(list(a=c(1:5,NA_integer_)))$select(
-#'   pl$col("a")$list()$list()$append(
+#'   pl$col("a")$implode()$implode()$append(
 #'     (
-#'       pl$col("a")$head(2)$list()$append(
-#'         pl$col("a")$tail(1)$list()
+#'       pl$col("a")$head(2)$implode()$append(
+#'         pl$col("a")$tail(1)$implode()
 #'       )
-#'     )$list()
+#'     )$implode()
 #'   )
 #' )$get_column("a") # get series from DataFrame
 #'
@@ -1046,3 +1046,13 @@ Series_expr = method_as_property(function() {
 Series_to_lit = function() {
   pl$lit(self)
 }
+
+#' Series n_unique
+#' @description return count of unique values in Series
+#' @keywords Series
+#' @return Expr
+#' @examples
+#' pl$Series(1:4)$n_unique()
+Series_n_unique = function() {
+  unwrap(.pr$Series$n_unique(self), "in $n_unique():")
+}
diff --git a/man/Expr_approx_unique.Rd b/man/Expr_approx_unique.Rd