diff --git a/NEWS.md b/NEWS.md index 091b1ae4b..203b4041a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,21 +2,62 @@ ## Polars R Package (development version) +Updated rust-polars to 0.41.2 (#1147). + ### Breaking changes - In `$n_chunks()`, the default value of `strategy` now is `"first"` (#1137). --`$sample()` for Expr and DataFrame (#1136): +- `$sample()` for Expr and DataFrame (#1136): - the argument `frac` is renamed `fraction`; - all the arguments except `n` must be named; - for the Expr method only, the first argument is now `n` (it was already the case for the DataFrame method); - for the Expr method only, the default value for `with_replacement` is now `FALSE` (it was already the case for the DataFrame method). +- `$melt()` had several changes (#1147): + - `melt()` is renamed `$unpivot()`. + - Some arguments were renamed: `id_vars` is now `index`, `value_vars` is now + `on`. + - The order of arguments has changed: `on` is now first, then `index`. The + order of the other arguments hasn't changed. Note that `on` can be unnamed + but all the other arguments must be named. +- `pivot()` had several changes (#1147): + - The argument `columns` is renamed `on`. + - The order of arguments has changed: `on` is now first, then `index` and + `values`. The order of the other arguments hasn't changed. Note that `on` + can be unnamed but all the other arguments must be named. +- In `$write_parquet()` and `$sink_parquet()`, the default value of argument + `statistics` is now `TRUE` and can take other values than `TRUE/FALSE` (#1147). +- In `$dt$truncate()` and `$dt$round()`, the argument `offset` has been removed. + Use `$dt$offset_by()` after those functions instead (#1147). +- In `$top_k()` and `$bottom_k()` for `Expr`, the arguments `nulls_last`, + `maintain_order` and `multithreaded` have been removed. If any `null` values + are in the top/bottom `k` values, they will always be positioned last (#1147). +- `$replace()` has been split in two functions depending on the desired + behaviour (#1147): + - `$replace()` recodes some values in the column, leaving all other values + unchanged. Compared to the previous version, it doesn't use the arguments + `default` and `return_dtype` anymore. + - `$replace_strict()` replaces all values by different values. If a value + doesn't have a specific mapping, it is replaced by the `default` value. +- `$str$concat()` is deprecated, use `$str$join()` (with the same arguments) + instead (#1147). +- In `pl$date_range()` and `pl$date_ranges()`, the arguments `time_unit` and + `time_zone` have been removed. They were deprecated in previous versions + (#1147). +- In `$join()`, when `how = "cross"`, `on`, `left_on` and `right_on` must be + `NULL` (#1147). + ### New features - New method `$has_nulls()` (#1133). - New method `$list$explode()` (#1139). +- `$over()` gains a new argument `order_by` to specify the order of values + within each group. This is useful when the operation depends on the order of + values, such as `$shift()` (#1147). +- `$value_counts()` gains an argument `normalize` to give relative frequencies + of unique values instead of their count (#1147). ## Polars R Package 0.17.0 diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R index 819a47efe..57b7c71a9 100644 --- a/R/dataframe__frame.R +++ b/R/dataframe__frame.R @@ -1070,7 +1070,7 @@ DataFrame_to_list = function(unnest_structs = TRUE, ..., int64_conversion = pola DataFrame_join = function( other, on = NULL, - how = c("inner", "left", "full", "semi", "anti", "cross"), + how = "inner", ..., left_on = NULL, right_on = NULL, @@ -1490,7 +1490,7 @@ DataFrame_join_asof = function( -#' @inherit LazyFrame_melt +#' @inherit LazyFrame_unpivot #' @keywords DataFrame #' #' @return A new `DataFrame` @@ -1502,25 +1502,26 @@ DataFrame_join_asof = function( #' c = c(2, 4, 6), #' d = c(7, 8, 9) #' ) -#' df$melt(id_vars = "a", value_vars = c("b", "c", "d")) -DataFrame_melt = function( - id_vars = NULL, - value_vars = NULL, +#' df$unpivot(index = "a", on = c("b", "c", "d")) +DataFrame_unpivot = function( + on = NULL, + ..., + index = NULL, variable_name = NULL, value_name = NULL) { - .pr$DataFrame$melt( - self, id_vars %||% character(), value_vars %||% character(), + .pr$DataFrame$unpivot( + self, on %||% character(), index %||% character(), value_name, variable_name - ) |> unwrap("in $melt( ): ") + ) |> unwrap("in $unpivot( ): ") } #' Pivot data from long to wide #' @param values Column values to aggregate. Can be multiple columns if the -#' `columns` arguments contains multiple columns as well. +#' `on` arguments contains multiple columns as well. #' @param index One or multiple keys to group by. -#' @param columns Name of the column(s) whose values will be used as the header +#' @param on Name of the column(s) whose values will be used as the header #' of the output DataFrame. #' @param ... Not used. #' @param aggregate_function One of: @@ -1544,7 +1545,7 @@ DataFrame_melt = function( #' df #' #' df$pivot( -#' values = "baz", index = "foo", columns = "bar" +#' values = "baz", index = "foo", on = "bar" #' ) #' #' # Run an expression as aggregation function @@ -1557,15 +1558,15 @@ DataFrame_melt = function( #' #' df$pivot( #' index = "col1", -#' columns = "col2", +#' on = "col2", #' values = "col3", #' aggregate_function = pl$element()$tanh()$mean() #' ) DataFrame_pivot = function( - values, - index, - columns, + on, ..., + index, + values, aggregate_function = NULL, maintain_order = TRUE, sort_columns = FALSE, @@ -1586,7 +1587,7 @@ DataFrame_pivot = function( )) |> # run pivot when valid aggregate_expr and_then(\(aggregate_expr) .pr$DataFrame$pivot_expr( - self, index, columns, values, maintain_order, sort_columns, aggregate_expr, separator + self, on, index, values, maintain_order, sort_columns, aggregate_expr, separator )) |> # unwrap and add method context name unwrap("in $pivot():") @@ -1736,7 +1737,7 @@ DataFrame_describe = function(percentiles = c(.25, .75), interpolation = "neares )$ unnest("fields")$ drop("column")$ - pivot(index = "statistic", columns = "variable", values = "column_0")$ + pivot(index = "statistic", on = "variable", values = "column_0")$ with_columns(statistic = pl$lit(metrics)) }) |> uw() @@ -2031,9 +2032,11 @@ DataFrame_write_parquet = function( ..., compression = "zstd", compression_level = 3, - statistics = FALSE, + statistics = TRUE, row_group_size = NULL, data_pagesize_limit = NULL) { + statistics = translate_statistics(statistics) |> + unwrap("in $write_parquet():") .pr$DataFrame$write_parquet( self, file, diff --git a/R/expr__datetime.R b/R/expr__datetime.R index b2e04cfad..6d429c141 100644 --- a/R/expr__datetime.R +++ b/R/expr__datetime.R @@ -2,8 +2,8 @@ #' @description Divide the date/datetime range into buckets. #' Each date/datetime is mapped to the start of its bucket. #' -#' @param every string encoding duration see details. -#' @param offset optional string encoding duration see details. +#' @param every Either an Expr or a string indicating a column name or a +#' duration (see Details). #' #' @details The ``every`` and ``offset`` argument are created with the #' the following string language: @@ -20,22 +20,18 @@ #' These strings can be combined: #' - 3d12h4m25s # 3 days, 12 hours, 4 minutes, and 25 seconds #' @return Date/Datetime expr -#' @keywords ExprDT -#' @aliases (Expr)$dt$truncate #' @examples #' t1 = as.POSIXct("3040-01-01", tz = "GMT") #' t2 = t1 + as.difftime(25, units = "secs") -#' s = pl$date_range(t1, t2, interval = "2s", time_unit = "ms") +#' s = pl$datetime_range(t1, t2, interval = "2s", time_unit = "ms") #' -#' # use a dt namespace function #' df = pl$DataFrame(datetime = s)$with_columns( -#' pl$col("datetime")$dt$truncate("4s")$alias("truncated_4s"), -#' pl$col("datetime")$dt$truncate("4s", offset("3s"))$alias("truncated_4s_offset_2s") +#' pl$col("datetime")$dt$truncate("4s")$alias("truncated_4s") #' ) #' df -ExprDT_truncate = function(every, offset = NULL) { - offset = parse_as_polars_duration_string(offset, default = "0ns") - .pr$Expr$dt_truncate(self, every, offset) |> +ExprDT_truncate = function(every) { + every = parse_as_polars_duration_string(every, default = "0ns") + .pr$Expr$dt_truncate(self, every) |> unwrap("in $dt$truncate()") } @@ -46,46 +42,20 @@ ExprDT_truncate = function(every, offset = NULL) { #' Each date/datetime in the second half of the interval #' is mapped to the end of its bucket. #' +#' @inherit ExprDT_truncate params details return #' -#' @param every string encoding duration see details. -#' @param offset optional string encoding duration see details. -#' -#' @details The ``every`` and ``offset`` arguments are created with the -#' following string language: -#' - 1ns # 1 nanosecond -#' - 1us # 1 microsecond -#' - 1ms # 1 millisecond -#' - 1s # 1 second -#' - 1m # 1 minute -#' - 1h # 1 hour -#' - 1d # 1 day -#' - 1w # 1 calendar week -#' - 1mo # 1 calendar month -#' - 1y # 1 calendar year -#' These strings can be combined: -#' - 3d12h4m25s # 3 days, 12 hours, 4 minutes, and 25 seconds -#' -#' This functionality is currently experimental and may -#' change without it being considered a breaking change. -#' -#' @return Date/Datetime expr -#' @keywords ExprDT -#' @aliases (Expr)$dt$round #' @examples #' t1 = as.POSIXct("3040-01-01", tz = "GMT") #' t2 = t1 + as.difftime(25, units = "secs") -#' s = pl$date_range(t1, t2, interval = "2s", time_unit = "ms") +#' s = pl$datetime_range(t1, t2, interval = "2s", time_unit = "ms") #' -#' # use a dt namespace function #' df = pl$DataFrame(datetime = s)$with_columns( -#' pl$col("datetime")$dt$truncate("4s")$alias("truncated_4s"), -#' pl$col("datetime")$dt$truncate("4s", offset("3s"))$alias("truncated_4s_offset_2s") +#' pl$col("datetime")$dt$round("4s")$alias("rounded_4s") #' ) #' df -ExprDT_round = function(every, offset = NULL) { +ExprDT_round = function(every) { every = parse_as_polars_duration_string(every, default = "0ns") - offset = parse_as_polars_duration_string(offset, default = "0ns") - .pr$Expr$dt_round(self, every, offset) |> + .pr$Expr$dt_round(self, every) |> unwrap("in $dt$round()") } @@ -370,7 +340,7 @@ ExprDT_ordinal_day = function() { #' @aliases (Expr)$dt$hour #' @examples #' df = pl$DataFrame( -#' date = pl$date_range( +#' date = pl$datetime_range( #' as.Date("2020-12-25"), #' as.Date("2021-1-05"), #' interval = "1d2h", @@ -395,7 +365,7 @@ ExprDT_hour = function() { #' @aliases (Expr)$dt$minute #' @examples #' df = pl$DataFrame( -#' date = pl$date_range( +#' date = pl$datetime_range( #' as.Date("2020-12-25"), #' as.Date("2021-1-05"), #' interval = "1d5s", @@ -556,7 +526,7 @@ ExprDT_epoch = function(tu = c("us", "ns", "ms", "s", "d")) { #' @aliases (Expr)$dt$timestamp #' @examples #' df = pl$DataFrame( -#' date = pl$date_range( +#' date = pl$datetime_range( #' start = as.Date("2001-1-1"), #' end = as.Date("2001-1-3"), #' interval = "1d1s" @@ -585,7 +555,7 @@ ExprDT_timestamp = function(tu = c("ns", "us", "ms")) { #' @aliases (Expr)$dt$with_time_unit #' @examples #' df = pl$DataFrame( -#' date = pl$date_range( +#' date = pl$datetime_range( #' start = as.Date("2001-1-1"), #' end = as.Date("2001-1-3"), #' interval = "1d1s" @@ -615,7 +585,7 @@ ExprDT_with_time_unit = function(tu = c("ns", "us", "ms")) { #' @aliases (Expr)$dt$cast_time_unit #' @examples #' df = pl$DataFrame( -#' date = pl$date_range( +#' date = pl$datetime_range( #' start = as.Date("2001-1-1"), #' end = as.Date("2001-1-3"), #' interval = "1d1s" @@ -641,10 +611,10 @@ ExprDT_cast_time_unit = function(tu = c("ns", "us", "ms")) { #' @return Expr of i64 #' @examples #' df = pl$DataFrame( -#' date = pl$date_range( +#' date = pl$datetime_range( #' as.POSIXct("2020-03-01", tz = "UTC"), #' as.POSIXct("2020-05-01", tz = "UTC"), -#' "1mo" +#' "1mo1s" #' ) #' ) #' @@ -681,10 +651,10 @@ ExprDT_convert_time_zone = function(time_zone) { #' @aliases (Expr)$dt$replace_time_zone #' @examples #' df1 = pl$DataFrame( -#' london_timezone = pl$date_range( +#' london_timezone = pl$datetime_range( #' as.POSIXct("2020-03-01", tz = "UTC"), #' as.POSIXct("2020-07-01", tz = "UTC"), -#' "1mo" +#' "1mo1s" #' )$dt$convert_time_zone("Europe/London") #' ) #' @@ -729,10 +699,10 @@ ExprDT_replace_time_zone = function( #' @return Expr of i64 #' @examples #' df = pl$DataFrame( -#' date = pl$date_range( +#' date = pl$datetime_range( #' start = as.Date("2020-3-1"), #' end = as.Date("2020-5-1"), -#' interval = "1mo" +#' interval = "1mo1s" #' ) #' ) #' df$select( @@ -791,7 +761,7 @@ ExprDT_total_minutes = function() { #' #' @return Expr of i64 #' @examples -#' df = pl$DataFrame(date = pl$date_range( +#' df = pl$DataFrame(date = pl$datetime_range( #' start = as.POSIXct("2020-1-1", tz = "GMT"), #' end = as.POSIXct("2020-1-1 00:04:00", tz = "GMT"), #' interval = "1m" @@ -810,7 +780,7 @@ ExprDT_total_seconds = function() { #' #' @return Expr of i64 #' @examples -#' df = pl$DataFrame(date = pl$date_range( +#' df = pl$DataFrame(date = pl$datetime_range( #' start = as.POSIXct("2020-1-1", tz = "GMT"), #' end = as.POSIXct("2020-1-1 00:00:01", tz = "GMT"), #' interval = "1ms" @@ -829,7 +799,7 @@ ExprDT_total_milliseconds = function() { #' #' @return Expr of i64 #' @examples -#' df = pl$DataFrame(date = pl$date_range( +#' df = pl$DataFrame(date = pl$datetime_range( #' start = as.POSIXct("2020-1-1", tz = "GMT"), #' end = as.POSIXct("2020-1-1 00:00:01", tz = "GMT"), #' interval = "1ms" @@ -848,7 +818,7 @@ ExprDT_total_microseconds = function() { #' #' @return Expr of i64 #' @examples -#' df = pl$DataFrame(date = pl$date_range( +#' df = pl$DataFrame(date = pl$datetime_range( #' start = as.POSIXct("2020-1-1", tz = "GMT"), #' end = as.POSIXct("2020-1-1 00:00:01", tz = "GMT"), #' interval = "1ms" @@ -907,7 +877,7 @@ ExprDT_total_nanoseconds = function() { #' #' # the "by" argument also accepts expressions #' df = pl$DataFrame( -#' dates = pl$date_range( +#' dates = pl$datetime_range( #' as.POSIXct("2022-01-01", tz = "GMT"), #' as.POSIXct("2022-01-02", tz = "GMT"), #' interval = "6h", time_unit = "ms", time_zone = "GMT" @@ -932,7 +902,7 @@ ExprDT_offset_by = function(by) { #' #' #' @examples -#' df = pl$DataFrame(dates = pl$date_range( +#' df = pl$DataFrame(dates = pl$datetime_range( #' as.Date("2000-1-1"), #' as.Date("2000-1-2"), #' "1h" diff --git a/R/expr__expr.R b/R/expr__expr.R index 2389b95ba..c0d4a44d1 100644 --- a/R/expr__expr.R +++ b/R/expr__expr.R @@ -1163,11 +1163,7 @@ Expr_is_not_nan = use_extendr_wrapper #' full data. #' #' @return Expr -#' @aliases slice -#' @name Expr_slice -#' @format NULL #' @examples -#' #' # as head #' pl$DataFrame(list(a = 0:100))$select( #' pl$all()$slice(0, 6) @@ -1185,7 +1181,8 @@ Expr_is_not_nan = use_extendr_wrapper #' # recycling #' pl$DataFrame(mtcars)$with_columns(pl$col("mpg")$slice(0, 1)) Expr_slice = function(offset, length = NULL) { - .pr$Expr$slice(self, wrap_e(offset), wrap_e(length)) + .pr$Expr$slice(self, offset, wrap_e(length)) |> + unwrap("in $slice():") } @@ -1406,19 +1403,14 @@ Expr_sort = function(..., descending = FALSE, nulls_last = FALSE) { #' Return the `k` largest elements. This has time complexity: \eqn{ O(n + k #' \\log{}n - \frac{k}{2}) } #' -#' @param k Number of top values to get -#' @param ... Ignored. -#' @param nulls_last Place null values last. -#' @param maintain_order Whether the order should be maintained if elements are -#' equal. -#' @param multithreaded Sort using multiple threads. +#' @param k Number of top values to get. #' #' @return Expr #' @examples #' pl$DataFrame(a = c(6, 1, 0, NA, Inf, NaN))$select(pl$col("a")$top_k(5)) -Expr_top_k = function(k, ..., nulls_last = FALSE, maintain_order = FALSE, multithreaded = TRUE) { +Expr_top_k = function(k) { if (!is.numeric(k) || k < 0) stop("k must be numeric and positive, prefereably integerish") - .pr$Expr$top_k(self, k, nulls_last = nulls_last, maintain_order = maintain_order, multithreaded = multithreaded) |> + .pr$Expr$top_k(self, k) |> unwrap("in $top_k():") } @@ -1430,9 +1422,9 @@ Expr_top_k = function(k, ..., nulls_last = FALSE, maintain_order = FALSE, multit #' @inherit Expr_top_k params return #' @examples #' pl$DataFrame(a = c(6, 1, 0, NA, Inf, NaN))$select(pl$col("a")$bottom_k(5)) -Expr_bottom_k = function(k, ..., nulls_last = FALSE, maintain_order = FALSE, multithreaded = TRUE) { +Expr_bottom_k = function(k) { if (!is.numeric(k) || k < 0) stop("k must be numeric and positive, prefereably integerish") - .pr$Expr$bottom_k(self, k, nulls_last = nulls_last, maintain_order = maintain_order, multithreaded = multithreaded) |> + .pr$Expr$bottom_k(self, k) |> unwrap("in $bottom_k():") } @@ -1844,6 +1836,9 @@ Expr_last = use_extendr_wrapper #' #' @param ... Column(s) to group by. Accepts expression input. #' Characters are parsed as column names. +#' @param order_by Order the window functions/aggregations with the partitioned +#' groups by the result of the expression passed to `order_by`. Can be an Expr. +#' Strings are parsed as column names. #' @param mapping_strategy One of the following: #' * `"group_to_rows"` (default): if the aggregation results in multiple values, #' assign them back to their position in the DataFrame. This can only be done @@ -1889,7 +1884,21 @@ Expr_last = use_extendr_wrapper #' df$with_columns( #' top_2 = pl$col("c")$top_k(2)$over("a", mapping_strategy = "join") #' ) -Expr_over = function(..., mapping_strategy = "group_to_rows") { +#' +#' # order_by specifies how values are sorted within a group, which is +#' # essential when the operation depends on the order of values +#' df = pl$DataFrame( +#' g = c(1, 1, 1, 1, 2, 2, 2, 2), +#' t = c(1, 2, 3, 4, 4, 1, 2, 3), +#' x = c(10, 20, 30, 40, 10, 20, 30, 40) +#' ) +#' +#' # without order_by, the first and second values in the second group would +#' # be inverted, which would be wrong +#' df$with_columns( +#' x_lag = pl$col("x")$shift(1)$over("g", order_by = "t") +#' ) +Expr_over = function(..., order_by = NULL, mapping_strategy = "group_to_rows") { list_of_exprs = list2(...) |> lapply(\(x) { if (is.character(x)) { @@ -1907,7 +1916,7 @@ Expr_over = function(..., mapping_strategy = "group_to_rows") { } }) - .pr$Expr$over(self, list_of_exprs, mapping_strategy) |> + .pr$Expr$over(self, list_of_exprs, order_by, order_by_descending = FALSE, order_by_nulls_last = FALSE, mapping_strategy) |> unwrap("in $over():") } @@ -3307,16 +3316,25 @@ Expr_to_r = function(df = NULL, i = 0, ..., int64_conversion = polars_options()$ #' @param sort Ensure the output is sorted from most values to least. #' @param parallel Better to turn this off in the aggregation context, as it can #' lead to contention. -#' @param name Give the resulting count field a specific name, defaults to -#' `"count"`. -#' @format NULL -#' @examples -#' df = pl$DataFrame(iris)$select(pl$col("Species")$value_counts()) -#' df +#' @param name Give the resulting count column a specific name. The default is +#' `"count"` if `normalize = FALSE` and `"proportion"` if `normalize = TRUE`. +#' @param normalize If `TRUE`, it gives relative frequencies of the unique +#' values instead of their count. #' -#' df$unnest()$to_data_frame() -Expr_value_counts = function(..., sort = FALSE, parallel = FALSE, name = "count") { - .pr$Expr$value_counts(self, sort, parallel, name) +#' @examples +#' df = pl$DataFrame(iris) +#' df$select(pl$col("Species")$value_counts())$unnest() +#' df$select(pl$col("Species")$value_counts(normalize = TRUE))$unnest() +Expr_value_counts = function(..., sort = FALSE, parallel = FALSE, name, normalize = FALSE) { + if (missing(name)) { + if (isTRUE(normalize)) { + name = "proportion" + } else { + name = "count" + } + } + + .pr$Expr$value_counts(self, sort, parallel, name, normalize) } #' Count unique values @@ -3558,9 +3576,11 @@ Expr_rolling = function( unwrap("in $rolling():") } -#' Replace values by different values +#' Replace the given values by different values of the same data type. #' -#' This allows one to recode values in a column. +#' This allows one to recode values in a column, leaving all other values +#' unchanged. See [`$replace_strict()`][Expr_replace_strict] to give a default +#' value to all other values and to specify the output datatype. #' #' @param old Can be several things: #' * a vector indicating the values to recode; @@ -3570,11 +3590,6 @@ Expr_rolling = function( #' * an Expr #' @param new Either a vector of length 1, a vector of same length as `old` or #' an Expr. If missing, `old` must be a named list. -#' @param default The default replacement if the value is not in `old`. Can be -#' an Expr. If `NULL` (default), then the value doesn't change. -#' @param return_dtype The data type of the resulting expression. If set to -#' `NULL` (default), the data type is determined automatically based on the -#' other inputs. #' #' @return Expr #' @examples @@ -3587,33 +3602,88 @@ Expr_rolling = function( #' # "old" can be a named list where names are values to replace, and values are #' # the replacements #' mapping = list(`2` = 100, `3` = 200) -#' df$with_columns(replaced = pl$col("a")$replace(mapping, default = -1)) +#' df$with_columns(replaced = pl$col("a")$replace(mapping)) #' #' df = pl$DataFrame(a = c("x", "y", "z")) #' mapping = list(x = 1, y = 2, z = 3) #' df$with_columns(replaced = pl$col("a")$replace(mapping)) #' -#' # one can specify the data type to return instead of automatically inferring it -#' df$with_columns(replaced = pl$col("a")$replace(mapping, return_dtype = pl$Int8)) +#' # "old" and "new" can take Expr +#' df = pl$DataFrame(a = c(1, 2, 2, 3), b = c(1.5, 2.5, 5, 1)) +#' df$with_columns( +#' replaced = pl$col("a")$replace( +#' old = pl$col("a")$max(), +#' new = pl$col("b")$sum() +#' ) +#' ) +Expr_replace = function(old, new) { + if (missing(new) && is.list(old)) { + new = unlist(old, use.names = FALSE) + old = names(old) + } + .pr$Expr$replace(self, old, new) |> + unwrap("in $replace():") +} + + +#' Replace all values by different values. +#' +#' This changes all the values in a column, either using a specific replacement +#' or a default one. See [`$replace()`][Expr_replace] to replace only a subset +#' of values. +#' +#' @inheritParams Expr_replace +#' @param default The default replacement if the value is not in `old`. Can be +#' an Expr. If `NULL` (default), then the value doesn't change. +#' @param return_dtype The data type of the resulting expression. If set to +#' `NULL` (default), the data type is determined automatically based on the +#' other inputs. +#' +#' @return Expr +#' @examples +#' df = pl$DataFrame(a = c(1, 2, 2, 3)) +#' +#' # "old" and "new" can take vectors of length 1 or of same length +#' df$with_columns(replaced = pl$col("a")$replace_strict(2, 100, default = 1)) +#' df$with_columns( +#' replaced = pl$col("a")$replace_strict(c(2, 3), c(100, 200), default = 1) +#' ) +#' +#' # "old" can be a named list where names are values to replace, and values are +#' # the replacements +#' mapping = list(`2` = 100, `3` = 200) +#' df$with_columns(replaced = pl$col("a")$replace_strict(mapping, default = -1)) +#' +#' # one can specify the data type to return instead of automatically +#' # inferring it +#' df$with_columns( +#' replaced = pl$col("a")$replace_strict(mapping, default = 1, return_dtype = pl$Int32) +#' ) #' #' # "old", "new", and "default" can take Expr #' df = pl$DataFrame(a = c(1, 2, 2, 3), b = c(1.5, 2.5, 5, 1)) #' df$with_columns( -#' replaced = pl$col("a")$replace( +#' replaced = pl$col("a")$replace_strict( #' old = pl$col("a")$max(), #' new = pl$col("b")$sum(), #' default = pl$col("b"), #' ) #' ) -Expr_replace = function(old, new, default = NULL, return_dtype = NULL) { +Expr_replace_strict = function(old, new, default = NULL, return_dtype = NULL) { if (missing(new) && is.list(old)) { new = unlist(old, use.names = FALSE) old = names(old) } - .pr$Expr$replace(self, old, new, default, return_dtype) |> - unwrap("in $replace():") + # return_dtype = pl$foo is silently passed otherwise + if (!missing(return_dtype) && !is_polars_dtype(return_dtype)) { + Err_plain("`return_dtype` must be a valid dtype.") |> + unwrap("in $replace_strict():") + } + .pr$Expr$replace_strict(self, old, new, default, return_dtype) |> + unwrap("in $replace_strict():") } + #' Get the lengths of runs of identical values #' #' @return Expr diff --git a/R/expr__list.R b/R/expr__list.R index b37f081da..c02d7e438 100644 --- a/R/expr__list.R +++ b/R/expr__list.R @@ -491,7 +491,7 @@ ExprList_to_struct = function( #' #' # concat strings in each list #' df$select( -#' pl$col("b")$list$eval(pl$element()$str$concat(" "))$list$first() +#' pl$col("b")$list$eval(pl$element()$str$join(" "))$list$first() #' ) ExprList_eval = function(expr, parallel = FALSE) { .pr$Expr$list_eval(self, expr, parallel) diff --git a/R/expr__string.R b/R/expr__string.R index 2d858d9a5..1cb764373 100644 --- a/R/expr__string.R +++ b/R/expr__string.R @@ -236,15 +236,24 @@ ExprStr_len_chars = function() { #' # concatenate a Series of strings to a single string #' df = pl$DataFrame(foo = c(1, NA, 2)) #' -#' df$select(pl$col("foo")$str$concat("-")) +#' df$select(pl$col("foo")$str$join("-")) #' -#' df$select(pl$col("foo")$str$concat("-", ignore_nulls = FALSE)) +#' df$select(pl$col("foo")$str$join("-", ignore_nulls = FALSE)) +ExprStr_join = function( + delimiter = "", + ..., + ignore_nulls = TRUE) { + .pr$Expr$str_join(self, delimiter, ignore_nulls) |> + unwrap("in $str$join():") +} + ExprStr_concat = function( delimiter = "", ..., ignore_nulls = TRUE) { - .pr$Expr$str_concat(self, delimiter, ignore_nulls) |> - unwrap("in $concat():") + warning("$str$concat() is deprecated as of 0.18.0. Use $str$join() instead.") + .pr$Expr$str_join(self, delimiter, ignore_nulls) |> + unwrap("in $str$concat():") } #' Convert a string to uppercase diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index eeae33d8b..dda668b79 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -48,9 +48,9 @@ fold <- function(acc, lambda, exprs) .Call(wrap__fold, acc, lambda, exprs) reduce <- function(lambda, exprs) .Call(wrap__reduce, lambda, exprs) -date_range <- function(start, end, interval, closed, time_unit, time_zone) .Call(wrap__date_range, start, end, interval, closed, time_unit, time_zone) +date_range <- function(start, end, interval, closed) .Call(wrap__date_range, start, end, interval, closed) -date_ranges <- function(start, end, interval, closed, time_unit, time_zone) .Call(wrap__date_ranges, start, end, interval, closed, time_unit, time_zone) +date_ranges <- function(start, end, interval, closed) .Call(wrap__date_ranges, start, end, interval, closed) datetime_range <- function(start, end, interval, closed, time_unit, time_zone) .Call(wrap__datetime_range, start, end, interval, closed, time_unit, time_zone) @@ -208,9 +208,9 @@ RPolarsDataFrame$estimated_size <- function() .Call(wrap__RPolarsDataFrame__esti RPolarsDataFrame$null_count <- function() .Call(wrap__RPolarsDataFrame__null_count, self) -RPolarsDataFrame$melt <- function(id_vars, value_vars, value_name, variable_name) .Call(wrap__RPolarsDataFrame__melt, self, id_vars, value_vars, value_name, variable_name) +RPolarsDataFrame$unpivot <- function(on, index, value_name, variable_name) .Call(wrap__RPolarsDataFrame__unpivot, self, on, index, value_name, variable_name) -RPolarsDataFrame$pivot_expr <- function(index, columns, values, maintain_order, sort_columns, aggregate_expr, separator) .Call(wrap__RPolarsDataFrame__pivot_expr, self, index, columns, values, maintain_order, sort_columns, aggregate_expr, separator) +RPolarsDataFrame$pivot_expr <- function(on, index, values, maintain_order, sort_columns, aggregate_expr, separator) .Call(wrap__RPolarsDataFrame__pivot_expr, self, on, index, values, maintain_order, sort_columns, aggregate_expr, separator) RPolarsDataFrame$sample_n <- function(n, with_replacement, shuffle, seed) .Call(wrap__RPolarsDataFrame__sample_n, self, n, with_replacement, shuffle, seed) @@ -530,9 +530,9 @@ RPolarsExpr$sort_with <- function(descending, nulls_last) .Call(wrap__RPolarsExp RPolarsExpr$arg_sort <- function(descending, nulls_last) .Call(wrap__RPolarsExpr__arg_sort, self, descending, nulls_last) -RPolarsExpr$top_k <- function(k, nulls_last, maintain_order, multithreaded) .Call(wrap__RPolarsExpr__top_k, self, k, nulls_last, maintain_order, multithreaded) +RPolarsExpr$top_k <- function(k) .Call(wrap__RPolarsExpr__top_k, self, k) -RPolarsExpr$bottom_k <- function(k, nulls_last, maintain_order, multithreaded) .Call(wrap__RPolarsExpr__bottom_k, self, k, nulls_last, maintain_order, multithreaded) +RPolarsExpr$bottom_k <- function(k) .Call(wrap__RPolarsExpr__bottom_k, self, k) RPolarsExpr$arg_max <- function() .Call(wrap__RPolarsExpr__arg_max, self) @@ -702,7 +702,7 @@ RPolarsExpr$extend_constant <- function(value, n) .Call(wrap__RPolarsExpr__exten RPolarsExpr$rep <- function(n, rechunk) .Call(wrap__RPolarsExpr__rep, self, n, rechunk) -RPolarsExpr$value_counts <- function(sort, parallel, name) .Call(wrap__RPolarsExpr__value_counts, self, sort, parallel, name) +RPolarsExpr$value_counts <- function(sort, parallel, name, normalize) .Call(wrap__RPolarsExpr__value_counts, self, sort, parallel, name, normalize) RPolarsExpr$unique_counts <- function() .Call(wrap__RPolarsExpr__unique_counts, self) @@ -718,7 +718,9 @@ RPolarsExpr$peak_min <- function() .Call(wrap__RPolarsExpr__peak_min, self) RPolarsExpr$peak_max <- function() .Call(wrap__RPolarsExpr__peak_max, self) -RPolarsExpr$replace <- function(old, new, default, return_dtype) .Call(wrap__RPolarsExpr__replace, self, old, new, default, return_dtype) +RPolarsExpr$replace <- function(old, new) .Call(wrap__RPolarsExpr__replace, self, old, new) + +RPolarsExpr$replace_strict <- function(old, new, default, return_dtype) .Call(wrap__RPolarsExpr__replace_strict, self, old, new, default, return_dtype) RPolarsExpr$rle <- function() .Call(wrap__RPolarsExpr__rle, self) @@ -812,9 +814,9 @@ RPolarsExpr$arr_to_struct <- function(fields) .Call(wrap__RPolarsExpr__arr_to_st RPolarsExpr$arr_shift <- function(n) .Call(wrap__RPolarsExpr__arr_shift, self, n) -RPolarsExpr$dt_truncate <- function(every, offset) .Call(wrap__RPolarsExpr__dt_truncate, self, every, offset) +RPolarsExpr$dt_truncate <- function(every) .Call(wrap__RPolarsExpr__dt_truncate, self, every) -RPolarsExpr$dt_round <- function(every, offset) .Call(wrap__RPolarsExpr__dt_round, self, every, offset) +RPolarsExpr$dt_round <- function(every) .Call(wrap__RPolarsExpr__dt_round, self, every) RPolarsExpr$dt_time <- function() .Call(wrap__RPolarsExpr__dt_time, self) @@ -992,7 +994,7 @@ RPolarsExpr$qcut <- function(probs, labels, left_closed, allow_duplicates, inclu RPolarsExpr$qcut_uniform <- function(n_bins, labels, left_closed, allow_duplicates, include_breaks) .Call(wrap__RPolarsExpr__qcut_uniform, self, n_bins, labels, left_closed, allow_duplicates, include_breaks) -RPolarsExpr$over <- function(partition_by, mapping) .Call(wrap__RPolarsExpr__over, self, partition_by, mapping) +RPolarsExpr$over <- function(partition_by, order_by, order_by_descending, order_by_nulls_last, mapping) .Call(wrap__RPolarsExpr__over, self, partition_by, order_by, order_by_descending, order_by_nulls_last, mapping) RPolarsExpr$print <- function() invisible(.Call(wrap__RPolarsExpr__print, self)) @@ -1024,7 +1026,7 @@ RPolarsExpr$str_len_bytes <- function() .Call(wrap__RPolarsExpr__str_len_bytes, RPolarsExpr$str_len_chars <- function() .Call(wrap__RPolarsExpr__str_len_chars, self) -RPolarsExpr$str_concat <- function(delimiter, ignore_nulls) .Call(wrap__RPolarsExpr__str_concat, self, delimiter, ignore_nulls) +RPolarsExpr$str_join <- function(delimiter, ignore_nulls) .Call(wrap__RPolarsExpr__str_join, self, delimiter, ignore_nulls) RPolarsExpr$str_to_uppercase <- function() .Call(wrap__RPolarsExpr__str_to_uppercase, self) @@ -1252,7 +1254,7 @@ RPolarsLazyFrame$join <- function(other, left_on, right_on, how, validate, join_ RPolarsLazyFrame$sort_by_exprs <- function(by, dotdotdot, descending, nulls_last, maintain_order, multithreaded) .Call(wrap__RPolarsLazyFrame__sort_by_exprs, self, by, dotdotdot, descending, nulls_last, maintain_order, multithreaded) -RPolarsLazyFrame$melt <- function(id_vars, value_vars, value_name, variable_name, streamable) .Call(wrap__RPolarsLazyFrame__melt, self, id_vars, value_vars, value_name, variable_name, streamable) +RPolarsLazyFrame$unpivot <- function(on, index, value_name, variable_name, streamable) .Call(wrap__RPolarsLazyFrame__unpivot, self, on, index, value_name, variable_name, streamable) RPolarsLazyFrame$rename <- function(existing, new) .Call(wrap__RPolarsLazyFrame__rename, self, existing, new) @@ -1326,7 +1328,7 @@ RPolarsSeries$name <- function() .Call(wrap__RPolarsSeries__name, self) RPolarsSeries$sort <- function(descending, nulls_last, multithreaded) .Call(wrap__RPolarsSeries__sort, self, descending, nulls_last, multithreaded) -RPolarsSeries$value_counts <- function(sort, parallel, name) .Call(wrap__RPolarsSeries__value_counts, self, sort, parallel, name) +RPolarsSeries$value_counts <- function(sort, parallel, name, normalize) .Call(wrap__RPolarsSeries__value_counts, self, sort, parallel, name, normalize) RPolarsSeries$arg_min <- function() .Call(wrap__RPolarsSeries__arg_min, self) diff --git a/R/functions__eager.R b/R/functions__eager.R index fc264e0d5..39f4c2a21 100644 --- a/R/functions__eager.R +++ b/R/functions__eager.R @@ -181,12 +181,6 @@ pl_concat = function( #' @param ... Ignored. #' @param closed Define which sides of the range are closed (inclusive). #' One of the followings: `"both"` (default), `"left"`, `"right"`, `"none"`. -#' @param time_unit Time unit of the resulting the [Datetime][DataType_Datetime] -#' data type. One of `"ns"`, `"us"`, `"ms"` or `NULL`. Only takes effect if the -#' output column is of type [Datetime][DataType_Datetime] (deprecated usage). -#' @param time_zone Time zone of the resulting [Datetime][DataType_Datetime] data -#' type. Only takes effect if the output column is of type [Datetime][DataType_Datetime] -#' (deprecated usage). #' @return An [Expr][Expr_class] of data type Date or [Datetime][DataType_Datetime] #' #' @inheritSection polars_duration_string Polars duration string language @@ -211,46 +205,12 @@ pl_date_range = function( end, interval = "1d", ..., - closed = "both", - time_unit = NULL, - time_zone = NULL) { - .warn_for_deprecated_date_range_use(start, end, interval, time_unit, time_zone) - + closed = "both") { interval = parse_as_polars_duration_string(interval) - date_range(start, end, interval, closed, time_unit, time_zone) |> + date_range(start, end, interval, closed) |> unwrap("in pl$date_range():") } - -.warn_for_deprecated_date_range_use = function( - start, - end, - interval, - time_unit = NULL, - time_zone = NULL) { - if ( - inherits(start, "POSIXt") || - inherits(end, "POSIXt") || - !is.null(time_unit) || - !is.null(time_zone) || - ( - is.character(interval) && - length(interval) == 1L && - (grepl("h", interval) || grepl("m", gsub("mo", "", interval)) || grepl("s", gsub("saturating", "", interval))) - ) - ) { - warning( - "Creating Datetime ranges using `pl$date_range()` is deprecated.", - "Use `pl$datetime_range()` instead.", - call. = FALSE - ) - } - - invisible(NULL) -} - - - # TODO: link to the Date type docs #' Generate a list containing a date range #' @@ -289,13 +249,9 @@ pl_date_ranges = function( end, interval = "1d", ..., - closed = "both", - time_unit = NULL, - time_zone = NULL) { - .warn_for_deprecated_date_range_use(start, end, interval, time_unit, time_zone) - + closed = "both") { interval = parse_as_polars_duration_string(interval) - date_ranges(start, end, interval, closed, time_unit, time_zone) |> + date_ranges(start, end, interval, closed) |> unwrap("in pl$date_ranges():") } diff --git a/R/group_by_dynamic.R b/R/group_by_dynamic.R index 54cd85cef..3a02e957a 100644 --- a/R/group_by_dynamic.R +++ b/R/group_by_dynamic.R @@ -5,7 +5,7 @@ #' @aliases RPolarsDynamicGroupBy #' @examples #' df = pl$DataFrame( -#' time = pl$date_range( +#' time = pl$datetime_range( #' start = strptime("2021-12-16 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), #' end = strptime("2021-12-16 03:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), #' interval = "30m" @@ -109,7 +109,7 @@ DynamicGroupBy_agg = function(...) { #' @return [DataFrame][DataFrame_class] #' @examples #' df = pl$DataFrame( -#' time = pl$date_range( +#' time = pl$datetime_range( #' start = strptime("2021-12-16 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), #' end = strptime("2021-12-16 03:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), #' interval = "30m" diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index 4a5cf474b..39fe36eca 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -629,8 +629,14 @@ LazyFrame_collect_in_background = function() { #' * "gzip": min-level: 0, max-level: 10. #' * "brotli": min-level: 0, max-level: 11. #' * "zstd": min-level: 1, max-level: 22. -#' @param statistics Logical. Whether compute and write column statistics. -#' This requires extra compute. +#' @param statistics Whether statistics should be written to the Parquet +#' headers. Possible values: +#' * `TRUE`: enable default set of statistics (default) +#' * `FALSE`: disable all statistics +#' * `"full"`: calculate and write all available statistics. +#' * A named list where all values must be `TRUE` or `FALSE`, e.g. +#' `list(min = TRUE, max = FALSE)`. Statistics available are `"min"`, `"max"`, +#' `"distinct_count"`, `"null_count"`. #' @param row_group_size `NULL` or Integer. Size of the row groups in number of #' rows. If `NULL` (default), the chunks of the DataFrame are used. Writing in #' smaller chunks may reduce memory pressure and improve writing speeds. @@ -661,7 +667,7 @@ LazyFrame_sink_parquet = function( ..., compression = "zstd", compression_level = 3, - statistics = FALSE, + statistics = TRUE, row_group_size = NULL, data_pagesize_limit = NULL, maintain_order = TRUE, @@ -693,6 +699,9 @@ LazyFrame_sink_parquet = function( ) |> unwrap("in $sink_parquet()") } + statistics = translate_statistics(statistics) |> + unwrap("in $sink_parquet():") + lf |> .pr$LazyFrame$sink_parquet( path, @@ -703,7 +712,7 @@ LazyFrame_sink_parquet = function( data_pagesize_limit, maintain_order ) |> - unwrap("in $sink_parquet()") + unwrap("in $sink_parquet():") invisible(self) } @@ -1311,7 +1320,7 @@ LazyFrame_group_by = function(..., maintain_order = polars_options()$maintain_or LazyFrame_join = function( other, on = NULL, - how = c("inner", "left", "full", "semi", "anti", "cross"), + how = "inner", ..., left_on = NULL, right_on = NULL, @@ -1327,16 +1336,21 @@ LazyFrame_join = function( Err_plain("`other` must be a LazyFrame.") |> uw() } - if (!is.null(on)) { - rexprs_right = rexprs_left = as.list(on) - } else if ((!is.null(left_on) && !is.null(right_on))) { - rexprs_left = as.list(left_on) - rexprs_right = as.list(right_on) - } else if (how != "cross") { - Err_plain("must specify either `on`, or `left_on` and `right_on`.") |> uw() + if (how == "cross") { + if (!is.null(on) || !is.null(left_on) || !is.null(right_on)) { + Err_plain("cross join should not pass join keys.") |> uw() + } + rexprs_left = as.list(NULL) + rexprs_right = as.list(NULL) } else { - rexprs_left = as.list(self$columns) - rexprs_right = as.list(other$columns) + if (!is.null(on)) { + rexprs_right = rexprs_left = as.list(on) + } else if ((!is.null(left_on) && !is.null(right_on))) { + rexprs_left = as.list(left_on) + rexprs_right = as.list(right_on) + } else { + Err_plain("must specify either `on`, or `left_on` and `right_on`.") |> uw() + } } .pr$LazyFrame$join( @@ -1489,14 +1503,14 @@ LazyFrame_join_asof = function( #' Unpivot a Frame from wide to long format #' -#' @param id_vars Columns to use as identifier variables. -#' @param value_vars Values to use as identifier variables. If `value_vars` is +#' @param on Values to use as identifier variables. If `value_vars` is #' empty all columns that are not in `id_vars` will be used. +#' @param ... Not used. +#' @param index Columns to use as identifier variables. #' @param variable_name Name to give to the new column containing the names of #' the melted columns. Defaults to "variable". #' @param value_name Name to give to the new column containing the values of #' the melted columns. Defaults to "value" -#' @param ... Not used. #' @param streamable Allow this node to run in the streaming engine. If this #' runs in streaming, the output of the melt operation will not have a stable #' ordering. @@ -1519,18 +1533,18 @@ LazyFrame_join_asof = function( #' b = c(1, 3, 5), #' c = c(2, 4, 6) #' ) -#' lf$melt(id_vars = "a", value_vars = c("b", "c"))$collect() -LazyFrame_melt = function( - id_vars = NULL, - value_vars = NULL, +#' lf$unpivot(index = "a", on = c("b", "c"))$collect() +LazyFrame_unpivot = function( + on = NULL, + ..., + index = NULL, variable_name = NULL, value_name = NULL, - ..., streamable = TRUE) { - .pr$LazyFrame$melt( - self, id_vars %||% character(), value_vars %||% character(), + .pr$LazyFrame$unpivot( + self, on %||% character(), index %||% character(), value_name, variable_name, streamable - ) |> unwrap("in $melt( ): ") + ) |> unwrap("in $unpivot( ): ") } #' Rename column names of a LazyFrame @@ -1990,7 +2004,7 @@ LazyFrame_rolling = function( #' - [`$rolling()`][LazyFrame_rolling] #' @examples #' lf = pl$LazyFrame( -#' time = pl$date_range( +#' time = pl$datetime_range( #' start = strptime("2021-12-16 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), #' end = strptime("2021-12-16 03:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"), #' interval = "30m" diff --git a/R/series__series.R b/R/series__series.R index 7723919a9..20ad15e6d 100644 --- a/R/series__series.R +++ b/R/series__series.R @@ -110,7 +110,7 @@ #' #' # use Expr method in subnamespaces #' as_polars_series(list(3:1, 1:2, NULL))$list$first() -#' as_polars_series(c(1, NA, 2))$str$concat("-") +#' as_polars_series(c(1, NA, 2))$str$join("-") #' #' s = pl$date_range( #' as.Date("2024-02-18"), as.Date("2024-02-24"), @@ -371,7 +371,8 @@ Series_print = function() { #' #' as_polars_series("a")$add("-z") Series_add = function(other) { - .pr$Series$add(self, as_polars_series(other)) + .pr$Series$add(self, as_polars_series(other)) |> + unwrap("in $add():") } @@ -390,7 +391,8 @@ Series_add = function(other) { #' 1L - as_polars_series(1:3) #' as_polars_series(1:3) - 1L Series_sub = function(other) { - .pr$Series$sub(self, as_polars_series(other)) + .pr$Series$sub(self, as_polars_series(other)) |> + unwrap("in $sub():") } @@ -405,7 +407,8 @@ Series_sub = function(other) { #' as_polars_series(1:3)$div(as_polars_series(11:13)) #' as_polars_series(1:3)$div(1L) Series_div = function(other) { - .pr$Series$div(self, as_polars_series(other)) + .pr$Series$div(self, as_polars_series(other)) |> + unwrap("in $div():") } @@ -435,7 +438,8 @@ Series_floor_div = function(other) { #' as_polars_series(1:3)$mul(as_polars_series(11:13)) #' as_polars_series(1:3)$mul(1L) Series_mul = function(other) { - .pr$Series$mul(self, as_polars_series(other)) + .pr$Series$mul(self, as_polars_series(other)) |> + unwrap("in $mul():") } @@ -450,7 +454,8 @@ Series_mul = function(other) { #' as_polars_series(1:3)$mod(as_polars_series(11:13)) #' as_polars_series(1:3)$mod(1L) Series_mod = function(other) { - .pr$Series$rem(self, as_polars_series(other)) + .pr$Series$rem(self, as_polars_series(other)) |> + unwrap("in $mod():") } @@ -578,8 +583,9 @@ Series_to_list = \(int64_conversion = polars_options()$int64_conversion) { #' @return DataFrame #' @examples #' as_polars_series(iris$Species, name = "flower species")$value_counts() -Series_value_counts = function(..., sort = TRUE, parallel = FALSE, name = "count") { - unwrap(.pr$Series$value_counts(self, sort, parallel, name), "in $value_counts():") +Series_value_counts = function(..., sort = TRUE, parallel = FALSE, name = "count", normalize = FALSE) { + .pr$Series$value_counts(self, sort, parallel, name, normalize) |> + unwrap("in $value_counts():") } #' Apply every value with an R fun diff --git a/R/utils.R b/R/utils.R index 45c246fb3..46141eef7 100644 --- a/R/utils.R +++ b/R/utils.R @@ -678,3 +678,55 @@ is_named = function(x) { } TRUE } + +# Used in parquet write/sink +translate_statistics = function(statistics) { + if (length(statistics) != 1 && !is.list(statistics)) { + return(Err_plain("`statistics` must be of length 1.")) + } + if (is.logical(statistics)) { + if (isTRUE(statistics)) { + statistics = list( + min = TRUE, + max = TRUE, + distinct_count = FALSE, + null_count = TRUE + ) + } else { + statistics = list( + min = FALSE, + max = FALSE, + distinct_count = FALSE, + null_count = FALSE + ) + } + } else if (is.character(statistics)) { + if (statistics == "full") { + statistics = list( + min = TRUE, + max = TRUE, + distinct_count = TRUE, + null_count = TRUE + ) + } else { + return(Err_plain("`statistics` must be TRUE/FALSE, 'full', or a named list.")) + } + } else if (is.list(statistics)) { + default = list( + min = TRUE, + max = TRUE, + distinct_count = FALSE, + null_count = TRUE + ) + statistics = utils::modifyList(default, statistics) + nms = names(statistics) + invalid = nms[!nms %in% c("min", "max", "distinct_count", "null_count")] + if (length(invalid) > 0) { + msg = paste0("`", invalid, "`", collapse = ", ") + return( + Err_plain("In `statistics`,", msg, "are not valid keys.") + ) + } + } + result(statistics) +} diff --git a/man/DataFrame_join.Rd b/man/DataFrame_join.Rd index d84357cba..1a8519583 100644 --- a/man/DataFrame_join.Rd +++ b/man/DataFrame_join.Rd @@ -7,7 +7,7 @@ DataFrame_join( other, on = NULL, - how = c("inner", "left", "full", "semi", "anti", "cross"), + how = "inner", ..., left_on = NULL, right_on = NULL, diff --git a/man/DataFrame_pivot.Rd b/man/DataFrame_pivot.Rd index 2c9634b75..31143411c 100644 --- a/man/DataFrame_pivot.Rd +++ b/man/DataFrame_pivot.Rd @@ -5,10 +5,10 @@ \title{Pivot data from long to wide} \usage{ DataFrame_pivot( - values, - index, - columns, + on, ..., + index, + values, aggregate_function = NULL, maintain_order = TRUE, sort_columns = FALSE, @@ -16,16 +16,16 @@ DataFrame_pivot( ) } \arguments{ -\item{values}{Column values to aggregate. Can be multiple columns if the -\code{columns} arguments contains multiple columns as well.} - -\item{index}{One or multiple keys to group by.} - -\item{columns}{Name of the column(s) whose values will be used as the header +\item{on}{Name of the column(s) whose values will be used as the header of the output DataFrame.} \item{...}{Not used.} +\item{index}{One or multiple keys to group by.} + +\item{values}{Column values to aggregate. Can be multiple columns if the +\code{on} arguments contains multiple columns as well.} + \item{aggregate_function}{One of: \itemize{ \item string indicating the expressions to aggregate with, such as 'first', @@ -56,7 +56,7 @@ df = pl$DataFrame( df df$pivot( - values = "baz", index = "foo", columns = "bar" + values = "baz", index = "foo", on = "bar" ) # Run an expression as aggregation function @@ -69,7 +69,7 @@ df df$pivot( index = "col1", - columns = "col2", + on = "col2", values = "col3", aggregate_function = pl$element()$tanh()$mean() ) diff --git a/man/DataFrame_melt.Rd b/man/DataFrame_unpivot.Rd similarity index 77% rename from man/DataFrame_melt.Rd rename to man/DataFrame_unpivot.Rd index bf3098504..34e0f813c 100644 --- a/man/DataFrame_melt.Rd +++ b/man/DataFrame_unpivot.Rd @@ -1,22 +1,25 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/dataframe__frame.R -\name{DataFrame_melt} -\alias{DataFrame_melt} +\name{DataFrame_unpivot} +\alias{DataFrame_unpivot} \title{Unpivot a Frame from wide to long format} \usage{ -DataFrame_melt( - id_vars = NULL, - value_vars = NULL, +DataFrame_unpivot( + on = NULL, + ..., + index = NULL, variable_name = NULL, value_name = NULL ) } \arguments{ -\item{id_vars}{Columns to use as identifier variables.} - -\item{value_vars}{Values to use as identifier variables. If \code{value_vars} is +\item{on}{Values to use as identifier variables. If \code{value_vars} is empty all columns that are not in \code{id_vars} will be used.} +\item{...}{Not used.} + +\item{index}{Columns to use as identifier variables.} + \item{variable_name}{Name to give to the new column containing the names of the melted columns. Defaults to "variable".} @@ -44,6 +47,6 @@ df = pl$DataFrame( c = c(2, 4, 6), d = c(7, 8, 9) ) -df$melt(id_vars = "a", value_vars = c("b", "c", "d")) +df$unpivot(index = "a", on = c("b", "c", "d")) } \keyword{DataFrame} diff --git a/man/DynamicGroupBy_class.Rd b/man/DynamicGroupBy_class.Rd index 66fbf6939..0c1f73a69 100644 --- a/man/DynamicGroupBy_class.Rd +++ b/man/DynamicGroupBy_class.Rd @@ -9,7 +9,7 @@ This class comes from \code{\link[=DataFrame_group_by_dynamic]{$group } \examples{ df = pl$DataFrame( - time = pl$date_range( + time = pl$datetime_range( start = strptime("2021-12-16 00:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"), end = strptime("2021-12-16 03:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"), interval = "30m" diff --git a/man/DynamicGroupBy_ungroup.Rd b/man/DynamicGroupBy_ungroup.Rd index 3e042c876..f7c9663ca 100644 --- a/man/DynamicGroupBy_ungroup.Rd +++ b/man/DynamicGroupBy_ungroup.Rd @@ -15,7 +15,7 @@ Revert the \verb{$group_by_dynamic()} operation. Doing } \examples{ df = pl$DataFrame( - time = pl$date_range( + time = pl$datetime_range( start = strptime("2021-12-16 00:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"), end = strptime("2021-12-16 03:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"), interval = "30m" diff --git a/man/ExprDT_cast_time_unit.Rd b/man/ExprDT_cast_time_unit.Rd index 8532d0a9e..d423401d9 100644 --- a/man/ExprDT_cast_time_unit.Rd +++ b/man/ExprDT_cast_time_unit.Rd @@ -19,7 +19,7 @@ The corresponding global timepoint will stay unchanged +/- precision. } \examples{ df = pl$DataFrame( - date = pl$date_range( + date = pl$datetime_range( start = as.Date("2001-1-1"), end = as.Date("2001-1-3"), interval = "1d1s" diff --git a/man/ExprDT_convert_time_zone.Rd b/man/ExprDT_convert_time_zone.Rd index 3914e8690..7b914b3e0 100644 --- a/man/ExprDT_convert_time_zone.Rd +++ b/man/ExprDT_convert_time_zone.Rd @@ -19,10 +19,10 @@ regardless of your system’s time zone. } \examples{ df = pl$DataFrame( - date = pl$date_range( + date = pl$datetime_range( as.POSIXct("2020-03-01", tz = "UTC"), as.POSIXct("2020-05-01", tz = "UTC"), - "1mo" + "1mo1s" ) ) diff --git a/man/ExprDT_hour.Rd b/man/ExprDT_hour.Rd index f16784847..2c94e1232 100644 --- a/man/ExprDT_hour.Rd +++ b/man/ExprDT_hour.Rd @@ -17,7 +17,7 @@ Returns the hour number from 0 to 23. } \examples{ df = pl$DataFrame( - date = pl$date_range( + date = pl$datetime_range( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d2h", diff --git a/man/ExprDT_minute.Rd b/man/ExprDT_minute.Rd index 0d6ef7da8..e5a903c5e 100644 --- a/man/ExprDT_minute.Rd +++ b/man/ExprDT_minute.Rd @@ -17,7 +17,7 @@ Returns the minute number from 0 to 59. } \examples{ df = pl$DataFrame( - date = pl$date_range( + date = pl$datetime_range( as.Date("2020-12-25"), as.Date("2021-1-05"), interval = "1d5s", diff --git a/man/ExprDT_offset_by.Rd b/man/ExprDT_offset_by.Rd index a5e44a488..844f0db76 100644 --- a/man/ExprDT_offset_by.Rd +++ b/man/ExprDT_offset_by.Rd @@ -55,7 +55,7 @@ df$select( # the "by" argument also accepts expressions df = pl$DataFrame( - dates = pl$date_range( + dates = pl$datetime_range( as.POSIXct("2022-01-01", tz = "GMT"), as.POSIXct("2022-01-02", tz = "GMT"), interval = "6h", time_unit = "ms", time_zone = "GMT" diff --git a/man/ExprDT_replace_time_zone.Rd b/man/ExprDT_replace_time_zone.Rd index 449ac3fd4..6d0377c04 100644 --- a/man/ExprDT_replace_time_zone.Rd +++ b/man/ExprDT_replace_time_zone.Rd @@ -42,10 +42,10 @@ change the corresponding global timepoint. } \examples{ df1 = pl$DataFrame( - london_timezone = pl$date_range( + london_timezone = pl$datetime_range( as.POSIXct("2020-03-01", tz = "UTC"), as.POSIXct("2020-07-01", tz = "UTC"), - "1mo" + "1mo1s" )$dt$convert_time_zone("Europe/London") ) diff --git a/man/ExprDT_round.Rd b/man/ExprDT_round.Rd index 196acdc82..601e133d3 100644 --- a/man/ExprDT_round.Rd +++ b/man/ExprDT_round.Rd @@ -2,15 +2,13 @@ % Please edit documentation in R/expr__datetime.R \name{ExprDT_round} \alias{ExprDT_round} -\alias{(Expr)$dt$round} \title{Round datetime} \usage{ -ExprDT_round(every, offset = NULL) +ExprDT_round(every) } \arguments{ -\item{every}{string encoding duration see details.} - -\item{offset}{optional string encoding duration see details.} +\item{every}{Either an Expr or a string indicating a column name or a +duration (see Details).} } \value{ Date/Datetime expr @@ -23,8 +21,8 @@ Each date/datetime in the second half of the interval is mapped to the end of its bucket. } \details{ -The \code{every} and \code{offset} arguments are created with the -following string language: +The \code{every} and \code{offset} argument are created with the +the following string language: \itemize{ \item 1ns # 1 nanosecond \item 1us # 1 microsecond @@ -41,20 +39,14 @@ These strings can be combined: \item 3d12h4m25s # 3 days, 12 hours, 4 minutes, and 25 seconds } } - -This functionality is currently experimental and may -change without it being considered a breaking change. } \examples{ t1 = as.POSIXct("3040-01-01", tz = "GMT") t2 = t1 + as.difftime(25, units = "secs") -s = pl$date_range(t1, t2, interval = "2s", time_unit = "ms") +s = pl$datetime_range(t1, t2, interval = "2s", time_unit = "ms") -# use a dt namespace function df = pl$DataFrame(datetime = s)$with_columns( - pl$col("datetime")$dt$truncate("4s")$alias("truncated_4s"), - pl$col("datetime")$dt$truncate("4s", offset("3s"))$alias("truncated_4s_offset_2s") + pl$col("datetime")$dt$round("4s")$alias("rounded_4s") ) df } -\keyword{ExprDT} diff --git a/man/ExprDT_time.Rd b/man/ExprDT_time.Rd index 6aebb4ceb..8604e6a2a 100644 --- a/man/ExprDT_time.Rd +++ b/man/ExprDT_time.Rd @@ -13,7 +13,7 @@ A Time Expr This only works on Datetime Series, it will error on Date Series. } \examples{ -df = pl$DataFrame(dates = pl$date_range( +df = pl$DataFrame(dates = pl$datetime_range( as.Date("2000-1-1"), as.Date("2000-1-2"), "1h" diff --git a/man/ExprDT_timestamp.Rd b/man/ExprDT_timestamp.Rd index 6892e5aa7..74ddad68b 100644 --- a/man/ExprDT_timestamp.Rd +++ b/man/ExprDT_timestamp.Rd @@ -18,7 +18,7 @@ Return a timestamp in the given time unit. } \examples{ df = pl$DataFrame( - date = pl$date_range( + date = pl$datetime_range( start = as.Date("2001-1-1"), end = as.Date("2001-1-3"), interval = "1d1s" diff --git a/man/ExprDT_total_days.Rd b/man/ExprDT_total_days.Rd index b7dc388a0..f896d40ce 100644 --- a/man/ExprDT_total_days.Rd +++ b/man/ExprDT_total_days.Rd @@ -14,10 +14,10 @@ Extract the days from a Duration type. } \examples{ df = pl$DataFrame( - date = pl$date_range( + date = pl$datetime_range( start = as.Date("2020-3-1"), end = as.Date("2020-5-1"), - interval = "1mo" + interval = "1mo1s" ) ) df$select( diff --git a/man/ExprDT_total_microseconds.Rd b/man/ExprDT_total_microseconds.Rd index 0e69df198..2338c85d6 100644 --- a/man/ExprDT_total_microseconds.Rd +++ b/man/ExprDT_total_microseconds.Rd @@ -13,7 +13,7 @@ Expr of i64 Extract the microseconds from a Duration type. } \examples{ -df = pl$DataFrame(date = pl$date_range( +df = pl$DataFrame(date = pl$datetime_range( start = as.POSIXct("2020-1-1", tz = "GMT"), end = as.POSIXct("2020-1-1 00:00:01", tz = "GMT"), interval = "1ms" diff --git a/man/ExprDT_total_milliseconds.Rd b/man/ExprDT_total_milliseconds.Rd index 9cf2b3f54..3919edd50 100644 --- a/man/ExprDT_total_milliseconds.Rd +++ b/man/ExprDT_total_milliseconds.Rd @@ -13,7 +13,7 @@ Expr of i64 Extract the milliseconds from a Duration type. } \examples{ -df = pl$DataFrame(date = pl$date_range( +df = pl$DataFrame(date = pl$datetime_range( start = as.POSIXct("2020-1-1", tz = "GMT"), end = as.POSIXct("2020-1-1 00:00:01", tz = "GMT"), interval = "1ms" diff --git a/man/ExprDT_total_nanoseconds.Rd b/man/ExprDT_total_nanoseconds.Rd index dda7bfc41..e0489ece7 100644 --- a/man/ExprDT_total_nanoseconds.Rd +++ b/man/ExprDT_total_nanoseconds.Rd @@ -13,7 +13,7 @@ Expr of i64 Extract the nanoseconds from a Duration type. } \examples{ -df = pl$DataFrame(date = pl$date_range( +df = pl$DataFrame(date = pl$datetime_range( start = as.POSIXct("2020-1-1", tz = "GMT"), end = as.POSIXct("2020-1-1 00:00:01", tz = "GMT"), interval = "1ms" diff --git a/man/ExprDT_total_seconds.Rd b/man/ExprDT_total_seconds.Rd index 9bdabdff5..31b33c68b 100644 --- a/man/ExprDT_total_seconds.Rd +++ b/man/ExprDT_total_seconds.Rd @@ -13,7 +13,7 @@ Expr of i64 Extract the seconds from a Duration type. } \examples{ -df = pl$DataFrame(date = pl$date_range( +df = pl$DataFrame(date = pl$datetime_range( start = as.POSIXct("2020-1-1", tz = "GMT"), end = as.POSIXct("2020-1-1 00:04:00", tz = "GMT"), interval = "1m" diff --git a/man/ExprDT_truncate.Rd b/man/ExprDT_truncate.Rd index 273c7efff..a8a0140a8 100644 --- a/man/ExprDT_truncate.Rd +++ b/man/ExprDT_truncate.Rd @@ -2,15 +2,13 @@ % Please edit documentation in R/expr__datetime.R \name{ExprDT_truncate} \alias{ExprDT_truncate} -\alias{(Expr)$dt$truncate} \title{Truncate datetime} \usage{ -ExprDT_truncate(every, offset = NULL) +ExprDT_truncate(every) } \arguments{ -\item{every}{string encoding duration see details.} - -\item{offset}{optional string encoding duration see details.} +\item{every}{Either an Expr or a string indicating a column name or a +duration (see Details).} } \value{ Date/Datetime expr @@ -42,13 +40,10 @@ These strings can be combined: \examples{ t1 = as.POSIXct("3040-01-01", tz = "GMT") t2 = t1 + as.difftime(25, units = "secs") -s = pl$date_range(t1, t2, interval = "2s", time_unit = "ms") +s = pl$datetime_range(t1, t2, interval = "2s", time_unit = "ms") -# use a dt namespace function df = pl$DataFrame(datetime = s)$with_columns( - pl$col("datetime")$dt$truncate("4s")$alias("truncated_4s"), - pl$col("datetime")$dt$truncate("4s", offset("3s"))$alias("truncated_4s_offset_2s") + pl$col("datetime")$dt$truncate("4s")$alias("truncated_4s") ) df } -\keyword{ExprDT} diff --git a/man/ExprDT_with_time_unit.Rd b/man/ExprDT_with_time_unit.Rd index 65d85a261..dc6c5e9e7 100644 --- a/man/ExprDT_with_time_unit.Rd +++ b/man/ExprDT_with_time_unit.Rd @@ -20,7 +20,7 @@ The corresponding global timepoint will change. } \examples{ df = pl$DataFrame( - date = pl$date_range( + date = pl$datetime_range( start = as.Date("2001-1-1"), end = as.Date("2001-1-3"), interval = "1d1s" diff --git a/man/ExprList_eval.Rd b/man/ExprList_eval.Rd index 1161c5f86..6f04b96d5 100644 --- a/man/ExprList_eval.Rd +++ b/man/ExprList_eval.Rd @@ -47,6 +47,6 @@ df$select( # concat strings in each list df$select( - pl$col("b")$list$eval(pl$element()$str$concat(" "))$list$first() + pl$col("b")$list$eval(pl$element()$str$join(" "))$list$first() ) } diff --git a/man/ExprStr_concat.Rd b/man/ExprStr_join.Rd similarity index 77% rename from man/ExprStr_concat.Rd rename to man/ExprStr_join.Rd index 1079dcd08..1c533d390 100644 --- a/man/ExprStr_concat.Rd +++ b/man/ExprStr_join.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/expr__string.R -\name{ExprStr_concat} -\alias{ExprStr_concat} +\name{ExprStr_join} +\alias{ExprStr_join} \title{Vertically concatenate the string values in the column to a single string value.} \usage{ -ExprStr_concat(delimiter = "", ..., ignore_nulls = TRUE) +ExprStr_join(delimiter = "", ..., ignore_nulls = TRUE) } \arguments{ \item{delimiter}{The delimiter to insert between consecutive string values.} @@ -24,7 +24,7 @@ Vertically concatenate the string values in the column to a single string value. # concatenate a Series of strings to a single string df = pl$DataFrame(foo = c(1, NA, 2)) -df$select(pl$col("foo")$str$concat("-")) +df$select(pl$col("foo")$str$join("-")) -df$select(pl$col("foo")$str$concat("-", ignore_nulls = FALSE)) +df$select(pl$col("foo")$str$join("-", ignore_nulls = FALSE)) } diff --git a/man/Expr_bottom_k.Rd b/man/Expr_bottom_k.Rd index 614baf8b9..16b803bf9 100644 --- a/man/Expr_bottom_k.Rd +++ b/man/Expr_bottom_k.Rd @@ -4,25 +4,10 @@ \alias{Expr_bottom_k} \title{Bottom k values} \usage{ -Expr_bottom_k( - k, - ..., - nulls_last = FALSE, - maintain_order = FALSE, - multithreaded = TRUE -) +Expr_bottom_k(k) } \arguments{ -\item{k}{Number of top values to get} - -\item{...}{Ignored.} - -\item{nulls_last}{Place null values last.} - -\item{maintain_order}{Whether the order should be maintained if elements are -equal.} - -\item{multithreaded}{Sort using multiple threads.} +\item{k}{Number of top values to get.} } \value{ Expr diff --git a/man/Expr_over.Rd b/man/Expr_over.Rd index 84bcc9256..bfa3ba9b2 100644 --- a/man/Expr_over.Rd +++ b/man/Expr_over.Rd @@ -4,12 +4,16 @@ \alias{Expr_over} \title{Compute expressions over the given groups} \usage{ -Expr_over(..., mapping_strategy = "group_to_rows") +Expr_over(..., order_by = NULL, mapping_strategy = "group_to_rows") } \arguments{ \item{...}{Column(s) to group by. Accepts expression input. Characters are parsed as column names.} +\item{order_by}{Order the window functions/aggregations with the partitioned +groups by the result of the expression passed to \code{order_by}. Can be an Expr. +Strings are parsed as column names.} + \item{mapping_strategy}{One of the following: \itemize{ \item \code{"group_to_rows"} (default): if the aggregation results in multiple values, @@ -66,4 +70,18 @@ df$with_columns( df$with_columns( top_2 = pl$col("c")$top_k(2)$over("a", mapping_strategy = "join") ) + +# order_by specifies how values are sorted within a group, which is +# essential when the operation depends on the order of values +df = pl$DataFrame( + g = c(1, 1, 1, 1, 2, 2, 2, 2), + t = c(1, 2, 3, 4, 4, 1, 2, 3), + x = c(10, 20, 30, 40, 10, 20, 30, 40) +) + +# without order_by, the first and second values in the second group would +# be inverted, which would be wrong +df$with_columns( + x_lag = pl$col("x")$shift(1)$over("g", order_by = "t") +) } diff --git a/man/Expr_replace.Rd b/man/Expr_replace.Rd index c4931ecad..c50e7b348 100644 --- a/man/Expr_replace.Rd +++ b/man/Expr_replace.Rd @@ -2,9 +2,9 @@ % Please edit documentation in R/expr__expr.R \name{Expr_replace} \alias{Expr_replace} -\title{Replace values by different values} +\title{Replace the given values by different values of the same data type.} \usage{ -Expr_replace(old, new, default = NULL, return_dtype = NULL) +Expr_replace(old, new) } \arguments{ \item{old}{Can be several things: @@ -18,19 +18,14 @@ if old values are numeric, the names must be wrapped in backticks; \item{new}{Either a vector of length 1, a vector of same length as \code{old} or an Expr. If missing, \code{old} must be a named list.} - -\item{default}{The default replacement if the value is not in \code{old}. Can be -an Expr. If \code{NULL} (default), then the value doesn't change.} - -\item{return_dtype}{The data type of the resulting expression. If set to -\code{NULL} (default), the data type is determined automatically based on the -other inputs.} } \value{ Expr } \description{ -This allows one to recode values in a column. +This allows one to recode values in a column, leaving all other values +unchanged. See \code{\link[=Expr_replace_strict]{$replace_strict()}} to give a default +value to all other values and to specify the output datatype. } \examples{ df = pl$DataFrame(a = c(1, 2, 2, 3)) @@ -42,22 +37,18 @@ df$with_columns(replaced = pl$col("a")$replace(c(2, 3), c(100, 200))) # "old" can be a named list where names are values to replace, and values are # the replacements mapping = list(`2` = 100, `3` = 200) -df$with_columns(replaced = pl$col("a")$replace(mapping, default = -1)) +df$with_columns(replaced = pl$col("a")$replace(mapping)) df = pl$DataFrame(a = c("x", "y", "z")) mapping = list(x = 1, y = 2, z = 3) df$with_columns(replaced = pl$col("a")$replace(mapping)) -# one can specify the data type to return instead of automatically inferring it -df$with_columns(replaced = pl$col("a")$replace(mapping, return_dtype = pl$Int8)) - -# "old", "new", and "default" can take Expr +# "old" and "new" can take Expr df = pl$DataFrame(a = c(1, 2, 2, 3), b = c(1.5, 2.5, 5, 1)) df$with_columns( replaced = pl$col("a")$replace( old = pl$col("a")$max(), - new = pl$col("b")$sum(), - default = pl$col("b"), + new = pl$col("b")$sum() ) ) } diff --git a/man/Expr_replace_strict.Rd b/man/Expr_replace_strict.Rd new file mode 100644 index 000000000..4a93659fd --- /dev/null +++ b/man/Expr_replace_strict.Rd @@ -0,0 +1,66 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/expr__expr.R +\name{Expr_replace_strict} +\alias{Expr_replace_strict} +\title{Replace all values by different values.} +\usage{ +Expr_replace_strict(old, new, default = NULL, return_dtype = NULL) +} +\arguments{ +\item{old}{Can be several things: +\itemize{ +\item a vector indicating the values to recode; +\item if \code{new} is missing, this can be a named list e.g \code{list(old = "new")} where +the names are the old values and the values are the replacements. Note that +if old values are numeric, the names must be wrapped in backticks; +\item an Expr +}} + +\item{new}{Either a vector of length 1, a vector of same length as \code{old} or +an Expr. If missing, \code{old} must be a named list.} + +\item{default}{The default replacement if the value is not in \code{old}. Can be +an Expr. If \code{NULL} (default), then the value doesn't change.} + +\item{return_dtype}{The data type of the resulting expression. If set to +\code{NULL} (default), the data type is determined automatically based on the +other inputs.} +} +\value{ +Expr +} +\description{ +This changes all the values in a column, either using a specific replacement +or a default one. See \code{\link[=Expr_replace]{$replace()}} to replace only a subset +of values. +} +\examples{ +df = pl$DataFrame(a = c(1, 2, 2, 3)) + +# "old" and "new" can take vectors of length 1 or of same length +df$with_columns(replaced = pl$col("a")$replace_strict(2, 100, default = 1)) +df$with_columns( + replaced = pl$col("a")$replace_strict(c(2, 3), c(100, 200), default = 1) +) + +# "old" can be a named list where names are values to replace, and values are +# the replacements +mapping = list(`2` = 100, `3` = 200) +df$with_columns(replaced = pl$col("a")$replace_strict(mapping, default = -1)) + +# one can specify the data type to return instead of automatically +# inferring it +df$with_columns( + replaced = pl$col("a")$replace_strict(mapping, default = 1, return_dtype = pl$Int32) +) + +# "old", "new", and "default" can take Expr +df = pl$DataFrame(a = c(1, 2, 2, 3), b = c(1.5, 2.5, 5, 1)) +df$with_columns( + replaced = pl$col("a")$replace_strict( + old = pl$col("a")$max(), + new = pl$col("b")$sum(), + default = pl$col("b"), + ) +) +} diff --git a/man/Expr_slice.Rd b/man/Expr_slice.Rd index 96420196b..28fa0a8a1 100644 --- a/man/Expr_slice.Rd +++ b/man/Expr_slice.Rd @@ -2,7 +2,6 @@ % Please edit documentation in R/expr__expr.R \name{Expr_slice} \alias{Expr_slice} -\alias{slice} \title{Get a slice of an Expr} \usage{ Expr_slice(offset, length = NULL) @@ -23,7 +22,6 @@ in those columns but will not change the number of rows in the data. See examples. } \examples{ - # as head pl$DataFrame(list(a = 0:100))$select( pl$all()$slice(0, 6) diff --git a/man/Expr_top_k.Rd b/man/Expr_top_k.Rd index 7601ba956..8ab61738d 100644 --- a/man/Expr_top_k.Rd +++ b/man/Expr_top_k.Rd @@ -4,25 +4,10 @@ \alias{Expr_top_k} \title{Top k values} \usage{ -Expr_top_k( - k, - ..., - nulls_last = FALSE, - maintain_order = FALSE, - multithreaded = TRUE -) +Expr_top_k(k) } \arguments{ -\item{k}{Number of top values to get} - -\item{...}{Ignored.} - -\item{nulls_last}{Place null values last.} - -\item{maintain_order}{Whether the order should be maintained if elements are -equal.} - -\item{multithreaded}{Sort using multiple threads.} +\item{k}{Number of top values to get.} } \value{ Expr diff --git a/man/Expr_value_counts.Rd b/man/Expr_value_counts.Rd index 7bbab1c40..2e3d7f7f9 100644 --- a/man/Expr_value_counts.Rd +++ b/man/Expr_value_counts.Rd @@ -4,7 +4,7 @@ \alias{Expr_value_counts} \title{Value counts} \usage{ -Expr_value_counts(..., sort = FALSE, parallel = FALSE, name = "count") +Expr_value_counts(..., sort = FALSE, parallel = FALSE, name, normalize = FALSE) } \arguments{ \item{...}{Ignored.} @@ -14,8 +14,11 @@ Expr_value_counts(..., sort = FALSE, parallel = FALSE, name = "count") \item{parallel}{Better to turn this off in the aggregation context, as it can lead to contention.} -\item{name}{Give the resulting count field a specific name, defaults to -\code{"count"}.} +\item{name}{Give the resulting count column a specific name. The default is +\code{"count"} if \code{normalize = FALSE} and \code{"proportion"} if \code{normalize = TRUE}.} + +\item{normalize}{If \code{TRUE}, it gives relative frequencies of the unique +values instead of their count.} } \value{ Expr @@ -24,8 +27,7 @@ Expr Count all unique values and create a struct mapping value to count. } \examples{ -df = pl$DataFrame(iris)$select(pl$col("Species")$value_counts()) -df - -df$unnest()$to_data_frame() +df = pl$DataFrame(iris) +df$select(pl$col("Species")$value_counts())$unnest() +df$select(pl$col("Species")$value_counts(normalize = TRUE))$unnest() } diff --git a/man/IO_sink_parquet.Rd b/man/IO_sink_parquet.Rd index 7108c04a7..19e558164 100644 --- a/man/IO_sink_parquet.Rd +++ b/man/IO_sink_parquet.Rd @@ -9,7 +9,7 @@ LazyFrame_sink_parquet( ..., compression = "zstd", compression_level = 3, - statistics = FALSE, + statistics = TRUE, row_group_size = NULL, data_pagesize_limit = NULL, maintain_order = TRUE, @@ -48,8 +48,16 @@ means smaller files on disk: \item "zstd": min-level: 1, max-level: 22. }} -\item{statistics}{Logical. Whether compute and write column statistics. -This requires extra compute.} +\item{statistics}{Whether statistics should be written to the Parquet +headers. Possible values: +\itemize{ +\item \code{TRUE}: enable default set of statistics (default) +\item \code{FALSE}: disable all statistics +\item \code{"full"}: calculate and write all available statistics. +\item A named list where all values must be \code{TRUE} or \code{FALSE}, e.g. +\code{list(min = TRUE, max = FALSE)}. Statistics available are \code{"min"}, \code{"max"}, +\code{"distinct_count"}, \code{"null_count"}. +}} \item{row_group_size}{\code{NULL} or Integer. Size of the row groups in number of rows. If \code{NULL} (default), the chunks of the DataFrame are used. Writing in diff --git a/man/IO_write_parquet.Rd b/man/IO_write_parquet.Rd index d29039a03..ef01ef9cf 100644 --- a/man/IO_write_parquet.Rd +++ b/man/IO_write_parquet.Rd @@ -9,7 +9,7 @@ DataFrame_write_parquet( ..., compression = "zstd", compression_level = 3, - statistics = FALSE, + statistics = TRUE, row_group_size = NULL, data_pagesize_limit = NULL ) @@ -40,8 +40,16 @@ means smaller files on disk: \item "zstd": min-level: 1, max-level: 22. }} -\item{statistics}{Logical. Whether compute and write column statistics. -This requires extra compute.} +\item{statistics}{Whether statistics should be written to the Parquet +headers. Possible values: +\itemize{ +\item \code{TRUE}: enable default set of statistics (default) +\item \code{FALSE}: disable all statistics +\item \code{"full"}: calculate and write all available statistics. +\item A named list where all values must be \code{TRUE} or \code{FALSE}, e.g. +\code{list(min = TRUE, max = FALSE)}. Statistics available are \code{"min"}, \code{"max"}, +\code{"distinct_count"}, \code{"null_count"}. +}} \item{row_group_size}{\code{NULL} or Integer. Size of the row groups in number of rows. If \code{NULL} (default), the chunks of the DataFrame are used. Writing in diff --git a/man/LazyFrame_group_by_dynamic.Rd b/man/LazyFrame_group_by_dynamic.Rd index f0177d440..1fe869e29 100644 --- a/man/LazyFrame_group_by_dynamic.Rd +++ b/man/LazyFrame_group_by_dynamic.Rd @@ -95,7 +95,7 @@ by: } \examples{ lf = pl$LazyFrame( - time = pl$date_range( + time = pl$datetime_range( start = strptime("2021-12-16 00:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"), end = strptime("2021-12-16 03:00:00", format = "\%Y-\%m-\%d \%H:\%M:\%S", tz = "UTC"), interval = "30m" diff --git a/man/LazyFrame_join.Rd b/man/LazyFrame_join.Rd index 382cbff76..dc6b2bb57 100644 --- a/man/LazyFrame_join.Rd +++ b/man/LazyFrame_join.Rd @@ -7,7 +7,7 @@ LazyFrame_join( other, on = NULL, - how = c("inner", "left", "full", "semi", "anti", "cross"), + how = "inner", ..., left_on = NULL, right_on = NULL, diff --git a/man/LazyFrame_melt.Rd b/man/LazyFrame_unpivot.Rd similarity index 80% rename from man/LazyFrame_melt.Rd rename to man/LazyFrame_unpivot.Rd index 057b75fdf..09fdd57cd 100644 --- a/man/LazyFrame_melt.Rd +++ b/man/LazyFrame_unpivot.Rd @@ -1,32 +1,32 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/lazyframe__lazy.R -\name{LazyFrame_melt} -\alias{LazyFrame_melt} +\name{LazyFrame_unpivot} +\alias{LazyFrame_unpivot} \title{Unpivot a Frame from wide to long format} \usage{ -LazyFrame_melt( - id_vars = NULL, - value_vars = NULL, +LazyFrame_unpivot( + on = NULL, + ..., + index = NULL, variable_name = NULL, value_name = NULL, - ..., streamable = TRUE ) } \arguments{ -\item{id_vars}{Columns to use as identifier variables.} - -\item{value_vars}{Values to use as identifier variables. If \code{value_vars} is +\item{on}{Values to use as identifier variables. If \code{value_vars} is empty all columns that are not in \code{id_vars} will be used.} +\item{...}{Not used.} + +\item{index}{Columns to use as identifier variables.} + \item{variable_name}{Name to give to the new column containing the names of the melted columns. Defaults to "variable".} \item{value_name}{Name to give to the new column containing the values of the melted columns. Defaults to "value"} -\item{...}{Not used.} - \item{streamable}{Allow this node to run in the streaming engine. If this runs in streaming, the output of the melt operation will not have a stable ordering.} @@ -51,6 +51,6 @@ lf = pl$LazyFrame( b = c(1, 3, 5), c = c(2, 4, 6) ) -lf$melt(id_vars = "a", value_vars = c("b", "c"))$collect() +lf$unpivot(index = "a", on = c("b", "c"))$collect() } \keyword{LazyFrame} diff --git a/man/Series_class.Rd b/man/Series_class.Rd index 105c8e03a..8907e8cb6 100644 --- a/man/Series_class.Rd +++ b/man/Series_class.Rd @@ -172,7 +172,7 @@ s$cos() # use Expr method in subnamespaces as_polars_series(list(3:1, 1:2, NULL))$list$first() -as_polars_series(c(1, NA, 2))$str$concat("-") +as_polars_series(c(1, NA, 2))$str$join("-") s = pl$date_range( as.Date("2024-02-18"), as.Date("2024-02-24"), diff --git a/man/Series_value_counts.Rd b/man/Series_value_counts.Rd index 88725a6d8..ba711ec3f 100644 --- a/man/Series_value_counts.Rd +++ b/man/Series_value_counts.Rd @@ -4,7 +4,13 @@ \alias{Series_value_counts} \title{Count the occurrences of unique values} \usage{ -Series_value_counts(..., sort = TRUE, parallel = FALSE, name = "count") +Series_value_counts( + ..., + sort = TRUE, + parallel = FALSE, + name = "count", + normalize = FALSE +) } \arguments{ \item{...}{Ignored.} @@ -14,8 +20,11 @@ Series_value_counts(..., sort = TRUE, parallel = FALSE, name = "count") \item{parallel}{Better to turn this off in the aggregation context, as it can lead to contention.} -\item{name}{Give the resulting count field a specific name, defaults to -\code{"count"}.} +\item{name}{Give the resulting count column a specific name. The default is +\code{"count"} if \code{normalize = FALSE} and \code{"proportion"} if \code{normalize = TRUE}.} + +\item{normalize}{If \code{TRUE}, it gives relative frequencies of the unique +values instead of their count.} } \value{ DataFrame diff --git a/man/pl_date_range.Rd b/man/pl_date_range.Rd index 5fda132b7..08d6377c5 100644 --- a/man/pl_date_range.Rd +++ b/man/pl_date_range.Rd @@ -4,15 +4,7 @@ \alias{pl_date_range} \title{Generate a date range} \usage{ -pl_date_range( - start, - end, - interval = "1d", - ..., - closed = "both", - time_unit = NULL, - time_zone = NULL -) +pl_date_range(start, end, interval = "1d", ..., closed = "both") } \arguments{ \item{start}{Lower bound of the date range. Something that can be coerced to @@ -29,14 +21,6 @@ See the \verb{Polars duration string language} section for details.} \item{closed}{Define which sides of the range are closed (inclusive). One of the followings: \code{"both"} (default), \code{"left"}, \code{"right"}, \code{"none"}.} - -\item{time_unit}{Time unit of the resulting the \link[=DataType_Datetime]{Datetime} -data type. One of \code{"ns"}, \code{"us"}, \code{"ms"} or \code{NULL}. Only takes effect if the -output column is of type \link[=DataType_Datetime]{Datetime} (deprecated usage).} - -\item{time_zone}{Time zone of the resulting \link[=DataType_Datetime]{Datetime} data -type. Only takes effect if the output column is of type \link[=DataType_Datetime]{Datetime} -(deprecated usage).} } \value{ An \link[=Expr_class]{Expr} of data type Date or \link[=DataType_Datetime]{Datetime} diff --git a/man/pl_date_ranges.Rd b/man/pl_date_ranges.Rd index 8c31e3365..4c336fc3e 100644 --- a/man/pl_date_ranges.Rd +++ b/man/pl_date_ranges.Rd @@ -4,15 +4,7 @@ \alias{pl_date_ranges} \title{Generate a list containing a date range} \usage{ -pl_date_ranges( - start, - end, - interval = "1d", - ..., - closed = "both", - time_unit = NULL, - time_zone = NULL -) +pl_date_ranges(start, end, interval = "1d", ..., closed = "both") } \arguments{ \item{start}{Lower bound of the date range. Something that can be coerced to @@ -29,14 +21,6 @@ See the \verb{Polars duration string language} section for details.} \item{closed}{Define which sides of the range are closed (inclusive). One of the followings: \code{"both"} (default), \code{"left"}, \code{"right"}, \code{"none"}.} - -\item{time_unit}{Time unit of the resulting the \link[=DataType_Datetime]{Datetime} -data type. One of \code{"ns"}, \code{"us"}, \code{"ms"} or \code{NULL}. Only takes effect if the -output column is of type \link[=DataType_Datetime]{Datetime} (deprecated usage).} - -\item{time_zone}{Time zone of the resulting \link[=DataType_Datetime]{Datetime} data -type. Only takes effect if the output column is of type \link[=DataType_Datetime]{Datetime} -(deprecated usage).} } \value{ An \link[=Expr_class]{Expr} of data type List(Date) or diff --git a/src/rust/Cargo.lock b/src/rust/Cargo.lock index fdcfbc15e..5d8db30e8 100644 --- a/src/rust/Cargo.lock +++ b/src/rust/Cargo.lock @@ -96,6 +96,18 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bf7d0a018de4f6aa429b9d33d69edf69072b1c5b1cb8d3e4a5f7ef898fc3eb76" +[[package]] +name = "arrayref" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b4930d2cb77ce62f89ee5d5289b4ac049559b1c45539271f5ed4fdc7db34545" + +[[package]] +name = "arrayvec" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" + [[package]] name = "async-stream" version = "0.3.5" @@ -144,6 +156,12 @@ version = "0.15.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ae037714f313c1353189ead58ef9eec30a8e8dc101b2622d461418fd59e28a9" +[[package]] +name = "atomic-waker" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" + [[package]] name = "autocfg" version = "1.1.0" @@ -215,6 +233,19 @@ dependencies = [ "serde", ] +[[package]] +name = "blake3" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30cca6d3674597c30ddf2c587bf8d9d65c9a84d2326d941cc79c9842dfe0ef52" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", +] + [[package]] name = "block-buffer" version = "0.10.4" @@ -348,6 +379,12 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "constant_time_eq" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7144d30dcf0fafbce74250a3963025d8d52177934239851c917d29f1df280c2" + [[package]] name = "core-foundation" version = "0.9.4" @@ -630,6 +667,16 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "fs4" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7e180ac76c23b45e767bd7ae9579bc0bb458618c4bc71835926e098e61d15f8" +dependencies = [ + "rustix", + "windows-sys 0.52.0", +] + [[package]] name = "futures" version = "0.3.30" @@ -778,7 +825,26 @@ dependencies = [ "futures-core", "futures-sink", "futures-util", - "http", + "http 0.2.12", + "indexmap", + "slab", + "tokio", + "tokio-util", + "tracing", +] + +[[package]] +name = "h2" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa82e28a107a8cc405f0839610bdc9b15f1e25ec7d696aa5cf173edbcb1486ab" +dependencies = [ + "atomic-waker", + "bytes", + "fnv", + "futures-core", + "futures-sink", + "http 1.1.0", "indexmap", "slab", "tokio", @@ -855,6 +921,17 @@ dependencies = [ "itoa", ] +[[package]] +name = "http" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + [[package]] name = "http-body" version = "0.4.6" @@ -862,7 +939,30 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" dependencies = [ "bytes", - "http", + "http 0.2.12", + "pin-project-lite", +] + +[[package]] +name = "http-body" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643" +dependencies = [ + "bytes", + "http 1.1.0", +] + +[[package]] +name = "http-body-util" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f" +dependencies = [ + "bytes", + "futures-util", + "http 1.1.0", + "http-body 1.0.0", "pin-project-lite", ] @@ -894,9 +994,9 @@ dependencies = [ "futures-channel", "futures-core", "futures-util", - "h2", - "http", - "http-body", + "h2 0.3.26", + "http 0.2.12", + "http-body 0.4.6", "httparse", "httpdate", "itoa", @@ -908,18 +1008,62 @@ dependencies = [ "want", ] +[[package]] +name = "hyper" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe575dd17d0862a9a33781c8c4696a55c320909004a67a00fb286ba8b1bc496d" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "h2 0.4.5", + "http 1.1.0", + "http-body 1.0.0", + "httparse", + "itoa", + "pin-project-lite", + "smallvec", + "tokio", + "want", +] + [[package]] name = "hyper-rustls" -version = "0.24.2" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590" +checksum = "5ee4be2c948921a1a5320b629c4193916ed787a7f7f293fd3f7f5a6c9de74155" dependencies = [ "futures-util", - "http", - "hyper", + "http 1.1.0", + "hyper 1.3.1", + "hyper-util", "rustls", + "rustls-native-certs", + "rustls-pki-types", "tokio", "tokio-rustls", + "tower-service", +] + +[[package]] +name = "hyper-util" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b875924a60b96e5d7b9ae7b066540b1dd1cbd90d1828f54c92e02a283351c56" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "http 1.1.0", + "http-body 1.0.0", + "hyper 1.3.1", + "pin-project-lite", + "socket2", + "tokio", + "tower", + "tower-service", + "tracing", ] [[package]] @@ -1444,26 +1588,26 @@ dependencies = [ [[package]] name = "object_store" -version = "0.9.1" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8718f8b65fdf67a45108d1548347d4af7d71fb81ce727bbf9e3b2535e079db3" +checksum = "fbebfd32c213ba1907fa7a9c9138015a8de2b43e30c5aa45b18f7deb46786ad6" dependencies = [ "async-trait", - "base64 0.21.7", + "base64 0.22.0", "bytes", "chrono", "futures", "humantime", - "hyper", + "hyper 1.3.1", "itertools", "md-5", "parking_lot", "percent-encoding", "quick-xml", "rand", - "reqwest", + "reqwest 0.12.5", "ring", - "rustls-pemfile 2.1.2", + "rustls-pemfile", "serde", "serde_json", "snafu", @@ -1583,6 +1727,26 @@ dependencies = [ "siphasher", ] +[[package]] +name = "pin-project" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6bf43b791c5b9e34c3d182969b4abb522f9343702850a2e57f460d00d09b4b3" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.50", +] + [[package]] name = "pin-project-lite" version = "0.2.13" @@ -1612,8 +1776,8 @@ dependencies = [ [[package]] name = "polars" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "getrandom", "polars-arrow", @@ -1632,8 +1796,8 @@ dependencies = [ [[package]] name = "polars-arrow" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "ahash", "atoi", @@ -1679,8 +1843,8 @@ dependencies = [ [[package]] name = "polars-compute" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "bytemuck", "either", @@ -1694,8 +1858,8 @@ dependencies = [ [[package]] name = "polars-core" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "ahash", "bitflags 2.4.2", @@ -1728,8 +1892,8 @@ dependencies = [ [[package]] name = "polars-error" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "avro-schema", "object_store", @@ -1741,8 +1905,8 @@ dependencies = [ [[package]] name = "polars-expr" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "ahash", "bitflags 2.4.2", @@ -1760,17 +1924,19 @@ dependencies = [ [[package]] name = "polars-io" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "ahash", "async-trait", "atoi_simd", + "blake3", "bytes", "chrono", "chrono-tz", "fast-float", "flate2", + "fs4", "futures", "home", "itoa", @@ -1789,7 +1955,7 @@ dependencies = [ "polars-utils", "rayon", "regex", - "reqwest", + "reqwest 0.11.27", "ryu", "serde", "serde_json", @@ -1804,11 +1970,12 @@ dependencies = [ [[package]] name = "polars-json" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "ahash", "chrono", + "chrono-tz", "fallible-streaming-iterator", "hashbrown 0.14.3", "indexmap", @@ -1824,19 +1991,21 @@ dependencies = [ [[package]] name = "polars-lazy" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "ahash", "bitflags 2.4.2", "futures", "glob", + "memchr", "once_cell", "polars-arrow", "polars-core", "polars-expr", "polars-io", "polars-json", + "polars-mem-engine", "polars-ops", "polars-pipe", "polars-plan", @@ -1848,10 +2017,30 @@ dependencies = [ "version_check", ] +[[package]] +name = "polars-mem-engine" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" +dependencies = [ + "futures", + "polars-arrow", + "polars-core", + "polars-error", + "polars-expr", + "polars-io", + "polars-json", + "polars-ops", + "polars-plan", + "polars-time", + "polars-utils", + "rayon", + "tokio", +] + [[package]] name = "polars-ops" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "ahash", "aho-corasick", @@ -1886,8 +2075,8 @@ dependencies = [ [[package]] name = "polars-parquet" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "ahash", "async-stream", @@ -1900,9 +2089,10 @@ dependencies = [ "num-traits", "parquet-format-safe", "polars-arrow", + "polars-compute", "polars-error", "polars-utils", - "seq-macro", + "serde", "simdutf8", "snap", "streaming-decompression", @@ -1911,8 +2101,8 @@ dependencies = [ [[package]] name = "polars-pipe" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "crossbeam-channel", "crossbeam-queue", @@ -1938,12 +2128,11 @@ dependencies = [ [[package]] name = "polars-plan" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "ahash", "bytemuck", - "chrono", "chrono-tz", "either", "futures", @@ -1969,8 +2158,8 @@ dependencies = [ [[package]] name = "polars-row" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "bytemuck", "polars-arrow", @@ -1980,8 +2169,8 @@ dependencies = [ [[package]] name = "polars-sql" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "hex", "once_cell", @@ -1991,6 +2180,7 @@ dependencies = [ "polars-lazy", "polars-ops", "polars-plan", + "polars-time", "rand", "serde", "serde_json", @@ -1999,8 +2189,8 @@ dependencies = [ [[package]] name = "polars-time" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "atoi", "bytemuck", @@ -2020,8 +2210,8 @@ dependencies = [ [[package]] name = "polars-utils" -version = "0.40.0" -source = "git+https://github.com/pola-rs/polars.git?rev=318ec405632410a41f634de7aeff46e89a25eab9#318ec405632410a41f634de7aeff46e89a25eab9" +version = "0.41.2" +source = "git+https://github.com/pola-rs/polars.git?rev=f73937ab5213a44eaaba8cfc799d8f837600f179#f73937ab5213a44eaaba8cfc799d8f837600f179" dependencies = [ "ahash", "bytemuck", @@ -2072,6 +2262,53 @@ dependencies = [ "serde", ] +[[package]] +name = "quinn" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4ceeeeabace7857413798eb1ffa1e9c905a9946a57d81fb69b4b71c4d8eb3ad" +dependencies = [ + "bytes", + "pin-project-lite", + "quinn-proto", + "quinn-udp", + "rustc-hash", + "rustls", + "thiserror", + "tokio", + "tracing", +] + +[[package]] +name = "quinn-proto" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddf517c03a109db8100448a4be38d498df8a210a99fe0e1b9eaf39e78c640efe" +dependencies = [ + "bytes", + "rand", + "ring", + "rustc-hash", + "rustls", + "slab", + "thiserror", + "tinyvec", + "tracing", +] + +[[package]] +name = "quinn-udp" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9096629c45860fc7fb143e125eb826b5e721e10be3263160c7d60ca832cf8c46" +dependencies = [ + "libc", + "once_cell", + "socket2", + "tracing", + "windows-sys 0.52.0", +] + [[package]] name = "quote" version = "1.0.35" @@ -2083,7 +2320,7 @@ dependencies = [ [[package]] name = "r-polars" -version = "0.40.1" +version = "0.41.0" dependencies = [ "either", "extendr-api", @@ -2283,11 +2520,48 @@ dependencies = [ "encoding_rs", "futures-core", "futures-util", - "h2", - "http", - "http-body", - "hyper", + "h2 0.3.26", + "http 0.2.12", + "http-body 0.4.6", + "hyper 0.14.28", + "ipnet", + "js-sys", + "log", + "mime", + "once_cell", + "percent-encoding", + "pin-project-lite", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper 0.1.2", + "system-configuration", + "tokio", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", + "winreg 0.50.0", +] + +[[package]] +name = "reqwest" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7d6d2a27d57148378eb5e111173f4276ad26340ecc5c49a4a2152167a2d6a37" +dependencies = [ + "base64 0.22.0", + "bytes", + "futures-core", + "futures-util", + "h2 0.4.5", + "http 1.1.0", + "http-body 1.0.0", + "http-body-util", + "hyper 1.3.1", "hyper-rustls", + "hyper-util", "ipnet", "js-sys", "log", @@ -2295,14 +2569,15 @@ dependencies = [ "once_cell", "percent-encoding", "pin-project-lite", + "quinn", "rustls", "rustls-native-certs", - "rustls-pemfile 1.0.4", + "rustls-pemfile", + "rustls-pki-types", "serde", "serde_json", "serde_urlencoded", - "sync_wrapper", - "system-configuration", + "sync_wrapper 1.0.1", "tokio", "tokio-rustls", "tokio-util", @@ -2312,7 +2587,7 @@ dependencies = [ "wasm-bindgen-futures", "wasm-streams", "web-sys", - "winreg", + "winreg 0.52.0", ] [[package]] @@ -2342,6 +2617,12 @@ version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "rustix" version = "0.38.31" @@ -2357,37 +2638,31 @@ dependencies = [ [[package]] name = "rustls" -version = "0.21.11" +version = "0.23.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fecbfb7b1444f477b345853b1fce097a2c6fb637b2bfb87e6bc5db0f043fae4" +checksum = "ebbbdb961df0ad3f2652da8f3fdc4b36122f568f968f45ad3316f26c025c677b" dependencies = [ - "log", + "once_cell", "ring", + "rustls-pki-types", "rustls-webpki", - "sct", + "subtle", + "zeroize", ] [[package]] name = "rustls-native-certs" -version = "0.6.3" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9aace74cb666635c918e9c12bc0d348266037aa8eb599b5cba565709a8dff00" +checksum = "8f1fb85efa936c42c6d5fc28d2629bb51e4b2f4b8a5211e297d599cc5a093792" dependencies = [ "openssl-probe", - "rustls-pemfile 1.0.4", + "rustls-pemfile", + "rustls-pki-types", "schannel", "security-framework", ] -[[package]] -name = "rustls-pemfile" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c" -dependencies = [ - "base64 0.21.7", -] - [[package]] name = "rustls-pemfile" version = "2.1.2" @@ -2406,11 +2681,12 @@ checksum = "ecd36cc4259e3e4514335c4a138c6b43171a8d61d8f5c9348f9fc7529416f247" [[package]] name = "rustls-webpki" -version = "0.101.7" +version = "0.102.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765" +checksum = "f3bce581c0dd41bce533ce695a1437fa16a7ab5ac3ccfa99fe1a620a7885eabf" dependencies = [ "ring", + "rustls-pki-types", "untrusted", ] @@ -2456,16 +2732,6 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" -[[package]] -name = "sct" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414" -dependencies = [ - "ring", - "untrusted", -] - [[package]] name = "security-framework" version = "2.10.0" @@ -2489,12 +2755,6 @@ dependencies = [ "libc", ] -[[package]] -name = "seq-macro" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" - [[package]] name = "serde" version = "1.0.203" @@ -2654,9 +2914,9 @@ dependencies = [ [[package]] name = "sqlparser" -version = "0.39.0" +version = "0.47.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "743b4dc2cbde11890ccb254a8fc9d537fa41b36da00de2a1c5e9848c9bc42bd7" +checksum = "295e9930cd7a97e58ca2a070541a3ca502b17f5d1fa7157376d0fabd85324f25" dependencies = [ "log", ] @@ -2742,6 +3002,12 @@ dependencies = [ "syn 2.0.50", ] +[[package]] +name = "subtle" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d0208408ba0c3df17ed26eb06992cb1a1268d41b2c0e12e65203fbe3972cee5" + [[package]] name = "syn" version = "1.0.109" @@ -2770,6 +3036,12 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" +[[package]] +name = "sync_wrapper" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" + [[package]] name = "sysinfo" version = "0.30.5" @@ -2898,11 +3170,12 @@ dependencies = [ [[package]] name = "tokio-rustls" -version = "0.24.1" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" +checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4" dependencies = [ "rustls", + "rustls-pki-types", "tokio", ] @@ -2920,6 +3193,27 @@ dependencies = [ "tracing", ] +[[package]] +name = "tower" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" +dependencies = [ + "futures-core", + "futures-util", + "pin-project", + "pin-project-lite", + "tokio", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-layer" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0" + [[package]] name = "tower-service" version = "0.3.2" @@ -3406,6 +3700,16 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "winreg" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a277a57398d4bfa075df44f501a17cfdf8542d224f0d36095a2adc7aee4ef0a5" +dependencies = [ + "cfg-if", + "windows-sys 0.48.0", +] + [[package]] name = "xxhash-rust" version = "0.8.10" @@ -3432,6 +3736,12 @@ dependencies = [ "syn 2.0.50", ] +[[package]] +name = "zeroize" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" + [[package]] name = "zstd" version = "0.13.0" diff --git a/src/rust/Cargo.toml b/src/rust/Cargo.toml index a042f4fab..9bfabf136 100644 --- a/src/rust/Cargo.toml +++ b/src/rust/Cargo.toml @@ -1,8 +1,8 @@ [package] name = "r-polars" -version = "0.40.1" +version = "0.41.0" edition = "2021" -rust-version = "1.77.0" +rust-version = "1.79.0" publish = false [lib] @@ -50,8 +50,8 @@ serde_json = "*" smartstring = "1.0.1" state = "0.6.0" thiserror = "1.0.61" -polars-core = { git = "https://github.com/pola-rs/polars.git", rev = "318ec405632410a41f634de7aeff46e89a25eab9", default-features = false } -polars-lazy = { git = "https://github.com/pola-rs/polars.git", rev = "318ec405632410a41f634de7aeff46e89a25eab9", default-features = false } +polars-core = { git = "https://github.com/pola-rs/polars.git", rev = "f73937ab5213a44eaaba8cfc799d8f837600f179", default-features = false } +polars-lazy = { git = "https://github.com/pola-rs/polars.git", rev = "f73937ab5213a44eaaba8cfc799d8f837600f179", default-features = false } either = "1" [dependencies.polars] @@ -76,7 +76,6 @@ features = [ "cumulative_eval", "cutqcut", "dataframe_arithmetic", - "date_offset", "decompress-fast", "diagonal_concat", "diff", @@ -116,8 +115,11 @@ features = [ "meta", "mode", "moment", + "month_start", + "month_end", "ndarray", "object", + "offset_by", "parquet", "partition_by", "pct_change", @@ -157,4 +159,4 @@ features = [ "zip_with", ] git = "https://github.com/pola-rs/polars.git" -rev = "318ec405632410a41f634de7aeff46e89a25eab9" +rev = "f73937ab5213a44eaaba8cfc799d8f837600f179" diff --git a/src/rust/src/conversion_s_to_r.rs b/src/rust/src/conversion_s_to_r.rs index 09c936ea8..8199148e1 100644 --- a/src/rust/src/conversion_s_to_r.rs +++ b/src/rust/src/conversion_s_to_r.rs @@ -120,22 +120,17 @@ pub fn pl_series_to_list( let mut v: Vec = Vec::with_capacity(s.len()); let ca = s.list().unwrap(); - // Safty:amortized_iter() The returned should never be cloned or taken longer than a single iteration, - // as every call on next of the iterator will change the contents of that Series. - unsafe { - for opt_s in ca.amortized_iter() { - match opt_s { - Some(s) => { - let s_ref = s.as_ref(); - // is safe because s is read to generate new Robj, then discarded. - let inner_val = - to_list_recursive(s_ref, tag_structs, int64_conversion)?; - v.push(inner_val); - } + for opt_s in ca.amortized_iter() { + match opt_s { + Some(s) => { + let s_ref = s.as_ref(); + let inner_val = + to_list_recursive(s_ref, tag_structs, int64_conversion)?; + v.push(inner_val); + } - None => { - v.push(r!(extendr_api::NULL)); - } + None => { + v.push(r!(extendr_api::NULL)); } } } diff --git a/src/rust/src/lazy/dataframe.rs b/src/rust/src/lazy/dataframe.rs index 53a6d901b..41f21363e 100644 --- a/src/rust/src/lazy/dataframe.rs +++ b/src/rust/src/lazy/dataframe.rs @@ -13,7 +13,7 @@ use crate::utils::{r_result_list, try_f64_into_usize}; use extendr_api::prelude::*; use pl::{AsOfOptions, Duration, RollingGroupOptions}; use polars::chunked_array::ops::SortMultipleOptions; -use polars::frame::explode::MeltArgs; +use polars::frame::explode::UnpivotArgs; use polars::prelude as pl; use polars::prelude::{JoinCoalesce, SerializeOptions}; @@ -108,14 +108,14 @@ impl RPolarsLazyFrame { ) -> RResult<()> { let pqwo = polars::prelude::ParquetWriteOptions { compression: new_parquet_compression(compression_method, compression_level)?, - statistics: robj_to!(bool, statistics)?, + statistics: robj_to!(StatisticsOptions, statistics)?, row_group_size: robj_to!(Option, usize, row_group_size)?, data_pagesize_limit: robj_to!(Option, usize, data_pagesize_limit)?, maintain_order: robj_to!(bool, maintain_order)?, }; self.0 .clone() - .sink_parquet(robj_to!(String, path)?.into(), pqwo) + .sink_parquet(robj_to!(String, path)?, pqwo) .map_err(polars_to_rpolars_err) } @@ -126,7 +126,7 @@ impl RPolarsLazyFrame { }; self.0 .clone() - .sink_ipc(robj_to!(String, path)?.into(), ipcwo) + .sink_ipc(robj_to!(String, path)?, ipcwo) .map_err(polars_to_rpolars_err) } @@ -167,6 +167,7 @@ impl RPolarsLazyFrame { date_format, time_format, datetime_format, + float_scientific: None, float_precision, separator, quote_char: quote, @@ -185,7 +186,7 @@ impl RPolarsLazyFrame { self.0 .clone() - .sink_csv(robj_to!(String, path)?.into(), options) + .sink_csv(robj_to!(String, path)?, options) .map_err(polars_to_rpolars_err) } @@ -194,7 +195,7 @@ impl RPolarsLazyFrame { let options = pl::JsonWriterOptions { maintain_order }; self.0 .clone() - .sink_json(robj_to!(String, path)?.into(), options) + .sink_json(robj_to!(String, path)?, options) .map_err(polars_to_rpolars_err) } @@ -499,22 +500,22 @@ impl RPolarsLazyFrame { .into()) } - fn melt( + fn unpivot( &self, - id_vars: Robj, - value_vars: Robj, + on: Robj, + index: Robj, value_name: Robj, variable_name: Robj, streamable: Robj, ) -> RResult { - let args = MeltArgs { - id_vars: strings_to_smartstrings(robj_to!(Vec, String, id_vars)?), - value_vars: strings_to_smartstrings(robj_to!(Vec, String, value_vars)?), + let args = UnpivotArgs { + on: strings_to_smartstrings(robj_to!(Vec, String, on)?), + index: strings_to_smartstrings(robj_to!(Vec, String, index)?), value_name: robj_to!(Option, String, value_name)?.map(|s| s.into()), variable_name: robj_to!(Option, String, variable_name)?.map(|s| s.into()), streamable: robj_to!(bool, streamable)?, }; - Ok(self.0.clone().melt(args).into()) + Ok(self.0.clone().unpivot(args).into()) } fn rename(&self, existing: Robj, new: Robj) -> RResult { @@ -590,6 +591,7 @@ impl RPolarsLazyFrame { fast_projection: _, row_estimate: _, eager, + new_streaming: _, } = self.0.get_current_optimizations(); list!( type_coercion = type_coercion, diff --git a/src/rust/src/lazy/dsl.rs b/src/rust/src/lazy/dsl.rs index 129f1398e..88d203d2e 100644 --- a/src/rust/src/lazy/dsl.rs +++ b/src/rust/src/lazy/dsl.rs @@ -261,50 +261,12 @@ impl RPolarsExpr { .into() } - pub fn top_k( - &self, - k: Robj, - nulls_last: Robj, - maintain_order: Robj, - multithreaded: Robj, - ) -> RResult { - let nulls_last = robj_to!(bool, nulls_last)?; - let multithreaded = robj_to!(bool, multithreaded)?; - let maintain_order = robj_to!(bool, maintain_order)?; - Ok(self - .0 - .clone() - .top_k( - robj_to!(PLExpr, k)?, - SortOptions::default() - .with_nulls_last(nulls_last) - .with_maintain_order(maintain_order) - .with_multithreaded(multithreaded), - ) - .into()) + pub fn top_k(&self, k: Robj) -> RResult { + Ok(self.0.clone().top_k(robj_to!(PLExpr, k)?).into()) } - pub fn bottom_k( - &self, - k: Robj, - nulls_last: Robj, - maintain_order: Robj, - multithreaded: Robj, - ) -> RResult { - let nulls_last = robj_to!(bool, nulls_last)?; - let multithreaded = robj_to!(bool, multithreaded)?; - let maintain_order = robj_to!(bool, maintain_order)?; - Ok(self - .0 - .clone() - .bottom_k( - robj_to!(PLExpr, k)?, - SortOptions::default() - .with_nulls_last(nulls_last) - .with_maintain_order(maintain_order) - .with_multithreaded(multithreaded), - ) - .into()) + pub fn bottom_k(&self, k: Robj) -> RResult { + Ok(self.0.clone().bottom_k(robj_to!(PLExpr, k)?).into()) } pub fn arg_max(&self) -> Self { @@ -1081,8 +1043,11 @@ impl RPolarsExpr { } } - pub fn value_counts(&self, sort: bool, parallel: bool, name: String) -> Self { - self.0.clone().value_counts(sort, parallel, name).into() + pub fn value_counts(&self, sort: bool, parallel: bool, name: String, normalize: bool) -> Self { + self.0 + .clone() + .value_counts(sort, parallel, name, normalize) + .into() } pub fn unique_counts(&self) -> Self { @@ -1122,7 +1087,15 @@ impl RPolarsExpr { self.0.clone().peak_max().into() } - pub fn replace( + pub fn replace(&self, old: Robj, new: Robj) -> RResult { + Ok(self + .0 + .clone() + .replace(robj_to!(PLExpr, old)?, robj_to!(PLExpr, new)?) + .into()) + } + + pub fn replace_strict( &self, old: Robj, new: Robj, @@ -1132,7 +1105,7 @@ impl RPolarsExpr { Ok(self .0 .clone() - .replace( + .replace_strict( robj_to!(PLExpr, old)?, robj_to!(PLExpr, new)?, robj_to!(Option, PLExpr, default)?, @@ -1460,22 +1433,17 @@ impl RPolarsExpr { // datetime methods - pub fn dt_truncate(&self, every: Robj, offset: String) -> RResult { + pub fn dt_truncate(&self, every: Robj) -> RResult { Ok(self .0 .clone() .dt() - .truncate(robj_to!(PLExpr, every)?, offset) + .truncate(robj_to!(PLExpr, every)?) .into()) } - pub fn dt_round(&self, every: Robj, offset: &str) -> RResult { - Ok(self - .0 - .clone() - .dt() - .round(robj_to!(PLExpr, every)?, offset) - .into()) + pub fn dt_round(&self, every: Robj) -> RResult { + Ok(self.0.clone().dt().round(robj_to!(PLExpr, every)?).into()) } pub fn dt_time(&self) -> RResult { @@ -1822,12 +1790,17 @@ impl RPolarsExpr { self.0.clone().len().into() } - pub fn slice(&self, offset: &RPolarsExpr, length: Nullable<&RPolarsExpr>) -> Self { + pub fn slice(&self, offset: Robj, length: Nullable<&RPolarsExpr>) -> RResult { + let offset = robj_to!(PLExpr, offset)?; let length = match null_to_opt(length) { - Some(i) => i.0.clone(), + Some(i) => dsl::cast(i.0.clone(), pl::DataType::Int64), None => dsl::lit(i64::MAX), }; - self.0.clone().slice(offset.0.clone(), length).into() + Ok(self + .0 + .clone() + .slice(dsl::cast(offset, pl::DataType::Int64), length) + .into()) } pub fn append(&self, other: &RPolarsExpr, upcast: bool) -> Self { @@ -1935,14 +1908,33 @@ impl RPolarsExpr { .into()) } - pub fn over(&self, partition_by: Robj, mapping: Robj) -> RResult { + pub fn over( + &self, + partition_by: Robj, + order_by: Robj, + order_by_descending: bool, + order_by_nulls_last: bool, + mapping: Robj, + ) -> RResult { + let partition_by = robj_to!(Vec, PLExpr, partition_by)?; + + let order_by = robj_to!(Option, Vec, PLExprCol, order_by)?.map(|order_by| { + ( + order_by, + SortOptions { + descending: order_by_descending, + nulls_last: order_by_nulls_last, + maintain_order: false, + ..Default::default() + }, + ) + }); + + let mapping = robj_to!(WindowMapping, mapping)?; Ok(self .0 .clone() - .over_with_options( - robj_to!(Vec, PLExpr, partition_by)?, - robj_to!(WindowMapping, mapping)?, - ) + .over_with_options(partition_by, order_by, mapping) .into()) } @@ -1964,8 +1956,8 @@ impl RPolarsExpr { // set expected type of output from R function let ot = robj_to!(Option, PLPolarsDataType, output_type)?; let output_map = pl::GetOutput::map_field(move |fld| match ot { - Some(ref dt) => pl::Field::new(fld.name(), dt.clone()), - None => fld.clone(), + Some(ref dt) => Ok(pl::Field::new(fld.name(), dt.clone())), + None => Ok(fld.clone()), }); robj_to!(bool, agg_list) @@ -1998,8 +1990,8 @@ impl RPolarsExpr { let ot = robj_to!(Option, PLPolarsDataType, output_type)?; let output_map = pl::GetOutput::map_field(move |fld| match ot { - Some(ref dt) => pl::Field::new(fld.name(), dt.clone()), - None => fld.clone(), + Some(ref dt) => Ok(pl::Field::new(fld.name(), dt.clone())), + None => Ok(fld.clone()), }); robj_to!(bool, agg_list) @@ -2031,8 +2023,8 @@ impl RPolarsExpr { let ot = null_to_opt(output_type).map(|rdt| rdt.0.clone()); let output_map = pl::GetOutput::map_field(move |fld| match ot { - Some(ref dt) => pl::Field::new(fld.name(), dt.clone()), - None => fld.clone(), + Some(ref dt) => Ok(pl::Field::new(fld.name(), dt.clone())), + None => Ok(fld.clone()), }); self.0.clone().apply(rbgfunc, output_map).into() @@ -2131,12 +2123,12 @@ impl RPolarsExpr { self.clone().0.str().len_chars().into() } - pub fn str_concat(&self, delimiter: Robj, ignore_nulls: Robj) -> RResult { + pub fn str_join(&self, delimiter: Robj, ignore_nulls: Robj) -> RResult { Ok(self .0 .clone() .str() - .concat(robj_to!(str, delimiter)?, robj_to!(bool, ignore_nulls)?) + .join(robj_to!(str, delimiter)?, robj_to!(bool, ignore_nulls)?) .into()) } diff --git a/src/rust/src/rdataframe/mod.rs b/src/rust/src/rdataframe/mod.rs index d6d8452e8..925ad4428 100644 --- a/src/rust/src/rdataframe/mod.rs +++ b/src/rust/src/rdataframe/mod.rs @@ -25,7 +25,7 @@ use polars_core::utils::arrow; use crate::utils::{collect_hinted_result, r_result_list}; use crate::conversion::strings_to_smartstrings; -use polars::frame::explode::MeltArgs; +use polars::frame::explode::UnpivotArgs; use polars::prelude::pivot::{pivot, pivot_stable}; pub struct OwnedDataFrameIterator { @@ -382,23 +382,23 @@ impl RPolarsDataFrame { self.0.clone().null_count().into() } - fn melt( + fn unpivot( &self, - id_vars: Robj, - value_vars: Robj, + on: Robj, + index: Robj, value_name: Robj, variable_name: Robj, ) -> RResult { - let args = MeltArgs { - id_vars: strings_to_smartstrings(robj_to!(Vec, String, id_vars)?), - value_vars: strings_to_smartstrings(robj_to!(Vec, String, value_vars)?), + let args = UnpivotArgs { + on: strings_to_smartstrings(robj_to!(Vec, String, on)?), + index: strings_to_smartstrings(robj_to!(Vec, String, index)?), value_name: robj_to!(Option, String, value_name)?.map(|s| s.into()), variable_name: robj_to!(Option, String, variable_name)?.map(|s| s.into()), streamable: false, }; self.0 - .melt2(args) + .unpivot2(args) .map_err(polars_to_rpolars_err) .map(RPolarsDataFrame) } @@ -406,8 +406,8 @@ impl RPolarsDataFrame { #[allow(clippy::too_many_arguments)] pub fn pivot_expr( &self, + on: Robj, index: Robj, - columns: Robj, values: Robj, maintain_order: Robj, sort_columns: Robj, @@ -422,8 +422,8 @@ impl RPolarsDataFrame { fun( &self.0, - robj_to!(Vec, String, index)?, - robj_to!(Vec, String, columns)?, + robj_to!(Vec, String, on)?, + robj_to!(Option, Vec, String, index)?, robj_to!(Option, Vec, String, values)?, robj_to!(bool, sort_columns)?, robj_to!(Option, PLExpr, aggregate_expr)?, @@ -574,7 +574,7 @@ impl RPolarsDataFrame { compression_method, compression_level, )?) - .with_statistics(robj_to!(bool, statistics)?) + .with_statistics(robj_to!(StatisticsOptions, statistics)?) .with_row_group_size(robj_to!(Option, usize, row_group_size)?) .with_data_page_size(robj_to!(Option, usize, data_pagesize_limit)?) .set_parallel(true) diff --git a/src/rust/src/rdataframe/read_ndjson.rs b/src/rust/src/rdataframe/read_ndjson.rs index a2d51d1a3..6e3091d86 100644 --- a/src/rust/src/rdataframe/read_ndjson.rs +++ b/src/rust/src/rdataframe/read_ndjson.rs @@ -38,7 +38,7 @@ pub fn new_from_ndjson( }?; linereader - .with_infer_schema_length(robj_to!(Option, usize, infer_schema_length)?) + .with_infer_schema_length(robj_to!(Option, nonzero_usize, infer_schema_length)?) .with_batch_size(robj_to!(Option, nonzero_usize, batch_size)?) .with_n_rows(robj_to!(Option, usize, n_rows)?) .low_memory(robj_to!(bool, low_memory)?) diff --git a/src/rust/src/rdataframe/read_parquet.rs b/src/rust/src/rdataframe/read_parquet.rs index 07ac9578f..af7bfc662 100644 --- a/src/rust/src/rdataframe/read_parquet.rs +++ b/src/rust/src/rdataframe/read_parquet.rs @@ -42,8 +42,10 @@ pub fn new_from_parquet( cloud_options, use_statistics: robj_to!(bool, use_statistics)?, hive_options: polars::io::HiveOptions { - enabled: robj_to!(bool, hive_partitioning)?, - schema: None, // TODO: implement a option to set this + enabled: robj_to!(Option, bool, hive_partitioning)?, + hive_start_idx: 0, // TODO: is it actually 0? + schema: None, // TODO: implement a option to set this + try_parse_dates: true, }, glob: robj_to!(bool, glob)?, }; diff --git a/src/rust/src/rdatatype.rs b/src/rust/src/rdatatype.rs index 8b25d9f6f..17249fcc4 100644 --- a/src/rust/src/rdatatype.rs +++ b/src/rust/src/rdatatype.rs @@ -720,6 +720,23 @@ pub fn robj_new_null_behavior(robj: Robj) -> RResult RResult { + use pl::StatisticsOptions as SO; + let hm = robj + .as_list() + .unwrap() + .into_hashmap() + .into_iter() + .map(|xi| (xi.0, xi.1.as_bool().unwrap())) + .collect::>(); + let mut out = SO::default(); + out.min_value = *hm.get(&"min").unwrap(); + out.max_value = *hm.get(&"max").unwrap(); + out.distinct_count = *hm.get(&"distinct_count").unwrap(); + out.null_count = *hm.get(&"null_count").unwrap(); + Ok(out) +} + pub fn parse_fill_null_strategy( strategy: &str, limit: Option, diff --git a/src/rust/src/rlib.rs b/src/rust/src/rlib.rs index 001b1b3a6..10df4c0e7 100644 --- a/src/rust/src/rlib.rs +++ b/src/rust/src/rlib.rs @@ -86,40 +86,22 @@ fn concat_str(dotdotdot: Robj, separator: Robj, ignore_nulls: Robj) -> RResult RResult { +fn date_range(start: Robj, end: Robj, interval: &str, closed: Robj) -> RResult { Ok(RPolarsExpr(polars::lazy::prelude::date_range( robj_to!(PLExprCol, start)?, robj_to!(PLExprCol, end)?, pl::Duration::parse(interval), robj_to!(ClosedWindow, closed)?, - robj_to!(Option, timeunit, time_unit)?, - robj_to!(Option, String, time_zone)?, ))) } #[extendr] -fn date_ranges( - start: Robj, - end: Robj, - interval: &str, - closed: Robj, - time_unit: Robj, - time_zone: Robj, -) -> RResult { +fn date_ranges(start: Robj, end: Robj, interval: &str, closed: Robj) -> RResult { Ok(RPolarsExpr(polars::lazy::prelude::date_ranges( robj_to!(PLExprCol, start)?, robj_to!(PLExprCol, end)?, pl::Duration::parse(interval), robj_to!(ClosedWindow, closed)?, - robj_to!(Option, timeunit, time_unit)?, - robj_to!(Option, String, time_zone)?, ))) } diff --git a/src/rust/src/series.rs b/src/rust/src/series.rs index 85fc0ddae..13184609c 100644 --- a/src/rust/src/series.rs +++ b/src/rust/src/series.rs @@ -166,9 +166,10 @@ impl RPolarsSeries { sort: bool, parallel: bool, name: String, + normalize: bool, ) -> std::result::Result { self.0 - .value_counts(sort, parallel, name) + .value_counts(sort, parallel, name, normalize) .map(RPolarsDataFrame) .map_err(|err| format!("in value_counts: {:?}", err)) } @@ -365,24 +366,34 @@ impl RPolarsSeries { ) } - pub fn add(&self, other: &RPolarsSeries) -> Self { - (&self.0 + &other.0).into() + pub fn add(&self, other: &RPolarsSeries) -> RResult { + Ok((&self.0 + &other.0) + .map(Into::into) + .map_err(polars_to_rpolars_err)?) } - pub fn sub(&self, other: &RPolarsSeries) -> Self { - (&self.0 - &other.0).into() + pub fn sub(&self, other: &RPolarsSeries) -> RResult { + Ok((&self.0 - &other.0) + .map(Into::into) + .map_err(polars_to_rpolars_err)?) } - pub fn mul(&self, other: &RPolarsSeries) -> Self { - (&self.0 * &other.0).into() + pub fn mul(&self, other: &RPolarsSeries) -> RResult { + Ok((&self.0 * &other.0) + .map(Into::into) + .map_err(polars_to_rpolars_err)?) } - pub fn div(&self, other: &RPolarsSeries) -> Self { - (&self.0 / &other.0).into() + pub fn div(&self, other: &RPolarsSeries) -> RResult { + Ok((&self.0 / &other.0) + .map(Into::into) + .map_err(polars_to_rpolars_err)?) } - pub fn rem(&self, other: &RPolarsSeries) -> Self { - (&self.0 % &other.0).into() + pub fn rem(&self, other: &RPolarsSeries) -> RResult { + Ok((&self.0 % &other.0) + .map(Into::into) + .map_err(polars_to_rpolars_err)?) } pub fn map_elements( diff --git a/src/rust/src/utils/mod.rs b/src/rust/src/utils/mod.rs index 78359950b..e72b0cb28 100644 --- a/src/rust/src/utils/mod.rs +++ b/src/rust/src/utils/mod.rs @@ -1087,6 +1087,10 @@ macro_rules! robj_to_inner { (WindowMapping, $a:ident) => { $crate::rdatatype::robj_to_window_mapping($a) }; + + (StatisticsOptions, $a:ident) => { + $crate::rdatatype::robj_to_statistics_options($a) + }; } //convert any Robj to appropriate rust type with informative error Strings diff --git a/tests/testthat/_snaps/after-wrappers.md b/tests/testthat/_snaps/after-wrappers.md index 08a79d57f..29f35f8c3 100644 --- a/tests/testthat/_snaps/after-wrappers.md +++ b/tests/testthat/_snaps/after-wrappers.md @@ -85,15 +85,15 @@ [21] "group_by" "group_by_dynamic" "head" "height" [25] "item" "join" "join_asof" "last" [29] "lazy" "limit" "max" "mean" - [33] "median" "melt" "min" "n_chunks" - [37] "null_count" "partition_by" "pivot" "print" - [41] "quantile" "rechunk" "rename" "reverse" - [45] "rolling" "sample" "schema" "select" - [49] "select_seq" "shape" "shift" "shift_and_fill" - [53] "slice" "sort" "sql" "std" - [57] "sum" "tail" "to_data_frame" "to_list" - [61] "to_raw_ipc" "to_series" "to_struct" "transpose" - [65] "unique" "unnest" "var" "width" + [33] "median" "min" "n_chunks" "null_count" + [37] "partition_by" "pivot" "print" "quantile" + [41] "rechunk" "rename" "reverse" "rolling" + [45] "sample" "schema" "select" "select_seq" + [49] "shape" "shift" "shift_and_fill" "slice" + [53] "sort" "sql" "std" "sum" + [57] "tail" "to_data_frame" "to_list" "to_raw_ipc" + [61] "to_series" "to_struct" "transpose" "unique" + [65] "unnest" "unpivot" "var" "width" [69] "with_columns" "with_columns_seq" "with_row_index" "write_csv" [73] "write_ipc" "write_json" "write_ndjson" "write_parquet" @@ -110,19 +110,19 @@ [11] "export_stream" "from_arrow_record_batches" [13] "from_raw_ipc" "get_column" [15] "get_columns" "lazy" - [17] "melt" "n_chunks" - [19] "new_with_capacity" "null_count" - [21] "partition_by" "pivot_expr" - [23] "print" "rechunk" - [25] "sample_frac" "sample_n" - [27] "schema" "select" - [29] "select_at_idx" "select_seq" - [31] "set_column_from_robj" "set_column_from_series" - [33] "set_column_names_mut" "shape" - [35] "to_list" "to_list_tag_structs" - [37] "to_list_unwind" "to_raw_ipc" - [39] "to_struct" "transpose" - [41] "unnest" "with_columns" + [17] "n_chunks" "new_with_capacity" + [19] "null_count" "partition_by" + [21] "pivot_expr" "print" + [23] "rechunk" "sample_frac" + [25] "sample_n" "schema" + [27] "select" "select_at_idx" + [29] "select_seq" "set_column_from_robj" + [31] "set_column_from_series" "set_column_names_mut" + [33] "shape" "to_list" + [35] "to_list_tag_structs" "to_list_unwind" + [37] "to_raw_ipc" "to_struct" + [39] "transpose" "unnest" + [41] "unpivot" "with_columns" [43] "with_columns_seq" "with_row_index" [45] "write_csv" "write_ipc" [47] "write_json" "write_ndjson" @@ -156,21 +156,21 @@ [21] "join" "join_asof" [23] "last" "limit" [25] "max" "mean" - [27] "median" "melt" - [29] "min" "print" - [31] "profile" "quantile" - [33] "rename" "reverse" - [35] "rolling" "schema" - [37] "select" "select_seq" - [39] "serialize" "set_optimization_toggle" - [41] "shift" "shift_and_fill" - [43] "sink_csv" "sink_ipc" - [45] "sink_ndjson" "sink_parquet" - [47] "slice" "sort" - [49] "sql" "std" - [51] "sum" "tail" - [53] "to_dot" "unique" - [55] "unnest" "var" + [27] "median" "min" + [29] "print" "profile" + [31] "quantile" "rename" + [33] "reverse" "rolling" + [35] "schema" "select" + [37] "select_seq" "serialize" + [39] "set_optimization_toggle" "shift" + [41] "shift_and_fill" "sink_csv" + [43] "sink_ipc" "sink_ndjson" + [45] "sink_parquet" "slice" + [47] "sort" "sql" + [49] "std" "sum" + [51] "tail" "to_dot" + [53] "unique" "unnest" + [55] "unpivot" "var" [57] "width" "with_columns" [59] "with_columns_seq" "with_context" [61] "with_row_index" @@ -192,20 +192,20 @@ [19] "join" "join_asof" [21] "last" "max" [23] "mean" "median" - [25] "melt" "min" - [27] "print" "profile" - [29] "quantile" "rename" - [31] "reverse" "rolling" - [33] "schema" "select" - [35] "select_seq" "serialize" - [37] "set_optimization_toggle" "shift" - [39] "shift_and_fill" "sink_csv" - [41] "sink_ipc" "sink_json" - [43] "sink_parquet" "slice" - [45] "sort_by_exprs" "std" - [47] "sum" "tail" - [49] "to_dot" "unique" - [51] "unnest" "var" + [25] "min" "print" + [27] "profile" "quantile" + [29] "rename" "reverse" + [31] "rolling" "schema" + [33] "select" "select_seq" + [35] "serialize" "set_optimization_toggle" + [37] "shift" "shift_and_fill" + [39] "sink_csv" "sink_ipc" + [41] "sink_json" "sink_parquet" + [43] "slice" "sort_by_exprs" + [45] "std" "sum" + [47] "tail" "to_dot" + [49] "unique" "unnest" + [51] "unpivot" "var" [53] "with_columns" "with_columns_seq" [55] "with_context" "with_row_index" @@ -257,25 +257,25 @@ [121] "product" "qcut" "quantile" [124] "rank" "rechunk" "reinterpret" [127] "rep" "repeat_by" "replace" - [130] "reshape" "reverse" "rle" - [133] "rle_id" "rolling" "rolling_max" - [136] "rolling_max_by" "rolling_mean" "rolling_mean_by" - [139] "rolling_median" "rolling_median_by" "rolling_min" - [142] "rolling_min_by" "rolling_quantile" "rolling_quantile_by" - [145] "rolling_skew" "rolling_std" "rolling_std_by" - [148] "rolling_sum" "rolling_sum_by" "rolling_var" - [151] "rolling_var_by" "round" "sample" - [154] "search_sorted" "set_sorted" "shift" - [157] "shift_and_fill" "shrink_dtype" "shuffle" - [160] "sign" "sin" "sinh" - [163] "skew" "slice" "sort" - [166] "sort_by" "sqrt" "std" - [169] "str" "struct" "sub" - [172] "sum" "tail" "tan" - [175] "tanh" "to_physical" "to_r" - [178] "to_series" "top_k" "unique" - [181] "unique_counts" "upper_bound" "value_counts" - [184] "var" "xor" + [130] "replace_strict" "reshape" "reverse" + [133] "rle" "rle_id" "rolling" + [136] "rolling_max" "rolling_max_by" "rolling_mean" + [139] "rolling_mean_by" "rolling_median" "rolling_median_by" + [142] "rolling_min" "rolling_min_by" "rolling_quantile" + [145] "rolling_quantile_by" "rolling_skew" "rolling_std" + [148] "rolling_std_by" "rolling_sum" "rolling_sum_by" + [151] "rolling_var" "rolling_var_by" "round" + [154] "sample" "search_sorted" "set_sorted" + [157] "shift" "shift_and_fill" "shrink_dtype" + [160] "shuffle" "sign" "sin" + [163] "sinh" "skew" "slice" + [166] "sort" "sort_by" "sqrt" + [169] "std" "str" "struct" + [172] "sub" "sum" "tail" + [175] "tan" "tanh" "to_physical" + [178] "to_r" "to_series" "top_k" + [181] "unique" "unique_counts" "upper_bound" + [184] "value_counts" "var" "xor" --- @@ -399,56 +399,56 @@ [229] "rank" "rechunk" [231] "reinterpret" "rem" [233] "rep" "repeat_by" - [235] "replace" "reshape" - [237] "reverse" "rle" - [239] "rle_id" "rolling" - [241] "rolling_corr" "rolling_cov" - [243] "rolling_max" "rolling_max_by" - [245] "rolling_mean" "rolling_mean_by" - [247] "rolling_median" "rolling_median_by" - [249] "rolling_min" "rolling_min_by" - [251] "rolling_quantile" "rolling_quantile_by" - [253] "rolling_skew" "rolling_std" - [255] "rolling_std_by" "rolling_sum" - [257] "rolling_sum_by" "rolling_var" - [259] "rolling_var_by" "round" - [261] "sample_frac" "sample_n" - [263] "search_sorted" "shift" - [265] "shift_and_fill" "shrink_dtype" - [267] "shuffle" "sign" - [269] "sin" "sinh" - [271] "skew" "slice" - [273] "sort_by" "sort_with" - [275] "std" "str_base64_decode" - [277] "str_base64_encode" "str_concat" + [235] "replace" "replace_strict" + [237] "reshape" "reverse" + [239] "rle" "rle_id" + [241] "rolling" "rolling_corr" + [243] "rolling_cov" "rolling_max" + [245] "rolling_max_by" "rolling_mean" + [247] "rolling_mean_by" "rolling_median" + [249] "rolling_median_by" "rolling_min" + [251] "rolling_min_by" "rolling_quantile" + [253] "rolling_quantile_by" "rolling_skew" + [255] "rolling_std" "rolling_std_by" + [257] "rolling_sum" "rolling_sum_by" + [259] "rolling_var" "rolling_var_by" + [261] "round" "sample_frac" + [263] "sample_n" "search_sorted" + [265] "shift" "shift_and_fill" + [267] "shrink_dtype" "shuffle" + [269] "sign" "sin" + [271] "sinh" "skew" + [273] "slice" "sort_by" + [275] "sort_with" "std" + [277] "str_base64_decode" "str_base64_encode" [279] "str_contains" "str_contains_any" [281] "str_count_matches" "str_ends_with" [283] "str_extract" "str_extract_all" [285] "str_extract_groups" "str_find" [287] "str_head" "str_hex_decode" - [289] "str_hex_encode" "str_json_decode" - [291] "str_json_path_match" "str_len_bytes" - [293] "str_len_chars" "str_pad_end" - [295] "str_pad_start" "str_replace" - [297] "str_replace_all" "str_replace_many" - [299] "str_reverse" "str_slice" - [301] "str_split" "str_split_exact" - [303] "str_splitn" "str_starts_with" - [305] "str_strip_chars" "str_strip_chars_end" - [307] "str_strip_chars_start" "str_tail" - [309] "str_to_date" "str_to_datetime" - [311] "str_to_integer" "str_to_lowercase" - [313] "str_to_time" "str_to_titlecase" - [315] "str_to_uppercase" "str_zfill" - [317] "struct_field_by_name" "struct_rename_fields" - [319] "struct_with_fields" "sub" - [321] "sum" "tail" - [323] "tan" "tanh" - [325] "to_physical" "top_k" - [327] "unique" "unique_counts" - [329] "unique_stable" "upper_bound" - [331] "value_counts" "var" - [333] "xor" + [289] "str_hex_encode" "str_join" + [291] "str_json_decode" "str_json_path_match" + [293] "str_len_bytes" "str_len_chars" + [295] "str_pad_end" "str_pad_start" + [297] "str_replace" "str_replace_all" + [299] "str_replace_many" "str_reverse" + [301] "str_slice" "str_split" + [303] "str_split_exact" "str_splitn" + [305] "str_starts_with" "str_strip_chars" + [307] "str_strip_chars_end" "str_strip_chars_start" + [309] "str_tail" "str_to_date" + [311] "str_to_datetime" "str_to_integer" + [313] "str_to_lowercase" "str_to_time" + [315] "str_to_titlecase" "str_to_uppercase" + [317] "str_zfill" "struct_field_by_name" + [319] "struct_rename_fields" "struct_with_fields" + [321] "sub" "sum" + [323] "tail" "tan" + [325] "tanh" "to_physical" + [327] "top_k" "unique" + [329] "unique_counts" "unique_stable" + [331] "upper_bound" "value_counts" + [333] "var" "xor" # public and private methods of each class When @@ -512,26 +512,26 @@ [121] "print" "product" "qcut" [124] "quantile" "rank" "rechunk" [127] "reinterpret" "rep" "repeat_by" - [130] "replace" "reshape" "reverse" - [133] "rle" "rle_id" "rolling" - [136] "rolling_max" "rolling_max_by" "rolling_mean" - [139] "rolling_mean_by" "rolling_median" "rolling_median_by" - [142] "rolling_min" "rolling_min_by" "rolling_quantile" - [145] "rolling_quantile_by" "rolling_skew" "rolling_std" - [148] "rolling_std_by" "rolling_sum" "rolling_sum_by" - [151] "rolling_var" "rolling_var_by" "round" - [154] "sample" "search_sorted" "set_sorted" - [157] "shift" "shift_and_fill" "shrink_dtype" - [160] "shuffle" "sign" "sin" - [163] "sinh" "skew" "slice" - [166] "sort" "sort_by" "sqrt" - [169] "std" "str" "struct" - [172] "sub" "sum" "tail" - [175] "tan" "tanh" "to_physical" - [178] "to_r" "to_series" "top_k" - [181] "unique" "unique_counts" "upper_bound" - [184] "value_counts" "var" "when" - [187] "xor" + [130] "replace" "replace_strict" "reshape" + [133] "reverse" "rle" "rle_id" + [136] "rolling" "rolling_max" "rolling_max_by" + [139] "rolling_mean" "rolling_mean_by" "rolling_median" + [142] "rolling_median_by" "rolling_min" "rolling_min_by" + [145] "rolling_quantile" "rolling_quantile_by" "rolling_skew" + [148] "rolling_std" "rolling_std_by" "rolling_sum" + [151] "rolling_sum_by" "rolling_var" "rolling_var_by" + [154] "round" "sample" "search_sorted" + [157] "set_sorted" "shift" "shift_and_fill" + [160] "shrink_dtype" "shuffle" "sign" + [163] "sin" "sinh" "skew" + [166] "slice" "sort" "sort_by" + [169] "sqrt" "std" "str" + [172] "struct" "sub" "sum" + [175] "tail" "tan" "tanh" + [178] "to_physical" "to_r" "to_series" + [181] "top_k" "unique" "unique_counts" + [184] "upper_bound" "value_counts" "var" + [187] "when" "xor" --- @@ -602,26 +602,26 @@ [121] "print" "product" "qcut" [124] "quantile" "rank" "rechunk" [127] "reinterpret" "rep" "repeat_by" - [130] "replace" "reshape" "reverse" - [133] "rle" "rle_id" "rolling" - [136] "rolling_max" "rolling_max_by" "rolling_mean" - [139] "rolling_mean_by" "rolling_median" "rolling_median_by" - [142] "rolling_min" "rolling_min_by" "rolling_quantile" - [145] "rolling_quantile_by" "rolling_skew" "rolling_std" - [148] "rolling_std_by" "rolling_sum" "rolling_sum_by" - [151] "rolling_var" "rolling_var_by" "round" - [154] "sample" "search_sorted" "set_sorted" - [157] "shift" "shift_and_fill" "shrink_dtype" - [160] "shuffle" "sign" "sin" - [163] "sinh" "skew" "slice" - [166] "sort" "sort_by" "sqrt" - [169] "std" "str" "struct" - [172] "sub" "sum" "tail" - [175] "tan" "tanh" "to_physical" - [178] "to_r" "to_series" "top_k" - [181] "unique" "unique_counts" "upper_bound" - [184] "value_counts" "var" "when" - [187] "xor" + [130] "replace" "replace_strict" "reshape" + [133] "reverse" "rle" "rle_id" + [136] "rolling" "rolling_max" "rolling_max_by" + [139] "rolling_mean" "rolling_mean_by" "rolling_median" + [142] "rolling_median_by" "rolling_min" "rolling_min_by" + [145] "rolling_quantile" "rolling_quantile_by" "rolling_skew" + [148] "rolling_std" "rolling_std_by" "rolling_sum" + [151] "rolling_sum_by" "rolling_var" "rolling_var_by" + [154] "round" "sample" "search_sorted" + [157] "set_sorted" "shift" "shift_and_fill" + [160] "shrink_dtype" "shuffle" "sign" + [163] "sin" "sinh" "skew" + [166] "slice" "sort" "sort_by" + [169] "sqrt" "std" "str" + [172] "struct" "sub" "sum" + [175] "tail" "tan" "tanh" + [178] "to_physical" "to_r" "to_series" + [181] "top_k" "unique" "unique_counts" + [184] "upper_bound" "value_counts" "var" + [187] "when" "xor" --- @@ -695,26 +695,27 @@ [127] "product" "qcut" "quantile" [130] "rank" "rechunk" "reinterpret" [133] "rename" "rep" "repeat_by" - [136] "replace" "reshape" "reverse" - [139] "rle" "rle_id" "rolling_max" - [142] "rolling_max_by" "rolling_mean" "rolling_mean_by" - [145] "rolling_median" "rolling_median_by" "rolling_min" - [148] "rolling_min_by" "rolling_quantile" "rolling_quantile_by" - [151] "rolling_skew" "rolling_std" "rolling_std_by" - [154] "rolling_sum" "rolling_sum_by" "rolling_var" - [157] "rolling_var_by" "round" "sample" - [160] "search_sorted" "set_sorted" "shape" - [163] "shift" "shift_and_fill" "shrink_dtype" - [166] "shuffle" "sign" "sin" - [169] "sinh" "skew" "slice" - [172] "sort" "sort_by" "sqrt" - [175] "std" "str" "struct" - [178] "sub" "sum" "tail" - [181] "tan" "tanh" "to_frame" - [184] "to_list" "to_lit" "to_physical" - [187] "to_r" "to_vector" "top_k" - [190] "unique" "unique_counts" "upper_bound" - [193] "value_counts" "var" "xor" + [136] "replace" "replace_strict" "reshape" + [139] "reverse" "rle" "rle_id" + [142] "rolling_max" "rolling_max_by" "rolling_mean" + [145] "rolling_mean_by" "rolling_median" "rolling_median_by" + [148] "rolling_min" "rolling_min_by" "rolling_quantile" + [151] "rolling_quantile_by" "rolling_skew" "rolling_std" + [154] "rolling_std_by" "rolling_sum" "rolling_sum_by" + [157] "rolling_var" "rolling_var_by" "round" + [160] "sample" "search_sorted" "set_sorted" + [163] "shape" "shift" "shift_and_fill" + [166] "shrink_dtype" "shuffle" "sign" + [169] "sin" "sinh" "skew" + [172] "slice" "sort" "sort_by" + [175] "sqrt" "std" "str" + [178] "struct" "sub" "sum" + [181] "tail" "tan" "tanh" + [184] "to_frame" "to_list" "to_lit" + [187] "to_physical" "to_r" "to_vector" + [190] "top_k" "unique" "unique_counts" + [193] "upper_bound" "value_counts" "var" + [196] "xor" --- diff --git a/tests/testthat/_snaps/lazy.md b/tests/testthat/_snaps/lazy.md index 97df90723..292a4bc47 100644 --- a/tests/testthat/_snaps/lazy.md +++ b/tests/testthat/_snaps/lazy.md @@ -57,8 +57,7 @@ } }, "output_schema": null, - "projection": null, - "selection": null + "filter": null } }, "predicate": { @@ -78,7 +77,8 @@ }, "options": { "run_parallel": true, - "duplicate_check": true + "duplicate_check": true, + "should_broadcast": true } } } diff --git a/tests/testthat/test-dataframe.R b/tests/testthat/test-dataframe.R index b191f3aba..46bba75a1 100644 --- a/tests/testthat/test-dataframe.R +++ b/tests/testthat/test-dataframe.R @@ -969,7 +969,7 @@ test_that("pivot examples", { expect_identical( df$pivot( - values = "baz", index = "foo", columns = "bar", aggregate_function = "first" + values = "baz", index = "foo", on = "bar", aggregate_function = "first" )$to_list(), list(foo = c("one", "two"), A = c(1, 4), B = c(2, 5), C = c(3, 6)) ) @@ -985,7 +985,7 @@ test_that("pivot examples", { expect_equal( df$pivot( index = "col1", - columns = "col2", + on = "col2", values = "col3", aggregate_function = pl$element()$tanh()$mean() )$to_list(), @@ -1006,7 +1006,7 @@ test_that("pivot args works", { jaz = 6:1 ) expect_identical( - df$pivot("foo", "bar", "baz")$to_list(), + df$pivot("baz", index = "bar", values = "foo")$to_list(), list(bar = c("A", "B", "C"), `1.0` = c("one", NA, NA), `2.0` = c( NA, "one", NA @@ -1021,39 +1021,39 @@ test_that("pivot args works", { # aggr functions expect_identical( - df$pivot("cat", "ann", "bob", aggregate_function = "mean")$to_list(), + df$pivot("bob", index = "ann", values = "cat", aggregate_function = "mean")$to_list(), list(ann = c("one", "two"), A = c(2, 5), B = c(2, 5)) ) expect_identical( - df$pivot("cat", "ann", "bob", aggregate_function = pl$element()$mean())$to_list(), - df$pivot("cat", "ann", "bob", aggregate_function = "mean")$to_list() + df$pivot("bob", index = "ann", values = "cat", aggregate_function = pl$element()$mean())$to_list(), + df$pivot("bob", index = "ann", values = "cat", aggregate_function = "mean")$to_list() ) expect_grepl_error( - df$pivot("ann", "bob", "cat", aggregate_function = 42), - c("pivot", "param", "aggregate_function", "42") + df$pivot("cat", index = "bob", values = "ann", aggregate_function = 42), + "is neither a string, NULL or an Expr" ) expect_grepl_error( - df$pivot("ann", "bob", "cat", aggregate_function = "dummy"), - c("pivot", "dummy is not a method") + df$pivot("cat", index = "bob", values = "ann", aggregate_function = "dummy"), + "dummy is not a method" ) # maintain_order sort_columns expect_grepl_error( - df$pivot("ann", "bob", "cat", aggregate_function = "mean", maintain_order = 42), - c("pivot", "maintain_order", "bool") + df$pivot("cat", index = "bob", values = "ann", aggregate_function = "mean", maintain_order = 42), + "Expected a value of type \\[bool\\]" ) expect_grepl_error( - df$pivot("ann", "bob", "cat", aggregate_function = "mean", sort_columns = 42), - c("pivot", "sort_columns", "bool") + df$pivot("cat", index = "bob", values = "ann", aggregate_function = "mean", sort_columns = 42), + "Expected a value of type \\[bool\\]" ) # separator expect_named( - df$pivot(c("ann", "bob"), "ann", "cat", aggregate_function = "mean", separator = "."), + df$pivot("cat", index = "ann", values = c("ann", "bob"), aggregate_function = "mean", separator = "."), c( - "ann", "ann.cat.1.0", "ann.cat.2.0", "ann.cat.3.0", "ann.cat.4.0", - "ann.cat.5.0", "ann.cat.6.0", "bob.cat.1.0", "bob.cat.2.0", "bob.cat.3.0", - "bob.cat.4.0", "bob.cat.5.0", "bob.cat.6.0" + "ann", "ann.1.0", "ann.2.0", "ann.3.0", "ann.4.0", + "ann.5.0", "ann.6.0", "bob.1.0", "bob.2.0", "bob.3.0", + "bob.4.0", "bob.5.0", "bob.6.0" ) ) }) diff --git a/tests/testthat/test-expr_datetime.R b/tests/testthat/test-expr_datetime.R index 718875ec0..f79c7f37c 100644 --- a/tests/testthat/test-expr_datetime.R +++ b/tests/testthat/test-expr_datetime.R @@ -90,14 +90,6 @@ test_that("pl$date_range", { )$to_series()$to_vector(), seq(as.Date("2022-01-01"), as.Date("2022-03-01"), by = "1 month") ) - - # Deprecated usage - expect_identical( - suppressWarnings(pl$date_range( - as.POSIXct("2022-01-01 12:00", "UTC"), as.POSIXct("2022-01-03", "UTC"), "1d" - )$to_series()$to_vector()), - as.POSIXct(c("2022-01-01 12:00", "2022-01-02 12:00"), "UTC") - ) }) test_that("dt$truncate", { @@ -108,8 +100,7 @@ test_that("dt$truncate", { # use a dt namespace function df = pl$DataFrame(datetime = s)$with_columns( - pl$col("datetime")$dt$truncate("4s")$alias("truncated_4s"), - pl$col("datetime")$dt$truncate("4s", offset("3s"))$alias("truncated_4s_offset_2s") + pl$col("datetime")$dt$truncate("4s")$alias("truncated_4s") ) l_actual = df$to_list() @@ -117,15 +108,9 @@ test_that("dt$truncate", { lapply(l_actual, \(x) diff(x) |> as.numeric()), list( datetime = rep(2, 12), - truncated_4s = rep(c(0, 4), 6), - truncated_4s_offset_2s = rep(c(0, 4), 6) + truncated_4s = rep(c(0, 4), 6) ) ) - - expect_identical( - as.numeric(l_actual$truncated_4s_offset_2s - l_actual$truncated_4s), - rep(3, 13) - ) }) @@ -136,11 +121,8 @@ test_that("dt$round", { s = pl$datetime_range(t1, t2, interval = "2s", time_unit = "ms") # use a dt namespace function - ## TODO contribute POLARS, offset makes little sense, it should be implemented - ## before round not after. df = pl$DataFrame(datetime = s)$with_columns( - pl$col("datetime")$dt$round("8s")$alias("truncated_4s"), - pl$col("datetime")$dt$round("8s", offset("4s1ms"))$alias("truncated_4s_offset_2s") + pl$col("datetime")$dt$round("8s")$alias("truncated_4s") ) l_actual = df$to_list() @@ -148,8 +130,7 @@ test_that("dt$round", { lapply(l_actual, \(x) diff(x) |> as.numeric()), list( datetime = rep(2, 12), - truncated_4s = rep(c(0, 8, 0, 0), 3), - truncated_4s_offset_2s = rep(c(0, 8, 0, 0), 3) + truncated_4s = rep(c(0, 8, 0, 0), 3) ) ) @@ -161,10 +142,6 @@ test_that("dt$round", { pl$col("datetime")$dt$round(c("2s", "1h")), "`every` must be a single non-NA character or difftime" ) - expect_grepl_error( - pl$col("datetime")$dt$round("1s", 42), - "`offset` must be a single non-NA character or difftime" - ) }) test_that("dt$combine", { diff --git a/tests/testthat/test-expr_expr.R b/tests/testthat/test-expr_expr.R index 7ccc2026e..4b46b81f2 100644 --- a/tests/testthat/test-expr_expr.R +++ b/tests/testthat/test-expr_expr.R @@ -324,11 +324,26 @@ test_that("$over() with mapping_strategy", { expect_identical( df$select(pl$col("val")$top_k(2)$over("a", mapping_strategy = "join"))$to_list(), list( - val = list(c(5L, 2L), c(5L, 2L), c(4L, 3L), c(4L, 3L), c(5L, 2L)) + val = list(c(5L, 2L), c(5L, 2L), c(3L, 4L), c(3L, 4L), c(5L, 2L)) ) ) }) +test_that("arg 'order_by' in $over() works", { + df = pl$DataFrame( + g = c(1, 1, 1, 1, 2, 2, 2, 2), + t = c(1, 2, 3, 4, 4, 1, 2, 3), + x = c(10, 20, 30, 40, 10, 20, 30, 40) + ) + + expect_equal( + df$select( + x_lag = pl$col("x")$shift(1)$over("g", order_by = "t") + )$to_list(), + list(x_lag = c(NA, 10, 20, 30, 40, NA, 20, 30)) + ) +}) + test_that("col DataType + col(s) + col regex", { # one Datatype expect_equal( @@ -896,14 +911,14 @@ test_that("Expr_sort", { }) -test_that("Expr_k_top", { +test_that("$top_k() works", { l = list(a = c(6, 1, 0, NA, Inf, -Inf, NaN)) l_actual = pl$DataFrame(l)$select( pl$col("a")$top_k(3)$alias("k_top"), pl$col("a")$bottom_k(3)$alias("k_bot") ) - known = structure(list(k_top = c(NaN, Inf, 6), k_bot = c(NA, -Inf, 0)), + known = structure(list(k_top = c(NaN, Inf, 6), k_bot = c(-Inf, 0, 1)), row.names = c(NA, -3L), class = "data.frame" ) expect_equal(l_actual$to_data_frame(), known) @@ -2381,6 +2396,18 @@ test_that("$value_counts", { count = rep(50, 3) ) ) + + # arg "normalize" + expect_equal( + df$select(pl$col("Species")$value_counts(normalize = TRUE))$ + unnest()$ + sort("Species")$ + to_data_frame(), + data.frame( + Species = factor(c("setosa", "versicolor", "virginica")), + proportion = rep(0.33333333, 3) + ) + ) }) @@ -2814,8 +2841,8 @@ test_that("replace works", { # the replacements mapping = list(`2` = 100, `3` = 200) expect_equal( - df$select(replaced = pl$col("a")$replace(mapping, default = -1))$to_list(), - list(replaced = c(-1, 100, 100, 200)) + df$select(replaced = pl$col("a")$replace(mapping))$to_list(), + list(replaced = c(1, 100, 100, 200)) ) df = pl$DataFrame(a = c("x", "y", "z")) @@ -2825,10 +2852,59 @@ test_that("replace works", { list(replaced = c("1.0", "2.0", "3.0")) ) + # "old", "new", and "default" can take Expr + df = pl$DataFrame(a = c(1, 2, 2, 3), b = c(1.5, 2.5, 5, 1)) + expect_equal( + df$select( + replaced = pl$col("a")$replace( + old = pl$col("a")$max(), + new = pl$col("b")$sum() + ) + )$to_list(), + list(replaced = c(1, 2, 2, 10)) + ) +}) + +test_that("replace_strict works", { + df = pl$DataFrame(a = c(1, 2, 2, 3)) + + # replace_strict requires a default value + expect_error( + df$select(replaced = pl$col("a")$replace_strict(2, 100, return_dtype = pl$Float32))$to_list(), + "incomplete mapping specified for `replace_strict`" + ) + expect_equal( + df$select(replaced = pl$col("a")$replace_strict(c(2, 3), 999, default = 1))$to_list(), + list(replaced = c(1, 999, 999, 999)) + ) + expect_equal( + df$select(replaced = pl$col("a")$replace_strict(c(2, 3), c(100, 200), default = 1))$to_list(), + list(replaced = c(1, 100, 100, 200)) + ) + + # "old" can be a named list where names are values to replace, and values are + # the replacements + mapping = list(`2` = 100, `3` = 200) + expect_equal( + df$select(replaced = pl$col("a")$replace_strict(mapping, default = -1))$to_list(), + list(replaced = c(-1, 100, 100, 200)) + ) + + df = pl$DataFrame(a = c("x", "y", "z")) + mapping = list(x = 1, y = 2, z = 3) + expect_equal( + df$select(replaced = pl$col("a")$replace_strict(mapping, return_dtype = pl$String))$to_list(), + list(replaced = c("1.0", "2.0", "3.0")) + ) + expect_error( + df$select(pl$col("a")$replace_strict(mapping, return_dtype = pl$foo)), + "must be a valid dtype" + ) + # one can specify the data type to return instead of automatically inferring it expect_equal( df$ - select(replaced = pl$col("a")$replace(mapping, return_dtype = pl$Int8))$ + select(replaced = pl$col("a")$replace_strict(mapping, return_dtype = pl$Int32))$ to_list(), list(replaced = 1:3) ) @@ -2837,7 +2913,7 @@ test_that("replace works", { df = pl$DataFrame(a = c(1, 2, 2, 3), b = c(1.5, 2.5, 5, 1)) expect_equal( df$select( - replaced = pl$col("a")$replace( + replaced = pl$col("a")$replace_strict( old = pl$col("a")$max(), new = pl$col("b")$sum(), default = pl$col("b"), @@ -2852,8 +2928,8 @@ test_that("rle works", { expect_equal( df$select(pl$col("s")$rle())$unnest("s")$to_data_frame(), data.frame( - lengths = c(2, 1, 1, 1, 1, 2), - values = c(1, 2, 1, NA, 1, 3) + len = c(2, 1, 1, 1, 1, 2), + value = c(1, 2, 1, NA, 1, 3) ) ) }) @@ -2891,8 +2967,8 @@ test_that("cut works", { cut = pl$col("foo")$cut(c(-1, 1), include_breaks = TRUE) )$unnest("cut")$to_list(), list( - brk = c(-1, -1, 1, 1, Inf), - foo_bin = factor(c("(-inf, -1]", "(-inf, -1]", "(-1, 1]", "(-1, 1]", "(1, inf]")) + breakpoint = c(-1, -1, 1, 1, Inf), + category = factor(c("(-inf, -1]", "(-inf, -1]", "(-1, 1]", "(-1, 1]", "(1, inf]")) ) ) @@ -2901,8 +2977,8 @@ test_that("cut works", { cut = pl$col("foo")$cut(c(-1, 1), include_breaks = TRUE, left_closed = TRUE) )$unnest("cut")$to_list(), list( - brk = c(-1, 1, 1, Inf, Inf), - foo_bin = factor(c("[-inf, -1)", "[-1, 1)", "[-1, 1)", "[1, inf)", "[1, inf)")) + breakpoint = c(-1, 1, 1, Inf, Inf), + category = factor(c("[-inf, -1)", "[-1, 1)", "[-1, 1)", "[1, inf)", "[1, inf)")) ) ) }) @@ -2921,7 +2997,7 @@ test_that("qcut works", { df$select( qcut = pl$col("foo")$qcut(c(0.25, 0.75), labels = c("a", "b", "c"), include_breaks = TRUE) )$unnest("qcut")$to_list(), - list(brk = c(-1, -1, 1, 1, Inf), foo_bin = factor(c("a", "a", "b", "b", "c"))) + list(breakpoint = c(-1, -1, 1, 1, Inf), category = factor(c("a", "a", "b", "b", "c"))) ) expect_equal( diff --git a/tests/testthat/test-expr_string.R b/tests/testthat/test-expr_string.R index c4dec7167..c8dee7327 100644 --- a/tests/testthat/test-expr_string.R +++ b/tests/testthat/test-expr_string.R @@ -156,22 +156,30 @@ test_that("str$concat", { # concatenate a Series of strings to a single string df = pl$DataFrame(foo = c("1", "a", NA)) expect_identical( - df$select(pl$col("foo")$str$concat())$to_list()[[1]], + df$select(pl$col("foo")$str$join())$to_list()[[1]], "1a" ) expect_identical( - df$select(pl$col("foo")$str$concat("-"))$to_list()[[1]], + df$select(pl$col("foo")$str$join("-"))$to_list()[[1]], "1-a" ) expect_identical( - df$select(pl$col("foo")$str$concat(ignore_nulls = FALSE))$to_list()[[1]], + df$select(pl$col("foo")$str$join(ignore_nulls = FALSE))$to_list()[[1]], NA_character_ ) + # deprecated + expect_warning( + expect_identical( + df$select(pl$col("foo")$str$concat("-"))$to_list()[[1]], + "1-a" + ), + "deprecated" + ) # Series list of strings to Series of concatenated strings df = pl$DataFrame(list(bar = list(c("a", "b", "c"), c("1", "2", "æ")))) expect_identical( - df$select(pl$col("bar")$list$eval(pl$element()$str$concat())$list$first())$to_list()$bar, + df$select(pl$col("bar")$list$eval(pl$element()$str$join())$list$first())$to_list()$bar, sapply(df$to_list()[[1]], paste, collapse = "") ) }) diff --git a/tests/testthat/test-joins.R b/tests/testthat/test-joins.R index 8e3953468..c1f8cc9cf 100644 --- a/tests/testthat/test-joins.R +++ b/tests/testthat/test-joins.R @@ -111,6 +111,16 @@ test_that("cross join, DataFrame", { ) ) + expect_grepl_error( + dat$join(dat2, how = "cross", on = "foo"), + "cross join should not pass join keys" + ) + + expect_grepl_error( + dat$join(dat2, how = "cross", left_on = "foo", right_on = "foo2"), + "cross join should not pass join keys" + ) + # one empty dataframe dat_empty = pl$DataFrame(y = character()) expect_identical( @@ -146,37 +156,37 @@ test_that("argument 'validate' works", { # eager 1:1 expect_grepl_error( df1$join(df2, on = "x", validate = "1:1"), - "join keys did not fulfil 1:1 validation" + "join keys did not fulfill 1:1 validation" ) # lazy 1:1 expect_grepl_error( df1$lazy()$join(df2$lazy(), on = "x", validate = "1:1")$collect(), - "join keys did not fulfil 1:1 validation" + "join keys did not fulfill 1:1 validation" ) # eager m:1 expect_grepl_error( df1$join(df2, on = "x", validate = "m:1"), - "join keys did not fulfil m:1 validation" + "join keys did not fulfill m:1 validation" ) # lazy m:1 expect_grepl_error( df1$lazy()$join(df2$lazy(), on = "x", validate = "m:1")$collect(), - "join keys did not fulfil m:1 validation" + "join keys did not fulfill m:1 validation" ) # eager 1:m expect_grepl_error( df2$join(df1, on = "x", validate = "1:m"), - "join keys did not fulfil 1:m validation" + "join keys did not fulfill 1:m validation" ) # lazy 1:m expect_grepl_error( df2$lazy()$join(df1$lazy(), on = "x", validate = "1:m")$collect(), - "join keys did not fulfil 1:m validation" + "join keys did not fulfill 1:m validation" ) # eager error on unknown validate choice diff --git a/tests/testthat/test-melt.R b/tests/testthat/test-melt.R index 9449a01d3..cf112f374 100644 --- a/tests/testthat/test-melt.R +++ b/tests/testthat/test-melt.R @@ -1,4 +1,4 @@ -patrick::with_parameters_test_that("melt example", +patrick::with_parameters_test_that("unpivot example", { df_1 = pl[[create_func]]( a = c("x", "y", "z"), @@ -9,7 +9,7 @@ patrick::with_parameters_test_that("melt example", expect_true(is_func(df_1)) expect_identical( - df_1$melt(id_vars = "a", value_vars = c("b", "c")) |> as.data.frame(), + df_1$unpivot(index = "a", on = c("b", "c")) |> as.data.frame(), data.frame( a = c("x", "y", "z", "x", "y", "z"), variable = c("b", "b", "b", "c", "c", "c"), @@ -17,7 +17,7 @@ patrick::with_parameters_test_that("melt example", ) ) expect_identical( - df_1$melt(id_vars = c("c", "b"), value_vars = "a") |> as.data.frame(), + df_1$unpivot(index = c("c", "b"), value_vars = "a") |> as.data.frame(), data.frame( c = c(2, 4, 6), b = c(1, 3, 5), @@ -26,7 +26,7 @@ patrick::with_parameters_test_that("melt example", ) ) expect_identical( - df_1$melt(id_vars = c("a", "b"), value_vars = "c") |> as.data.frame(), + df_1$unpivot(index = c("a", "b"), value_vars = "c") |> as.data.frame(), data.frame( a = c("x", "y", "z"), b = c(1, 3, 5), @@ -36,8 +36,8 @@ patrick::with_parameters_test_that("melt example", ) expect_identical( - df_1$melt( - id_vars = c("a", "b"), + df_1$unpivot( + index = c("a", "b"), value_vars = c("c"), value_name = "alice", variable_name = "bob" diff --git a/tests/testthat/test-parquet.R b/tests/testthat/test-parquet.R index 4aa8ce658..26b2869ff 100644 --- a/tests/testthat/test-parquet.R +++ b/tests/testthat/test-parquet.R @@ -94,3 +94,29 @@ test_that("write_parquet returns the input data", { x = dat$write_parquet(tmpf) expect_identical(x$to_list(), dat$to_list()) }) + +test_that("write_parquet: argument 'statistics'", { + dat = pl$DataFrame(mtcars) + tmpf = tempfile() + on.exit(unlink(tmpf)) + + expect_silent(dat$write_parquet(tmpf, statistics = TRUE)) + expect_silent(dat$write_parquet(tmpf, statistics = FALSE)) + expect_silent(dat$write_parquet(tmpf, statistics = "full")) + expect_grepl_error( + dat$write_parquet(tmpf, statistics = list(null_count = FALSE)), + "File out of specification: null count of a page is required" + ) + expect_grepl_error( + dat$write_parquet(tmpf, statistics = list(foo = TRUE, foo2 = FALSE)), + "In `statistics`, `foo`, `foo2` are not valid keys" + ) + expect_grepl_error( + dat$write_parquet(tmpf, statistics = "foo"), + "`statistics` must be TRUE/FALSE, 'full', or a named list." + ) + expect_grepl_error( + dat$write_parquet(tmpf, statistics = c(max = TRUE, min = FALSE)), + "`statistics` must be of length 1." + ) +}) diff --git a/tests/testthat/test-sink_stream.R b/tests/testthat/test-sink_stream.R index 57010df61..5d676c6df 100644 --- a/tests/testthat/test-sink_stream.R +++ b/tests/testthat/test-sink_stream.R @@ -13,6 +13,37 @@ test_that("Test sinking data to parquet file", { expect_identical(x$collect()$to_list(), lf$collect()$to_list()) }) +test_that("sink_parquet: argument 'statistics'", { + tmpf = tempfile() + on.exit(unlink(tmpf)) + + expect_silent(lf$sink_parquet(tmpf, statistics = TRUE)) + expect_silent(lf$sink_parquet(tmpf, statistics = FALSE)) + expect_silent(lf$sink_parquet(tmpf, statistics = "full")) + # TODO: uncomment when https://github.com/pola-rs/polars/issues/17306 is fixed + # expect_silent(lf$sink_parquet( + # tmpf, + # statistics = list( + # min = TRUE, + # max = FALSE, + # distinct_count = TRUE, + # null_count = FALSE + # ) + # )) + expect_grepl_error( + lf$sink_parquet(tmpf, statistics = list(foo = TRUE, foo2 = FALSE)), + "In `statistics`, `foo`, `foo2` are not valid keys" + ) + expect_grepl_error( + lf$sink_parquet(tmpf, statistics = "foo"), + "`statistics` must be TRUE/FALSE, 'full', or a named list." + ) + expect_grepl_error( + lf$sink_parquet(tmpf, statistics = c(max = TRUE, min = FALSE)), + "`statistics` must be of length 1." + ) +}) + test_that("Test sinking data to IPC file", { tmpf = tempfile() on.exit(unlink(tmpf)) diff --git a/vignettes/polars.Rmd b/vignettes/polars.Rmd index 2f5d1f3b9..1d4d550d6 100644 --- a/vignettes/polars.Rmd +++ b/vignettes/polars.Rmd @@ -333,7 +333,7 @@ To go from long to wide, we use the `$pivot()` method. Here we pivot the data so that every subject takes its own column. ```{r} -indo_wide = indo$pivot(values = "conc", index = "time", columns = "Subject") +indo_wide = indo$pivot(values = "conc", index = "time", on = "Subject") indo_wide ``` @@ -341,7 +341,7 @@ To go from wide to long, we use the `$melt()` method. ```{r} # indo_wide$melt(id_vars = "time") # default column names are "variable" and "value" -indo_wide$melt(id_vars = "time", variable_name = "subject", value_name = "conc") +indo_wide$unpivot(index = "time", variable_name = "subject", value_name = "conc") ``` Basic functionality aside, it should be noted that `$pivot()` can perform @@ -356,7 +356,7 @@ different combinations of transmission type (`am`) and engine shape (`vs`)? dat$pivot( values = "mpg", index = c("am", "vs"), - columns = "cyl", + on = "cyl", aggregate_function = "median" # aggregating function ) ``` diff --git a/vignettes/userguide.Rmd b/vignettes/userguide.Rmd index 0e7857b82..e2663d4b8 100755 --- a/vignettes/userguide.Rmd +++ b/vignettes/userguide.Rmd @@ -16,7 +16,7 @@ options(rmarkdown.html_vignette.check_title = FALSE) ``` -````{comment} +```{.r} These functions/methods are either missing, broken, or Vincent can't figure out how to use them. * `Series_shift` @@ -33,7 +33,7 @@ Requires new Polars version: * `df$sample()` * `df$describe()` -```` +``` [The Polars User Guide](https://pola-rs.github.io/polars-book/user-guide/) is a detailed tutorial about the Polars DataFrame library. Its goal is to introduce you to Polars by going through examples and comparing it to other solutions. Some design choices are introduced there. The guide also introduces you to optimal usage of Polars. The Polars User Guide is available at this link: @@ -271,8 +271,7 @@ dataset$ collect() ``` -````{comment} -```{r} +```{.r} compute_age = function() 2021 - pl$col("birthday")$dt$year() avg_birthday = function(gender) { @@ -341,13 +340,13 @@ q$collect() # ) # q$collect() ``` -```` + ## Folds -```{comment} +```{.r} df = pl$DataFrame( "a" = c(1, 2, 3), "b" = c(10, 20, 30) @@ -397,7 +396,7 @@ df = pl$read_csv( ) ``` -```{comment} +```{.r} df$select( "Type 1", "Type 2", @@ -408,7 +407,7 @@ df$select( ``` -```{r} +```{.r} filtered = df$ filter(pl$col("Type 2") == "Psychic")$ select(c("Name", "Type 1", "Speed")) @@ -450,9 +449,7 @@ df$sort("Type 1")$select( # List context and row wise computations -````{comment} - -``` +```{.r} grades = pl$DataFrame( "student" = c("bas", "laura", "tim", "jenny"), "arithmetic" = c(10, 5, 6, 8), @@ -480,7 +477,7 @@ grades$with_columns( # Custom functions -``` +```{.r} df = pl$DataFrame( "keys" = c("a", "a", "b"), "values" = c(10, 7, 1) @@ -520,7 +517,7 @@ out = df$select( ) print(out) ``` -```` + # R examples @@ -624,7 +621,7 @@ df$group_by("fruits")$ ``` -```{comment} +```{.r} # We can explode the list column "cars" to a new row for each element in the list df$ # sort("cars")$