Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DataFrame.describe + auto-convert String-Err to RPolarsErr #268

Merged
merged 18 commits into from
Jul 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,10 @@ Collate:
'dataframe__frame.R'
'datatype.R'
'docs.R'
'error__rpolarserr.R'
'error__string.R'
'error__trait.R'
'error_conversion.R'
'error_trait.R'
'expr__binary.R'
'expr__categorical.R'
'expr__datetime.R'
Expand All @@ -78,11 +80,9 @@ Collate:
'pkg-knitr.R'
'pkg-nanoarrow.R'
'rlang.R'
'rpolarserr.R'
'rust_result.R'
's3_methods.R'
'series__series.R'
'string_error.R'
'translation.R'
'vctrs.R'
'zzz.R'
Expand Down
132 changes: 100 additions & 32 deletions R/dataframe__frame.R
Original file line number Diff line number Diff line change
Expand Up @@ -708,19 +708,20 @@ DataFrame_sort = function(
#' (pl$col("Sepal.Length") + 2)$alias("add_2_SL")
#' )
DataFrame_select = function(...) {
args = list2(...)
exprs = do.call(construct_ProtoExprArray, args)
df = unwrap(.pr$DataFrame$select(self, exprs))

expr_names = names(args)
if (!is.null(expr_names)) {
old_names = df$columns
new_names = old_names
has_expr_name = nchar(expr_names) >= 1L
new_names[has_expr_name] = expr_names[has_expr_name]
df$columns = new_names
}
df
args = unpack_list(...)
.pr$DataFrame$select(self, args) |>
and_then(\(df) result(msg = "internal error while renaming columns", {
expr_names = names(args)
if (!is.null(expr_names)) {
old_names = df$columns
new_names = old_names
has_expr_name = nchar(expr_names) >= 1L
new_names[has_expr_name] = expr_names[has_expr_name]
df$columns = new_names
}
df
})) |>
unwrap("in $select()")
}

#' Drop in place
Expand Down Expand Up @@ -1389,7 +1390,72 @@ DataFrame_rename = function(...) {
self$lazy()$rename(...)$collect()
}

#' @title Summary statistics for a DataFrame
#' @param percentiles One or more percentiles to include in the summary statistics.
#' All values must be in the range `[0; 1]`.
#' @keywords DataFrame
#' @return DataFrame
#' @examples
#' pl$DataFrame(iris)$describe()
DataFrame_describe = function(percentiles = c(.25, .75)) {
perc = percentiles

# guard input
# styler: off
pcase(
is.null(perc), Ok(numeric()),
!is.numeric(perc), Err(bad_robj(perc)$mistyped("numeric")),
isFALSE(all(perc > 0) && all(perc < 1)), {
Err(bad_robj(perc)$misvalued("has all vector elements within 0 and 1"))
},
or_else = Ok(perc)
# styler: on
) |>
map_err(
\(err) err$bad_arg("percentiles")
) |>
and_then(
\(perc) {
# this polars query should always succeed else flag as ...
result(msg = "internal error", {
# make percentile expressions
perc_exprs = lapply(
perc, \(x) pl$all()$quantile(x)$prefix(paste0(as.character(x * 100), "pct:"))
)

# bundle all expressions
largs = c(
list(
pl$all()$count()$prefix("count:"),
pl$all()$null_count()$prefix("null_count:"),
pl$all()$mean()$prefix("mean:"),
pl$all()$std()$prefix("std:"),
pl$all()$min()$prefix("min:"),
pl$all()$max()$prefix("max:"),
pl$all()$median()$prefix("median:")
),
perc_exprs
)

# compute aggregates
df_aggs = do.call(self$select, largs)
e_col_row_names = pl$lit(df_aggs$columns)$str$split(":")

# pivotize
df_pivot = pl$select(
e_col_row_names$arr$first()$alias("rowname"),
e_col_row_names$arr$last()$alias("colname"),
pl$lit(unlist(as.data.frame(df_aggs)))$alias("value")
)$pivot(
values = "value", index = "rowname", columns = "colname"
)
df_pivot$columns[1] = "describe"
df_pivot
})
}
) |>
unwrap("in $describe():")
}

#' @title Glimpse values in a DataFrame
#' @keywords DataFrame
Expand Down Expand Up @@ -1428,25 +1494,27 @@ DataFrame_glimpse = function(..., return_as_string = FALSE) {
}

# construct print, flag any error as internal
output = result({
schema = self$schema
data = lapply(seq_along(schema), \(i) parse_column_(names(schema)[i], schema[[i]]))
max_col_name = max(sapply(data, \(x) nchar(x$col_name)))
max_col_dtyp = max(sapply(data, \(x) nchar(x$dtype)))
max_col_vals = 100 - max_col_name - max_col_dtyp - 3

sapply(data, \(x) {
name_filler = paste(rep(" ", max_col_name - nchar(x$col_name)), collapse = "")
dtyp_filler = paste(rep(" ", max_col_dtyp - nchar(x$dtype_str)), collapse = "")
vals_filler = paste(rep(" ", max_col_dtyp - nchar(x$dtype_str)), collapse = "")
paste0(
"& ", x$col_name, name_filler, x$dtype_str, dtyp_filler, " ",
substr(x$val_str, 1, max_col_vals), "\n"
)
}) |>
paste0(collapse = "")

}, msg = "internal error") |>
output = result(
{
schema = self$schema
data = lapply(seq_along(schema), \(i) parse_column_(names(schema)[i], schema[[i]]))
max_col_name = max(sapply(data, \(x) nchar(x$col_name)))
max_col_dtyp = max(sapply(data, \(x) nchar(x$dtype)))
max_col_vals = 100 - max_col_name - max_col_dtyp - 3

sapply(data, \(x) {
name_filler = paste(rep(" ", max_col_name - nchar(x$col_name)), collapse = "")
dtyp_filler = paste(rep(" ", max_col_dtyp - nchar(x$dtype_str)), collapse = "")
vals_filler = paste(rep(" ", max_col_dtyp - nchar(x$dtype_str)), collapse = "")
paste0(
"& ", x$col_name, name_filler, x$dtype_str, dtyp_filler, " ",
substr(x$val_str, 1, max_col_vals), "\n"
)
}) |>
paste0(collapse = "")
},
msg = "internal error"
) |>
unwrap("in $glimpse() :")

# chose return type
Expand Down
16 changes: 15 additions & 1 deletion R/rpolarserr.R → R/error__rpolarserr.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,5 +34,19 @@ to_condition.RPolarsErr = function(err) {
)
}
plain.RPolarsErr = function(err, msg) {
err$value$plain(msg)
err$plain(msg)
}

upgrade_err.RPolarsErr = function(err) { # already RPolarsErr pass through
err
}




#### ---- rpolarserr utils

# short hand for starting new error with a bad robj input
bad_robj = function(r) {
.pr$RPolarsErr$new()$bad_robj(r)
}
3 changes: 3 additions & 0 deletions R/string_error.R → R/error__string.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,6 @@ to_condition.character = function(err) {
plain.character = function(err, msg) {
NextMethod("plain", err)
}
upgrade_err.character = function(err) {
.pr$RPolarsErr$new()$plain(err)
}
27 changes: 26 additions & 1 deletion R/error_trait.R → R/error__trait.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# ANY NEW ERROR MUST IMPLEMENT THESE S3 METHODS, these are the "trait" of a polars error
# ALSO MUST IMPLEMENT BASE THESE METHODS: print
# ALSO MUST IMPLEMENT THESE BASE METHODS: print

#' Internal generic method to add call to error
#' @param err any type which impl as.character
Expand Down Expand Up @@ -86,3 +86,28 @@ plain = function(err, msg) {
plain.default = function(err, msg) {
paste0(msg, ": ", err)
}


## TODO refactor upgrade_err into as.RPolarsErr
#' Internal generic method to add plain text to error message
#' @details
#' polars converts any other error types to RPolarsErr.
#' An error type can choose to implement this to improve the translation.
#' As fall back the error will be deparsed into a string with rust Debug, see rdbg()
#' @param err some error type object
#' @param msg string to add
#' @keywords internal
#' @return condition
upgrade_err = function(err) {
UseMethod("upgrade_err", err)
}
upgrade_err.default = function(err) {
err # no upgrade found pass as is
}

# call upgrade error from internalsnamespace
# error_trait methods are internal and do not work correctly
# when called directly by user e.g. polars:::upgrade_err(polars:::RPolarsErr$new())
# calling R from rust via R! but it is a "user" call in .GlobalEnv
# by calling a package function the parent env is the internal pacakge env.
upgrade_err_internal_ns = \(x) upgrade_err(x)
20 changes: 17 additions & 3 deletions R/error_conversion.R
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,22 @@ unwrap_err = function(result) {
result = function(expr, msg = NULL) {
tryCatch(
Ok(expr),
error = \(cond) cond$value %||% cond$message |>
plain(msg) |>
Err()
error = \(cond) {
cond$value %||% cond$message |>
upgrade_err() |>
plain(msg) |>
Err()
}
)
}


raw_result = function(expr) {
tryCatch(
Ok(expr),
error = \(cond) {
cond$value %||% cond$message |>
Err()
}
)
}
7 changes: 4 additions & 3 deletions R/expr__expr.R
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,9 @@ wrap_e_result = function(e, str_to_lit = TRUE, argname = NULL) {
expr_result
}

#' wrap_elist_result
#' @description make sure all elementsof a list is wrapped as Expr
#' internal wrap_elist_result
#' @description make sure all elements of a list is wrapped as Expr
#' DEPRECATED: prefer robj_to!(VecPlExpr) on rust side
#' Capture any conversion error in the result
#' @param elist a list Expr or any R object Into<Expr> (passable to pl$lit)
#' @details
Expand All @@ -144,7 +145,7 @@ wrap_elist_result = function(elist, str_to_lit = TRUE) {
msg = if (element_i >= 1L) {
paste0("element [[", element_i, "]] of sequence not convertable into an Expr, error in:\n")
} else {
"not convertable into a list of Expr, error in:\n"
paste0(str_string(elist), " was not convertable into a list of Expr, error in:\n")
}
)
}
Expand Down
12 changes: 6 additions & 6 deletions R/extendr-wrappers.R
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ Expr$search_sorted <- function(element) .Call(wrap__Expr__search_sorted, self, e

Expr$take <- function(idx) .Call(wrap__Expr__take, self, idx)

Expr$sort_by <- function(by, descending) .Call(wrap__Expr__sort_by, self, by, descending)
Expr$sort_by <- function(by, reverse) .Call(wrap__Expr__sort_by, self, by, reverse)

Expr$backward_fill <- function(limit) .Call(wrap__Expr__backward_fill, self, limit)

Expand Down Expand Up @@ -401,7 +401,7 @@ Expr$rolling_skew <- function(window_size_f, bias) .Call(wrap__Expr__rolling_ske

Expr$abs <- function() .Call(wrap__Expr__abs, self)

Expr$rank <- function(method, descending) .Call(wrap__Expr__rank, self, method, descending)
Expr$rank <- function(method, reverse) .Call(wrap__Expr__rank, self, method, reverse)

Expr$diff <- function(n_float, null_behavior) .Call(wrap__Expr__diff, self, n_float, null_behavior)

Expand Down Expand Up @@ -489,7 +489,7 @@ Expr$lst_sum <- function() .Call(wrap__Expr__lst_sum, self)

Expr$lst_mean <- function() .Call(wrap__Expr__lst_mean, self)

Expr$lst_sort <- function(descending) .Call(wrap__Expr__lst_sort, self, descending)
Expr$lst_sort <- function(reverse) .Call(wrap__Expr__lst_sort, self, reverse)

Expr$lst_reverse <- function() .Call(wrap__Expr__lst_reverse, self)

Expand Down Expand Up @@ -985,7 +985,7 @@ Series$n_unique <- function() .Call(wrap__Series__n_unique, self)

Series$name <- function() .Call(wrap__Series__name, self)

Series$sort_mut <- function(descending) .Call(wrap__Series__sort_mut, self, descending)
Series$sort_mut <- function(reverse) .Call(wrap__Series__sort_mut, self, reverse)

Series$value_counts <- function(multithreaded, sorted) .Call(wrap__Series__value_counts, self, multithreaded, sorted)

Expand All @@ -997,7 +997,7 @@ Series$is_sorted_flag <- function() .Call(wrap__Series__is_sorted_flag, self)

Series$is_sorted_reverse_flag <- function() .Call(wrap__Series__is_sorted_reverse_flag, self)

Series$is_sorted <- function(descending, nulls_last) .Call(wrap__Series__is_sorted, self, descending, nulls_last)
Series$is_sorted <- function(reverse, nulls_last) .Call(wrap__Series__is_sorted, self, reverse, nulls_last)

Series$series_equal <- function(other, null_equal, strict) .Call(wrap__Series__series_equal, self, other, null_equal, strict)

Expand Down Expand Up @@ -1061,7 +1061,7 @@ Series$cumsum <- function(reverse) .Call(wrap__Series__cumsum, self, reverse)

Series$to_frame <- function() .Call(wrap__Series__to_frame, self)

Series$set_sorted_mut <- function(descending) invisible(.Call(wrap__Series__set_sorted_mut, self, descending))
Series$set_sorted_mut <- function(reverse) invisible(.Call(wrap__Series__set_sorted_mut, self, reverse))

Series$from_arrow <- function(name, array) .Call(wrap__Series__from_arrow, name, array)

Expand Down
2 changes: 1 addition & 1 deletion R/functions__lazy.R
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ pl$col = function(name = "", ...) {
}
}
# TODO implement series, DataType
stopf("not supported implement input")
stopf(paste("cannot make a column expression from:", str_string(name)))
}

#' an element in 'eval'-expr
Expand Down
7 changes: 4 additions & 3 deletions R/lazyframe__lazy.R
Original file line number Diff line number Diff line change
Expand Up @@ -215,8 +215,9 @@ LazyFrame_describe_plan = "use_extendr_wrapper"
#' @param ... any single Expr or string naming a column
#' @return A new `LazyFrame` object with applied filter.
LazyFrame_select = function(...) {
pra = construct_ProtoExprArray(...)
.pr$LazyFrame$select(self, pra)
args = unpack_list(...)
.pr$LazyFrame$select(self, args) |>
unwrap("in $select()")
}

#' @title Lazy with columns
Expand Down Expand Up @@ -255,7 +256,7 @@ LazyFrame_filter = "use_extendr_wrapper"
#' @return collected `DataFrame`
#' @examples pl$DataFrame(iris)$lazy()$filter(pl$col("Species") == "setosa")$collect()
LazyFrame_collect = function() {
unwrap(.pr$LazyFrame$collect(self), "in $collect():")
unwrap(.pr$LazyFrame$collect_handled(self), "in $collect():")
}

#' @title New DataFrame from LazyFrame_object$collect()
Expand Down
2 changes: 0 additions & 2 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,6 @@ unpack_list = function(...) {
}
}



#' Simple SQL CASE WHEN implementation for R
#'
#' @description Inspired by data.table::fcase + dplyr::case_when.
Expand Down
Loading