From 3bcce4d4bc0b5aadbdeb99ceffce4000f40e4291 Mon Sep 17 00:00:00 2001 From: eitsupi <50911393+eitsupi@users.noreply.github.com> Date: Sat, 18 Nov 2023 19:46:06 +0900 Subject: [PATCH] feat!: add `as_polars_df` and `as_polars_lf` generic functions (#519) --- DESCRIPTION | 1 + NAMESPACE | 12 +++ NEWS.md | 2 + R/as_polars.R | 156 ++++++++++++++++++++++++++++++++ R/convert.R | 8 +- R/lazyframe__lazy.R | 3 + man/LazyFrame_collect.Rd | 3 + man/LazyFrame_fetch.Rd | 3 + man/as_polars_df.Rd | 124 +++++++++++++++++++++++++ man/as_polars_lf.Rd | 27 ++++++ man/pl_from_arrow.Rd | 2 + src/rust/src/utils/mod.rs | 47 ++-------- tests/testthat/test-as_polars.R | 35 +++++++ tests/testthat/test-concat.R | 2 +- 14 files changed, 382 insertions(+), 43 deletions(-) create mode 100644 R/as_polars.R create mode 100644 man/as_polars_df.Rd create mode 100644 man/as_polars_lf.Rd create mode 100644 tests/testthat/test-as_polars.R diff --git a/DESCRIPTION b/DESCRIPTION index 877bd7248..10f6db44d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -66,6 +66,7 @@ Collate: 'after-wrappers.R' 'Field.R' 'PTime.R' + 'as_polars.R' 'autocompletion.R' 'construction.R' 'convert.R' diff --git a/NAMESPACE b/NAMESPACE index a1f39a062..63bf32a95 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -107,6 +107,16 @@ S3method(as.list,rpolars_raw_list) S3method(as.matrix,DataFrame) S3method(as.matrix,LazyFrame) S3method(as.vector,Series) +S3method(as_polars_df,ArrowTabular) +S3method(as_polars_df,DataFrame) +S3method(as_polars_df,GroupBy) +S3method(as_polars_df,LazyFrame) +S3method(as_polars_df,RecordBatchReader) +S3method(as_polars_df,Series) +S3method(as_polars_df,data.frame) +S3method(as_polars_df,default) +S3method(as_polars_lf,LazyFrame) +S3method(as_polars_lf,default) S3method(as_polars_series,POSIXlt) S3method(as_polars_series,default) S3method(as_polars_series,vctrs_rcrd) @@ -161,6 +171,8 @@ S3method(tail,LazyFrame) S3method(unique,DataFrame) S3method(unique,LazyFrame) export(.pr) +export(as_polars_df) +export(as_polars_lf) export(as_polars_series) export(knit_print.DataFrame) export(pl) diff --git a/NEWS.md b/NEWS.md index 51f4c3533..518b3dd53 100644 --- a/NEWS.md +++ b/NEWS.md @@ -37,6 +37,8 @@ is aimed for r-polars extensions, and will be kept stable as much as possible (#504). - New functions `pl$min_horizontal()`, `pl$max_horizontal()`, `pl$sum_horizontal()`, `pl$all_horizontal()`, `pl$any_horizontal()` (#508). +- New generic functions `as_polars_df()` and `as_polars_lf()` to create polars DataFrames + and LazyFrames (#519). # polars 0.10.1 diff --git a/R/as_polars.R b/R/as_polars.R new file mode 100644 index 000000000..67bb6ad7b --- /dev/null +++ b/R/as_polars.R @@ -0,0 +1,156 @@ +#' To polars DataFrame +#' +#' [as_polars_df()] is a generic function that converts an R object to a +#' polars DataFrame. It is basically a wrapper for [pl$DataFrame()][pl_DataFrame], +#' but has special implementations for Apache Arrow-based objects such as +#' polars [LazyFrame][LazyFrame_class] and [arrow::Table]. +#' +#' For [LazyFrame][LazyFrame_class] objects, this function is a shortcut for +#' [$collect()][LazyFrame_collect] or [$fetch()][LazyFrame_fetch], depending on +#' whether the number of rows to fetch is infinite or not. +#' @rdname as_polars_df +#' @param x Object to convert to a polars DataFrame. +#' @param ... Additional arguments passed to methods. +#' @examplesIf requireNamespace("arrow", quietly = TRUE) +#' at = arrow::as_arrow_table(mtcars) +#' +#' # Convert an arrow Table to a polars LazyFrame +#' lf = as_polars_df(at)$lazy() +#' +#' # Collect all rows +#' as_polars_df(lf) +#' +#' # Fetch 5 rows +#' as_polars_df(lf, 5) +#' @export +as_polars_df = function(x, ...) { + UseMethod("as_polars_df") +} + + +#' @rdname as_polars_df +#' @export +as_polars_df.default = function(x, ...) { + as_polars_df(as.data.frame(x, stringsAsFactors = FALSE), ...) +} + + +#' @rdname as_polars_df +#' @export +as_polars_df.data.frame = function(x, ...) { + pl$DataFrame(x) +} + + +#' @rdname as_polars_df +#' @export +as_polars_df.DataFrame = function(x, ...) { + x +} + + +#' @rdname as_polars_df +#' @export +as_polars_df.GroupBy = function(x, ...) { + x$to_data_frame() +} + + +#' @rdname as_polars_df +#' @export +as_polars_df.Series = function(x, ...) { + pl$DataFrame(x) +} + + +#' @rdname as_polars_df +#' @param n_rows Number of rows to fetch. Defaults to `Inf`, meaning all rows. +#' @inheritParams LazyFrame_collect +#' @export +as_polars_df.LazyFrame = function( + x, + n_rows = Inf, + ..., + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + comm_subplan_elim = TRUE, + comm_subexpr_elim = TRUE, + streaming = FALSE, + no_optimization = FALSE, + inherit_optimization = FALSE, + collect_in_background = FALSE) { + # capture all args and modify some to match lower level function + args = as.list(environment()) + args$... = list(...) + + if (is.infinite(args$n_rows)) { + args$n_rows = NULL + .fn = x$collect + } else { + args$collect_in_background = NULL + .fn = x$fetch + } + + args$x = NULL + check_no_missing_args(.fn, args) + do.call(.fn, args) +} + + +#' @rdname as_polars_df +#' @inheritParams pl_from_arrow +#' @export +as_polars_df.ArrowTabular = function( + x, + ..., + rechunk = TRUE, + schema = NULL, + schema_overrides = NULL) { + pl$from_arrow( + x, + ..., + rechunk = rechunk, + schema = schema, + schema_overrides = schema_overrides + ) +} + + +#' @rdname as_polars_df +#' @export +as_polars_df.RecordBatchReader = as_polars_df.ArrowTabular + + +# TODO: as_polars_df.nanoarrow_array_stream + + +#' To polars LazyFrame +#' +#' [as_polars_lf()] is a generic function that converts an R object to a +#' polars LazyFrame. It is basically a shortcut for [as_polars_df(x, ...)][as_polars_df] with the +#' [$lazy()][DataFrame_lazy] method. +#' @rdname as_polars_lf +#' @inheritParams as_polars_df +#' @examples +#' as_polars_lf(mtcars) +#' @export +as_polars_lf = function(x, ...) { + UseMethod("as_polars_lf") +} + + +#' @rdname as_polars_lf +#' @export +as_polars_lf.default = function(x, ...) { + as_polars_df(x, ...)$lazy() +} + + +#' @rdname as_polars_lf +#' @export +as_polars_lf.LazyFrame = function(x, ...) { + x +} diff --git a/R/convert.R b/R/convert.R index 7f6a82bdf..d7177964a 100644 --- a/R/convert.R +++ b/R/convert.R @@ -2,6 +2,7 @@ #' @description import Arrow Table or Array #' @name pl_from_arrow #' @param data arrow Table or Array or ChunkedArray +#' @param ... Ignored. #' @param rechunk bool rewrite in one array per column, Implemented for ChunkedArray #' Array is already contiguous. Not implemented for Table. C #' @param schema named list of DataTypes or char vec of names. Same length as arrow table. @@ -24,7 +25,12 @@ #' data = arrow::arrow_table(iris), #' schema = char_schema #' ) -pl$from_arrow = function(data, rechunk = TRUE, schema = NULL, schema_overrides = NULL) { +pl$from_arrow = function( + data, + ..., + rechunk = TRUE, + schema = NULL, + schema_overrides = NULL) { if (!requireNamespace("arrow", quietly = TRUE)) { stop("in pl$from_arrow: cannot import from arrow without R package arrow installed") } diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index bb26ee0a7..13a697dac 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -338,6 +338,7 @@ LazyFrame_set_optimization_toggle = function( #' @description `$collect()` performs the query on the LazyFrame. It returns a #' DataFrame #' @inheritParams LazyFrame_set_optimization_toggle +#' @param ... Ignored. #' @param no_optimization Boolean. Sets the following parameters to `FALSE`: #' `predicate_pushdown`, `projection_pushdown`, `slice_pushdown`, #' `comm_subplan_elim`, `comm_subexpr_elim`. @@ -364,6 +365,7 @@ LazyFrame_set_optimization_toggle = function( #' - [`$sink_ipc()`][LazyFrame_sink_ipc()] streams query to a arrow file. LazyFrame_collect = function( + ..., type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, @@ -1311,6 +1313,7 @@ LazyFrame_dtypes = method_as_property(function() { #' fetch(3) LazyFrame_fetch = function( n_rows = 500, + ..., type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, diff --git a/man/LazyFrame_collect.Rd b/man/LazyFrame_collect.Rd index 69b8e4a82..bbff0f7bd 100644 --- a/man/LazyFrame_collect.Rd +++ b/man/LazyFrame_collect.Rd @@ -5,6 +5,7 @@ \title{Collect a query into a DataFrame} \usage{ LazyFrame_collect( + ..., type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, @@ -19,6 +20,8 @@ LazyFrame_collect( ) } \arguments{ +\item{...}{Ignored.} + \item{type_coercion}{Boolean. Coerce types such that operations succeed and run on minimal required memory.} diff --git a/man/LazyFrame_fetch.Rd b/man/LazyFrame_fetch.Rd index 88c5102ba..ffa5dc407 100644 --- a/man/LazyFrame_fetch.Rd +++ b/man/LazyFrame_fetch.Rd @@ -6,6 +6,7 @@ \usage{ LazyFrame_fetch( n_rows = 500, + ..., type_coercion = TRUE, predicate_pushdown = TRUE, projection_pushdown = TRUE, @@ -21,6 +22,8 @@ LazyFrame_fetch( \arguments{ \item{n_rows}{Integer. Maximum number of rows to fetch.} +\item{...}{Ignored.} + \item{type_coercion}{Boolean. Coerce types such that operations succeed and run on minimal required memory.} diff --git a/man/as_polars_df.Rd b/man/as_polars_df.Rd new file mode 100644 index 000000000..0ac178492 --- /dev/null +++ b/man/as_polars_df.Rd @@ -0,0 +1,124 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/as_polars.R +\name{as_polars_df} +\alias{as_polars_df} +\alias{as_polars_df.default} +\alias{as_polars_df.data.frame} +\alias{as_polars_df.DataFrame} +\alias{as_polars_df.GroupBy} +\alias{as_polars_df.Series} +\alias{as_polars_df.LazyFrame} +\alias{as_polars_df.ArrowTabular} +\alias{as_polars_df.RecordBatchReader} +\title{To polars DataFrame} +\usage{ +as_polars_df(x, ...) + +\method{as_polars_df}{default}(x, ...) + +\method{as_polars_df}{data.frame}(x, ...) + +\method{as_polars_df}{DataFrame}(x, ...) + +\method{as_polars_df}{GroupBy}(x, ...) + +\method{as_polars_df}{Series}(x, ...) + +\method{as_polars_df}{LazyFrame}( + x, + n_rows = Inf, + ..., + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + comm_subplan_elim = TRUE, + comm_subexpr_elim = TRUE, + streaming = FALSE, + no_optimization = FALSE, + inherit_optimization = FALSE, + collect_in_background = FALSE +) + +\method{as_polars_df}{ArrowTabular}(x, ..., rechunk = TRUE, schema = NULL, schema_overrides = NULL) + +\method{as_polars_df}{RecordBatchReader}(x, ..., rechunk = TRUE, schema = NULL, schema_overrides = NULL) +} +\arguments{ +\item{x}{Object to convert to a polars DataFrame.} + +\item{...}{Additional arguments passed to methods.} + +\item{n_rows}{Number of rows to fetch. Defaults to \code{Inf}, meaning all rows.} + +\item{type_coercion}{Boolean. Coerce types such that operations succeed and +run on minimal required memory.} + +\item{predicate_pushdown}{Boolean. Applies filters as early as possible at +scan level.} + +\item{projection_pushdown}{Boolean. Select only the columns that are needed +at the scan level.} + +\item{simplify_expression}{Boolean. Various optimizations, such as constant +folding and replacing expensive operations with faster alternatives.} + +\item{slice_pushdown}{Boolean. Only load the required slice from the scan +level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).} + +\item{comm_subplan_elim}{Boolean. Will try to cache branching subplans that +occur on self-joins or unions.} + +\item{comm_subexpr_elim}{Boolean. Common subexpressions will be cached and +reused.} + +\item{streaming}{Boolean. Run parts of the query in a streaming fashion +(this is in an alpha state).} + +\item{no_optimization}{Boolean. Sets the following parameters to \code{FALSE}: +\code{predicate_pushdown}, \code{projection_pushdown}, \code{slice_pushdown}, +\code{comm_subplan_elim}, \code{comm_subexpr_elim}.} + +\item{inherit_optimization}{Boolean. Use existing optimization settings +regardless the settings specified in this function call.} + +\item{collect_in_background}{Boolean. Detach this query from R session. +Computation will start in background. Get a handle which later can be converted +into the resulting DataFrame. Useful in interactive mode to not lock R session.} + +\item{rechunk}{bool rewrite in one array per column, Implemented for ChunkedArray +Array is already contiguous. Not implemented for Table. C} + +\item{schema}{named list of DataTypes or char vec of names. Same length as arrow table. +If schema names or types do not match arrow table, the columns will be renamed/recast. +NULL default is to import columns as is. Takes no effect for Array or ChunkedArray} + +\item{schema_overrides}{named list of DataTypes. Name some columns to recast by the DataType. +Takes not effect for Array or ChunkedArray} +} +\description{ +\code{\link[=as_polars_df]{as_polars_df()}} is a generic function that converts an R object to a +polars DataFrame. It is basically a wrapper for \link[=pl_DataFrame]{pl$DataFrame()}, +but has special implementations for Apache Arrow-based objects such as +polars \link[=LazyFrame_class]{LazyFrame} and \link[arrow:Table-class]{arrow::Table}. +} +\details{ +For \link[=LazyFrame_class]{LazyFrame} objects, this function is a shortcut for +\link[=LazyFrame_collect]{$collect()} or \link[=LazyFrame_fetch]{$fetch()}, depending on +whether the number of rows to fetch is infinite or not. +} +\examples{ +\dontshow{if (requireNamespace("arrow", quietly = TRUE)) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +at = arrow::as_arrow_table(mtcars) + +# Convert an arrow Table to a polars LazyFrame +lf = as_polars_df(at)$lazy() + +# Collect all rows +as_polars_df(lf) + +# Fetch 5 rows +as_polars_df(lf, 5) +\dontshow{\}) # examplesIf} +} diff --git a/man/as_polars_lf.Rd b/man/as_polars_lf.Rd new file mode 100644 index 000000000..f79a9ba12 --- /dev/null +++ b/man/as_polars_lf.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/as_polars.R +\name{as_polars_lf} +\alias{as_polars_lf} +\alias{as_polars_lf.default} +\alias{as_polars_lf.LazyFrame} +\title{To polars LazyFrame} +\usage{ +as_polars_lf(x, ...) + +\method{as_polars_lf}{default}(x, ...) + +\method{as_polars_lf}{LazyFrame}(x, ...) +} +\arguments{ +\item{x}{Object to convert to a polars DataFrame.} + +\item{...}{Additional arguments passed to methods.} +} +\description{ +\code{\link[=as_polars_lf]{as_polars_lf()}} is a generic function that converts an R object to a +polars LazyFrame. It is basically a shortcut for \link[=as_polars_df]{as_polars_df(x, ...)} with the +\link[=DataFrame_lazy]{$lazy()} method. +} +\examples{ +as_polars_lf(mtcars) +} diff --git a/man/pl_from_arrow.Rd b/man/pl_from_arrow.Rd index 1a86f3729..ef4b98e1c 100644 --- a/man/pl_from_arrow.Rd +++ b/man/pl_from_arrow.Rd @@ -7,6 +7,8 @@ \arguments{ \item{data}{arrow Table or Array or ChunkedArray} +\item{...}{Ignored.} + \item{rechunk}{bool rewrite in one array per column, Implemented for ChunkedArray Array is already contiguous. Not implemented for Table. C} diff --git a/src/rust/src/utils/mod.rs b/src/rust/src/utils/mod.rs index 4e9cbb9a2..70ed0976d 100644 --- a/src/rust/src/utils/mod.rs +++ b/src/rust/src/utils/mod.rs @@ -812,28 +812,9 @@ pub fn robj_to_lazyframe(robj: extendr_api::Robj) -> RResult { // closure to allow ?-convert extendr::Result to RResult let res = || -> RResult { - match () { - // allow input as a DataFrame - _ if robj.inherits("DataFrame") => { - let extptr_df: ExternalPtr = robj.try_into()?; - Ok(extptr_df.lazy()) - } - _ if robj.inherits("LazyFrame") => { - let lf: ExternalPtr = robj.try_into()?; - let lf = LazyFrame(lf.0.clone()); - Ok(lf) - } - _ if robj.inherits("data.frame") => { - let df = unpack_r_eval(R!("polars:::result(polars::pl$DataFrame({{robj}}))"))?; - let extptr_df: ExternalPtr = df.try_into()?; - Ok(extptr_df.lazy()) - } - _ => Ok(DataFrame::new_with_capacity(1) - .lazy() - .0 - .select(&[robj_to_rexpr(robj, true)?.0])) - .map(LazyFrame), - } + let lf: ExternalPtr = + (unpack_r_eval(R!("polars:::result(polars::as_polars_lf({{robj}}))"))?).try_into()?; + Ok(LazyFrame(lf.0.clone())) }(); res.bad_val(rv).mistyped(tn::()) @@ -845,25 +826,9 @@ pub fn robj_to_dataframe(robj: extendr_api::Robj) -> RResult { // closure to allow ?-convert extendr::Result to RResult let res = || -> RResult { - match () { - // allow input as a DataFrame - _ if robj.inherits("DataFrame") => { - let extptr_df: ExternalPtr = robj.try_into()?; - Ok(extptr_df.0.clone()) - } - _ if robj.inherits("data.frame") => { - let df = unpack_r_eval(R!("polars:::result(polars::pl$DataFrame({{robj}}))"))?; - let extptr_df: ExternalPtr = df.try_into()?; - Ok(extptr_df.0.clone()) - } - _ => DataFrame::new_with_capacity(1) - .lazy() - .0 - .select(&[robj_to_rexpr(robj, true)?.0]) - .collect(), - } - .map(DataFrame) - .map_err(polars_to_rpolars_err) + let df: ExternalPtr = + (unpack_r_eval(R!("polars:::result(polars::as_polars_df({{robj}}))"))?).try_into()?; + Ok(DataFrame(df.0.clone())) }(); res.bad_val(rdbg(robj_clone)) diff --git a/tests/testthat/test-as_polars.R b/tests/testthat/test-as_polars.R new file mode 100644 index 000000000..63a2dad5e --- /dev/null +++ b/tests/testthat/test-as_polars.R @@ -0,0 +1,35 @@ +test_df = data.frame( + "col_int" = 1L:10L, + "col_dbl" = (1:10) / 10, + "col_chr" = letters[1:10], + "col_lgl" = rep_len(c(TRUE, FALSE, NA), 10) +) + +make_cases = function() { + tibble::tribble( + ~.test_name, ~x, + "data.frame", test_df, + "plsf", pl$LazyFrame(test_df), + "plgroupby", pl$DataFrame(test_df)$group_by("col_int"), + "arrow Table", arrow::as_arrow_table(test_df) + ) +} + +patrick::with_parameters_test_that("as_polars_df S3 methods", + { + skip_if_not_installed("arrow") + + actual = as.data.frame(as_polars_df(x)) + expected = as.data.frame(pl$DataFrame(test_df)) + + expect_equal(actual, expected) + }, + .cases = make_cases() +) + + +test_that("as_polars_lf S3 method", { + skip_if_not_installed("arrow") + at = arrow::as_arrow_table(test_df) + expect_s3_class(as_polars_lf(at), "LazyFrame") +}) diff --git a/tests/testthat/test-concat.R b/tests/testthat/test-concat.R index 90e028d89..18fe57c00 100644 --- a/tests/testthat/test-concat.R +++ b/tests/testthat/test-concat.R @@ -89,7 +89,7 @@ test_that("concat dataframe", { # can concat Series expect_identical( pl$concat(1:5, pl$Series(5:1, "b"), how = "horizontal")$to_list(), - list(1:5, b = 5:1) + list(x = 1:5, b = 5:1) )