From 0679e05933d6ebe3984ea4daaa7f7f9b272ba4a6 Mon Sep 17 00:00:00 2001 From: sorhawell Date: Mon, 10 Jul 2023 16:22:56 +0200 Subject: [PATCH 01/13] fetch init --- R/dataframe__frame.R | 10 ++++++++++ R/extendr-wrappers.R | 2 ++ R/lazyframe__lazy.R | 13 +++++++++++++ man/DataFrame_fetch.Rd | 21 +++++++++++++++++++++ man/LazyFrame_fetch.Rd | 21 +++++++++++++++++++++ src/rust/Cargo.lock | 6 +++--- src/rust/src/lazy/dataframe.rs | 14 ++++++++++++-- 7 files changed, 82 insertions(+), 5 deletions(-) create mode 100644 man/DataFrame_fetch.Rd create mode 100644 man/LazyFrame_fetch.Rd diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R index 9b4e67dbd..d0ed4a252 100644 --- a/R/dataframe__frame.R +++ b/R/dataframe__frame.R @@ -1527,3 +1527,13 @@ DataFrame_glimpse = function(..., return_as_string = FALSE) { # chose return type if (return_as_string) output else invisible(cat(output)) } + + +#' @title Fetch limited number of rows of DataFrame +#' @keywords DataFrame +#' @inherit LazyFrame_fetch +#' @examples +#' pl$DataFrame(iris)$fetch(2) +DataFrame_fetch = function(n_rows) { + self$lazy()$fetch(n_rows) +} diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index cde638634..b71e57175 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -945,6 +945,8 @@ LazyFrame$rename <- function(existing, new) .Call(wrap__LazyFrame__rename, self, LazyFrame$schema <- function() .Call(wrap__LazyFrame__schema, self) +LazyFrame$fetch <- function(n_rows) .Call(wrap__LazyFrame__fetch, self, n_rows) + #' @export `$.LazyFrame` <- function (self, name) { func <- LazyFrame[[name]]; environment(func) <- environment(); func } diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index e82f8466b..7d49f49ff 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -911,3 +911,16 @@ LazyFrame_dtypes = method_as_property(function() { result() |> unwrap("in $dtypes()") }) + + +#' @title Dtypes +#' @description Get rows +#' @keywords LazyFrame +#' @param n_rows number of rows to fetch at maximum. +#' @return A DataFrame of maximum n_rows +#' @examples +#' pl$LazyFrame(irirs)$fetch(3) +LazyFrame_fetch = function(n_rows = 500) { + .pr$LazyFrame$fetch(self, n_rows) |> + unwrap("in $fetch()") +} diff --git a/man/DataFrame_fetch.Rd b/man/DataFrame_fetch.Rd new file mode 100644 index 000000000..3b733e0c7 --- /dev/null +++ b/man/DataFrame_fetch.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dataframe__frame.R +\name{DataFrame_fetch} +\alias{DataFrame_fetch} +\title{Fetch limited number of rows of DataFrame} +\usage{ +DataFrame_fetch(n_rows) +} +\arguments{ +\item{n_rows}{number of rows to fetch at maximum.} +} +\value{ +A DataFrame of maximum n_rows +} +\description{ +Get rows +} +\examples{ +pl$DataFrame(iris)$fetch(2) +} +\keyword{DataFrame} diff --git a/man/LazyFrame_fetch.Rd b/man/LazyFrame_fetch.Rd new file mode 100644 index 000000000..36d1e85f3 --- /dev/null +++ b/man/LazyFrame_fetch.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe__lazy.R +\name{LazyFrame_fetch} +\alias{LazyFrame_fetch} +\title{Dtypes} +\usage{ +LazyFrame_fetch(n_rows = 500) +} +\arguments{ +\item{n_rows}{number of rows to fetch at maximum.} +} +\value{ +A DataFrame of maximum n_rows +} +\description{ +Get rows +} +\examples{ +pl$LazyFrame(irirs)$fetch(3) +} +\keyword{LazyFrame} diff --git a/src/rust/Cargo.lock b/src/rust/Cargo.lock index 34bc2b9e5..2c8c0dee3 100644 --- a/src/rust/Cargo.lock +++ b/src/rust/Cargo.lock @@ -497,7 +497,7 @@ checksum = "0198b9d0078e0f30dedc7acbb21c974e838fc8fae3ee170128658a98cb2c1c04" [[package]] name = "extendr-api" version = "0.4.0" -source = "git+https://github.com/extendr/extendr?rev=refs/pull/581/head#ee8d3754aba4ffc0e6740d1abbd257f7b62285cf" +source = "git+https://github.com/rpolars/extendr?branch=pl0.7.0rc#ee8d3754aba4ffc0e6740d1abbd257f7b62285cf" dependencies = [ "extendr-engine", "extendr-macros", @@ -509,7 +509,7 @@ dependencies = [ [[package]] name = "extendr-engine" version = "0.4.0" -source = "git+https://github.com/extendr/extendr?rev=refs/pull/581/head#ee8d3754aba4ffc0e6740d1abbd257f7b62285cf" +source = "git+https://github.com/rpolars/extendr?branch=pl0.7.0rc#ee8d3754aba4ffc0e6740d1abbd257f7b62285cf" dependencies = [ "libR-sys", ] @@ -517,7 +517,7 @@ dependencies = [ [[package]] name = "extendr-macros" version = "0.4.0" -source = "git+https://github.com/extendr/extendr?rev=refs/pull/581/head#ee8d3754aba4ffc0e6740d1abbd257f7b62285cf" +source = "git+https://github.com/rpolars/extendr?branch=pl0.7.0rc#ee8d3754aba4ffc0e6740d1abbd257f7b62285cf" dependencies = [ "proc-macro2", "quote", diff --git a/src/rust/src/lazy/dataframe.rs b/src/rust/src/lazy/dataframe.rs index c8a622788..d0e0812e5 100644 --- a/src/rust/src/lazy/dataframe.rs +++ b/src/rust/src/lazy/dataframe.rs @@ -1,6 +1,7 @@ use crate::concurrent::{handle_thread_r_requests, PolarsBackgroundHandle}; use crate::conversion::strings_to_smartstrings; use crate::lazy::dsl::*; +use crate::rdataframe::DataFrame; use crate::rdatatype::new_join_type; use crate::rdatatype::new_quantile_interpolation_option; use crate::rdatatype::new_unique_keep_strategy; @@ -65,7 +66,7 @@ impl LazyFrame { PolarsBackgroundHandle::new(self) } - pub fn collect(&self) -> Result { + pub fn collect(&self) -> Result { handle_thread_r_requests(self.clone().0).map_err(|err| { //improve err messages let err_string = match err { @@ -79,7 +80,7 @@ impl LazyFrame { }) } - pub fn collect_handled(&self) -> crate::rpolarserr::RResult { + pub fn collect_handled(&self) -> RResult { use crate::rpolarserr::WithRctx; handle_thread_r_requests(self.clone().0).when("calling $collect() on LazyFrame") } @@ -377,6 +378,15 @@ impl LazyFrame { pairs.map(|(name, ty)| (name, RPolarsDataType(ty.clone()))), )) } + + fn fetch(&self, n_rows: Robj) -> RResult { + Ok(self + .0 + .clone() + .fetch(robj_to!(usize, n_rows)?) + .map_err(crate::rpolarserr::polars_to_rpolars_err)? + .into()) + } } #[derive(Clone)] From 48b953d4a174f457ad8b581483e4920460620f2a Mon Sep 17 00:00:00 2001 From: sorhawell Date: Tue, 8 Aug 2023 15:38:58 +0200 Subject: [PATCH 02/13] solve conflicts + drop DataFrame_fetch --- R/dataframe__frame.R | 10 ---------- R/extendr-wrappers.R | 4 +--- R/lazyframe__lazy.R | 5 ++--- src/rust/src/lazy/dataframe.rs | 2 +- 4 files changed, 4 insertions(+), 17 deletions(-) diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R index 87a7cbe0c..46f5e3e0a 100644 --- a/R/dataframe__frame.R +++ b/R/dataframe__frame.R @@ -1548,15 +1548,6 @@ DataFrame_glimpse = function(..., return_as_string = FALSE) { } -<<<<<<< HEAD -#' @title Fetch limited number of rows of DataFrame -#' @keywords DataFrame -#' @inherit LazyFrame_fetch -#' @examples -#' pl$DataFrame(iris)$fetch(2) -DataFrame_fetch = function(n_rows) { - self$lazy()$fetch(n_rows) -======= #' @inherit LazyFrame_explode title params #' #' @keywords DataFrame @@ -1571,5 +1562,4 @@ DataFrame_fetch = function(n_rows) { #' df$explode("numbers") DataFrame_explode = function(columns, ...) { self$lazy()$explode(columns, ...)$collect() ->>>>>>> main } diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index 5965c0251..ea2c107a1 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -965,13 +965,11 @@ LazyFrame$rename <- function(existing, new) .Call(wrap__LazyFrame__rename, self, LazyFrame$schema <- function() .Call(wrap__LazyFrame__schema, self) -<<<<<<< HEAD LazyFrame$fetch <- function(n_rows) .Call(wrap__LazyFrame__fetch, self, n_rows) -======= + LazyFrame$explode <- function(columns, dotdotdot_args) .Call(wrap__LazyFrame__explode, self, columns, dotdotdot_args) LazyFrame$clone_see_me_macro <- function() .Call(wrap__LazyFrame__clone_see_me_macro, self) ->>>>>>> main #' @export `$.LazyFrame` <- function (self, name) { func <- LazyFrame[[name]]; environment(func) <- environment(); func } diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index e6abc2e81..a701153c6 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -926,7 +926,6 @@ LazyFrame_dtypes = method_as_property(function() { unwrap("in $dtypes()") }) -<<<<<<< HEAD #' @title Dtypes #' @description Get rows @@ -938,7 +937,8 @@ LazyFrame_dtypes = method_as_property(function() { LazyFrame_fetch = function(n_rows = 500) { .pr$LazyFrame$fetch(self, n_rows) |> unwrap("in $fetch()") -======= +} + #' @title Explode the DataFrame to long format by exploding the given columns #' @keywords LazyFrame #' @@ -970,5 +970,4 @@ LazyFrame_explode = function(columns = list(), ...) { LazyFrame_clone = function() { .pr$LazyFrame$clone_see_me_macro(self) ->>>>>>> main } diff --git a/src/rust/src/lazy/dataframe.rs b/src/rust/src/lazy/dataframe.rs index 5fe7b335c..07a088a51 100644 --- a/src/rust/src/lazy/dataframe.rs +++ b/src/rust/src/lazy/dataframe.rs @@ -393,7 +393,6 @@ impl LazyFrame { )) } - fn fetch(&self, n_rows: Robj) -> RResult { Ok(self .0 @@ -401,6 +400,7 @@ impl LazyFrame { .fetch(robj_to!(usize, n_rows)?) .map_err(crate::rpolarserr::polars_to_rpolars_err)? .into()) + } fn explode(&self, columns: Robj, dotdotdot_args: Robj) -> RResult { let mut columns: Vec = robj_to!(Vec, PLExprCol, columns)?; From 33fdb3054206d92cdc20a57cdabf0b04c3c8d0b6 Mon Sep 17 00:00:00 2001 From: sorhawell Date: Tue, 8 Aug 2023 15:43:25 +0200 Subject: [PATCH 03/13] unit-test + roxygen --- man/DataFrame_fetch.Rd | 21 --------------------- man/pl_corr.Rd | 2 +- man/pl_cov.Rd | 2 +- man/pl_rolling_corr.Rd | 2 +- man/pl_rolling_cov.Rd | 2 +- tests/testthat/test-lazy.R | 10 ++++++++++ 6 files changed, 14 insertions(+), 25 deletions(-) delete mode 100644 man/DataFrame_fetch.Rd diff --git a/man/DataFrame_fetch.Rd b/man/DataFrame_fetch.Rd deleted file mode 100644 index 3b733e0c7..000000000 --- a/man/DataFrame_fetch.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dataframe__frame.R -\name{DataFrame_fetch} -\alias{DataFrame_fetch} -\title{Fetch limited number of rows of DataFrame} -\usage{ -DataFrame_fetch(n_rows) -} -\arguments{ -\item{n_rows}{number of rows to fetch at maximum.} -} -\value{ -A DataFrame of maximum n_rows -} -\description{ -Get rows -} -\examples{ -pl$DataFrame(iris)$fetch(2) -} -\keyword{DataFrame} diff --git a/man/pl_corr.Rd b/man/pl_corr.Rd index 48ca21163..8c3a011fa 100644 --- a/man/pl_corr.Rd +++ b/man/pl_corr.Rd @@ -23,6 +23,6 @@ Expr for the computed correlation Calculates the correlation between two columns } \examples{ -lf <- pl$LazyFrame(data.frame(a = c(1, 8, 3), b = c(4, 5, 2))) +lf = pl$LazyFrame(data.frame(a = c(1, 8, 3), b = c(4, 5, 2))) lf$select(pl$corr("a", "b", method = "spearman"))$collect() } diff --git a/man/pl_cov.Rd b/man/pl_cov.Rd index b7ba29486..402c7c324 100644 --- a/man/pl_cov.Rd +++ b/man/pl_cov.Rd @@ -15,7 +15,7 @@ Expr for the computed covariance Calculates the covariance between two columns / expressions. } \examples{ -lf <- pl$LazyFrame(data.frame(a = c(1, 8, 3), b = c(4, 5, 2))) +lf = pl$LazyFrame(data.frame(a = c(1, 8, 3), b = c(4, 5, 2))) lf$select(pl$cov("a", "b"))$collect() pl$cov(c(1, 8, 3), c(4, 5, 2))$to_r() } diff --git a/man/pl_rolling_corr.Rd b/man/pl_rolling_corr.Rd index 0a76e3248..79ceec9f8 100644 --- a/man/pl_rolling_corr.Rd +++ b/man/pl_rolling_corr.Rd @@ -22,6 +22,6 @@ Expr for the computed rolling correlation Calculates the rolling correlation between two columns } \examples{ -lf <- pl$LazyFrame(data.frame(a = c(1, 8, 3), b = c(4, 5, 2))) +lf = pl$LazyFrame(data.frame(a = c(1, 8, 3), b = c(4, 5, 2))) lf$select(pl$rolling_corr("a", "b", window_size = 2))$collect() } diff --git a/man/pl_rolling_cov.Rd b/man/pl_rolling_cov.Rd index ac7b5d520..98de07f68 100644 --- a/man/pl_rolling_cov.Rd +++ b/man/pl_rolling_cov.Rd @@ -22,6 +22,6 @@ Expr for the computed rolling covariance Calculates the rolling covariance between two columns } \examples{ -lf <- pl$LazyFrame(data.frame(a = c(1, 8, 3), b = c(4, 5, 2))) +lf = pl$LazyFrame(data.frame(a = c(1, 8, 3), b = c(4, 5, 2))) lf$select(pl$rolling_cov("a", "b", window_size = 2))$collect() } diff --git a/tests/testthat/test-lazy.R b/tests/testthat/test-lazy.R index 6bd0ac78a..692f98918 100644 --- a/tests/testthat/test-lazy.R +++ b/tests/testthat/test-lazy.R @@ -655,3 +655,13 @@ test_that("cloning", { expect_identical(pf$collect()$to_data_frame(), pf2$collect()$to_data_frame()) expect_different(pl$mem_address(pf), pl$mem_address(pf2)) }) + + + +test_that("fetch", { + lf = pl$LazyFrame(a = 1:10, b = letters[10:1]) + expect_identical( + lf$fetch(5)$to_list(), + lf$slice(0,5)$collect()$to_list() + ) +}) From cc128c142baa196834126313acf5723ba415f95c Mon Sep 17 00:00:00 2001 From: sorhawell Date: Thu, 10 Aug 2023 10:52:10 +0200 Subject: [PATCH 04/13] support fetch with R + extra unit tests + docs --- R/lazyframe__lazy.R | 41 +++++++++++++++++++++++++++++----- man/LazyFrame_collect.Rd | 9 ++++++++ man/LazyFrame_fetch.Rd | 30 +++++++++++++++++++++---- man/LazyFrame_profile.Rd | 11 ++++++++- src/rust/src/concurrent.rs | 16 +++++++++++++ src/rust/src/lazy/dataframe.rs | 11 ++++----- tests/testthat/test-lazy.R | 28 +++++++++++++++++++++++ 7 files changed, 129 insertions(+), 17 deletions(-) diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index 3783f16ed..6c5788819 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -293,6 +293,12 @@ LazyFrame_filter = "use_extendr_wrapper" #' @keywords LazyFrame DataFrame_new #' @return A `DataFrame` #' @examples pl$LazyFrame(iris)$filter(pl$col("Species") == "setosa")$collect() +#' @seealso +#' - [`$fetch()`][LazyFrame_fetch] - fast limited query check +#' - [`$profile()`][LazyFrame_profile] - returns as `$collect()` but also table with each operation +#' profiled. +#' - [`$collect_in_background()`][LazyFrame_collect_in_background] - non-blocking collect returns +#' a future handle. Can also just be used via `$collect(collect_in_background = TRUE)`. LazyFrame_collect = function( type_coercion = TRUE, predicate_pushdown = TRUE, @@ -303,6 +309,7 @@ LazyFrame_collect = function( no_optimization = FALSE, streaming = FALSE, collect_in_background = FALSE) { + if (isTRUE(no_optimization)) { predicate_pushdown = FALSE projection_pushdown = FALSE @@ -1019,13 +1026,31 @@ LazyFrame_dtypes = method_as_property(function() { }) -#' @title Dtypes -#' @description Get rows +#' @title Fetch +#' @description limit number of rows at scan level for fast trying a query #' @keywords LazyFrame -#' @param n_rows number of rows to fetch at maximum. +#' @details +#' Collect a small number of rows for debugging purposes. +#' Fetch is like the [`$collect()`][LazyFrame_collect] operation, but it overwrites the number of +#' rows read by every scan operation. This is a utility that helps debug a query on a smaller number +#' of rows. Note that the fetch does not guarantee the final number of rows in the DataFrame. Filter +#' , join operations and a lower number of rows available in the scanned file influence the final +#' number of rows. +#' @param n_rows number (`Into`) of rows to fetch at maximum. #' @return A DataFrame of maximum n_rows +#' @seealso +#' - [`$collect()`][LazyFrame_collect] - regular collect. +#' - [`$profile()`][LazyFrame_profile] - returns as `$collect()` but also table with each operation +#' profiled. +#' - [`$collect_in_background()`][LazyFrame_collect_in_background] - non-blocking collect returns +#' a future handle. Can also just be used via `$collect(collect_in_background = TRUE)`. #' @examples -#' pl$LazyFrame(irirs)$fetch(3) +#' +#' # fetch 3 +#' pl$LazyFrame(iris)$fetch(3) +#' +#' # this fetch-query returns 4 and not 3 entries, see details. +#' pl$LazyFrame(iris)$select(pl$col("Species")$append("flora gigantica, alien"))$fetch(3) LazyFrame_fetch = function(n_rows = 500) { .pr$LazyFrame$fetch(self, n_rows) |> unwrap("in $fetch()") @@ -1037,7 +1062,13 @@ LazyFrame_fetch = function(n_rows = 500) { #' @details The units of the timings are microseconds. #' #' @keywords LazyFrame -#' @return List of two `DataFrame`s: one with the collected result, the other with the timings of each step. +#' @return List of two `DataFrame`s: one with the collected result, the other with the timings of +#' each step. +#' @seealso +#' - [`$collect()`][LazyFrame_collect] - regular collect. +#' - [`$fetch()`][LazyFrame_fetch] - fast limited query check +#' - [`$collect_in_background()`][LazyFrame_collect_in_background] - non-blocking collect returns +#' a future handle. Can also just be used via `$collect(collect_in_background = TRUE)`. #' @examples #' #' ## Simplest use case diff --git a/man/LazyFrame_collect.Rd b/man/LazyFrame_collect.Rd index 1ef627a86..c28d5db45 100644 --- a/man/LazyFrame_collect.Rd +++ b/man/LazyFrame_collect.Rd @@ -61,5 +61,14 @@ This can be a huge time saver in debugging queries. \examples{ pl$LazyFrame(iris)$filter(pl$col("Species") == "setosa")$collect() } +\seealso{ +\itemize{ +\item \code{\link[=LazyFrame_fetch]{$fetch()}} - fast limited query check +\item \code{\link[=LazyFrame_profile]{$profile()}} - returns as \verb{$collect()} but also table with each operation +profiled. +\item \code{\link[=LazyFrame_collect_in_background]{$collect_in_background()}} - non-blocking collect returns +a future handle. Can also just be used via \verb{$collect(collect_in_background = TRUE)}. +} +} \keyword{DataFrame_new} \keyword{LazyFrame} diff --git a/man/LazyFrame_fetch.Rd b/man/LazyFrame_fetch.Rd index 36d1e85f3..029fb512d 100644 --- a/man/LazyFrame_fetch.Rd +++ b/man/LazyFrame_fetch.Rd @@ -2,20 +2,42 @@ % Please edit documentation in R/lazyframe__lazy.R \name{LazyFrame_fetch} \alias{LazyFrame_fetch} -\title{Dtypes} +\title{Fetch} \usage{ LazyFrame_fetch(n_rows = 500) } \arguments{ -\item{n_rows}{number of rows to fetch at maximum.} +\item{n_rows}{number (\verb{Into}) of rows to fetch at maximum.} } \value{ A DataFrame of maximum n_rows } \description{ -Get rows +limit number of rows at scan level for fast trying a query +} +\details{ +Collect a small number of rows for debugging purposes. +Fetch is like the \code{\link[=LazyFrame_collect]{$collect()}} operation, but it overwrites the number of +rows read by every scan operation. This is a utility that helps debug a query on a smaller number +of rows. Note that the fetch does not guarantee the final number of rows in the DataFrame. Filter +, join operations and a lower number of rows available in the scanned file influence the final +number of rows. } \examples{ -pl$LazyFrame(irirs)$fetch(3) + +# fetch 3 +pl$LazyFrame(iris)$fetch(3) + +# this fetch-query returns 4 and not 3 entries, see details. +pl$LazyFrame(iris)$select(pl$col("Species")$append("flora gigantica, alien"))$fetch(3) +} +\seealso{ +\itemize{ +\item \code{\link[=LazyFrame_collect]{$collect()}} - regular collect. +\item \code{\link[=LazyFrame_profile]{$profile()}} - returns as \verb{$collect()} but also table with each operation +profiled. +\item \code{\link[=LazyFrame_collect_in_background]{$collect_in_background()}} - non-blocking collect returns +a future handle. Can also just be used via \verb{$collect(collect_in_background = TRUE)}. +} } \keyword{LazyFrame} diff --git a/man/LazyFrame_profile.Rd b/man/LazyFrame_profile.Rd index ae4c6a0aa..7dc3176e1 100644 --- a/man/LazyFrame_profile.Rd +++ b/man/LazyFrame_profile.Rd @@ -7,7 +7,8 @@ LazyFrame_profile() } \value{ -List of two \code{DataFrame}s: one with the collected result, the other with the timings of each step. +List of two \code{DataFrame}s: one with the collected result, the other with the timings of +each step. } \description{ This will run the query and return a list containing the materialized DataFrame and @@ -44,5 +45,13 @@ pl$LazyFrame(iris)$ agg(pl$col(pl$Float64)$apply(r_func))$ profile() +} +\seealso{ +\itemize{ +\item \code{\link[=LazyFrame_collect]{$collect()}} - regular collect. +\item \code{\link[=LazyFrame_fetch]{$fetch()}} - fast limited query check +\item \code{\link[=LazyFrame_collect_in_background]{$collect_in_background()}} - non-blocking collect returns +a future handle. Can also just be used via \verb{$collect(collect_in_background = TRUE)}. +} } \keyword{LazyFrame} diff --git a/src/rust/src/concurrent.rs b/src/rust/src/concurrent.rs index e4b324186..12223e1de 100644 --- a/src/rust/src/concurrent.rs +++ b/src/rust/src/concurrent.rs @@ -75,3 +75,19 @@ pub fn profile_with_r_func_support(lazy_df: pl::LazyFrame) -> RResult<(DataFrame .map_err(polars_to_rpolars_err) .map(|(result_df, profile_df)| (DataFrame(result_df), DataFrame(profile_df))) } + +pub fn fetch_with_r_func_support(lazy_df: pl::LazyFrame, n_rows: usize) -> RResult { + concurrent_handler( + move |tc| { + let retval = lazy_df.fetch(n_rows); + ThreadCom::kill_global(&CONFIG); + drop(tc); + retval + }, + serve_r, + &CONFIG, + ) + .map_err(|err| RPolarsErr::new().plain(err.to_string()))? + .map_err(polars_to_rpolars_err) + .map(DataFrame) +} diff --git a/src/rust/src/lazy/dataframe.rs b/src/rust/src/lazy/dataframe.rs index 92b6ccef6..cb2ee5ec9 100644 --- a/src/rust/src/lazy/dataframe.rs +++ b/src/rust/src/lazy/dataframe.rs @@ -1,7 +1,9 @@ use crate::conversion::strings_to_smartstrings; use crate::rdataframe::DataFrame as RDataFrame; -use crate::concurrent::{collect_with_r_func_support, profile_with_r_func_support}; +use crate::concurrent::{ + collect_with_r_func_support, fetch_with_r_func_support, profile_with_r_func_support, +}; use crate::lazy::dsl::*; use crate::rdataframe::DataFrame as RDF; @@ -389,12 +391,7 @@ impl LazyFrame { } fn fetch(&self, n_rows: Robj) -> RResult { - Ok(self - .0 - .clone() - .fetch(robj_to!(usize, n_rows)?) - .map_err(crate::rpolarserr::polars_to_rpolars_err)? - .into()) + fetch_with_r_func_support(self.0.clone(), robj_to!(usize, n_rows)?) } #[allow(clippy::too_many_arguments)] diff --git a/tests/testthat/test-lazy.R b/tests/testthat/test-lazy.R index 99939e5de..a674631a0 100644 --- a/tests/testthat/test-lazy.R +++ b/tests/testthat/test-lazy.R @@ -659,9 +659,37 @@ test_that("cloning", { test_that("fetch", { + + #simple example lf = pl$LazyFrame(a = 1:10, b = letters[10:1]) expect_identical( lf$fetch(5)$to_list(), lf$slice(0,5)$collect()$to_list() ) + + # supports use of R functions in fetch + expect_identical( + lf$select(pl$col("a")$map(\(s) s * 2L))$fetch(5)$to_list(), + lf$select(pl$col("a") * 2L)$fetch(5)$to_list() + ) + + #usize input can be char + expect_identical( + lf$select(pl$col("a") * 2L)$fetch("5")$to_list(), + lf$select(pl$col("a") * 2L)$fetch(5)$to_list() + ) + + # uszie input can be bit64 + skip_if_not_installed("bit64") + expect_identical( + lf$select(pl$col("a") * 2L)$fetch(bit64::as.integer64(5))$to_list(), + lf$select(pl$col("a") * 2L)$fetch(5)$to_list() + ) + + # usize cannot be negative + expect_identical( + result(lf$select(pl$col("a") * 2L)$fetch(-5)$to_list())$err$contexts(), + list(BadArgument = "n_rows", ValueOutOfScope = "cannot be less than zero", BadValue = "-5") + ) + }) From ab8ddf6a7784292656f5bb7232b8c1d32dfbcd8f Mon Sep 17 00:00:00 2001 From: sorhawell Date: Thu, 10 Aug 2023 15:03:42 +0200 Subject: [PATCH 05/13] fmt + opt args + docs --- R/lazyframe__lazy.R | 55 +++++++++++++++++++++++++++++++++++--- man/LazyFrame_fetch.Rd | 39 ++++++++++++++++++++++++++- tests/testthat/test-lazy.R | 17 +++++++++--- 3 files changed, 103 insertions(+), 8 deletions(-) diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index 6c5788819..28fbedbf7 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -309,7 +309,6 @@ LazyFrame_collect = function( no_optimization = FALSE, streaming = FALSE, collect_in_background = FALSE) { - if (isTRUE(no_optimization)) { predicate_pushdown = FALSE projection_pushdown = FALSE @@ -1037,6 +1036,25 @@ LazyFrame_dtypes = method_as_property(function() { #' , join operations and a lower number of rows available in the scanned file influence the final #' number of rows. #' @param n_rows number (`Into`) of rows to fetch at maximum. +#' @param type_coercion Boolean. Coerce types such that operations succeed and +#' run on minimal required memory. +#' @param predicate_pushdown Boolean. Applies filters as early as possible / at +#' scan level. +#' @param projection_pushdown Boolean. Applies filters as early as possible / at +#' scan level. +#' @param simplify_expression Boolean. Cache subtrees/file scans that are used +#' by multiple subtrees in the query plan. +#' @param slice_pushdown Boolean. Only load the required slice from the scan +#' level. Don't materialize sliced outputs (e.g. `join$head(10)`). +#' @param common_subplan_elimination Boolean. Cache subtrees/file scans that +#' are used by multiple subtrees in the query plan. +#' @param no_optimization Boolean. Turn off the following optimizations: +#' predicate_pushdown = FALSE +#' projection_pushdown = FALSE +#' slice_pushdown = FALSE +#' common_subplan_elimination = FALSE +#' @param streaming Boolean. Run parts of the query in a streaming fashion +#' (this is in an alpha state). #' @return A DataFrame of maximum n_rows #' @seealso #' - [`$collect()`][LazyFrame_collect] - regular collect. @@ -1051,8 +1069,39 @@ LazyFrame_dtypes = method_as_property(function() { #' #' # this fetch-query returns 4 and not 3 entries, see details. #' pl$LazyFrame(iris)$select(pl$col("Species")$append("flora gigantica, alien"))$fetch(3) -LazyFrame_fetch = function(n_rows = 500) { - .pr$LazyFrame$fetch(self, n_rows) |> +LazyFrame_fetch = function( + n_rows = 500, + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + common_subplan_elimination = TRUE, + no_optimization = FALSE, + streaming = FALSE) { + if (isTRUE(no_optimization)) { + predicate_pushdown = FALSE + projection_pushdown = FALSE + slice_pushdown = FALSE + common_subplan_elimination = FALSE + } + + if (isTRUE(streaming)) { + common_subplan_elimination = FALSE + } + + + self |> + .pr$LazyFrame$optimization_toggle( + type_coercion, + predicate_pushdown, + projection_pushdown, + simplify_expression, + slice_pushdown, + common_subplan_elimination, + streaming + ) |> + and_then(\(self) .pr$LazyFrame$fetch(self, n_rows)) |> unwrap("in $fetch()") } diff --git a/man/LazyFrame_fetch.Rd b/man/LazyFrame_fetch.Rd index 029fb512d..d83e2d52a 100644 --- a/man/LazyFrame_fetch.Rd +++ b/man/LazyFrame_fetch.Rd @@ -4,10 +4,47 @@ \alias{LazyFrame_fetch} \title{Fetch} \usage{ -LazyFrame_fetch(n_rows = 500) +LazyFrame_fetch( + n_rows = 500, + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + common_subplan_elimination = TRUE, + no_optimization = FALSE, + streaming = FALSE +) } \arguments{ \item{n_rows}{number (\verb{Into}) of rows to fetch at maximum.} + +\item{type_coercion}{Boolean. Coerce types such that operations succeed and +run on minimal required memory.} + +\item{predicate_pushdown}{Boolean. Applies filters as early as possible / at +scan level.} + +\item{projection_pushdown}{Boolean. Applies filters as early as possible / at +scan level.} + +\item{simplify_expression}{Boolean. Cache subtrees/file scans that are used +by multiple subtrees in the query plan.} + +\item{slice_pushdown}{Boolean. Only load the required slice from the scan +level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).} + +\item{common_subplan_elimination}{Boolean. Cache subtrees/file scans that +are used by multiple subtrees in the query plan.} + +\item{no_optimization}{Boolean. Turn off the following optimizations: +predicate_pushdown = FALSE +projection_pushdown = FALSE +slice_pushdown = FALSE +common_subplan_elimination = FALSE} + +\item{streaming}{Boolean. Run parts of the query in a streaming fashion +(this is in an alpha state).} } \value{ A DataFrame of maximum n_rows diff --git a/tests/testthat/test-lazy.R b/tests/testthat/test-lazy.R index a674631a0..2144b4152 100644 --- a/tests/testthat/test-lazy.R +++ b/tests/testthat/test-lazy.R @@ -659,12 +659,11 @@ test_that("cloning", { test_that("fetch", { - - #simple example + # simple example lf = pl$LazyFrame(a = 1:10, b = letters[10:1]) expect_identical( lf$fetch(5)$to_list(), - lf$slice(0,5)$collect()$to_list() + lf$slice(0, 5)$collect()$to_list() ) # supports use of R functions in fetch @@ -673,7 +672,7 @@ test_that("fetch", { lf$select(pl$col("a") * 2L)$fetch(5)$to_list() ) - #usize input can be char + # usize input can be char expect_identical( lf$select(pl$col("a") * 2L)$fetch("5")$to_list(), lf$select(pl$col("a") * 2L)$fetch(5)$to_list() @@ -692,4 +691,14 @@ test_that("fetch", { list(BadArgument = "n_rows", ValueOutOfScope = "cannot be less than zero", BadValue = "-5") ) + + # bad opt profile arg streaming + expect_identical( + result(lf$select(pl$lit(2L) * 2L)$lazy()$fetch(-5, streaming = 42)$to_list())$err$contexts(), + list( + BadArgument = "streaming", + TypeMismatch = "bool", + BadValue = "Rvalue: 42.0, Rsexp: Doubles, Rclass: [\"numeric\"]" + ) + ) }) From 6f8a67240c4cc734dd8550e9987d12142d275d0d Mon Sep 17 00:00:00 2001 From: sorhawell Date: Fri, 11 Aug 2023 10:53:37 +0200 Subject: [PATCH 06/13] tniy bug in a utest --- tests/testthat/test-lazy.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-lazy.R b/tests/testthat/test-lazy.R index 2144b4152..e2b5fcb3a 100644 --- a/tests/testthat/test-lazy.R +++ b/tests/testthat/test-lazy.R @@ -694,7 +694,7 @@ test_that("fetch", { # bad opt profile arg streaming expect_identical( - result(lf$select(pl$lit(2L) * 2L)$lazy()$fetch(-5, streaming = 42)$to_list())$err$contexts(), + result(pl$select(pl$lit(2L) * 2L)$lazy()$fetch(-5, streaming = 42)$to_list())$err$contexts(), list( BadArgument = "streaming", TypeMismatch = "bool", From 1a52263e1ecf10e8270b756cb170d3623ed738db Mon Sep 17 00:00:00 2001 From: sorhawell Date: Mon, 21 Aug 2023 21:15:05 +0200 Subject: [PATCH 07/13] merge main --- R/info.R | 20 +++++++++++++++++++- tests/testthat/_snaps/info.md | 3 ++- tests/testthat/test-info.R | 1 + 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/R/info.R b/R/info.R index 3081b7529..c15ad6933 100644 --- a/R/info.R +++ b/R/info.R @@ -1,3 +1,18 @@ +# get version of rust-polars version from Cargo.lock at package build time +# from polars 0.31.1 this can be migrated to rust side see +# https://github.com/pola-rs/polars/pull/9660 +RUST_POLARS_VERSION = (\() { + Cargo.lock = readLines("./src/rust/Cargo.lock") + polars.idx = which(Cargo.lock == r"{name = "polars"}")[1] + this_line = Cargo.lock[polars.idx + 1] + if (isTRUE(substr(this_line, 1, 7) == "version")) { + return(substr(this_line, 12, nchar(this_line) - 1)) + } + warning("failed to find RUST_POLARS_VERSION version") + "unknown" +})() + + #' Report information of the package #' #' @return A list with information of the package @@ -8,6 +23,7 @@ pl$polars_info = function() { # Similar to arrow::arrow_info() out = list( version = utils::packageVersion("polars"), + rust_polars = RUST_POLARS_VERSION, features = FeatureInfo$new()$to_r() ) @@ -27,6 +43,8 @@ print.polars_info = function(x, ...) { cat("\n") } - cat("R Polars package version: ", format(x$version), "\n\n", sep = "") + cat("r-polars package version : ", format(x$version), "\n", sep = "") + cat("rust-polars crate version: ", format(x$rust_polars), "\n", sep = "") + cat("\n") print_key_values("Features", unlist(x$features)) } diff --git a/tests/testthat/_snaps/info.md b/tests/testthat/_snaps/info.md index e2dade0e6..36db030f9 100644 --- a/tests/testthat/_snaps/info.md +++ b/tests/testthat/_snaps/info.md @@ -3,7 +3,8 @@ Code info Output - R Polars package version: 999.999.999 + r-polars package version : 999.999.999 + rust-polars crate version: 999.999.999 Features: simd FALSE diff --git a/tests/testthat/test-info.R b/tests/testthat/test-info.R index 7450c34df..30242a942 100644 --- a/tests/testthat/test-info.R +++ b/tests/testthat/test-info.R @@ -11,6 +11,7 @@ test_that("print pl$polars_info()", { # Ensure static version for snapshot test info$version <- package_version("999.999.999") + info$rust_polars <- package_version("999.999.999") # Ensure all features are FALSE for snapshot test for (feature in names(info$features)) { From ed341d4a939ddb263bb58d2f9d48fd32c0407df4 Mon Sep 17 00:00:00 2001 From: sorhawell Date: Mon, 28 Aug 2023 14:07:17 +0200 Subject: [PATCH 08/13] fix error__trait.R docs error --- R/error__trait.R | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/R/error__trait.R b/R/error__trait.R index 033cfdddb..075605231 100644 --- a/R/error__trait.R +++ b/R/error__trait.R @@ -4,6 +4,7 @@ #' Internal generic method to add call to error #' @param err any type which impl as.character #' @param call calling context +#' @noRd #' @details #' Additional details... #' @@ -25,9 +26,11 @@ when_calling.default = function(err, call) { call_to_string = function(call) paste(capture.output(print(call)), collapse = "\n") # NB collapse is needed to ensure no invalid multi-line error strings -#' Internal generic method to point to which public method the user got wrong + +#' where in (lexically) error happened +#' @description Internal generic method to point to which public method the user got wrong #' @param err any type which impl as.character -#' @param call calling context +#' @param context calling context #' @keywords internal #' @return err as string #' @examples @@ -52,8 +55,8 @@ where_in.default = function(err, context) { #' Internal generic method to convert an error_type to condition. #' @param err any type which impl as.character -#' @param call calling context #' @keywords internal +#' @noRd #' @details #' this method is needed to preserve state of err without upcasting to a string message #' an implementation will describe how to store the error in the condition @@ -75,6 +78,7 @@ to_condition.default = function(err) { #' Internal generic method to add plain text to error message #' @param err some error type object #' @param msg string to add +#' @noRd #' @keywords internal #' @return condition plain = function(err, msg) { @@ -95,7 +99,7 @@ plain.default = function(err, msg) { #' An error type can choose to implement this to improve the translation. #' As fall back the error will be deparsed into a string with rust Debug, see rdbg() #' @param err some error type object -#' @param msg string to add +#' @noRd #' @keywords internal #' @return condition upgrade_err = function(err) { From f6c73fb881e7947f5d2c7efc6936e82bbc6a3027 Mon Sep 17 00:00:00 2001 From: sorhawell Date: Mon, 28 Aug 2023 14:10:18 +0200 Subject: [PATCH 09/13] add news --- NEWS.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 194168056..32a1b18bf 100644 --- a/NEWS.md +++ b/NEWS.md @@ -9,7 +9,8 @@ - `pl$scan_arrow_ipc` is now called `pl$scan_ipc` (#343). ## What's changed -- Stream query to file with `pl$sink_ipc()` and `pl$sink_parquet()` (#343) +- New method `$fetch()` for `LazyFrame` (#319). +- Stream query to file with `pl$sink_ipc()` and `pl$sink_parquet()` (#343). - New method `$explode()` for `DataFrame` and `LazyFrame` (#314). - New method `$clone()` for `LazyFrame` (#347). - New methods `$optimization_toggle()` and `$profile()` for `LazyFrame` (#323). From 0897b8913b817c899d66a0f1fd4f0d10c8bc4005 Mon Sep 17 00:00:00 2001 From: sorhawell Date: Mon, 28 Aug 2023 15:35:30 +0200 Subject: [PATCH 10/13] upadate docs --- R/error__trait.R | 1 + man/plain.Rd | 20 -------------------- man/to_condition.Rd | 24 ------------------------ man/upgrade_err.Rd | 25 ------------------------- man/when_calling.Rd | 26 -------------------------- man/where_in.Rd | 4 ++-- 6 files changed, 3 insertions(+), 97 deletions(-) delete mode 100644 man/plain.Rd delete mode 100644 man/to_condition.Rd delete mode 100644 man/upgrade_err.Rd delete mode 100644 man/when_calling.Rd diff --git a/R/error__trait.R b/R/error__trait.R index 075605231..2b72d9f94 100644 --- a/R/error__trait.R +++ b/R/error__trait.R @@ -32,6 +32,7 @@ call_to_string = function(call) paste(capture.output(print(call)), collapse = "\ #' @param err any type which impl as.character #' @param context calling context #' @keywords internal +#' @noRd #' @return err as string #' @examples #' # diff --git a/man/plain.Rd b/man/plain.Rd deleted file mode 100644 index 91ccb95ae..000000000 --- a/man/plain.Rd +++ /dev/null @@ -1,20 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/error__trait.R -\name{plain} -\alias{plain} -\title{Internal generic method to add plain text to error message} -\usage{ -plain(err, msg) -} -\arguments{ -\item{err}{some error type object} - -\item{msg}{string to add} -} -\value{ -condition -} -\description{ -Internal generic method to add plain text to error message -} -\keyword{internal} diff --git a/man/to_condition.Rd b/man/to_condition.Rd deleted file mode 100644 index 78dc4fbb1..000000000 --- a/man/to_condition.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/error__trait.R -\name{to_condition} -\alias{to_condition} -\title{Internal generic method to convert an error_type to condition.} -\usage{ -to_condition(err) -} -\arguments{ -\item{err}{any type which impl as.character} - -\item{call}{calling context} -} -\value{ -condition -} -\description{ -Internal generic method to convert an error_type to condition. -} -\details{ -this method is needed to preserve state of err without upcasting to a string message -an implementation will describe how to store the error in the condition -} -\keyword{internal} diff --git a/man/upgrade_err.Rd b/man/upgrade_err.Rd deleted file mode 100644 index bdae6c6b4..000000000 --- a/man/upgrade_err.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/error__trait.R -\name{upgrade_err} -\alias{upgrade_err} -\title{Internal generic method to add plain text to error message} -\usage{ -upgrade_err(err) -} -\arguments{ -\item{err}{some error type object} - -\item{msg}{string to add} -} -\value{ -condition -} -\description{ -Internal generic method to add plain text to error message -} -\details{ -polars converts any other error types to RPolarsErr. -An error type can choose to implement this to improve the translation. -As fall back the error will be deparsed into a string with rust Debug, see rdbg() -} -\keyword{internal} diff --git a/man/when_calling.Rd b/man/when_calling.Rd deleted file mode 100644 index 29fff3b6a..000000000 --- a/man/when_calling.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/error__trait.R -\name{when_calling} -\alias{when_calling} -\title{Internal generic method to add call to error} -\usage{ -when_calling(err, call) -} -\arguments{ -\item{err}{any type which impl as.character} - -\item{call}{calling context} -} -\value{ -err as string -} -\description{ -Internal generic method to add call to error -} -\details{ -Additional details... -} -\examples{ -# -} -\keyword{internal} diff --git a/man/where_in.Rd b/man/where_in.Rd index 327cd21a0..4cefdabdb 100644 --- a/man/where_in.Rd +++ b/man/where_in.Rd @@ -2,14 +2,14 @@ % Please edit documentation in R/error__trait.R \name{where_in} \alias{where_in} -\title{Internal generic method to point to which public method the user got wrong} +\title{where in (lexically) error happened} \usage{ where_in(err, context) } \arguments{ \item{err}{any type which impl as.character} -\item{call}{calling context} +\item{context}{calling context} } \value{ err as string From 083245b252bd2c238724436598090e755aa137f7 Mon Sep 17 00:00:00 2001 From: sorhawell Date: Mon, 28 Aug 2023 15:41:54 +0200 Subject: [PATCH 11/13] with last --- man/where_in.Rd | 23 ----------------------- 1 file changed, 23 deletions(-) delete mode 100644 man/where_in.Rd diff --git a/man/where_in.Rd b/man/where_in.Rd deleted file mode 100644 index 4cefdabdb..000000000 --- a/man/where_in.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/error__trait.R -\name{where_in} -\alias{where_in} -\title{where in (lexically) error happened} -\usage{ -where_in(err, context) -} -\arguments{ -\item{err}{any type which impl as.character} - -\item{context}{calling context} -} -\value{ -err as string -} -\description{ -Internal generic method to point to which public method the user got wrong -} -\examples{ -# -} -\keyword{internal} From e43121cb35610a069794acc1c48ad47223a214b9 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Wed, 30 Aug 2023 07:57:43 +0200 Subject: [PATCH 12/13] fix arguments comm_sub* args --- R/lazyframe__lazy.R | 59 +++++++++++++++++++++++----------------- man/LazyFrame_collect.Rd | 15 +++++----- man/LazyFrame_fetch.Rd | 32 ++++++++++++---------- 3 files changed, 60 insertions(+), 46 deletions(-) diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index ff0754b27..090a62f89 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -269,16 +269,17 @@ LazyFrame_filter = "use_extendr_wrapper" #' run on minimal required memory. #' @param predicate_pushdown Boolean. Applies filters as early as possible at #' scan level. -#' @param projection_pushdown Boolean. Select only the columns that are needed at the scan level. -#' @param simplify_expression Boolean. Various optimizations, such as constant folding -#' and replacing expensive operations with faster alternatives. +#' @param projection_pushdown Boolean. Select only the columns that are needed +#' at the scan level. +#' @param simplify_expression Boolean. Various optimizations, such as constant +#' folding and replacing expensive operations with faster alternatives. #' @param slice_pushdown Boolean. Only load the required slice from the scan #' Don't materialize sliced outputs #' level. Don't materialize sliced outputs (e.g. `join$head(10)`). -#' @param comm_subplan_elim Boolean. Will try to cache branching subplans that occur on self-joins -#' or unions. -#' @param comm_subexpr_elim Boolean. Common subexpressions will be cached and reused. -#' or unions. +#' @param comm_subplan_elim Boolean. Will try to cache branching subplans that +#' occur on self-joins or unions. +#' @param comm_subexpr_elim Boolean. Common subexpressions will be cached and +#' reused. #' @param no_optimization Boolean. Turn off the following optimizations: #' predicate_pushdown = FALSE #' projection_pushdown = FALSE @@ -1189,17 +1190,19 @@ LazyFrame_dtypes = method_as_property(function() { }) -#' @title Fetch -#' @description limit number of rows at scan level for fast trying a query +#' Fetch `n` rows of a LazyFrame +#' +#' This is similar to `$collect()` but limit the number of rows to collect. It +#' is mostly useful to check that a query works as expected. +#' #' @keywords LazyFrame #' @details -#' Collect a small number of rows for debugging purposes. -#' Fetch is like the [`$collect()`][LazyFrame_collect] operation, but it overwrites the number of -#' rows read by every scan operation. This is a utility that helps debug a query on a smaller number -#' of rows. Note that the fetch does not guarantee the final number of rows in the DataFrame. Filter -#' , join operations and a lower number of rows available in the scanned file influence the final -#' number of rows. -#' @param n_rows number (`Into`) of rows to fetch at maximum. +#' `$fetch()` does not guarantee the final number of rows in the DataFrame output. +#' It only guarantees that `n` rows are used at the beginning of the query. +#' Filters, join operations and a lower number of rows available in the scanned +#' file influence the final number of rows. +#' +#' @param n_rows Integer. Maximum number of rows to fetch. #' @param type_coercion Boolean. Coerce types such that operations succeed and #' run on minimal required memory. #' @param predicate_pushdown Boolean. Applies filters as early as possible / at @@ -1210,8 +1213,10 @@ LazyFrame_dtypes = method_as_property(function() { #' by multiple subtrees in the query plan. #' @param slice_pushdown Boolean. Only load the required slice from the scan #' level. Don't materialize sliced outputs (e.g. `join$head(10)`). -#' @param common_subplan_elimination Boolean. Cache subtrees/file scans that -#' are used by multiple subtrees in the query plan. +#' @param comm_subplan_elim Boolean. Will try to cache branching subplans that +#' occur on self-joins or unions. +#' @param comm_subexpr_elim Boolean. Common subexpressions will be cached and +#' reused. #' @param no_optimization Boolean. Turn off the following optimizations: #' predicate_pushdown = FALSE #' projection_pushdown = FALSE @@ -1228,10 +1233,11 @@ LazyFrame_dtypes = method_as_property(function() { #' a future handle. Can also just be used via `$collect(collect_in_background = TRUE)`. #' @examples #' -#' # fetch 3 +#' # fetch 3 rows #' pl$LazyFrame(iris)$fetch(3) #' -#' # this fetch-query returns 4 and not 3 entries, see details. +#' # this fetch-query returns 4 rows, because we started with 3 and appended one +#' # row in the query (see section 'Details') #' pl$LazyFrame(iris)$select(pl$col("Species")$append("flora gigantica, alien"))$fetch(3) LazyFrame_fetch = function( n_rows = 500, @@ -1240,21 +1246,23 @@ LazyFrame_fetch = function( projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, - common_subplan_elimination = TRUE, + comm_subplan_elim = TRUE, + comm_subexpr_elim = TRUE, no_optimization = FALSE, streaming = FALSE) { + if (isTRUE(no_optimization)) { predicate_pushdown = FALSE projection_pushdown = FALSE slice_pushdown = FALSE - common_subplan_elimination = FALSE + comm_subplan_elim = FALSE + comm_subexpr_elim = FALSE } if (isTRUE(streaming)) { - common_subplan_elimination = FALSE + comm_subplan_elim = FALSE } - self |> .pr$LazyFrame$optimization_toggle( type_coercion, @@ -1262,7 +1270,8 @@ LazyFrame_fetch = function( projection_pushdown, simplify_expression, slice_pushdown, - common_subplan_elimination, + comm_subplan_elim, + comm_subexpr_elim, streaming ) |> and_then(\(self) .pr$LazyFrame$fetch(self, n_rows)) |> diff --git a/man/LazyFrame_collect.Rd b/man/LazyFrame_collect.Rd index 0191e02ab..17e35a267 100644 --- a/man/LazyFrame_collect.Rd +++ b/man/LazyFrame_collect.Rd @@ -24,20 +24,21 @@ run on minimal required memory.} \item{predicate_pushdown}{Boolean. Applies filters as early as possible at scan level.} -\item{projection_pushdown}{Boolean. Select only the columns that are needed at the scan level.} +\item{projection_pushdown}{Boolean. Select only the columns that are needed +at the scan level.} -\item{simplify_expression}{Boolean. Various optimizations, such as constant folding -and replacing expensive operations with faster alternatives.} +\item{simplify_expression}{Boolean. Various optimizations, such as constant +folding and replacing expensive operations with faster alternatives.} \item{slice_pushdown}{Boolean. Only load the required slice from the scan Don't materialize sliced outputs level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).} -\item{comm_subplan_elim}{Boolean. Will try to cache branching subplans that occur on self-joins -or unions.} +\item{comm_subplan_elim}{Boolean. Will try to cache branching subplans that +occur on self-joins or unions.} -\item{comm_subexpr_elim}{Boolean. Common subexpressions will be cached and reused. -or unions.} +\item{comm_subexpr_elim}{Boolean. Common subexpressions will be cached and +reused.} \item{no_optimization}{Boolean. Turn off the following optimizations: predicate_pushdown = FALSE diff --git a/man/LazyFrame_fetch.Rd b/man/LazyFrame_fetch.Rd index d83e2d52a..c233deff1 100644 --- a/man/LazyFrame_fetch.Rd +++ b/man/LazyFrame_fetch.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/lazyframe__lazy.R \name{LazyFrame_fetch} \alias{LazyFrame_fetch} -\title{Fetch} +\title{Fetch \code{n} rows of a LazyFrame} \usage{ LazyFrame_fetch( n_rows = 500, @@ -11,13 +11,14 @@ LazyFrame_fetch( projection_pushdown = TRUE, simplify_expression = TRUE, slice_pushdown = TRUE, - common_subplan_elimination = TRUE, + comm_subplan_elim = TRUE, + comm_subexpr_elim = TRUE, no_optimization = FALSE, streaming = FALSE ) } \arguments{ -\item{n_rows}{number (\verb{Into}) of rows to fetch at maximum.} +\item{n_rows}{Integer. Maximum number of rows to fetch.} \item{type_coercion}{Boolean. Coerce types such that operations succeed and run on minimal required memory.} @@ -34,8 +35,11 @@ by multiple subtrees in the query plan.} \item{slice_pushdown}{Boolean. Only load the required slice from the scan level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).} -\item{common_subplan_elimination}{Boolean. Cache subtrees/file scans that -are used by multiple subtrees in the query plan.} +\item{comm_subplan_elim}{Boolean. Will try to cache branching subplans that +occur on self-joins or unions.} + +\item{comm_subexpr_elim}{Boolean. Common subexpressions will be cached and +reused.} \item{no_optimization}{Boolean. Turn off the following optimizations: predicate_pushdown = FALSE @@ -50,22 +54,22 @@ common_subplan_elimination = FALSE} A DataFrame of maximum n_rows } \description{ -limit number of rows at scan level for fast trying a query +This is similar to \verb{$collect()} but limit the number of rows to collect. It +is mostly useful to check that a query works as expected. } \details{ -Collect a small number of rows for debugging purposes. -Fetch is like the \code{\link[=LazyFrame_collect]{$collect()}} operation, but it overwrites the number of -rows read by every scan operation. This is a utility that helps debug a query on a smaller number -of rows. Note that the fetch does not guarantee the final number of rows in the DataFrame. Filter -, join operations and a lower number of rows available in the scanned file influence the final -number of rows. +\verb{$fetch()} does not guarantee the final number of rows in the DataFrame output. +It only guarantees that \code{n} rows are used at the beginning of the query. +Filters, join operations and a lower number of rows available in the scanned +file influence the final number of rows. } \examples{ -# fetch 3 +# fetch 3 rows pl$LazyFrame(iris)$fetch(3) -# this fetch-query returns 4 and not 3 entries, see details. +# this fetch-query returns 4 rows, because we started with 3 and appended one +# row in the query (see section 'Details') pl$LazyFrame(iris)$select(pl$col("Species")$append("flora gigantica, alien"))$fetch(3) } \seealso{ From e13a6c8210c40ebf021a6186a4e14190458e8eab Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Wed, 30 Aug 2023 08:27:52 +0200 Subject: [PATCH 13/13] remove Rd file for `where_in` [skip_ci] --- man/where_in.Rd | 23 ----------------------- 1 file changed, 23 deletions(-) delete mode 100644 man/where_in.Rd diff --git a/man/where_in.Rd b/man/where_in.Rd deleted file mode 100644 index 4cefdabdb..000000000 --- a/man/where_in.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/error__trait.R -\name{where_in} -\alias{where_in} -\title{where in (lexically) error happened} -\usage{ -where_in(err, context) -} -\arguments{ -\item{err}{any type which impl as.character} - -\item{context}{calling context} -} -\value{ -err as string -} -\description{ -Internal generic method to point to which public method the user got wrong -} -\examples{ -# -} -\keyword{internal}