From ca7238eacbd778fed52faeca2113221f19be334e Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Wed, 8 Nov 2023 14:23:07 +0100 Subject: [PATCH] implement `$with_context()` --- NEWS.md | 2 ++ R/extendr-wrappers.R | 2 ++ R/lazyframe__lazy.R | 34 +++++++++++++++++++++++++++++ man/LazyFrame_with_context.Rd | 39 ++++++++++++++++++++++++++++++++++ src/rust/src/lazy/dataframe.rs | 8 +++++++ tests/testthat/test-lazy.R | 26 +++++++++++++++++++++++ 6 files changed, 111 insertions(+) create mode 100644 man/LazyFrame_with_context.Rd diff --git a/NEWS.md b/NEWS.md index a8df546ff..f0efbd0c2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -38,6 +38,8 @@ - New methods `$peak_min()` and `$peak_max()` to find local minima and maxima in an Expr (#462). - New methods `$read_ndjson()` and `$scan_ndjson()` (#471). +- New method `$with_context()` for `LazyFrame` to have access to columns from + other Data/LazyFrames during the computation. # polars 0.9.0 diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index ac71b1087..a000bf675 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -1075,6 +1075,8 @@ LazyFrame$explode <- function(dotdotdot) .Call(wrap__LazyFrame__explode, self, d LazyFrame$clone_see_me_macro <- function() .Call(wrap__LazyFrame__clone_see_me_macro, self) +LazyFrame$with_context <- function(other) .Call(wrap__LazyFrame__with_context, self, other) + #' @export `$.LazyFrame` <- function (self, name) { func <- LazyFrame[[name]]; environment(func) <- environment(); func } diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index 193525a5a..2b70b43d1 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -1547,3 +1547,37 @@ LazyFrame_unnest = function(names = NULL) { } unwrap(.pr$LazyFrame$unnest(self, names), "in $unnest():") } + +#' Add an external context to the computation graph +#' +#' This allows expressions to also access columns from DataFrames or LazyFrames +#' that are not part of this one. +#' +#' @param other Data/LazyFrame to have access to. This can be a list of DataFrames +#' and LazyFrames. +#' @return A LazyFrame +#' +#' @examples +#' lf = pl$LazyFrame(a = c(1, 2, 3), b = c("a", "c", NA)) +#' lf_other = pl$LazyFrame(c = c("foo", "ham")) +#' +#' lf$with_context(lf_other)$select( +#' pl$col("b") + pl$col("c")$first() +#' )$collect() +#' +#' # Fill nulls with the median from another lazyframe: +#' train_lf = pl$LazyFrame( +#' feature_0 = c(-1.0, 0, 1), feature_1 = c(-1.0, 0, 1) +#' ) +#' test_lf = pl$LazyFrame( +#' feature_0 = c(-1.0, NA, 1), feature_1 = c(-1.0, 0, 1) +#' ) +#' +#' test_lf$with_context(train_lf$select(pl$all()$name$suffix("_train")))$select( +#' pl$col("feature_0")$fill_null(pl$col("feature_0_train")$median()) +#' )$collect() + +LazyFrame_with_context = function(other) { + .pr$LazyFrame$with_context(self, other) |> + unwrap("in with_context():") +} diff --git a/man/LazyFrame_with_context.Rd b/man/LazyFrame_with_context.Rd new file mode 100644 index 000000000..b5e964bf9 --- /dev/null +++ b/man/LazyFrame_with_context.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lazyframe__lazy.R +\name{LazyFrame_with_context} +\alias{LazyFrame_with_context} +\title{Add an external context to the computation graph} +\usage{ +LazyFrame_with_context(other) +} +\arguments{ +\item{other}{Data/LazyFrame to have access to. This can be a list of DataFrames +and LazyFrames.} +} +\value{ +A LazyFrame +} +\description{ +This allows expressions to also access columns from DataFrames or LazyFrames +that are not part of this one. +} +\examples{ +lf = pl$LazyFrame(a = c(1, 2, 3), b = c("a", "c", NA)) +lf_other = pl$LazyFrame(c = c("foo", "ham")) + +lf$with_context(lf_other)$select( + pl$col("b") + pl$col("c")$first() +)$collect() + +# Fill nulls with the median from another lazyframe: +train_lf = pl$LazyFrame( + feature_0 = c(-1.0, 0, 1), feature_1 = c(-1.0, 0, 1) +) +test_lf = pl$LazyFrame( + feature_0 = c(-1.0, NA, 1), feature_1 = c(-1.0, 0, 1) +) + +test_lf$with_context(train_lf$select(pl$all()$name$suffix("_train")))$select( + pl$col("feature_0")$fill_null(pl$col("feature_0_train")$median()) +)$collect() +} diff --git a/src/rust/src/lazy/dataframe.rs b/src/rust/src/lazy/dataframe.rs index cb60a4025..1fd0136f0 100644 --- a/src/rust/src/lazy/dataframe.rs +++ b/src/rust/src/lazy/dataframe.rs @@ -566,6 +566,14 @@ impl LazyFrame { pub fn clone_see_me_macro(&self) -> LazyFrame { self.clone() } + + pub fn with_context(&self, contexts: Robj) -> RResult { + let contexts = robj_to!(Vec, LazyFrame, contexts)? + .into_iter() + .map(|ldf| ldf.0) + .collect::>(); + Ok(self.0.clone().with_context(contexts).into()) + } } #[derive(Clone)] diff --git a/tests/testthat/test-lazy.R b/tests/testthat/test-lazy.R index f70b53bc4..a219d3c49 100644 --- a/tests/testthat/test-lazy.R +++ b/tests/testthat/test-lazy.R @@ -845,3 +845,29 @@ test_that("opt_toggles", { lf_new_opts$sink_ipc(tmpf, inherit_optimization = TRUE) expect_identical(pl$scan_ipc(tmpf, memmap = FALSE)$collect()$to_data_frame(), df_defaults) }) + +test_that("with_context works", { + lf = pl$LazyFrame(a = c(1, 2, 3), b = c("a", "c", NA)) + lf_other = pl$LazyFrame(c = c("foo", "ham")) + + expect_identical( + lf$with_context(lf_other)$select( + pl$col("b") + pl$col("c")$first() + )$collect()$to_data_frame(), + data.frame(b = c("afoo", "cfoo", NA)) + ) + + train_lf = pl$LazyFrame( + feature_0 = c(-1.0, 0, 1), feature_1 = c(-1.0, 0, 1) + ) + test_lf = pl$LazyFrame( + feature_0 = c(-1.0, NA, 1), feature_1 = c(-1.0, 0, 1) + ) + + expect_identical( + test_lf$with_context(train_lf$select(pl$all()$name$suffix("_train")))$select( + pl$col("feature_0")$fill_null(pl$col("feature_0_train")$median()) + )$collect()$to_data_frame(), + data.frame(feature_0 = c(-1, 0, 1)) + ) +})