From 798ba9bf53c922d7a4dd75507b4d603727197e42 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Wed, 18 Oct 2023 13:06:37 +0200 Subject: [PATCH 01/13] add arguments to `$profile()` --- R/lazyframe__lazy.R | 89 +++++++++++++++++++++++++++++++++++++++- man/LazyFrame_profile.Rd | 53 +++++++++++++++++++++++- 2 files changed, 139 insertions(+), 3 deletions(-) diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index b30573304..f1e03d6a6 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -1271,6 +1271,10 @@ LazyFrame_fetch = function( #' @description This will run the query and return a list containing the #' materialized DataFrame and a DataFrame that contains profiling information #' of each node that is executed. +#' +#' @inheritParams LazyFrame_collect +#' @param show_plot Show a Gantt chart of the profiling result +#' #' @details The units of the timings are microseconds. #' #' @keywords LazyFrame @@ -1311,8 +1315,89 @@ LazyFrame_fetch = function( #' group_by("Species", maintain_order = TRUE)$ #' agg(pl$col(pl$Float64)$apply(r_func))$ #' profile() -LazyFrame_profile = function() { - .pr$LazyFrame$profile(self) |> unwrap("in $profile()") +LazyFrame_profile = function( + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + comm_subplan_elim = TRUE, + comm_subexpr_elim = TRUE, + streaming = FALSE, + no_optimization = FALSE, + inherit_optimization = FALSE, + collect_in_background = FALSE, + show_plot = FALSE) { + + if (isTRUE(no_optimization)) { + predicate_pushdown = FALSE + projection_pushdown = FALSE + slice_pushdown = FALSE + comm_subplan_elim = FALSE + comm_subexpr_elim = FALSE + } + + if (isTRUE(streaming)) { + comm_subplan_elim = FALSE + } + + lf = self + + if (isFALSE(inherit_optimization)) { + lf = self$set_optimization_toggle( + type_coercion, + predicate_pushdown, + projection_pushdown, + simplify_expression, + slice_pushdown, + comm_subplan_elim, + comm_subexpr_elim, + streaming + ) |> unwrap("in $profile():") + } + + out = lf |> + .pr$LazyFrame$profile() |> + unwrap("in $profile()") + + if (isTRUE(show_plot)) { + timings = out$profile$to_data_frame() + timings$node = factor(timings$node, levels = unique(timings$node)) + total_timing = max(timings$end) + if (total_timing > 10000000) { + unit = "s" + total_timing = paste0(total_timing/1000000, "s") + timings$start <- timings$start / 1000000 + timings$end <- timings$end / 1000000 + } else if (total_timing > 10000) { + unit = "ms" + total_timing = paste0(total_timing/1000, "ms") + timings$start <- timings$start / 1000 + timings$end <- timings$end / 1000 + } else { + unit = "µs" + total_timing = paste0(total_timing, "µs") + } + + if (!"ggplot2" %in% rownames(installed.packages())) { + unwrap("in $profile(): argument `show_plot` requires the package 'ggplot2'.") + } + + plot = ggplot2::ggplot(timings, ggplot2::aes(x = start, xend = end, y = node, yend = node)) + + ggplot2::geom_segment(size = 6) + + ggplot2::xlab( + paste0("Node duration in ", unit, ". Total duration: ", total_timing) + ) + + ggplot2::ylab(NULL) + + ggplot2::scale_y_discrete(limits = rev) + + ggplot2::theme( + axis.text = ggplot2::element_text(size = 12) + ) + + print(plot) + } + + out } #' @title Explode columns containing a list of values diff --git a/man/LazyFrame_profile.Rd b/man/LazyFrame_profile.Rd index 30705cb1e..ed33d3828 100644 --- a/man/LazyFrame_profile.Rd +++ b/man/LazyFrame_profile.Rd @@ -4,7 +4,58 @@ \alias{LazyFrame_profile} \title{Collect and profile a lazy query.} \usage{ -LazyFrame_profile() +LazyFrame_profile( + type_coercion = TRUE, + predicate_pushdown = TRUE, + projection_pushdown = TRUE, + simplify_expression = TRUE, + slice_pushdown = TRUE, + comm_subplan_elim = TRUE, + comm_subexpr_elim = TRUE, + streaming = FALSE, + no_optimization = FALSE, + inherit_optimization = FALSE, + collect_in_background = FALSE, + show_plot = FALSE +) +} +\arguments{ +\item{type_coercion}{Boolean. Coerce types such that operations succeed and +run on minimal required memory.} + +\item{predicate_pushdown}{Boolean. Applies filters as early as possible at +scan level.} + +\item{projection_pushdown}{Boolean. Select only the columns that are needed +at the scan level.} + +\item{simplify_expression}{Boolean. Various optimizations, such as constant +folding and replacing expensive operations with faster alternatives.} + +\item{slice_pushdown}{Boolean. Only load the required slice from the scan +level. Don't materialize sliced outputs (e.g. \code{join$head(10)}).} + +\item{comm_subplan_elim}{Boolean. Will try to cache branching subplans that +occur on self-joins or unions.} + +\item{comm_subexpr_elim}{Boolean. Common subexpressions will be cached and +reused.} + +\item{streaming}{Boolean. Run parts of the query in a streaming fashion +(this is in an alpha state).} + +\item{no_optimization}{Boolean. Sets the following parameters to \code{FALSE}: +\code{predicate_pushdown}, \code{projection_pushdown}, \code{slice_pushdown}, +\code{comm_subplan_elim}, \code{comm_subexpr_elim}.} + +\item{inherit_optimization}{Boolean. Use existing optimization settings +regardless the settings specified in this function call.} + +\item{collect_in_background}{Boolean. Detach this query from R session. +Computation will start in background. Get a handle which later can be converted +into the resulting DataFrame. Useful in interactive mode to not lock R session.} + +\item{show_plot}{Show a Gantt chart of the profiling result} } \value{ List of two \code{DataFrame}s: one with the collected result, the other From 7533fee046e35cfe8cce9219fd5d942a5860b500 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Wed, 18 Oct 2023 13:08:58 +0200 Subject: [PATCH 02/13] ggplot2 in suggests --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index 1ba7c8d94..7d28ff35b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -26,6 +26,7 @@ Suggests: bit64, callr, data.table, + ggplot2, knitr, lubridate, nanoarrow, From 980ca2b3924fa89eeca580239db3bd4361d84be5 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Wed, 18 Oct 2023 13:27:51 +0200 Subject: [PATCH 03/13] better check for presence of ggplot2 --- R/lazyframe__lazy.R | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index f1e03d6a6..f71658137 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -1360,29 +1360,25 @@ LazyFrame_profile = function( .pr$LazyFrame$profile() |> unwrap("in $profile()") - if (isTRUE(show_plot)) { + if (isTRUE(show_plot) && requireNamespace("ggplot2", quietly = TRUE)) { timings = out$profile$to_data_frame() timings$node = factor(timings$node, levels = unique(timings$node)) total_timing = max(timings$end) if (total_timing > 10000000) { unit = "s" total_timing = paste0(total_timing/1000000, "s") - timings$start <- timings$start / 1000000 - timings$end <- timings$end / 1000000 + timings$start = timings$start / 1000000 + timings$end = timings$end / 1000000 } else if (total_timing > 10000) { unit = "ms" total_timing = paste0(total_timing/1000, "ms") - timings$start <- timings$start / 1000 - timings$end <- timings$end / 1000 + timings$start = timings$start / 1000 + timings$end = timings$end / 1000 } else { unit = "µs" total_timing = paste0(total_timing, "µs") } - if (!"ggplot2" %in% rownames(installed.packages())) { - unwrap("in $profile(): argument `show_plot` requires the package 'ggplot2'.") - } - plot = ggplot2::ggplot(timings, ggplot2::aes(x = start, xend = end, y = node, yend = node)) + ggplot2::geom_segment(size = 6) + ggplot2::xlab( From 001c902c220a5bb0b67ebf451a821b1b22fca34f Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Wed, 18 Oct 2023 13:43:54 +0200 Subject: [PATCH 04/13] use rlang::.data --- DESCRIPTION | 1 + R/lazyframe__lazy.R | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 7d28ff35b..672803348 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -33,6 +33,7 @@ Suggests: nycflights13, patrick, pillar, + rlang, rmarkdown, testthat (>= 3.0.0), tibble, diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index f71658137..13af1fdbf 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -1379,7 +1379,10 @@ LazyFrame_profile = function( total_timing = paste0(total_timing, "µs") } - plot = ggplot2::ggplot(timings, ggplot2::aes(x = start, xend = end, y = node, yend = node)) + + plot = ggplot2::ggplot( + timings, + ggplot2::aes(x = rlang::.data$start, xend = rlang::.data$end, + y = rlang::.data$node, yend = rlang::.data$node)) + ggplot2::geom_segment(size = 6) + ggplot2::xlab( paste0("Node duration in ", unit, ". Total duration: ", total_timing) From 85347377b7e28f5543639a2b1679b61cd98eb929 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Wed, 18 Oct 2023 14:09:30 +0200 Subject: [PATCH 05/13] fix some non ascii character, fix rlang issue --- R/lazyframe__lazy.R | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index 13af1fdbf..3f9e9f58d 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -1375,15 +1375,18 @@ LazyFrame_profile = function( timings$start = timings$start / 1000 timings$end = timings$end / 1000 } else { - unit = "µs" - total_timing = paste0(total_timing, "µs") + unit = "\U00B5s" + total_timing = paste0(total_timing, "\U00B5s") } + # for some reason, there's an error if I use rlang::.data directly in aes() + .data <- rlang::.data + plot = ggplot2::ggplot( timings, - ggplot2::aes(x = rlang::.data$start, xend = rlang::.data$end, - y = rlang::.data$node, yend = rlang::.data$node)) + - ggplot2::geom_segment(size = 6) + + ggplot2::aes(x = .data[["start"]], xend = .data[["end"]], + y = .data[["node"]], yend = .data[["node"]])) + + ggplot2::geom_segment(linewidth = 6) + ggplot2::xlab( paste0("Node duration in ", unit, ". Total duration: ", total_timing) ) + From 14ae3ef8d15db86dac987aa0dea76a4d315abaa9 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Wed, 18 Oct 2023 14:10:37 +0200 Subject: [PATCH 06/13] drop a few more non-ASCII chars --- R/lazyframe__lazy.R | 6 +++--- man/DataFrame_join_asof.Rd | 6 +++--- man/LazyFrame_join_asof.Rd | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index 3f9e9f58d..c850dbca6 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -977,11 +977,11 @@ LazyFrame_sort = function( #' table. They must have the same length. #' @param strategy Strategy for where to find match: #' * "backward" (default): search for the last row in the right table whose `on` -#' key is less than or equal to the left’s key. +#' key is less than or equal to the left key. #' * "forward": search for the first row in the right table whose `on` key is -#' greater than or equal to the left’s key. +#' greater than or equal to the left key. #' * "nearest": search for the last row in the right table whose value is nearest -#' to the left’s key. String keys are not currently supported for a nearest +#' to the left key. String keys are not currently supported for a nearest #' search. #' @param tolerance #' Numeric tolerance. By setting this the join will only be done if the near diff --git a/man/DataFrame_join_asof.Rd b/man/DataFrame_join_asof.Rd index 12486bc36..0338e41eb 100644 --- a/man/DataFrame_join_asof.Rd +++ b/man/DataFrame_join_asof.Rd @@ -43,11 +43,11 @@ tables.} \item{strategy}{Strategy for where to find match: \itemize{ \item "backward" (default): search for the last row in the right table whose \code{on} -key is less than or equal to the left’s key. +key is less than or equal to the left key. \item "forward": search for the first row in the right table whose \code{on} key is -greater than or equal to the left’s key. +greater than or equal to the left key. \item "nearest": search for the last row in the right table whose value is nearest -to the left’s key. String keys are not currently supported for a nearest +to the left key. String keys are not currently supported for a nearest search. }} diff --git a/man/LazyFrame_join_asof.Rd b/man/LazyFrame_join_asof.Rd index ad352e49b..9281fa324 100644 --- a/man/LazyFrame_join_asof.Rd +++ b/man/LazyFrame_join_asof.Rd @@ -43,11 +43,11 @@ tables.} \item{strategy}{Strategy for where to find match: \itemize{ \item "backward" (default): search for the last row in the right table whose \code{on} -key is less than or equal to the left’s key. +key is less than or equal to the left key. \item "forward": search for the first row in the right table whose \code{on} key is -greater than or equal to the left’s key. +greater than or equal to the left key. \item "nearest": search for the last row in the right table whose value is nearest -to the left’s key. String keys are not currently supported for a nearest +to the left key. String keys are not currently supported for a nearest search. }} From 3b6c35df9283051ffae782050c43e6adef4a298f Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Thu, 19 Oct 2023 18:30:18 +0200 Subject: [PATCH 07/13] add arg `truncate_nodes` --- R/lazyframe__lazy.R | 19 +++++++++++++++++-- man/LazyFrame_profile.Rd | 6 +++++- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index c850dbca6..9f45dec9d 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -1274,6 +1274,8 @@ LazyFrame_fetch = function( #' #' @inheritParams LazyFrame_collect #' @param show_plot Show a Gantt chart of the profiling result +#' @param truncate_nodes Truncate the label lengths in the Gantt chart to this +#' number of characters. If `0` (default), do not truncate. #' #' @details The units of the timings are microseconds. #' @@ -1327,7 +1329,8 @@ LazyFrame_profile = function( no_optimization = FALSE, inherit_optimization = FALSE, collect_in_background = FALSE, - show_plot = FALSE) { + show_plot = FALSE, + truncate_nodes = 0) { if (isTRUE(no_optimization)) { predicate_pushdown = FALSE @@ -1391,11 +1394,23 @@ LazyFrame_profile = function( paste0("Node duration in ", unit, ". Total duration: ", total_timing) ) + ggplot2::ylab(NULL) + - ggplot2::scale_y_discrete(limits = rev) + ggplot2::theme( axis.text = ggplot2::element_text(size = 12) ) + if (truncate_nodes > 0) { + plot = plot + + ggplot2::scale_y_discrete( + labels = paste0(strtrim(timings$node, truncate_nodes), "..."), + limits = rev + ) + } else { + plot = plot + + ggplot2::scale_y_discrete( + limits = rev + ) + } + print(plot) } diff --git a/man/LazyFrame_profile.Rd b/man/LazyFrame_profile.Rd index ed33d3828..ca494f13d 100644 --- a/man/LazyFrame_profile.Rd +++ b/man/LazyFrame_profile.Rd @@ -16,7 +16,8 @@ LazyFrame_profile( no_optimization = FALSE, inherit_optimization = FALSE, collect_in_background = FALSE, - show_plot = FALSE + show_plot = FALSE, + truncate_nodes = 0 ) } \arguments{ @@ -56,6 +57,9 @@ Computation will start in background. Get a handle which later can be converted into the resulting DataFrame. Useful in interactive mode to not lock R session.} \item{show_plot}{Show a Gantt chart of the profiling result} + +\item{truncate_nodes}{Truncate the label lengths in the Gantt chart to this +number of characters. If \code{0} (default), do not truncate.} } \value{ List of two \code{DataFrame}s: one with the collected result, the other From f065c8f18c44b1f08c1e91ee9c16472f419b0001 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Thu, 19 Oct 2023 18:30:25 +0200 Subject: [PATCH 08/13] bump news --- NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS.md b/NEWS.md index 067ce302a..77018baed 100644 --- a/NEWS.md +++ b/NEWS.md @@ -45,6 +45,7 @@ encoded as `NULL` to aid conversion to polars binary Series. Support back and forth conversion from polars binary literal and Series to R raw (#417). - New method `$dt$time()` to extract the time from a `datetime` variable (#428). +- Method `$profile()` gains optimization arguments and plot-related arguments (#429). # polars 0.8.1 From aaf8809da1da707c9a89498fcbf95c5d61a99819 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Thu, 19 Oct 2023 18:38:52 +0200 Subject: [PATCH 09/13] use = --- R/lazyframe__lazy.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index e8209d309..4eee0e934 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -1469,7 +1469,7 @@ LazyFrame_profile = function( } # for some reason, there's an error if I use rlang::.data directly in aes() - .data <- rlang::.data + .data = rlang::.data plot = ggplot2::ggplot( timings, @@ -1487,7 +1487,7 @@ LazyFrame_profile = function( if (truncate_nodes > 0) { plot = plot + ggplot2::scale_y_discrete( - labels = paste0(strtrim(timings$node, truncate_nodes), "..."), + labels = rev(paste0(strtrim(timings$node, truncate_nodes), "...")), limits = rev ) } else { From 6085a84ed311677c11e97e960fad0a733c7687ef Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Fri, 20 Oct 2023 14:14:25 +0200 Subject: [PATCH 10/13] error message if ggplot2 not installed, also return plot object --- R/lazyframe__lazy.R | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index 4eee0e934..24195c84e 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -1449,7 +1449,10 @@ LazyFrame_profile = function( .pr$LazyFrame$profile() |> unwrap("in $profile()") - if (isTRUE(show_plot) && requireNamespace("ggplot2", quietly = TRUE)) { + if (isTRUE(show_plot)) { + if (!requireNamespace("ggplot2", quietly = TRUE)) { + stop('The package "ggplot2" is required.') + } timings = out$profile$to_data_frame() timings$node = factor(timings$node, levels = unique(timings$node)) total_timing = max(timings$end) @@ -1498,6 +1501,7 @@ LazyFrame_profile = function( } print(plot) + out[["plot"]] = plot } out From b5b4f4313441997d63c070042f1b625905a4eda7 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Fri, 20 Oct 2023 18:31:14 +0200 Subject: [PATCH 11/13] add details on returned value --- R/lazyframe__lazy.R | 3 ++- man/LazyFrame_profile.Rd | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index 24195c84e..c2982d63c 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -1367,7 +1367,8 @@ LazyFrame_fetch = function( #' #' @keywords LazyFrame #' @return List of two `DataFrame`s: one with the collected result, the other -#' with the timings of each step. +#' with the timings of each step. If `show_graph = TRUE`, then the plot is +#' also stored in the list. #' @seealso #' - [`$collect()`][LazyFrame_collect] - regular collect. #' - [`$fetch()`][LazyFrame_fetch] - fast limited query check diff --git a/man/LazyFrame_profile.Rd b/man/LazyFrame_profile.Rd index ca494f13d..908d04ac4 100644 --- a/man/LazyFrame_profile.Rd +++ b/man/LazyFrame_profile.Rd @@ -63,7 +63,8 @@ number of characters. If \code{0} (default), do not truncate.} } \value{ List of two \code{DataFrame}s: one with the collected result, the other -with the timings of each step. +with the timings of each step. If \code{show_graph = TRUE}, then the plot is +also stored in the list. } \description{ This will run the query and return a list containing the From d9a760eb9144bc417508f77fb5f3d02ec7666848 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Tue, 24 Oct 2023 14:55:22 +0200 Subject: [PATCH 12/13] move plotting code in separate function --- R/lazyframe__lazy.R | 55 +++--------------------------------------- R/utils.R | 58 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 52 deletions(-) diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index c2982d63c..516eae55f 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -1451,58 +1451,9 @@ LazyFrame_profile = function( unwrap("in $profile()") if (isTRUE(show_plot)) { - if (!requireNamespace("ggplot2", quietly = TRUE)) { - stop('The package "ggplot2" is required.') - } - timings = out$profile$to_data_frame() - timings$node = factor(timings$node, levels = unique(timings$node)) - total_timing = max(timings$end) - if (total_timing > 10000000) { - unit = "s" - total_timing = paste0(total_timing/1000000, "s") - timings$start = timings$start / 1000000 - timings$end = timings$end / 1000000 - } else if (total_timing > 10000) { - unit = "ms" - total_timing = paste0(total_timing/1000, "ms") - timings$start = timings$start / 1000 - timings$end = timings$end / 1000 - } else { - unit = "\U00B5s" - total_timing = paste0(total_timing, "\U00B5s") - } - - # for some reason, there's an error if I use rlang::.data directly in aes() - .data = rlang::.data - - plot = ggplot2::ggplot( - timings, - ggplot2::aes(x = .data[["start"]], xend = .data[["end"]], - y = .data[["node"]], yend = .data[["node"]])) + - ggplot2::geom_segment(linewidth = 6) + - ggplot2::xlab( - paste0("Node duration in ", unit, ". Total duration: ", total_timing) - ) + - ggplot2::ylab(NULL) + - ggplot2::theme( - axis.text = ggplot2::element_text(size = 12) - ) - - if (truncate_nodes > 0) { - plot = plot + - ggplot2::scale_y_discrete( - labels = rev(paste0(strtrim(timings$node, truncate_nodes), "...")), - limits = rev - ) - } else { - plot = plot + - ggplot2::scale_y_discrete( - limits = rev - ) - } - - print(plot) - out[["plot"]] = plot + out[["plot"]] = make_profile_plot(out, truncate_nodes) |> + result() |> + unwrap("in $profile()") } out diff --git a/R/utils.R b/R/utils.R index 46b3c1dd1..f4beb14e4 100644 --- a/R/utils.R +++ b/R/utils.R @@ -620,3 +620,61 @@ is_bool = function(x) { dtypes_are_struct = function(dtypes) { sapply(dtypes, \(dt) pl$same_outer_dt(dt, pl$Struct())) } + +make_profile_plot = function(data, truncate_nodes) { + if (!requireNamespace("ggplot2", quietly = TRUE)) { + stop('The package "ggplot2" is required.') + } + timings = data$profile$to_data_frame() + timings$node = factor(timings$node, levels = unique(timings$node)) + total_timing = max(timings$end) + if (total_timing > 10000000) { + unit = "s" + total_timing = paste0(total_timing/1000000, "s") + timings$start = timings$start / 1000000 + timings$end = timings$end / 1000000 + } else if (total_timing > 10000) { + unit = "ms" + total_timing = paste0(total_timing/1000, "ms") + timings$start = timings$start / 1000 + timings$end = timings$end / 1000 + } else { + unit = "\U00B5s" + total_timing = paste0(total_timing, "\U00B5s") + } + + # for some reason, there's an error if I use rlang::.data directly in aes() + .data = rlang::.data + + plot = ggplot2::ggplot( + timings, + ggplot2::aes(x = .data[["start"]], xend = .data[["end"]], + y = .data[["node"]], yend = .data[["node"]])) + + ggplot2::geom_segment(linewidth = 6) + + ggplot2::xlab( + paste0("Node duration in ", unit, ". Total duration: ", total_timing) + ) + + ggplot2::ylab(NULL) + + ggplot2::theme( + axis.text = ggplot2::element_text(size = 12) + ) + + if (truncate_nodes > 0) { + plot = plot + + ggplot2::scale_y_discrete( + labels = rev(paste0(strtrim(timings$node, truncate_nodes), "...")), + limits = rev + ) + } else { + plot = plot + + ggplot2::scale_y_discrete( + limits = rev + ) + } + + # do not show the plot if we're running testthat + if (!identical(Sys.getenv("TESTTHAT"), "true")) { + print(plot) + } + plot +} From c06d7762d7159dc5d56240596a19c93c81e3a017 Mon Sep 17 00:00:00 2001 From: etiennebacher Date: Tue, 24 Oct 2023 14:55:31 +0200 Subject: [PATCH 13/13] add test --- tests/testthat/test-lazy_profile.R | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/testthat/test-lazy_profile.R b/tests/testthat/test-lazy_profile.R index 842baf7c1..71cee4dd6 100644 --- a/tests/testthat/test-lazy_profile.R +++ b/tests/testthat/test-lazy_profile.R @@ -35,3 +35,15 @@ test_that("$profile", { p1$result$as_data_frame() ) }) + + +test_that("profile: show_plot returns a plot in the list of outputs", { + skip_if_not_installed("ggplot2") + p1 = pl$LazyFrame(iris)$ + sort("Sepal.Length")$ + group_by("Species", maintain_order = TRUE)$ + agg(pl$col(pl$Float64)$first()$add(5)$suffix("_apply"))$ + profile(show_plot = TRUE) + + expect_length(p1, 3) +})