Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

melt + pivot #232

Merged
merged 12 commits into from
Jun 7, 2023
99 changes: 99 additions & 0 deletions R/dataframe__frame.R
Original file line number Diff line number Diff line change
Expand Up @@ -1270,3 +1270,102 @@ DataFrame_join_asof = function(
tolerance = tolerance
)$collect()
}




#' @inherit LazyFrame_melt
#' @keywords DataFrame
#'
#' @return A new `DataFrame`
#'
#' @examples
#' df = pl$DataFrame(
#' a = c("x", "y", "z"),
#' b = c(1, 3, 5),
#' c = c(2, 4, 6)
#' )
#' df$melt(id_vars = "a", value_vars = c("b", "c"))
DataFrame_melt = function(
id_vars = NULL,
value_vars = NULL,
variable_name = NULL,
value_name = NULL
) {
.pr$DataFrame$melt(
self, id_vars %||% character(), value_vars %||% character(),
value_name, variable_name
) |> unwrap("in $melt( ): ")
}



#' Create a spreadsheet-style pivot table as a DataFrame.
#' @param values Column values to aggregate. Can be multiple columns if the `columns`
#' arguments contains multiple columns as well.
#' @param index One or multiple keys to group by.
#' @param columns Name of the column(s) whose values will be used as the header of the output
#' DataFrame.
#' @param aggregate_function
#' String naming Expr to aggregate with, or an Expr e.g. `pl$element()$sum()`,
#' examples of strings:'first', 'sum', 'max', 'min', 'mean', 'median', 'last', 'count'
#' @param maintain_order Sort the grouped keys so that the output order is predictable.
#' @param sort_columns Sort the transposed columns by name. Default is by order of discovery.
#' @param separator Used as separator/delimiter in generated column names.
#'
#' @return DataFrame
#' @keywords DataFrame
#' @examples
#' df = pl$DataFrame(
#' foo = c("one", "one", "one", "two", "two", "two"),
#' bar = c("A", "B", "C", "A", "B", "C"),
#' baz = c(1, 2, 3, 4, 5, 6)
#' )
#' df$pivot(
#' values = "baz", index = "foo", columns = "bar", aggregate_function = "first"
#' )
#'
#'
#' # Run an expression as aggregation function
#' df = pl$DataFrame(
#' col1 = c("a", "a", "a", "b", "b", "b"),
#' col2 = c("x", "x", "x", "x", "y", "y"),
#' col3 = c(6, 7, 3, 2, 5, 7)
#' )
#' df$pivot(
#' index = "col1",
#' columns = "col2",
#' values = "col3",
#' aggregate_function = pl$element()$tanh()$mean()
#' )
DataFrame_pivot = function(
values,
index,
columns,
aggregate_function = NULL,
maintain_order = TRUE,
sort_columns = FALSE,
separator = "_") {
pcase(
# if string, call it on Expr-method of pl$element() and capture any Error as Result
is_string(aggregate_function), result(`$.Expr`(pl$element(), aggregate_function)()),

# Expr or NULL pass as is
is.null(aggregate_function) || inherits(aggregate_function, "Expr"), Ok(aggregate_function),

# anything else pass err
or_else = Err(" is neither a string, NULL or an Expr")
) |>
# add param context
map_err(\(err_msg) paste(
"param [aggregate_function] being ", str_string(aggregate_function), err_msg
)) |>
# run pivot when valid aggregate_expr
and_then(\(aggregate_expr) .pr$DataFrame$pivot_expr(
self, values, index, columns, maintain_order, sort_columns, aggregate_expr, separator
)) |>
# unwrap and add method context name
unwrap("in $pivot():")
}


6 changes: 6 additions & 0 deletions R/extendr-wrappers.R
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,10 @@ DataFrame$estimated_size <- function() .Call(wrap__DataFrame__estimated_size, se

DataFrame$null_count <- function() .Call(wrap__DataFrame__null_count, self)

DataFrame$melt <- function(id_vars, value_vars, value_name, variable_name) .Call(wrap__DataFrame__melt, self, id_vars, value_vars, value_name, variable_name)

DataFrame$pivot_expr <- function(values, index, columns, maintain_order, sort_columns, aggregate_expr, separator) .Call(wrap__DataFrame__pivot_expr, self, values, index, columns, maintain_order, sort_columns, aggregate_expr, separator)

#' @export
`$.DataFrame` <- function (self, name) { func <- DataFrame[[name]]; environment(func) <- environment(); func }

Expand Down Expand Up @@ -887,6 +891,8 @@ LazyFrame$join <- function(other, left_on, right_on, how, suffix, allow_parallel

LazyFrame$sort_by_exprs <- function(by, descending, nulls_last) .Call(wrap__LazyFrame__sort_by_exprs, self, by, descending, nulls_last)

LazyFrame$melt <- function(id_vars, value_vars, value_name, variable_name, streamable) .Call(wrap__LazyFrame__melt, self, id_vars, value_vars, value_name, variable_name, streamable)

#' @export
`$.LazyFrame` <- function (self, name) { func <- LazyFrame[[name]]; environment(func) <- environment(); func }

Expand Down
50 changes: 50 additions & 0 deletions R/lazyframe__lazy.R
Original file line number Diff line number Diff line change
Expand Up @@ -752,3 +752,53 @@ LazyFrame_join_asof = function(
) |>
unwrap("in join_asof( ):")
}


#' Unpivot a Frame from wide to long format
#'
#' @param id_vars char vec, columns to use as identifier variables.
#' @param value_vars char vec, Values to use as identifier variables.
#' If `value_vars` is empty all columns that are not in `id_vars` will be used.
#' @param variable_name string, Name to give to the `variable` column. Defaults to "variable"
#' @param value_name string, Name to give to the `value` column. Defaults to "value"
#' @param ... not used, forces to name streamable arg
#' @param streamable Allow this node to run in the streaming engine.
#' If this runs in streaming, the output of the melt operation
#' will not have a stable ordering.
#'
#' @details
#' Optionally leaves identifiers set.
#'
#' This function is useful to massage a DataFrame into a format where one or more
#' columns are identifier variables (id_vars), while all other columns, considered
#' measured variables (value_vars), are "unpivoted" to the row axis, leaving just
#' two non-identifier columns, 'variable' and 'value'.
#'
#' @keywords LazyFrame
#'
#' @return A new `LazyFrame`
#'
#' @examples
#' lf = pl$DataFrame(
#' a = c("x", "y", "z"),
#' b = c(1, 3, 5),
#' c = c(2, 4, 6)
#' )$lazy()
#' lf$melt(id_vars = "a", value_vars = c("b", "c"))$collect()
#'
LazyFrame_melt = function(
id_vars = NULL,
value_vars = NULL,
variable_name = NULL,
value_name = NULL,
...,
streamable = TRUE) {
.pr$LazyFrame$melt(
self, id_vars %||% character(), value_vars %||% character(),
value_name, variable_name, streamable
) |> unwrap("in $melt( ): ")
}




46 changes: 46 additions & 0 deletions man/DataFrame_melt.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

65 changes: 65 additions & 0 deletions man/DataFrame_pivot.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

55 changes: 55 additions & 0 deletions man/LazyFrame_melt.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions src/rust/src/conversion.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
use smartstring::alias::String as SmartString;
pub(crate) fn strings_to_smartstrings<I, S>(container: I) -> Vec<SmartString>
where
I: IntoIterator<Item = S>,
S: AsRef<str>,
{
container.into_iter().map(|s| s.as_ref().into()).collect()
}
20 changes: 20 additions & 0 deletions src/rust/src/lazy/dataframe.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use crate::concurrent::{handle_thread_r_requests, PolarsBackgroundHandle};
use crate::conversion::strings_to_smartstrings;
use crate::lazy::dsl::*;
use crate::rdatatype::new_asof_strategy;
use crate::rdatatype::new_join_type;
Expand All @@ -9,6 +10,7 @@ use crate::utils::wrappers::null_to_opt;
use crate::utils::{r_result_list, try_f64_into_usize};
use extendr_api::prelude::*;
use polars::chunked_array::object::AsOfOptions;
use polars::frame::explode::MeltArgs;
use polars::frame::hash_join::JoinType;
use polars::prelude as pl;

Expand Down Expand Up @@ -335,6 +337,24 @@ impl LazyFrame {
let nulls_last = robj_to!(bool, nulls_last)?;
Ok(ldf.sort_by_exprs(exprs, descending, nulls_last).into())
}

fn melt(
&self,
id_vars: Robj,
value_vars: Robj,
value_name: Robj,
variable_name: Robj,
streamable: Robj,
) -> Result<Self, String> {
let args = MeltArgs {
id_vars: strings_to_smartstrings(robj_to!(Vec, String, id_vars)?),
value_vars: strings_to_smartstrings(robj_to!(Vec, String, value_vars)?),
value_name: robj_to!(Option, String, value_name)?.map(|s| s.into()),
variable_name: robj_to!(Option, String, variable_name)?.map(|s| s.into()),
streamable: robj_to!(bool, streamable)?,
};
Ok(self.0.clone().melt(args).into())
}
}

#[derive(Clone)]
Expand Down
Loading