Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implement <DataFrame>$transpose() #440

Merged
merged 10 commits into from
Oct 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
- Method `$profile()` gains optimization arguments and plot-related arguments (#429).
- New method `pl$read_parquet()` that is a shortcut for `pl$scan_parquet()$collect()` (#434).
- Rename `$str$str_explode()` to `$str$explode()` (#436).
- New method `$transpose()` for `DataFrame` (#440).
- New argument `eager` of `LazyFrame$set_optimization_toggle()` (#439).

# polars 0.8.1
Expand Down
36 changes: 36 additions & 0 deletions R/dataframe__frame.R
Original file line number Diff line number Diff line change
Expand Up @@ -1682,6 +1682,42 @@ DataFrame_sample = function(
}


#' Transpose a DataFrame over the diagonal.
#'
#' @param include_header If `TRUE`, the column names will be added as first column.
#' @param header_name If `include_header` is `TRUE`, this determines the name of the column
#' that will be inserted.
#' @param column_names Character vector indicating the new column names. If `NULL` (default),
#' the columns will be named as "column_1", "column_2", etc. The length of this vector must match
#' the number of rows of the original input.
#'
#' @details
#' This is a very expensive operation.
#'
#' Transpose may be the fastest option to perform non foldable (see `fold()` or `reduce()`)
#' row operations like median.
#'
#' Polars transpose is currently eager only, likely because it is not trivial to deduce the schema.
#'
#' @keywords DataFrame
#' @return DataFrame
#' @examples
#'
#' # simple use-case
#' pl$DataFrame(mtcars)$transpose(include_header = TRUE, column_names = rownames(mtcars))
#'
#' # All rows must have one shared supertype, recast Categorical to Utf8 which is a supertype
#' # of f64, and then dataset "Iris" can be transposed
#' pl$DataFrame(iris)$with_columns(pl$col("Species")$cast(pl$Utf8))$transpose()
#'
DataFrame_transpose = function(
include_header = FALSE,
header_name = "column",
column_names = NULL) {
keep_names_as = if (isTRUE(include_header)) header_name else NULL
.pr$DataFrame$transpose(self, keep_names_as, column_names) |>
unwrap("in $transpose():")
}

#' Write to comma-separated values (CSV) file
#'
Expand Down
2 changes: 2 additions & 0 deletions R/extendr-wrappers.R
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,8 @@ DataFrame$sample_n <- function(n, with_replacement, shuffle, seed) .Call(wrap__D

DataFrame$sample_frac <- function(frac, with_replacement, shuffle, seed) .Call(wrap__DataFrame__sample_frac, self, frac, with_replacement, shuffle, seed)

DataFrame$transpose <- function(keep_names_as, new_col_names) .Call(wrap__DataFrame__transpose, self, keep_names_as, new_col_names)

DataFrame$write_csv <- function(path, has_header, separator, line_terminator, quote, batch_size, datetime_format, date_format, time_format, float_precision, null_value, quote_style) .Call(wrap__DataFrame__write_csv, self, path, has_header, separator, line_terminator, quote, batch_size, datetime_format, date_format, time_format, float_precision, null_value, quote_style)

#' @export
Expand Down
47 changes: 47 additions & 0 deletions man/DataFrame_transpose.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions src/rust/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions src/rust/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ state = "0.6.0"
thiserror = "1.0.40"
polars-core = { git = "https://github.com/pola-rs/polars.git", rev = "7f8cd7dbd6bc09a21a99c13020fbac8cfdd4aa90", default-features = false }
polars-lazy = { git = "https://github.com/pola-rs/polars.git", rev = "7f8cd7dbd6bc09a21a99c13020fbac8cfdd4aa90", default-features = false }
either = "1"
#features copied from node-polars

[dependencies.polars]
Expand Down
2 changes: 1 addition & 1 deletion src/rust/src/lazy/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -540,7 +540,7 @@ impl LazyFrame {
comm_subplan_elim,
comm_subexpr_elim,
streaming,
fast_projection,
fast_projection: _,
eager,
} = self.0.get_current_optimizations();
list!(
Expand Down
12 changes: 11 additions & 1 deletion src/rust/src/rdataframe/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use crate::rdatatype;
use crate::rdatatype::RPolarsDataType;
use crate::robj_to;
use crate::rpolarserr::*;

use either::Either;
pub use lazy::dataframe::*;

use crate::conversion_s_to_r::pl_series_to_list;
Expand Down Expand Up @@ -443,6 +443,16 @@ impl DataFrame {
.map(DataFrame)
}

pub fn transpose(&self, keep_names_as: Robj, new_col_names: Robj) -> RResult<Self> {
let opt_s = robj_to!(Option, str, keep_names_as)?;
let opt_vec_s = robj_to!(Option, Vec, String, new_col_names)?;
let opt_either_vec_s = opt_vec_s.map(|vec_s| Either::Right(vec_s));
self.0
.transpose(opt_s, opt_either_vec_s)
.map_err(polars_to_rpolars_err)
.map(DataFrame)
}

pub fn write_csv(
&self,
path: Robj,
Expand Down
62 changes: 62 additions & 0 deletions tests/testthat/test-dataframe.R
Original file line number Diff line number Diff line change
Expand Up @@ -1128,3 +1128,65 @@ test_that("sample", {
df$sample(fraction = 0.1, seed = "123")$to_data_frame()
)
})

test_that("transpose", {
# R function to mimic polars transpose
R_t_df = \(df, include_header = FALSE, header_name = "column", column_names = NULL) {
tdf = as.data.frame(t(df))

if (include_header) {
header_name_df = data.frame(column = rownames(tdf))
colnames(header_name_df) = header_name
tdf = cbind(header_name_df, tdf)
}
rownames(tdf) = NULL
tdf
}


# include_header + custom header column name + column names
expect_identical(
pl$DataFrame(mtcars)$
transpose(include_header = TRUE, header_name = "alice", column_names = rownames(mtcars))$
to_data_frame(),
R_t_df(mtcars, include_header = TRUE, header_name = "alice")
)

# same but default column name
expect_identical(
pl$DataFrame(mtcars)$
transpose(include_header = TRUE, column_names = rownames(mtcars))$
to_data_frame(),
R_t_df(mtcars, include_header = TRUE)
)

# no heaser column
expect_identical(
pl$DataFrame(mtcars)$
transpose(include_header = FALSE, column_names = rownames(mtcars))$
to_data_frame(),
R_t_df(mtcars, include_header = FALSE)
)

# use default column names
df_expected = R_t_df(mtcars, include_header = FALSE)
colnames(df_expected) = paste0("column_", seq_len(ncol(df_expected)) - 1L)
expect_identical(
pl$DataFrame(mtcars)$
transpose(include_header = FALSE, column_names = NULL)$
to_data_frame(),
df_expected
)

# transpose mixed types with a shared super tpye
df_expected = R_t_df(iris, include_header = FALSE)
colnames(df_expected) = paste0("column_", seq_len(ncol(df_expected)) - 1L)
expect_identical(
pl$DataFrame(iris)$
with_columns(pl$col("Species")$
cast(pl$Utf8))$
transpose(FALSE)$
to_data_frame(),
df_expected
)
})