diff --git a/DESCRIPTION b/DESCRIPTION index 65e1531c8..88ab9c6ba 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -113,5 +113,5 @@ Collate: 'zzz.R' Config/rextendr/version: 0.3.1 VignetteBuilder: knitr -Config/polars/LibVersion: 0.35.2 +Config/polars/LibVersion: 0.36.0 Config/polars/RustToolchainVersion: nightly-2023-12-23 diff --git a/NEWS.md b/NEWS.md index d7157507d..bdcfceda2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,17 +2,25 @@ ## polars (development version) +### Rust-polars update + +- rust-polars is updated to 0.36.2 (#659). Most of the changes were covered + in 0.12.0. The main change is that `pl$Utf8` is replaced by `pl$String`. + `pl$Utf8` is an alias and will keep working, but `pl$String` is now preferred + in the documentation and in new code. + ### What's changed - New methods `$str$reverse()`, `$str$contains_any()`, and `$str$replace_many()` (#641). - New methods `$rle()` and `$rle_id()` (#648). - New functions `is_polars_df()`, `is_polars_lf()`, `is_polars_series()` (#658). +- `$gather()` now accepts negative indexing (#659). ### Miscellaneous -- Remeve the `Makefile` in favor of `Taskfile.yml`. - Please use `task` instaed of `make` as a task runner (#654). +- Remove the `Makefile` in favor of `Taskfile.yml`. + Please use `task` instead of `make` as a task runner (#654). ## polars 0.12.0 diff --git a/R/Field.R b/R/Field.R index 5c2b9d639..62080223e 100644 --- a/R/Field.R +++ b/R/Field.R @@ -13,7 +13,7 @@ #' @return A object of with DataType `"RField"` containing its name and its #' DataType. #' @examples -#' pl$Field("city_names", pl$Utf8) +#' pl$Field("city_names", pl$String) pl_Field = function(name, datatype) { .pr$RField$new(name, datatype) } @@ -71,7 +71,7 @@ RField.property_setters = new.env(parent = emptyenv()) #' #' @rdname RField_name #' @examples -#' field = pl$Field("Cities", pl$Utf8) +#' field = pl$Field("Cities", pl$String) #' field$name #' #' field$name = "CityPoPulations" #<- is fine too @@ -90,7 +90,7 @@ RField.property_setters$name = function(self, value) { #' #' @keywords DataFrame #' @examples -#' field = pl$Field("Cities", pl$Utf8) +#' field = pl$Field("Cities", pl$String) #' field$datatype #' #' field$datatype = pl$Categorical #<- is fine too diff --git a/R/convert.R b/R/convert.R index 3a4136ad2..2e0c7c68d 100644 --- a/R/convert.R +++ b/R/convert.R @@ -15,7 +15,7 @@ #' @examples #' pl$from_arrow( #' data = arrow::arrow_table(iris), -#' schema_overrides = list(Sepal.Length = pl$Float32, Species = pl$Utf8) +#' schema_overrides = list(Sepal.Length = pl$Float32, Species = pl$String) #' ) #' #' char_schema = names(iris) diff --git a/R/csv.R b/R/csv.R index 3ff539943..0b444e098 100644 --- a/R/csv.R +++ b/R/csv.R @@ -25,7 +25,7 @@ #' * "Float64" or "float64" for DataType::Float64, #' * "Int32" or "integer" for DataType::Int32, #' * "Int64" or "integer64" for DataType::Int64, -#' * "Utf8" or "character" for DataType::Utf8, +#' * "String" or "character" for DataType::String, #' @param null_values Values to interpret as `NA` values. Can be: #' * a character vector: all values that match one of the values in this vector #' will be `NA`; @@ -55,7 +55,7 @@ #' the name is set). #' @param try_parse_dates Try to automatically parse dates. Most ISO8601-like #' formats can be inferred, as well as a handful of others. If this does not -#' succeed, the column remains of data type `pl$Utf8`. +#' succeed, the column remains of data type `pl$String`. #' @param eol_char Single byte end of line character (default: `\n`). When #' encountering a file with Windows line endings (`\r\n`), one can go with the #' default `\n`. The extra `\r` will be removed when processed. diff --git a/R/dataframe__frame.R b/R/dataframe__frame.R index d826173dd..fc431e6db 100644 --- a/R/dataframe__frame.R +++ b/R/dataframe__frame.R @@ -141,7 +141,7 @@ NULL #' pl$DataFrame(mtcars) #' #' # custom schema -#' pl$DataFrame(iris, schema = list(Sepal.Length = pl$Float32, Species = pl$Utf8)) +#' pl$DataFrame(iris, schema = list(Sepal.Length = pl$Float32, Species = pl$String)) pl_DataFrame = function(..., make_names_unique = TRUE, schema = NULL) { uw = \(res) unwrap(res, "in $DataFrame():") @@ -1669,9 +1669,9 @@ DataFrame_sample = function( #' # simple use-case #' pl$DataFrame(mtcars)$transpose(include_header = TRUE, column_names = rownames(mtcars)) #' -#' # All rows must have one shared supertype, recast Categorical to Utf8 which is a supertype +#' # All rows must have one shared supertype, recast Categorical to String which is a supertype #' # of f64, and then dataset "Iris" can be transposed -#' pl$DataFrame(iris)$with_columns(pl$col("Species")$cast(pl$Utf8))$transpose() +#' pl$DataFrame(iris)$with_columns(pl$col("Species")$cast(pl$String))$transpose() #' DataFrame_transpose = function( include_header = FALSE, diff --git a/R/datatype.R b/R/datatype.R index 48b6e2a31..9416bd767 100644 --- a/R/datatype.R +++ b/R/datatype.R @@ -48,15 +48,15 @@ wrap_proto_schema = function(x) { #' @examples #' print(ls(pl$dtypes)) #' pl$dtypes$Float64 -#' pl$dtypes$Utf8 +#' pl$dtypes$String #' #' pl$List(pl$List(pl$UInt64)) #' -#' pl$Struct(pl$Field("CityNames", pl$Utf8)) +#' pl$Struct(pl$Field("CityNames", pl$String)) #' -#' # The function changes type from Integer(Int32)[Integers] to char(Utf8)[Strings] -#' # specifying the output DataType: Utf8 solves the problem -#' pl$Series(1:4)$map_elements(\(x) letters[x], datatype = pl$dtypes$Utf8) +#' # The function changes type from Int32 to String +#' # Specifying the output DataType: String solves the problem +#' pl$Series(1:4)$map_elements(\(x) letters[x], datatype = pl$dtypes$String) #' NULL diff --git a/R/expr__expr.R b/R/expr__expr.R index 9e06054d3..6741762dc 100644 --- a/R/expr__expr.R +++ b/R/expr__expr.R @@ -672,7 +672,7 @@ construct_ProtoExprArray = function(...) { #' select( #' pl$col("Sepal.Length")$map_batches(\(x) { #' paste("cheese", as.character(x$to_vector())) -#' }, pl$dtypes$Utf8) +#' }, pl$dtypes$String) #' ) #' #' # R parallel process example, use Sys.sleep() to imitate some CPU expensive @@ -799,7 +799,7 @@ Expr_map = function(f, output_type = NULL, agg_list = FALSE, in_background = FAL #' #' e_letter = my_selection$map_elements(\(x) { #' letters[ceiling(x)] -#' }, return_type = pl$dtypes$Utf8)$name$suffix("_letter") +#' }, return_type = pl$dtypes$String)$name$suffix("_letter") #' pl$DataFrame(iris)$select(e_add10, e_letter) #' #' @@ -1575,10 +1575,12 @@ Expr_sort_by = function(by, descending = FALSE) { #' Gather values by index #' #' @param indices R scalar/vector or Series, or Expr that leads to a Series of -#' dtype UInt32. +#' dtype Int64. (0-indexed) #' @return Expr #' @examples -#' pl$DataFrame(a = c(1, 2, 4, 5, 8))$select(pl$col("a")$gather(c(0, 2, 4))) +#' df = pl$DataFrame(a = 1:10) +#' +#' df$select(pl$col("a")$gather(c(0, 2, 4, -1))) Expr_gather = function(indices) { .pr$Expr$gather(self, pl$lit(indices)) |> unwrap("in $gather():") @@ -2034,7 +2036,7 @@ Expr_filter = function(predicate) { Expr_where = Expr_filter -#' Explode a list or Utf8 Series +#' Explode a list or String Series #' #' This means that every item is expanded to a new row. #' diff --git a/R/expr__list.R b/R/expr__list.R index 03aa3328c..54346dd53 100644 --- a/R/expr__list.R +++ b/R/expr__list.R @@ -231,11 +231,11 @@ ExprList_contains = function(item) .pr$Expr$list_contains(self, wrap_e(item)) #' #' @description #' Join all string items in a sublist and place a separator between them. -#' This errors if inner type of list `!= Utf8`. +#' This errors if inner type of list `!= String`. #' @param separator String to separate the items with. Can be an Expr. #' @keywords ExprList #' @format function -#' @return Series of dtype Utf8 +#' @return Series of dtype String #' @aliases list_join #' @examples #' df = pl$DataFrame(list(s = list(c("a", "b", "c"), c("x", "y")))) diff --git a/R/expr__string.R b/R/expr__string.R index c7330dd07..860437682 100644 --- a/R/expr__string.R +++ b/R/expr__string.R @@ -4,7 +4,7 @@ # expr_str_make_sub_ns = macro_new_subnamespace("^ExprStr_", "RPolarsExprStrNameSpace") -#' Convert a Utf8 column into a Date/Datetime/Time column. +#' Convert a String column into a Date/Datetime/Time column. #' #' #' @param datatype The data type to convert into. Can be either Date, Datetime, @@ -96,7 +96,7 @@ ExprStr_strptime = function( unwrap("in str$strptime():") } -#' Convert a Utf8 column into a Date column +#' Convert a String column into a Date column #' #' @param format Format to use for conversion. See `?strptime` for possible #' values. Example: "%Y-%m-%d". If `NULL` (default), the format is @@ -120,7 +120,7 @@ ExprStr_to_date = function(format = NULL, strict = TRUE, exact = TRUE, cache = T unwrap("in $str$to_date():") } -#' Convert a Utf8 column into a Time column +#' Convert a String column into a Time column #' #' @param format Format to use for conversion. See `?strptime` for possible #' values. Example: "%H:%M:%S". If `NULL` (default), the format is @@ -142,7 +142,7 @@ ExprStr_to_time = function(format = NULL, strict = TRUE, cache = TRUE) { unwrap("in $str$to_time():") } -#' Convert a Utf8 column into a Datetime column +#' Convert a String column into a Datetime column #' #' @param format Format to use for conversion. See `?strptime` for possible #' values. Example: "%Y-%m-%d %H:%M:%S". If `NULL` (default), the format is @@ -223,7 +223,7 @@ ExprStr_len_chars = function() { #' @param ignore_nulls Ignore null values. If `FALSE`, null values will be #' propagated: if the column contains any null values, the output is null. #' @keywords ExprStr -#' @return Expr of Utf8 concatenated +#' @return Expr of String concatenated #' @examples #' # concatenate a Series of strings to a single string #' df = pl$DataFrame(foo = c("1", NA, 2)) @@ -242,7 +242,7 @@ ExprStr_concat = function(delimiter = "-", ignore_nulls = TRUE) { #' #' @description Transform to uppercase variant. #' @keywords ExprStr -#' @return Expr of Utf8 uppercase chars +#' @return Expr of String uppercase chars #' @examples #' pl$lit(c("A", "b", "c", "1", NA))$str$to_uppercase()$to_series() ExprStr_to_uppercase = function() { @@ -253,7 +253,7 @@ ExprStr_to_uppercase = function() { #' #' @description Transform to lowercase variant. #' @keywords ExprStr -#' @return Expr of Utf8 lowercase chars +#' @return Expr of String lowercase chars #' @examples #' pl$lit(c("A", "b", "c", "1", NA))$str$to_lowercase()$to_series() ExprStr_to_lowercase = function() { @@ -264,7 +264,7 @@ ExprStr_to_lowercase = function() { #' #' @description Transform to titlecase variant. #' @keywords ExprStr -#' @return Expr of Utf8 titlecase chars +#' @return Expr of String titlecase chars #' @details #' This method is only available with the feature flag "simd" which can #' be set via envvar "RPOLARS_FULL_FEATURES" and it requires @@ -296,7 +296,7 @@ ExprStr_to_titlecase = function() { #' `strip_chars()` removes characters at the beginning and the end of the string. #' Use `strip_chars_start()` and `strip_chars_end()` to remove characters only #' from left and right respectively. -#' @return Expr of Utf8 lowercase chars +#' @return Expr of String lowercase chars #' @examples #' df = pl$DataFrame(foo = c(" hello", "\tworld")) #' df$select(pl$col("foo")$str$strip_chars()) @@ -321,7 +321,7 @@ ExprStr_strip_chars = function(matches = NULL) { #' `strip_chars_start()` removes characters at the beginning of the string only. #' Use `strip_chars()` and `strip_chars_end()` to remove characters from the left #' and right or only from the right respectively. -#' @return Expr of Utf8 lowercase chars +#' @return Expr of String lowercase chars #' @examples #' df = pl$DataFrame(foo = c(" hello", "\tworld")) #' df$select(pl$col("foo")$str$strip_chars_start(" hel rld")) @@ -345,7 +345,7 @@ ExprStr_strip_chars_start = function(matches = NULL) { #' `strip_chars_end()` removes characters at the end of the string only. #' Use `strip_chars()` and `strip_chars_start()` to remove characters from the left #' and right or only from the left respectively. -#' @return Expr of Utf8 lowercase chars +#' @return Expr of String lowercase chars #' @examples #' df = pl$DataFrame(foo = c(" hello", "\tworld")) #' df$select(pl$col("foo")$str$strip_chars_end(" hel\trld")) @@ -375,12 +375,12 @@ ExprStr_strip_chars_end = function(matches = NULL) { #' @examples #' some_floats_expr = pl$lit(c(0, 10, -5, 5)) #' -#' # cast to Utf8 and ljust alignment = 5, and view as R char vector -#' some_floats_expr$cast(pl$Utf8)$str$zfill(5)$to_r() +#' # cast to String and ljust alignment = 5, and view as R char vector +#' some_floats_expr$cast(pl$String)$str$zfill(5)$to_r() #' #' # cast to int and the to utf8 and then ljust alignment = 5, and view as R #' # char vector -#' some_floats_expr$cast(pl$Int64)$cast(pl$Utf8)$str$zfill(5)$to_r() +#' some_floats_expr$cast(pl$Int64)$cast(pl$String)$str$zfill(5)$to_r() ExprStr_zfill = function(alignment) { .pr$Expr$str_zfill(self, alignment) |> unwrap("in str$zfill():") @@ -395,7 +395,7 @@ ExprStr_zfill = function(alignment) { #' @param fillchar Fill with this ASCII character. #' @details Padding is done using the specified `fillchar`. The original string #' is returned if `width` is less than or equal to `len(s)`. -#' @return Expr of Utf8 +#' @return Expr of String #' @examples #' df = pl$DataFrame(a = c("cow", "monkey", NA, "hippopotamus")) #' df$select(pl$col("a")$str$pad_end(8, "*")) @@ -508,10 +508,10 @@ ExprStr_json_decode = function(dtype, infer_schema_length = 100) { #' @param json_path A valid JSON path query string. #' @details #' Throw errors if encounter invalid JSON strings. All return value will be -#' cast to Utf8 regardless of the original value. +#' cast to String regardless of the original value. #' #' Documentation on JSONPath standard can be found here: . -#' @return Utf8 array. Contain null if original value is null or the json_path +#' @return String array. Contain null if original value is null or the json_path #' return nothing. #' @examples #' df = pl$DataFrame( @@ -532,7 +532,7 @@ ExprStr_json_path_match = function(json_path) { #' @param strict If `TRUE` (default), raise an error if the underlying value #' cannot be decoded. Otherwise, replace it with a null value. #' -#' @return Utf8 array with values decoded using provided encoding +#' @return String array with values decoded using provided encoding #' #' @examples #' df = pl$DataFrame(strings = c("foo", "bar", NA)) @@ -541,14 +541,16 @@ ExprStr_json_path_match = function(json_path) { #' pl$col("strings")$str$encode("base64")$alias("base64"), # notice DataType is not encoded #' pl$col("strings")$str$encode("hex")$alias("hex") # ... and must restored with cast #' )$with_columns( -#' pl$col("base64")$str$decode("base64")$alias("base64_decoded")$cast(pl$Utf8), -#' pl$col("hex")$str$decode("hex")$alias("hex_decoded")$cast(pl$Utf8) +#' pl$col("base64")$str$decode("base64")$alias("base64_decoded")$cast(pl$String), +#' pl$col("hex")$str$decode("hex")$alias("hex_decoded")$cast(pl$String) #' ) ExprStr_decode = function(encoding, ..., strict = TRUE) { + uw = \(res) unwrap(res, "in $str$decode():") + pcase( !is_string(encoding), stop("encoding must be a string, it was: ", encoding), - encoding == "hex", .pr$Expr$str_hex_decode(self, strict), - encoding == "base64", .pr$Expr$str_base64_decode(self, strict), + encoding == "hex", uw(.pr$Expr$str_hex_decode(self, strict)), + encoding == "base64", uw(.pr$Expr$str_base64_decode(self, strict)), or_else = stop("encoding must be one of 'hex' or 'base64', got ", encoding) ) } @@ -557,7 +559,7 @@ ExprStr_decode = function(encoding, ..., strict = TRUE) { #' #' @keywords ExprStr #' @param encoding Either 'hex' or 'base64'. -#' @return Utf8 array with values encoded using provided encoding +#' @return String array with values encoded using provided encoding #' #' @examples #' df = pl$DataFrame(strings = c("foo", "bar", NA)) @@ -566,14 +568,16 @@ ExprStr_decode = function(encoding, ..., strict = TRUE) { #' pl$col("strings")$str$encode("base64")$alias("base64"), # notice DataType is not encoded #' pl$col("strings")$str$encode("hex")$alias("hex") # ... and must restored with cast #' )$with_columns( -#' pl$col("base64")$str$decode("base64")$alias("base64_decoded")$cast(pl$Utf8), -#' pl$col("hex")$str$decode("hex")$alias("hex_decoded")$cast(pl$Utf8) +#' pl$col("base64")$str$decode("base64")$alias("base64_decoded")$cast(pl$String), +#' pl$col("hex")$str$decode("hex")$alias("hex_decoded")$cast(pl$String) #' ) ExprStr_encode = function(encoding) { + uw = \(res) unwrap(res, "in $str$$encode():") + pcase( !is_string(encoding), stop("encoding must be a string, it was: ", encoding), - encoding == "hex", .pr$Expr$str_hex_encode(self), - encoding == "base64", .pr$Expr$str_base64_encode(self), + encoding == "hex", uw(.pr$Expr$str_hex_encode(self)), + encoding == "base64", uw(.pr$Expr$str_base64_encode(self)), or_else = stop("encoding must be one of 'hex' or 'base64', got ", encoding) ) } @@ -587,7 +591,7 @@ ExprStr_encode = function(encoding) { #' pattern, first group begin at index 1 (default). #' #' @return -#' Utf8 array. Contains null if original value is null or regex capture nothing. +#' String array. Contains null if original value is null or regex capture nothing. #' #' @examples #' df = pl$DataFrame( @@ -614,7 +618,7 @@ ExprStr_extract = function(pattern, group_index) { #' @param pattern A valid regex pattern #' #' @return -#' `List[Utf8]` array. Contain null if original value is null or regex capture +#' `List[String]` array. Contain null if original value is null or regex capture #' nothing. #' #' @examples @@ -661,7 +665,7 @@ ExprStr_count_matches = function(pattern, literal = FALSE) { #' @param inclusive If `TRUE`, include the split character/string in the results. #' #' @return -#' List of Utf8 type +#' List of String type #' #' @examples #' df = pl$DataFrame(s = c("foo bar", "foo-bar", "foo bar baz")) @@ -690,7 +694,7 @@ ExprStr_split = function(by, inclusive = FALSE) { #' @param n Number of splits to make. #' @param inclusive If `TRUE`, include the split character/string in the results. #' -#' @return Struct where each of n+1 fields is of Utf8 type +#' @return Struct where each of n+1 fields is of String type #' #' @examples #' df = pl$DataFrame(s = c("a_1", NA, "c", "d_4")) @@ -714,7 +718,7 @@ ExprStr_split_exact = function(by, n, inclusive = FALSE) { #' @param n Number of splits to make. #' #' @return -#' Struct where each of `n` fields is of Utf8 type +#' Struct where each of `n` fields is of String type #' #' @examples #' df = pl$DataFrame(s = c("a_1", NA, "c", "d_4")) @@ -733,7 +737,7 @@ ExprStr_splitn = function(by, n) { #' @param value Replacement, can be an Expr. #' @param literal Treat pattern as a literal string. #' -#' @return Expr of Utf8 Series +#' @return Expr of String Series #' #' @seealso `$str$replace_all()`: Replace all matching regex/literal substrings. #' @@ -756,7 +760,7 @@ ExprStr_replace = function(pattern, value, literal = FALSE) { #' @param value Replacement, can be an Expr. #' @param literal Treat pattern as a literal string. #' -#' @return Expr of Utf8 Series +#' @return Expr of String Series #' #' @seealso `$str$replace()`: Replace first matching regex/literal substring. #' @@ -771,14 +775,14 @@ ExprStr_replace_all = function(pattern, value, literal = FALSE) { } -#' Create subslices of the string values of a Utf8 Series +#' Create subslices of the string values of a String Series #' #' @keywords ExprStr #' @param offset Start index. Negative indexing is supported. #' @param length Length of the slice. If `NULL` (default), the slice is taken to #' the end of the string. #' -#' @return Expr: Series of dtype Utf8. +#' @return Expr: Series of dtype String. #' #' @examples #' df = pl$DataFrame(s = c("pear", NA, "papaya", "dragonfruit")) @@ -793,7 +797,7 @@ ExprStr_slice = function(offset, length = NULL) { #' Returns a column with a separate row for every string character #' #' @keywords ExprStr -#' @return Expr: Series of dtype Utf8. +#' @return Expr: Series of dtype String. #' @examples #' df = pl$DataFrame(a = c("foo", "bar")) #' df$select(pl$col("a")$str$explode()) diff --git a/R/functions__lazy.R b/R/functions__lazy.R index 9c1f1d06c..21479a610 100644 --- a/R/functions__lazy.R +++ b/R/functions__lazy.R @@ -62,7 +62,7 @@ pl_all = function(name = NULL) { #' df$select(pl$col(pl$dtypes$Float64)) #' #' # ... or an R list of DataTypes, select any column of any such DataType -#' df$select(pl$col(list(pl$dtypes$Float64, pl$dtypes$Utf8))) +#' df$select(pl$col(list(pl$dtypes$Float64, pl$dtypes$String))) #' #' # from Series of names #' df$select(pl$col(pl$Series(c("bar", "foobar")))) @@ -691,13 +691,13 @@ pl_concat_list = function(exprs) { #' # wrap two columns in a struct and provide a schema to set all or some DataTypes by name #' e1 = pl$struct( #' pl$col(c("int", "str")), -#' schema = list(int = pl$Int64, str = pl$Utf8) +#' schema = list(int = pl$Int64, str = pl$String) #' )$alias("my_struct") #' # same result as e.g. wrapping the columns in a struct and casting afterwards #' e2 = pl$struct( #' list(pl$col("int"), pl$col("str")) #' )$cast( -#' pl$Struct(int = pl$Int64, str = pl$Utf8) +#' pl$Struct(int = pl$Int64, str = pl$String) #' )$alias("my_struct") #' #' df = pl$DataFrame( @@ -741,7 +741,7 @@ pl_struct = function( #' #' @param ... Columns to concatenate into a single string column. Accepts #' expressions. Strings are parsed as column names, other non-expression inputs -#' are parsed as literals. Non-Utf8 columns are cast to Utf8. +#' are parsed as literals. Non-String columns are cast to String #' @param separator String that will be used to separate the values of each #' column. #' @return Expr diff --git a/R/lazyframe__lazy.R b/R/lazyframe__lazy.R index 621f20706..bc95f678e 100644 --- a/R/lazyframe__lazy.R +++ b/R/lazyframe__lazy.R @@ -144,7 +144,7 @@ NULL #' # custom schema #' pl$LazyFrame( #' iris, -#' schema = list(Sepal.Length = pl$Float32, Species = pl$Utf8) +#' schema = list(Sepal.Length = pl$Float32, Species = pl$String) #' )$collect() pl_LazyFrame = function(...) { pl$DataFrame(...)$lazy() @@ -1502,7 +1502,7 @@ LazyFrame_profile = function( #' `"name"` is implicitly converted to `pl$col("name")`. #' #' @details -#' Only columns of DataType `List` or `Utf8` can be exploded. +#' Only columns of DataType `List` or `String` can be exploded. #' #' Named expressions like `$explode(a = pl$col("b"))` will not implicitly trigger #' `$alias("a")` here, due to only variant `Expr::Column` is supported in diff --git a/R/series__series.R b/R/series__series.R index 8b3bff913..e26f61f4e 100644 --- a/R/series__series.R +++ b/R/series__series.R @@ -251,11 +251,10 @@ Series_shape = method_as_property(function() { #' #' @examples #' -#' # make polars Series_Utf8 #' series_vec = pl$Series(letters[1:3]) #' #' # Series_non_list -#' series_vec$to_r() # as vector because Series DataType is not list (is Utf8) +#' series_vec$to_r() # as vector because Series DataType is not list (is String) #' series_vec$to_r_list() # implicit call as.list(), convert to list #' series_vec$to_vector() # implicit call unlist(), same as to_r() as already vector #' @@ -354,7 +353,7 @@ Series_value_counts = function(sort = TRUE, parallel = FALSE) { #' @examples #' s = pl$Series(letters[1:5], "ltrs") #' f = \(x) paste(x, ":", as.integer(charToRaw(x))) -#' s$map_elements(f, pl$Utf8) +#' s$map_elements(f, pl$String) #' #' # same as #' pl$Series(sapply(s$to_r(), f), s$name) diff --git a/R/zzz.R b/R/zzz.R index 388d116e8..e81a8ad04 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -111,7 +111,7 @@ move_env_elements(RPolarsExpr, pl, c("lit"), remove = FALSE) .onLoad = function(libname, pkgname) { # instanciate one of each DataType (it's just an enum) - all_types = .pr$DataType$get_all_simple_type_names() + all_types = c(.pr$DataType$get_all_simple_type_names(), "Utf8") # Allow "Utf8" as an alias of "String" names(all_types) = all_types pl$dtypes = c( lapply(all_types, DataType_new), # instanciate all simple flag-like types diff --git a/man/DataFrame_transpose.Rd b/man/DataFrame_transpose.Rd index b5c5b2f7a..1196a56c1 100644 --- a/man/DataFrame_transpose.Rd +++ b/man/DataFrame_transpose.Rd @@ -39,9 +39,9 @@ Polars transpose is currently eager only, likely because it is not trivial to de # simple use-case pl$DataFrame(mtcars)$transpose(include_header = TRUE, column_names = rownames(mtcars)) -# All rows must have one shared supertype, recast Categorical to Utf8 which is a supertype +# All rows must have one shared supertype, recast Categorical to String which is a supertype # of f64, and then dataset "Iris" can be transposed -pl$DataFrame(iris)$with_columns(pl$col("Species")$cast(pl$Utf8))$transpose() +pl$DataFrame(iris)$with_columns(pl$col("Species")$cast(pl$String))$transpose() } \keyword{DataFrame} diff --git a/man/ExprList_join.Rd b/man/ExprList_join.Rd index 61f560f92..d515b5bc5 100644 --- a/man/ExprList_join.Rd +++ b/man/ExprList_join.Rd @@ -14,11 +14,11 @@ ExprList_join(separator) \item{separator}{String to separate the items with. Can be an Expr.} } \value{ -Series of dtype Utf8 +Series of dtype String } \description{ Join all string items in a sublist and place a separator between them. -This errors if inner type of list \verb{!= Utf8}. +This errors if inner type of list \verb{!= String}. } \examples{ df = pl$DataFrame(list(s = list(c("a", "b", "c"), c("x", "y")))) diff --git a/man/ExprStr_concat.Rd b/man/ExprStr_concat.Rd index b29a572c9..45f2e4b6d 100644 --- a/man/ExprStr_concat.Rd +++ b/man/ExprStr_concat.Rd @@ -13,7 +13,7 @@ ExprStr_concat(delimiter = "-", ignore_nulls = TRUE) propagated: if the column contains any null values, the output is null.} } \value{ -Expr of Utf8 concatenated +Expr of String concatenated } \description{ Vertically concatenate the values in the Series to a single diff --git a/man/ExprStr_decode.Rd b/man/ExprStr_decode.Rd index e84ded69b..4b3763885 100644 --- a/man/ExprStr_decode.Rd +++ b/man/ExprStr_decode.Rd @@ -15,7 +15,7 @@ ExprStr_decode(encoding, ..., strict = TRUE) cannot be decoded. Otherwise, replace it with a null value.} } \value{ -Utf8 array with values decoded using provided encoding +String array with values decoded using provided encoding } \description{ Decode a value using the provided encoding @@ -27,8 +27,8 @@ df$with_columns( pl$col("strings")$str$encode("base64")$alias("base64"), # notice DataType is not encoded pl$col("strings")$str$encode("hex")$alias("hex") # ... and must restored with cast )$with_columns( - pl$col("base64")$str$decode("base64")$alias("base64_decoded")$cast(pl$Utf8), - pl$col("hex")$str$decode("hex")$alias("hex_decoded")$cast(pl$Utf8) + pl$col("base64")$str$decode("base64")$alias("base64_decoded")$cast(pl$String), + pl$col("hex")$str$decode("hex")$alias("hex_decoded")$cast(pl$String) ) } \keyword{ExprStr} diff --git a/man/ExprStr_encode.Rd b/man/ExprStr_encode.Rd index 91272edd6..ca3692b11 100644 --- a/man/ExprStr_encode.Rd +++ b/man/ExprStr_encode.Rd @@ -10,7 +10,7 @@ ExprStr_encode(encoding) \item{encoding}{Either 'hex' or 'base64'.} } \value{ -Utf8 array with values encoded using provided encoding +String array with values encoded using provided encoding } \description{ Encode a value using the provided encoding @@ -22,8 +22,8 @@ df$with_columns( pl$col("strings")$str$encode("base64")$alias("base64"), # notice DataType is not encoded pl$col("strings")$str$encode("hex")$alias("hex") # ... and must restored with cast )$with_columns( - pl$col("base64")$str$decode("base64")$alias("base64_decoded")$cast(pl$Utf8), - pl$col("hex")$str$decode("hex")$alias("hex_decoded")$cast(pl$Utf8) + pl$col("base64")$str$decode("base64")$alias("base64_decoded")$cast(pl$String), + pl$col("hex")$str$decode("hex")$alias("hex_decoded")$cast(pl$String) ) } \keyword{ExprStr} diff --git a/man/ExprStr_explode.Rd b/man/ExprStr_explode.Rd index 2f419c241..18880c287 100644 --- a/man/ExprStr_explode.Rd +++ b/man/ExprStr_explode.Rd @@ -7,7 +7,7 @@ ExprStr_explode() } \value{ -Expr: Series of dtype Utf8. +Expr: Series of dtype String. } \description{ Returns a column with a separate row for every string character diff --git a/man/ExprStr_extract.Rd b/man/ExprStr_extract.Rd index 7fc2502a8..a27009bb6 100644 --- a/man/ExprStr_extract.Rd +++ b/man/ExprStr_extract.Rd @@ -13,7 +13,7 @@ ExprStr_extract(pattern, group_index) pattern, first group begin at index 1 (default).} } \value{ -Utf8 array. Contains null if original value is null or regex capture nothing. +String array. Contains null if original value is null or regex capture nothing. } \description{ Extract the target capture group from provided patterns diff --git a/man/ExprStr_extract_all.Rd b/man/ExprStr_extract_all.Rd index fa31066d7..653800915 100644 --- a/man/ExprStr_extract_all.Rd +++ b/man/ExprStr_extract_all.Rd @@ -10,7 +10,7 @@ ExprStr_extract_all(pattern) \item{pattern}{A valid regex pattern} } \value{ -\code{List[Utf8]} array. Contain null if original value is null or regex capture +\code{List[String]} array. Contain null if original value is null or regex capture nothing. } \description{ diff --git a/man/ExprStr_json_path_match.Rd b/man/ExprStr_json_path_match.Rd index 880311430..b56c7c3be 100644 --- a/man/ExprStr_json_path_match.Rd +++ b/man/ExprStr_json_path_match.Rd @@ -10,7 +10,7 @@ ExprStr_json_path_match(json_path) \item{json_path}{A valid JSON path query string.} } \value{ -Utf8 array. Contain null if original value is null or the json_path +String array. Contain null if original value is null or the json_path return nothing. } \description{ @@ -18,7 +18,7 @@ Extract the first match of JSON string with the provided JSONPath expression } \details{ Throw errors if encounter invalid JSON strings. All return value will be -cast to Utf8 regardless of the original value. +cast to String regardless of the original value. Documentation on JSONPath standard can be found here: \url{https://goessner.net/articles/JsonPath/}. } diff --git a/man/ExprStr_pad_end.Rd b/man/ExprStr_pad_end.Rd index 447e8f8a0..a61abeac4 100644 --- a/man/ExprStr_pad_end.Rd +++ b/man/ExprStr_pad_end.Rd @@ -12,7 +12,7 @@ ExprStr_pad_end(width, fillchar = " ") \item{fillchar}{Fill with this ASCII character.} } \value{ -Expr of Utf8 +Expr of String } \description{ Return the string left justified in a string of length \code{width}. diff --git a/man/ExprStr_pad_start.Rd b/man/ExprStr_pad_start.Rd index 5fae7980c..8176d5059 100644 --- a/man/ExprStr_pad_start.Rd +++ b/man/ExprStr_pad_start.Rd @@ -12,7 +12,7 @@ ExprStr_pad_start(width, fillchar = " ") \item{fillchar}{Fill with this ASCII character.} } \value{ -Expr of Utf8 +Expr of String } \description{ Return the string right justified in a string of length \code{width}. diff --git a/man/ExprStr_replace.Rd b/man/ExprStr_replace.Rd index d4ec1d8f7..d458f2ae6 100644 --- a/man/ExprStr_replace.Rd +++ b/man/ExprStr_replace.Rd @@ -14,7 +14,7 @@ ExprStr_replace(pattern, value, literal = FALSE) \item{literal}{Treat pattern as a literal string.} } \value{ -Expr of Utf8 Series +Expr of String Series } \description{ Replace first matching regex/literal substring with a new string value diff --git a/man/ExprStr_replace_all.Rd b/man/ExprStr_replace_all.Rd index 56d0a1d41..36e48d7e0 100644 --- a/man/ExprStr_replace_all.Rd +++ b/man/ExprStr_replace_all.Rd @@ -14,7 +14,7 @@ ExprStr_replace_all(pattern, value, literal = FALSE) \item{literal}{Treat pattern as a literal string.} } \value{ -Expr of Utf8 Series +Expr of String Series } \description{ Replace all matching regex/literal substrings with a new string value diff --git a/man/ExprStr_slice.Rd b/man/ExprStr_slice.Rd index 4d6bc3a4a..a8a1ef2ab 100644 --- a/man/ExprStr_slice.Rd +++ b/man/ExprStr_slice.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/expr__string.R \name{ExprStr_slice} \alias{ExprStr_slice} -\title{Create subslices of the string values of a Utf8 Series} +\title{Create subslices of the string values of a String Series} \usage{ ExprStr_slice(offset, length = NULL) } @@ -13,10 +13,10 @@ ExprStr_slice(offset, length = NULL) the end of the string.} } \value{ -Expr: Series of dtype Utf8. +Expr: Series of dtype String. } \description{ -Create subslices of the string values of a Utf8 Series +Create subslices of the string values of a String Series } \examples{ df = pl$DataFrame(s = c("pear", NA, "papaya", "dragonfruit")) diff --git a/man/ExprStr_split.Rd b/man/ExprStr_split.Rd index bdbac79dc..9cb6123d0 100644 --- a/man/ExprStr_split.Rd +++ b/man/ExprStr_split.Rd @@ -13,7 +13,7 @@ used to split the string.} \item{inclusive}{If \code{TRUE}, include the split character/string in the results.} } \value{ -List of Utf8 type +List of String type } \description{ Split the string by a substring diff --git a/man/ExprStr_split_exact.Rd b/man/ExprStr_split_exact.Rd index 2d7df2417..c6f3405a6 100644 --- a/man/ExprStr_split_exact.Rd +++ b/man/ExprStr_split_exact.Rd @@ -14,7 +14,7 @@ ExprStr_split_exact(by, n, inclusive = FALSE) \item{inclusive}{If \code{TRUE}, include the split character/string in the results.} } \value{ -Struct where each of n+1 fields is of Utf8 type +Struct where each of n+1 fields is of String type } \description{ This results in a struct of \code{n+1} fields. If it cannot make \code{n} diff --git a/man/ExprStr_splitn.Rd b/man/ExprStr_splitn.Rd index a8b9c80e8..9f504e491 100644 --- a/man/ExprStr_splitn.Rd +++ b/man/ExprStr_splitn.Rd @@ -12,7 +12,7 @@ ExprStr_splitn(by, n) \item{n}{Number of splits to make.} } \value{ -Struct where each of \code{n} fields is of Utf8 type +Struct where each of \code{n} fields is of String type } \description{ If the number of possible splits is less than \code{n-1}, the remaining field diff --git a/man/ExprStr_strip_chars.Rd b/man/ExprStr_strip_chars.Rd index f988f515f..8884e95c1 100644 --- a/man/ExprStr_strip_chars.Rd +++ b/man/ExprStr_strip_chars.Rd @@ -13,7 +13,7 @@ set of characters will be stripped. If \code{NULL} (default), all whitespace is removed instead. This can be an Expr.} } \value{ -Expr of Utf8 lowercase chars +Expr of String lowercase chars } \description{ Remove leading and trailing characters. diff --git a/man/ExprStr_strip_chars_end.Rd b/man/ExprStr_strip_chars_end.Rd index 690fe1d05..bf7b38a07 100644 --- a/man/ExprStr_strip_chars_end.Rd +++ b/man/ExprStr_strip_chars_end.Rd @@ -13,7 +13,7 @@ set of characters will be stripped. If \code{NULL} (default), all whitespace is removed instead. This can be an Expr.} } \value{ -Expr of Utf8 lowercase chars +Expr of String lowercase chars } \description{ Remove trailing characters. diff --git a/man/ExprStr_strip_chars_start.Rd b/man/ExprStr_strip_chars_start.Rd index d9e472020..5b87bd1ab 100644 --- a/man/ExprStr_strip_chars_start.Rd +++ b/man/ExprStr_strip_chars_start.Rd @@ -13,7 +13,7 @@ set of characters will be stripped. If \code{NULL} (default), all whitespace is removed instead. This can be an Expr.} } \value{ -Expr of Utf8 lowercase chars +Expr of String lowercase chars } \description{ Remove leading characters. diff --git a/man/ExprStr_strptime.Rd b/man/ExprStr_strptime.Rd index 299be8a0c..9603aebc1 100644 --- a/man/ExprStr_strptime.Rd +++ b/man/ExprStr_strptime.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/expr__string.R \name{ExprStr_strptime} \alias{ExprStr_strptime} -\title{Convert a Utf8 column into a Date/Datetime/Time column.} +\title{Convert a String column into a Date/Datetime/Time column.} \usage{ ExprStr_strptime( datatype, @@ -42,7 +42,7 @@ conversion.} Expr of a Date, Datetime or Time Series } \description{ -Convert a Utf8 column into a Date/Datetime/Time column. +Convert a String column into a Date/Datetime/Time column. } \details{ When parsing a Datetime the column precision will be inferred from the format diff --git a/man/ExprStr_to_date.Rd b/man/ExprStr_to_date.Rd index fb6e8fc47..d1ec18061 100644 --- a/man/ExprStr_to_date.Rd +++ b/man/ExprStr_to_date.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/expr__string.R \name{ExprStr_to_date} \alias{ExprStr_to_date} -\title{Convert a Utf8 column into a Date column} +\title{Convert a String column into a Date column} \usage{ ExprStr_to_date(format = NULL, strict = TRUE, exact = TRUE, cache = TRUE) } @@ -25,7 +25,7 @@ conversion.} Expr } \description{ -Convert a Utf8 column into a Date column +Convert a String column into a Date column } \examples{ pl$DataFrame(str_date = c("2009-01-02", "2009-01-03", "2009-1-4", "2009 05 01"))$ diff --git a/man/ExprStr_to_datetime.Rd b/man/ExprStr_to_datetime.Rd index 0a2027ec5..452aed605 100644 --- a/man/ExprStr_to_datetime.Rd +++ b/man/ExprStr_to_datetime.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/expr__string.R \name{ExprStr_to_datetime} \alias{ExprStr_to_datetime} -\title{Convert a Utf8 column into a Datetime column} +\title{Convert a String column into a Datetime column} \usage{ ExprStr_to_datetime( format = NULL, @@ -45,7 +45,7 @@ conversion.} Expr } \description{ -Convert a Utf8 column into a Datetime column +Convert a String column into a Datetime column } \examples{ pl$DataFrame(str_date = c("2009-01-02 01:00", "2009-01-03 02:00", "2009-1-4 3:00"))$ diff --git a/man/ExprStr_to_lowercase.Rd b/man/ExprStr_to_lowercase.Rd index b66050884..a359c8614 100644 --- a/man/ExprStr_to_lowercase.Rd +++ b/man/ExprStr_to_lowercase.Rd @@ -7,7 +7,7 @@ ExprStr_to_lowercase() } \value{ -Expr of Utf8 lowercase chars +Expr of String lowercase chars } \description{ Transform to lowercase variant. diff --git a/man/ExprStr_to_time.Rd b/man/ExprStr_to_time.Rd index 351f73a5a..0b63c5eae 100644 --- a/man/ExprStr_to_time.Rd +++ b/man/ExprStr_to_time.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/expr__string.R \name{ExprStr_to_time} \alias{ExprStr_to_time} -\title{Convert a Utf8 column into a Time column} +\title{Convert a String column into a Time column} \usage{ ExprStr_to_time(format = NULL, strict = TRUE, cache = TRUE) } @@ -22,7 +22,7 @@ conversion.} Expr } \description{ -Convert a Utf8 column into a Time column +Convert a String column into a Time column } \examples{ pl$DataFrame(str_time = c("01:20:01", "28:00:02", "03:00:02"))$ diff --git a/man/ExprStr_to_titlecase.Rd b/man/ExprStr_to_titlecase.Rd index d6dd51ca8..4a53880f5 100644 --- a/man/ExprStr_to_titlecase.Rd +++ b/man/ExprStr_to_titlecase.Rd @@ -7,7 +7,7 @@ ExprStr_to_titlecase() } \value{ -Expr of Utf8 titlecase chars +Expr of String titlecase chars } \description{ Transform to titlecase variant. diff --git a/man/ExprStr_to_uppercase.Rd b/man/ExprStr_to_uppercase.Rd index fbc8c1df2..c51324cf5 100644 --- a/man/ExprStr_to_uppercase.Rd +++ b/man/ExprStr_to_uppercase.Rd @@ -7,7 +7,7 @@ ExprStr_to_uppercase() } \value{ -Expr of Utf8 uppercase chars +Expr of String uppercase chars } \description{ Transform to uppercase variant. diff --git a/man/ExprStr_zfill.Rd b/man/ExprStr_zfill.Rd index 457d1bdb8..477a28621 100644 --- a/man/ExprStr_zfill.Rd +++ b/man/ExprStr_zfill.Rd @@ -28,11 +28,11 @@ less than or equal to \code{len(s)}. \examples{ some_floats_expr = pl$lit(c(0, 10, -5, 5)) -# cast to Utf8 and ljust alignment = 5, and view as R char vector -some_floats_expr$cast(pl$Utf8)$str$zfill(5)$to_r() +# cast to String and ljust alignment = 5, and view as R char vector +some_floats_expr$cast(pl$String)$str$zfill(5)$to_r() # cast to int and the to utf8 and then ljust alignment = 5, and view as R # char vector -some_floats_expr$cast(pl$Int64)$cast(pl$Utf8)$str$zfill(5)$to_r() +some_floats_expr$cast(pl$Int64)$cast(pl$String)$str$zfill(5)$to_r() } \keyword{ExprStr} diff --git a/man/Expr_explode.Rd b/man/Expr_explode.Rd index 374e784a7..1e7873dd9 100644 --- a/man/Expr_explode.Rd +++ b/man/Expr_explode.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/expr__expr.R \name{Expr_explode} \alias{Expr_explode} -\title{Explode a list or Utf8 Series} +\title{Explode a list or String Series} \usage{ Expr_explode } diff --git a/man/Expr_flatten.Rd b/man/Expr_flatten.Rd index 736be03d5..bc88ccd36 100644 --- a/man/Expr_flatten.Rd +++ b/man/Expr_flatten.Rd @@ -3,7 +3,7 @@ \docType{data} \name{Expr_flatten} \alias{Expr_flatten} -\title{Explode a list or Utf8 Series} +\title{Explode a list or String Series} \format{ An object of class \code{character} of length 1. } diff --git a/man/Expr_gather.Rd b/man/Expr_gather.Rd index c307d6c52..01d45faf4 100644 --- a/man/Expr_gather.Rd +++ b/man/Expr_gather.Rd @@ -8,7 +8,7 @@ Expr_gather(indices) } \arguments{ \item{indices}{R scalar/vector or Series, or Expr that leads to a Series of -dtype UInt32.} +dtype Int64. (0-indexed)} } \value{ Expr @@ -17,5 +17,7 @@ Expr Gather values by index } \examples{ -pl$DataFrame(a = c(1, 2, 4, 5, 8))$select(pl$col("a")$gather(c(0, 2, 4))) +df = pl$DataFrame(a = 1:10) + +df$select(pl$col("a")$gather(c(0, 2, 4, -1))) } diff --git a/man/Expr_map_batches.Rd b/man/Expr_map_batches.Rd index 01c138205..592b035ab 100644 --- a/man/Expr_map_batches.Rd +++ b/man/Expr_map_batches.Rd @@ -56,7 +56,7 @@ pl$DataFrame(iris)$ select( pl$col("Sepal.Length")$map_batches(\(x) { paste("cheese", as.character(x$to_vector())) - }, pl$dtypes$Utf8) + }, pl$dtypes$String) ) # R parallel process example, use Sys.sleep() to imitate some CPU expensive diff --git a/man/Expr_map_elements.Rd b/man/Expr_map_elements.Rd index f1194329c..eb135efed 100644 --- a/man/Expr_map_elements.Rd +++ b/man/Expr_map_elements.Rd @@ -92,7 +92,7 @@ e_add10 = my_selection$map_elements(\(x) { e_letter = my_selection$map_elements(\(x) { letters[ceiling(x)] -}, return_type = pl$dtypes$Utf8)$name$suffix("_letter") +}, return_type = pl$dtypes$String)$name$suffix("_letter") pl$DataFrame(iris)$select(e_add10, e_letter) diff --git a/man/IO_read_csv.Rd b/man/IO_read_csv.Rd index 8b8910f71..703dbd017 100644 --- a/man/IO_read_csv.Rd +++ b/man/IO_read_csv.Rd @@ -59,7 +59,7 @@ list is used while reading to overwrite dtypes. Supported types so far are: \item "Float64" or "float64" for DataType::Float64, \item "Int32" or "integer" for DataType::Int32, \item "Int64" or "integer64" for DataType::Int64, -\item "Utf8" or "character" for DataType::Utf8, +\item "String" or "character" for DataType::String, }} \item{null_values}{Values to interpret as \code{NA} values. Can be: @@ -100,7 +100,7 @@ the name is set).} \item{try_parse_dates}{Try to automatically parse dates. Most ISO8601-like formats can be inferred, as well as a handful of others. If this does not -succeed, the column remains of data type \code{pl$Utf8}.} +succeed, the column remains of data type \code{pl$String}.} \item{eol_char}{Single byte end of line character (default: \verb{\\n}). When encountering a file with Windows line endings (\verb{\\r\\n}), one can go with the diff --git a/man/IO_scan_csv.Rd b/man/IO_scan_csv.Rd index 51ec750e6..10e80dc4a 100644 --- a/man/IO_scan_csv.Rd +++ b/man/IO_scan_csv.Rd @@ -59,7 +59,7 @@ list is used while reading to overwrite dtypes. Supported types so far are: \item "Float64" or "float64" for DataType::Float64, \item "Int32" or "integer" for DataType::Int32, \item "Int64" or "integer64" for DataType::Int64, -\item "Utf8" or "character" for DataType::Utf8, +\item "String" or "character" for DataType::String, }} \item{null_values}{Values to interpret as \code{NA} values. Can be: @@ -100,7 +100,7 @@ the name is set).} \item{try_parse_dates}{Try to automatically parse dates. Most ISO8601-like formats can be inferred, as well as a handful of others. If this does not -succeed, the column remains of data type \code{pl$Utf8}.} +succeed, the column remains of data type \code{pl$String}.} \item{eol_char}{Single byte end of line character (default: \verb{\\n}). When encountering a file with Windows line endings (\verb{\\r\\n}), one can go with the diff --git a/man/LazyFrame_explode.Rd b/man/LazyFrame_explode.Rd index 9ae4b732e..786e875cb 100644 --- a/man/LazyFrame_explode.Rd +++ b/man/LazyFrame_explode.Rd @@ -21,7 +21,7 @@ This will take every element of a list column and add it on an additional row. } \details{ -Only columns of DataType \code{List} or \code{Utf8} can be exploded. +Only columns of DataType \code{List} or \code{String} can be exploded. Named expressions like \verb{$explode(a = pl$col("b"))} will not implicitly trigger \verb{$alias("a")} here, due to only variant \code{Expr::Column} is supported in diff --git a/man/RField_class.Rd b/man/RField_class.Rd index 037210929..a1ca78468 100644 --- a/man/RField_class.Rd +++ b/man/RField_class.Rd @@ -22,5 +22,5 @@ datatypes and Schemas to represent everything of the Series/Column except the raw values. } \examples{ -pl$Field("city_names", pl$Utf8) +pl$Field("city_names", pl$String) } diff --git a/man/RField_datatype.Rd b/man/RField_datatype.Rd index c7c80f1cf..d7c73ae1d 100644 --- a/man/RField_datatype.Rd +++ b/man/RField_datatype.Rd @@ -10,7 +10,7 @@ RField_datatype() Get/set Field datatype } \examples{ -field = pl$Field("Cities", pl$Utf8) +field = pl$Field("Cities", pl$String) field$datatype field$datatype = pl$Categorical #<- is fine too diff --git a/man/RField_name.Rd b/man/RField_name.Rd index 4753528be..2f4f931f8 100644 --- a/man/RField_name.Rd +++ b/man/RField_name.Rd @@ -10,7 +10,7 @@ RField_name() Get/set Field name } \examples{ -field = pl$Field("Cities", pl$Utf8) +field = pl$Field("Cities", pl$String) field$name field$name = "CityPoPulations" #<- is fine too diff --git a/man/Series_map_elements.Rd b/man/Series_map_elements.Rd index 8573358a1..08d960751 100644 --- a/man/Series_map_elements.Rd +++ b/man/Series_map_elements.Rd @@ -30,7 +30,7 @@ About as slow as regular non-vectorized R. Similar to using R sapply on a vector \examples{ s = pl$Series(letters[1:5], "ltrs") f = \(x) paste(x, ":", as.integer(charToRaw(x))) -s$map_elements(f, pl$Utf8) +s$map_elements(f, pl$String) # same as pl$Series(sapply(s$to_r(), f), s$name) diff --git a/man/Series_to_r.Rd b/man/Series_to_r.Rd index 2072eae90..2a8ce2fd5 100644 --- a/man/Series_to_r.Rd +++ b/man/Series_to_r.Rd @@ -32,11 +32,10 @@ Thus every leaf(non list type) will be placed on the same depth of the tree, and } \examples{ -# make polars Series_Utf8 series_vec = pl$Series(letters[1:3]) # Series_non_list -series_vec$to_r() # as vector because Series DataType is not list (is Utf8) +series_vec$to_r() # as vector because Series DataType is not list (is String) series_vec$to_r_list() # implicit call as.list(), convert to list series_vec$to_vector() # implicit call unlist(), same as to_r() as already vector diff --git a/man/pl_DataFrame.Rd b/man/pl_DataFrame.Rd index 9c581ae90..727e0fa46 100644 --- a/man/pl_DataFrame.Rd +++ b/man/pl_DataFrame.Rd @@ -49,6 +49,6 @@ pl$DataFrame(list( pl$DataFrame(mtcars) # custom schema -pl$DataFrame(iris, schema = list(Sepal.Length = pl$Float32, Species = pl$Utf8)) +pl$DataFrame(iris, schema = list(Sepal.Length = pl$Float32, Species = pl$String)) } \keyword{DataFrame_new} diff --git a/man/pl_LazyFrame.Rd b/man/pl_LazyFrame.Rd index 83b0edf61..fcce955cd 100644 --- a/man/pl_LazyFrame.Rd +++ b/man/pl_LazyFrame.Rd @@ -36,7 +36,7 @@ pl$LazyFrame(list( # custom schema pl$LazyFrame( iris, - schema = list(Sepal.Length = pl$Float32, Species = pl$Utf8) + schema = list(Sepal.Length = pl$Float32, Species = pl$String) )$collect() } \keyword{LazyFrame_new} diff --git a/man/pl_col.Rd b/man/pl_col.Rd index 4b52a6ee7..184e61ed1 100644 --- a/man/pl_col.Rd +++ b/man/pl_col.Rd @@ -49,7 +49,7 @@ df$select(pl$col("^foo.*$")) df$select(pl$col(pl$dtypes$Float64)) # ... or an R list of DataTypes, select any column of any such DataType -df$select(pl$col(list(pl$dtypes$Float64, pl$dtypes$Utf8))) +df$select(pl$col(list(pl$dtypes$Float64, pl$dtypes$String))) # from Series of names df$select(pl$col(pl$Series(c("bar", "foobar")))) diff --git a/man/pl_concat_str.Rd b/man/pl_concat_str.Rd index 1362f7a63..2053bd44d 100644 --- a/man/pl_concat_str.Rd +++ b/man/pl_concat_str.Rd @@ -9,7 +9,7 @@ pl_concat_str(..., separator = "") \arguments{ \item{...}{Columns to concatenate into a single string column. Accepts expressions. Strings are parsed as column names, other non-expression inputs -are parsed as literals. Non-Utf8 columns are cast to Utf8.} +are parsed as literals. Non-String columns are cast to String} \item{separator}{String that will be used to separate the values of each column.} diff --git a/man/pl_dtypes.Rd b/man/pl_dtypes.Rd index c12619594..9eff6173c 100644 --- a/man/pl_dtypes.Rd +++ b/man/pl_dtypes.Rd @@ -12,14 +12,14 @@ not applicable \examples{ print(ls(pl$dtypes)) pl$dtypes$Float64 -pl$dtypes$Utf8 +pl$dtypes$String pl$List(pl$List(pl$UInt64)) -pl$Struct(pl$Field("CityNames", pl$Utf8)) +pl$Struct(pl$Field("CityNames", pl$String)) -# The function changes type from Integer(Int32)[Integers] to char(Utf8)[Strings] -# specifying the output DataType: Utf8 solves the problem -pl$Series(1:4)$map_elements(\(x) letters[x], datatype = pl$dtypes$Utf8) +# The function changes type from Int32 to String +# Specifying the output DataType: String solves the problem +pl$Series(1:4)$map_elements(\(x) letters[x], datatype = pl$dtypes$String) } diff --git a/man/pl_from_arrow.Rd b/man/pl_from_arrow.Rd index ca4434913..a49f4a96a 100644 --- a/man/pl_from_arrow.Rd +++ b/man/pl_from_arrow.Rd @@ -37,7 +37,7 @@ import Arrow Table or Array \examples{ pl$from_arrow( data = arrow::arrow_table(iris), - schema_overrides = list(Sepal.Length = pl$Float32, Species = pl$Utf8) + schema_overrides = list(Sepal.Length = pl$Float32, Species = pl$String) ) char_schema = names(iris) diff --git a/man/pl_pl.Rd b/man/pl_pl.Rd index 0314de1ab..a485ff4b7 100644 --- a/man/pl_pl.Rd +++ b/man/pl_pl.Rd @@ -6,7 +6,7 @@ \alias{pl} \title{The complete polars public API.} \format{ -An object of class \code{pl_polars_env} (inherits from \code{environment}) of length 95. +An object of class \code{pl_polars_env} (inherits from \code{environment}) of length 96. } \usage{ pl diff --git a/man/pl_struct.Rd b/man/pl_struct.Rd index 91b392428..b66f1fc54 100644 --- a/man/pl_struct.Rd +++ b/man/pl_struct.Rd @@ -54,13 +54,13 @@ print(df$schema) # returns a schema, a named list containing one element a Struc # wrap two columns in a struct and provide a schema to set all or some DataTypes by name e1 = pl$struct( pl$col(c("int", "str")), - schema = list(int = pl$Int64, str = pl$Utf8) + schema = list(int = pl$Int64, str = pl$String) )$alias("my_struct") # same result as e.g. wrapping the columns in a struct and casting afterwards e2 = pl$struct( list(pl$col("int"), pl$col("str")) )$cast( - pl$Struct(int = pl$Int64, str = pl$Utf8) + pl$Struct(int = pl$Int64, str = pl$String) )$alias("my_struct") df = pl$DataFrame( diff --git a/src/rust/Cargo.lock b/src/rust/Cargo.lock index 01c8b4e25..3500f9f33 100644 --- a/src/rust/Cargo.lock +++ b/src/rust/Cargo.lock @@ -1382,8 +1382,8 @@ dependencies = [ [[package]] name = "polars" -version = "0.35.4" -source = "git+https://github.com/pola-rs/polars.git?rev=4046c732dec0c9311294a2589590b4d017c5a02a#4046c732dec0c9311294a2589590b4d017c5a02a" +version = "0.36.2" +source = "git+https://github.com/pola-rs/polars.git?rev=fa59ffc1685043b44476dcb2a3f3804460ead5c5#fa59ffc1685043b44476dcb2a3f3804460ead5c5" dependencies = [ "getrandom", "polars-core", @@ -1398,8 +1398,8 @@ dependencies = [ [[package]] name = "polars-arrow" -version = "0.35.4" -source = "git+https://github.com/pola-rs/polars.git?rev=4046c732dec0c9311294a2589590b4d017c5a02a#4046c732dec0c9311294a2589590b4d017c5a02a" +version = "0.36.2" +source = "git+https://github.com/pola-rs/polars.git?rev=fa59ffc1685043b44476dcb2a3f3804460ead5c5#fa59ffc1685043b44476dcb2a3f3804460ead5c5" dependencies = [ "ahash", "arrow-format", @@ -1434,8 +1434,8 @@ dependencies = [ [[package]] name = "polars-compute" -version = "0.35.4" -source = "git+https://github.com/pola-rs/polars.git?rev=4046c732dec0c9311294a2589590b4d017c5a02a#4046c732dec0c9311294a2589590b4d017c5a02a" +version = "0.36.2" +source = "git+https://github.com/pola-rs/polars.git?rev=fa59ffc1685043b44476dcb2a3f3804460ead5c5#fa59ffc1685043b44476dcb2a3f3804460ead5c5" dependencies = [ "bytemuck", "num-traits", @@ -1446,8 +1446,8 @@ dependencies = [ [[package]] name = "polars-core" -version = "0.35.4" -source = "git+https://github.com/pola-rs/polars.git?rev=4046c732dec0c9311294a2589590b4d017c5a02a#4046c732dec0c9311294a2589590b4d017c5a02a" +version = "0.36.2" +source = "git+https://github.com/pola-rs/polars.git?rev=fa59ffc1685043b44476dcb2a3f3804460ead5c5#fa59ffc1685043b44476dcb2a3f3804460ead5c5" dependencies = [ "ahash", "bitflags 2.4.1", @@ -1481,8 +1481,8 @@ dependencies = [ [[package]] name = "polars-error" -version = "0.35.4" -source = "git+https://github.com/pola-rs/polars.git?rev=4046c732dec0c9311294a2589590b4d017c5a02a#4046c732dec0c9311294a2589590b4d017c5a02a" +version = "0.36.2" +source = "git+https://github.com/pola-rs/polars.git?rev=fa59ffc1685043b44476dcb2a3f3804460ead5c5#fa59ffc1685043b44476dcb2a3f3804460ead5c5" dependencies = [ "arrow-format", "avro-schema", @@ -1493,8 +1493,8 @@ dependencies = [ [[package]] name = "polars-io" -version = "0.35.4" -source = "git+https://github.com/pola-rs/polars.git?rev=4046c732dec0c9311294a2589590b4d017c5a02a#4046c732dec0c9311294a2589590b4d017c5a02a" +version = "0.36.2" +source = "git+https://github.com/pola-rs/polars.git?rev=fa59ffc1685043b44476dcb2a3f3804460ead5c5#fa59ffc1685043b44476dcb2a3f3804460ead5c5" dependencies = [ "ahash", "async-trait", @@ -1534,8 +1534,8 @@ dependencies = [ [[package]] name = "polars-json" -version = "0.35.4" -source = "git+https://github.com/pola-rs/polars.git?rev=4046c732dec0c9311294a2589590b4d017c5a02a#4046c732dec0c9311294a2589590b4d017c5a02a" +version = "0.36.2" +source = "git+https://github.com/pola-rs/polars.git?rev=fa59ffc1685043b44476dcb2a3f3804460ead5c5#fa59ffc1685043b44476dcb2a3f3804460ead5c5" dependencies = [ "ahash", "chrono", @@ -1554,8 +1554,8 @@ dependencies = [ [[package]] name = "polars-lazy" -version = "0.35.4" -source = "git+https://github.com/pola-rs/polars.git?rev=4046c732dec0c9311294a2589590b4d017c5a02a#4046c732dec0c9311294a2589590b4d017c5a02a" +version = "0.36.2" +source = "git+https://github.com/pola-rs/polars.git?rev=fa59ffc1685043b44476dcb2a3f3804460ead5c5#fa59ffc1685043b44476dcb2a3f3804460ead5c5" dependencies = [ "ahash", "bitflags 2.4.1", @@ -1577,8 +1577,8 @@ dependencies = [ [[package]] name = "polars-ops" -version = "0.35.4" -source = "git+https://github.com/pola-rs/polars.git?rev=4046c732dec0c9311294a2589590b4d017c5a02a#4046c732dec0c9311294a2589590b4d017c5a02a" +version = "0.36.2" +source = "git+https://github.com/pola-rs/polars.git?rev=fa59ffc1685043b44476dcb2a3f3804460ead5c5#fa59ffc1685043b44476dcb2a3f3804460ead5c5" dependencies = [ "ahash", "aho-corasick", @@ -1613,8 +1613,8 @@ dependencies = [ [[package]] name = "polars-parquet" -version = "0.35.4" -source = "git+https://github.com/pola-rs/polars.git?rev=4046c732dec0c9311294a2589590b4d017c5a02a#4046c732dec0c9311294a2589590b4d017c5a02a" +version = "0.36.2" +source = "git+https://github.com/pola-rs/polars.git?rev=fa59ffc1685043b44476dcb2a3f3804460ead5c5#fa59ffc1685043b44476dcb2a3f3804460ead5c5" dependencies = [ "ahash", "async-stream", @@ -1638,8 +1638,8 @@ dependencies = [ [[package]] name = "polars-pipe" -version = "0.35.4" -source = "git+https://github.com/pola-rs/polars.git?rev=4046c732dec0c9311294a2589590b4d017c5a02a#4046c732dec0c9311294a2589590b4d017c5a02a" +version = "0.36.2" +source = "git+https://github.com/pola-rs/polars.git?rev=fa59ffc1685043b44476dcb2a3f3804460ead5c5#fa59ffc1685043b44476dcb2a3f3804460ead5c5" dependencies = [ "crossbeam-channel", "crossbeam-queue", @@ -1661,8 +1661,8 @@ dependencies = [ [[package]] name = "polars-plan" -version = "0.35.4" -source = "git+https://github.com/pola-rs/polars.git?rev=4046c732dec0c9311294a2589590b4d017c5a02a#4046c732dec0c9311294a2589590b4d017c5a02a" +version = "0.36.2" +source = "git+https://github.com/pola-rs/polars.git?rev=fa59ffc1685043b44476dcb2a3f3804460ead5c5#fa59ffc1685043b44476dcb2a3f3804460ead5c5" dependencies = [ "ahash", "bytemuck", @@ -1688,8 +1688,8 @@ dependencies = [ [[package]] name = "polars-row" -version = "0.35.4" -source = "git+https://github.com/pola-rs/polars.git?rev=4046c732dec0c9311294a2589590b4d017c5a02a#4046c732dec0c9311294a2589590b4d017c5a02a" +version = "0.36.2" +source = "git+https://github.com/pola-rs/polars.git?rev=fa59ffc1685043b44476dcb2a3f3804460ead5c5#fa59ffc1685043b44476dcb2a3f3804460ead5c5" dependencies = [ "polars-arrow", "polars-error", @@ -1698,8 +1698,8 @@ dependencies = [ [[package]] name = "polars-sql" -version = "0.35.4" -source = "git+https://github.com/pola-rs/polars.git?rev=4046c732dec0c9311294a2589590b4d017c5a02a#4046c732dec0c9311294a2589590b4d017c5a02a" +version = "0.36.2" +source = "git+https://github.com/pola-rs/polars.git?rev=fa59ffc1685043b44476dcb2a3f3804460ead5c5#fa59ffc1685043b44476dcb2a3f3804460ead5c5" dependencies = [ "polars-arrow", "polars-core", @@ -1714,8 +1714,8 @@ dependencies = [ [[package]] name = "polars-time" -version = "0.35.4" -source = "git+https://github.com/pola-rs/polars.git?rev=4046c732dec0c9311294a2589590b4d017c5a02a#4046c732dec0c9311294a2589590b4d017c5a02a" +version = "0.36.2" +source = "git+https://github.com/pola-rs/polars.git?rev=fa59ffc1685043b44476dcb2a3f3804460ead5c5#fa59ffc1685043b44476dcb2a3f3804460ead5c5" dependencies = [ "atoi", "chrono", @@ -1734,8 +1734,8 @@ dependencies = [ [[package]] name = "polars-utils" -version = "0.35.4" -source = "git+https://github.com/pola-rs/polars.git?rev=4046c732dec0c9311294a2589590b4d017c5a02a#4046c732dec0c9311294a2589590b4d017c5a02a" +version = "0.36.2" +source = "git+https://github.com/pola-rs/polars.git?rev=fa59ffc1685043b44476dcb2a3f3804460ead5c5#fa59ffc1685043b44476dcb2a3f3804460ead5c5" dependencies = [ "ahash", "bytemuck", @@ -1776,7 +1776,7 @@ dependencies = [ [[package]] name = "r-polars" -version = "0.35.2" +version = "0.36.0" dependencies = [ "either", "extendr-api", diff --git a/src/rust/Cargo.toml b/src/rust/Cargo.toml index 24149d8f5..09d2f2063 100644 --- a/src/rust/Cargo.toml +++ b/src/rust/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "r-polars" -version = "0.35.2" +version = "0.36.0" edition = "2021" rust-version = "1.73" publish = false @@ -49,8 +49,8 @@ serde_json = "*" smartstring = "1.0.1" state = "0.6.0" thiserror = "1.0.56" -polars-core = { git = "https://github.com/pola-rs/polars.git", rev = "4046c732dec0c9311294a2589590b4d017c5a02a", default-features = false } -polars-lazy = { git = "https://github.com/pola-rs/polars.git", rev = "4046c732dec0c9311294a2589590b4d017c5a02a", default-features = false } +polars-core = { git = "https://github.com/pola-rs/polars.git", rev = "fa59ffc1685043b44476dcb2a3f3804460ead5c5", default-features = false } +polars-lazy = { git = "https://github.com/pola-rs/polars.git", rev = "fa59ffc1685043b44476dcb2a3f3804460ead5c5", default-features = false } either = "1" #features copied from node-polars @@ -145,4 +145,4 @@ features = [ "zip_with", ] git = "https://github.com/pola-rs/polars.git" -rev = "4046c732dec0c9311294a2589590b4d017c5a02a" +rev = "fa59ffc1685043b44476dcb2a3f3804460ead5c5" diff --git a/src/rust/src/conversion_s_to_r.rs b/src/rust/src/conversion_s_to_r.rs index ecf8f1887..291be3aad 100644 --- a/src/rust/src/conversion_s_to_r.rs +++ b/src/rust/src/conversion_s_to_r.rs @@ -94,7 +94,7 @@ pub fn pl_series_to_list( .map(|opt| opt.map(|val| val as f64)) .collect_robj() }), - Utf8 => s.utf8().map(|ca| ca.into_iter().collect_robj()), + String => s.str().map(|ca| ca.into_iter().collect_robj()), Boolean => s.bool().map(|ca| ca.into_iter().collect_robj()), Binary => s.binary().map(|ca| { diff --git a/src/rust/src/lazy/dsl.rs b/src/rust/src/lazy/dsl.rs index f1cb66fcc..98754df7c 100644 --- a/src/rust/src/lazy/dsl.rs +++ b/src/rust/src/lazy/dsl.rs @@ -19,7 +19,7 @@ use extendr_api::{extendr, prelude::*, rprintln, Deref, DerefMut, Rinternals}; use pl::PolarsError as pl_error; use pl::{ BinaryNameSpaceImpl, Duration, DurationMethods, IntoSeries, RollingGroupOptions, - TemporalMethods, Utf8NameSpaceImpl, + StringNameSpaceImpl, TemporalMethods, }; use polars::lazy::dsl; use polars::prelude as pl; @@ -133,7 +133,7 @@ impl RPolarsExpr { } (Rtype::Strings, 1) => { if robj.is_na() { - Ok(dsl::lit(pl::NULL).cast(pl::DataType::Utf8)) + Ok(dsl::lit(pl::NULL).cast(pl::DataType::String)) } else { Ok(dsl::lit(robj.as_str().unwrap())) } @@ -299,7 +299,11 @@ impl RPolarsExpr { } pub fn gather(&self, idx: Robj) -> RResult { - Ok(self.clone().0.gather(robj_to!(PLExpr, idx)?).into()) + Ok(self + .clone() + .0 + .gather(robj_to!(PLExpr, idx)?.cast(pl::DataType::Int64)) + .into()) } pub fn sort_by(&self, by: Robj, descending: Robj) -> RResult { @@ -919,7 +923,7 @@ impl RPolarsExpr { move |s| { //swap owned inline string to str as only supported and if swapped here life time is long enough let av = match &av { - pl::AnyValue::Utf8Owned(x) => pl::AnyValue::Utf8(x.as_str()), + pl::AnyValue::StringOwned(x) => pl::AnyValue::String(x.as_str()), x => x.clone(), }; s.extend_constant(av, n).map(Some) @@ -1833,7 +1837,7 @@ impl RPolarsExpr { pub fn str_len_bytes(&self) -> Self { use pl::*; let function = |s: pl::Series| { - let ca = s.utf8()?; + let ca = s.str()?; Ok(Some(ca.str_len_bytes().into_series())) }; self.clone() @@ -1845,7 +1849,7 @@ impl RPolarsExpr { pub fn str_len_chars(&self) -> Self { let function = |s: pl::Series| { - let ca = s.utf8()?; + let ca = s.str()?; Ok(Some(ca.str_len_chars().into_series())) }; self.clone() @@ -1950,7 +1954,7 @@ impl RPolarsExpr { use pl::*; let pat: String = robj_to!(String, pat, "in str$json_path_match: {}")?; let function = move |s: Series| { - let ca = s.utf8()?; + let ca = s.str()?; match ca.json_path_match(&pat) { Ok(ca) => Ok(Some(ca.into_series())), Err(e) => Err(pl::PolarsError::ComputeError(format!("{e:?}").into())), @@ -1959,7 +1963,7 @@ impl RPolarsExpr { Ok(RPolarsExpr( self.0 .clone() - .map(function, pl::GetOutput::from_type(pl::DataType::Utf8)) + .map(function, pl::GetOutput::from_type(pl::DataType::String)) .with_fmt("str.json_path_match"), )) }(); @@ -1977,65 +1981,38 @@ impl RPolarsExpr { .into()) } - pub fn str_hex_encode(&self) -> Self { - use pl::*; - self.clone() - .0 - .map( - move |s| s.utf8().map(|s| Some(s.hex_encode().into_series())), - pl::GetOutput::same_type(), - ) - .with_fmt("str.hex_encode") - .into() + pub fn str_hex_encode(&self) -> RResult { + Ok(self.0.clone().str().hex_encode().into()) } - pub fn str_hex_decode(&self, strict: bool) -> Self { - use pl::*; - self.clone() + pub fn str_hex_decode(&self, strict: Robj) -> RResult { + Ok(self .0 - .map( - move |s| s.utf8()?.hex_decode(strict).map(|s| Some(s.into_series())), - pl::GetOutput::same_type(), - ) - .with_fmt("str.hex_decode") - .into() + .clone() + .str() + .hex_decode(robj_to!(bool, strict)?) + .into()) } - pub fn str_base64_encode(&self) -> Self { - use pl::*; - self.clone() - .0 - .map( - move |s| s.utf8().map(|s| Some(s.base64_encode().into_series())), - pl::GetOutput::same_type(), - ) - .with_fmt("str.base64_encode") - .into() + pub fn str_base64_encode(&self) -> RResult { + Ok(self.0.clone().str().base64_encode().into()) } - pub fn str_base64_decode(&self, strict: bool) -> Self { - use pl::*; - self.clone() + pub fn str_base64_decode(&self, strict: Robj) -> RResult { + Ok(self .0 - .map( - move |s| { - s.utf8()? - .base64_decode(strict) - .map(|s| Some(s.into_series())) - }, - pl::GetOutput::same_type(), - ) - .with_fmt("str.base64_decode") - .into() + .clone() + .str() + .base64_decode(robj_to!(bool, strict)?) + .into()) } - pub fn str_extract(&self, pattern: Robj, group_index: Robj) -> List { - let res = || -> Result { - let pat = robj_to!(String, pattern)?; - let gi = robj_to!(usize, group_index)?; - Ok(self.0.clone().str().extract(pat.as_str(), gi).into()) - }() - .map_err(|err| format!("in str$extract: {}", err)); - r_result_list(res) + pub fn str_extract(&self, pattern: Robj, group_index: Robj) -> RResult { + Ok(self + .0 + .clone() + .str() + .extract(robj_to!(str, pattern)?, robj_to!(usize, group_index)?) + .into()) } pub fn str_extract_all(&self, pattern: &RPolarsExpr) -> Self { diff --git a/src/rust/src/rdatatype.rs b/src/rust/src/rdatatype.rs index 3344665d1..b4a7f91d8 100644 --- a/src/rust/src/rdatatype.rs +++ b/src/rust/src/rdatatype.rs @@ -73,7 +73,7 @@ impl RPolarsDataType { "Float32" | "float32" | "double" => pl::DataType::Float32, "Float64" | "float64" => pl::DataType::Float64, - "Utf8" | "character" => pl::DataType::Utf8, + "Utf8" | "String" | "character" => pl::DataType::String, "Binary" | "binary" => pl::DataType::Binary, "Date" | "date" => pl::DataType::Date, "Time" | "time" => pl::DataType::Time, @@ -145,7 +145,7 @@ impl RPolarsDataType { "Int64".into(), "Float32".into(), "Float64".into(), - "Utf8".into(), + "String".into(), "Binary".into(), "Date".into(), "Time".into(), @@ -329,11 +329,11 @@ pub fn literal_to_any_value(litval: pl::LiteralValue) -> RResult Ok(av::UInt64(x)), lv::UInt8(x) => Ok(av::UInt8(x)), // lv::Utf8(x) => Ok(av::Utf8(x.as_str())), - lv::Utf8(x) => { + lv::String(x) => { let mut s = SString::new(); s.push_str(x.as_str()); - Ok(av::Utf8Owned(s)) + Ok(av::StringOwned(s)) } x => rerr().bad_val(format!("cannot convert LiteralValue {:?} to AnyValue", x)), } diff --git a/src/rust/src/series.rs b/src/rust/src/series.rs index 67997bb2e..66c054d5d 100644 --- a/src/rust/src/series.rs +++ b/src/rust/src/series.rs @@ -166,7 +166,7 @@ impl RPolarsSeries { "{}", self.0.get(index.try_into().expect("usize>u32")).unwrap() ); - if let DataType::Utf8 | DataType::Categorical(_, _) = self.0.dtype() { + if let DataType::String | DataType::Categorical(_, _) = self.0.dtype() { let v_trunc = &val[..val .char_indices() .take(str_length.try_into().expect("usize>u32")) @@ -222,7 +222,7 @@ impl RPolarsSeries { Int64 => comp!(self, other, i64, op), Float64 => comp!(self, other, f64, op), Boolean => comp!(self, other, bool, op), - Utf8 => comp!(self, other, utf8, op), + String => comp!(self, other, str, op), _ => Err(format!( "oups this type: {} is not supported yet, but easily could be", dtype @@ -365,7 +365,7 @@ impl RPolarsSeries { Int32 => apply_input!(self.0, i32, rfun, na_fun), Int16 => apply_input!(self.0, i16, rfun, na_fun), Int8 => apply_input!(self.0, i8, rfun, na_fun), - Utf8 => apply_input!(self.0, utf8, rfun, na_fun), + String => apply_input!(self.0, str, rfun, na_fun), Boolean => apply_input!(self.0, bool, rfun, na_fun), //List(..) => apply_input!(self.0, list, rfun, na_fun), List(..) => { @@ -392,7 +392,7 @@ impl RPolarsSeries { match out_type { Float64 => apply_output!(r_iter, strict, allow_fail_eval, Doubles, Float64Chunked), Int32 => apply_output!(r_iter, strict, allow_fail_eval, Integers, Int32Chunked), - Utf8 => apply_output!(r_iter, strict, allow_fail_eval, Strings, Utf8Chunked), + String => apply_output!(r_iter, strict, allow_fail_eval, Strings, StringChunked), Boolean => apply_output!(r_iter, strict, allow_fail_eval, Logicals, BooleanChunked), List(..) => { //ierate over R return values, opt if never run (no values), err if fail diff --git a/src/rust/src/utils/mod.rs b/src/rust/src/utils/mod.rs index 9da0308aa..f77cac379 100644 --- a/src/rust/src/utils/mod.rs +++ b/src/rust/src/utils/mod.rs @@ -53,7 +53,7 @@ macro_rules! make_r_na_fun { (i8 $rfun:expr) => {make_r_na_fun!(i32 $rfun)}; (f32 $rfun:expr) => {make_r_na_fun!(f64 $rfun)}; - (utf8 $rfun:expr) => { + (str $rfun:expr) => { R!("function(f) {function() f(NA_character_)}") .unwrap() .as_function() diff --git a/tests/testthat/_snaps/after-wrappers.md b/tests/testthat/_snaps/after-wrappers.md index 3ae4c5e6c..5489a9b97 100644 --- a/tests/testthat/_snaps/after-wrappers.md +++ b/tests/testthat/_snaps/after-wrappers.md @@ -12,45 +12,45 @@ [13] "Int8" "LazyFrame" [15] "List" "Null" [17] "PTime" "SQLContext" - [19] "Series" "Struct" - [21] "Time" "UInt16" - [23] "UInt32" "UInt64" - [25] "UInt8" "Unknown" - [27] "Utf8" "all" - [29] "all_horizontal" "any_horizontal" - [31] "approx_n_unique" "class_names" - [33] "coalesce" "col" - [35] "concat" "concat_list" - [37] "concat_str" "corr" - [39] "count" "cov" - [41] "date_range" "disable_string_cache" - [43] "dtypes" "element" - [45] "enable_string_cache" "expr_to_r" - [47] "extra_auto_completion" "first" - [49] "fold" "from_arrow" - [51] "get_global_rpool_cap" "head" - [53] "implode" "is_schema" - [55] "last" "lit" - [57] "max" "max_horizontal" - [59] "mean" "median" - [61] "mem_address" "min" - [63] "min_horizontal" "n_unique" - [65] "numeric_dtypes" "options" - [67] "polars_info" "raw_list" - [69] "read_csv" "read_ndjson" - [71] "read_parquet" "reduce" - [73] "reset_options" "rolling_corr" - [75] "rolling_cov" "same_outer_dt" - [77] "scan_csv" "scan_ipc" - [79] "scan_ndjson" "scan_parquet" - [81] "select" "set_global_rpool_cap" - [83] "set_options" "show_all_public_functions" - [85] "show_all_public_methods" "std" - [87] "struct" "sum" - [89] "sum_horizontal" "tail" - [91] "threadpool_size" "using_string_cache" - [93] "var" "when" - [95] "with_string_cache" + [19] "Series" "String" + [21] "Struct" "Time" + [23] "UInt16" "UInt32" + [25] "UInt64" "UInt8" + [27] "Unknown" "Utf8" + [29] "all" "all_horizontal" + [31] "any_horizontal" "approx_n_unique" + [33] "class_names" "coalesce" + [35] "col" "concat" + [37] "concat_list" "concat_str" + [39] "corr" "count" + [41] "cov" "date_range" + [43] "disable_string_cache" "dtypes" + [45] "element" "enable_string_cache" + [47] "expr_to_r" "extra_auto_completion" + [49] "first" "fold" + [51] "from_arrow" "get_global_rpool_cap" + [53] "head" "implode" + [55] "is_schema" "last" + [57] "lit" "max" + [59] "max_horizontal" "mean" + [61] "median" "mem_address" + [63] "min" "min_horizontal" + [65] "n_unique" "numeric_dtypes" + [67] "options" "polars_info" + [69] "raw_list" "read_csv" + [71] "read_ndjson" "read_parquet" + [73] "reduce" "reset_options" + [75] "rolling_corr" "rolling_cov" + [77] "same_outer_dt" "scan_csv" + [79] "scan_ipc" "scan_ndjson" + [81] "scan_parquet" "select" + [83] "set_global_rpool_cap" "set_options" + [85] "show_all_public_functions" "show_all_public_methods" + [87] "std" "struct" + [89] "sum" "sum_horizontal" + [91] "tail" "threadpool_size" + [93] "using_string_cache" "var" + [95] "when" "with_string_cache" --- diff --git a/tests/testthat/test-dataframe.R b/tests/testthat/test-dataframe.R index 9a59e2c8e..b34c2da30 100644 --- a/tests/testthat/test-dataframe.R +++ b/tests/testthat/test-dataframe.R @@ -144,13 +144,13 @@ test_that("get set properties", { test_that("DataFrame, custom schema", { df = pl$DataFrame( iris, - schema = list(Sepal.Length = pl$Float32, Species = pl$Utf8) + schema = list(Sepal.Length = pl$Float32, Species = pl$String) ) # dtypes from object are as expected expect_true( all(mapply( df$dtypes, - pl$dtypes[c("Float32", rep("Float64", 3), "Utf8")], + pl$dtypes[c("Float32", rep("Float64", 3), "String")], FUN = "==" )) ) @@ -1217,7 +1217,7 @@ test_that("transpose", { expect_identical( pl$DataFrame(iris)$ with_columns(pl$col("Species")$ - cast(pl$Utf8))$ + cast(pl$String))$ transpose(FALSE)$ to_data_frame(), df_expected diff --git a/tests/testthat/test-datatype.R b/tests/testthat/test-datatype.R index 8f3041ffc..85fa5b247 100644 --- a/tests/testthat/test-datatype.R +++ b/tests/testthat/test-datatype.R @@ -66,3 +66,9 @@ test_that("POSIXct data conversion", { # POSIXct is converted to datetime[ms], so sub-ms precision is lost expect_identical(pl$lit(x)$to_r(), as.POSIXct(c("2020-01-01 13:45:48.343", "2020-01-01 13:45:48.343"), tz = "UTC")) }) + +test_that("String and Utf8 are identical", { + string = pl$DataFrame(x = "a", schema = list(x = pl$String))$to_data_frame() + utf8 = pl$DataFrame(x = "a", schema = list(x = pl$Utf8))$to_data_frame() + expect_identical(string, utf8) +}) diff --git a/tests/testthat/test-expr_binary.R b/tests/testthat/test-expr_binary.R index 19789cabd..d1b56f93f 100644 --- a/tests/testthat/test-expr_binary.R +++ b/tests/testthat/test-expr_binary.R @@ -61,7 +61,7 @@ test_that("bin$encode and bin$decode", { c("hex_decoded") )$select( pl$lit( - pl$col("hex_decoded")$cast(pl$Utf8) + pl$col("hex_decoded")$cast(pl$String) ) )$to_list() @@ -71,7 +71,7 @@ test_that("bin$encode and bin$decode", { c("base64_decoded") )$select( pl$lit( - pl$col("base64_decoded")$cast(pl$Utf8) + pl$col("base64_decoded")$cast(pl$String) ) )$to_list() diff --git a/tests/testthat/test-expr_expr.R b/tests/testthat/test-expr_expr.R index 58e44f3be..fe4c15819 100644 --- a/tests/testthat/test-expr_expr.R +++ b/tests/testthat/test-expr_expr.R @@ -501,10 +501,10 @@ test_that("to_physical + cast", { df - # cast error raised for Utf8 to Boolean + # cast error raised for String to Boolean expect_error( pl$DataFrame(iris)$with_columns( - pl$col("Species")$cast(pl$dtypes$Utf8)$cast(pl$dtypes$Boolean) + pl$col("Species")$cast(pl$dtypes$String)$cast(pl$dtypes$Boolean) ) ) @@ -1054,12 +1054,18 @@ test_that("gather that", { c(1L, 3L, 5L, NA_integer_) ) + expect_identical( + pl$select(pl$lit(1:6)$gather(c(0, -1)))$to_list()[[1L]], + c(1L, 6L) + ) + expect_error( pl$select(pl$lit(0:10)$gather(11))$to_list()[[1L]] ) - expect_error( - pl$select(pl$lit(0:10)$gather(-5))$to_list()[[1L]] + expect_identical( + pl$select(pl$lit(0:10)$gather(-5))$to_list()[[1L]], + 6L ) }) @@ -1522,7 +1528,7 @@ test_that("hash + reinterpret", { hash_values1 = unname(unlist(df$select(pl$col(c("Sepal.Width", "Species"))$unique()$hash()$implode())$to_list())) hash_values2 = unname(unlist(df$select(pl$col(c("Sepal.Width", "Species"))$unique()$hash(1, 2, 3, 4)$implode())$to_list())) - hash_values3 = unname((df$select(pl$col(c("Sepal.Width", "Species"))$unique()$hash(1, 2, 3, 4)$implode()$cast(pl$List(pl$Utf8)))$to_list())) + hash_values3 = unname((df$select(pl$col(c("Sepal.Width", "Species"))$unique()$hash(1, 2, 3, 4)$implode()$cast(pl$List(pl$String)))$to_list())) expect_true(!any(duplicated(hash_values1))) expect_true(!any(sapply(hash_values3, \(x) any(duplicated(x))))) @@ -2112,7 +2118,7 @@ test_that("ewm_", { test_that("extend_constant", { expect_identical( pl$lit(c("5", "Bob_is_not_a_number")) - $cast(pl$dtypes$Utf8, strict = FALSE) + $cast(pl$dtypes$String, strict = FALSE) $extend_constant("chuchu", 2)$to_r(), c("5", "Bob_is_not_a_number", "chuchu", "chuchu") ) @@ -2276,7 +2282,7 @@ test_that("shrink_dtype", { expect_true(all(mapply( df$dtypes, - pl$dtypes[c("Int8", "Int64", "Int32", "Int8", "Int16", "Utf8", "Float32", "Boolean")], + pl$dtypes[c("Int8", "Int64", "Int32", "Int8", "Int16", "String", "Float32", "Boolean")], FUN = function(actual, expected) actual == expected ))) }) diff --git a/tests/testthat/test-expr_string.R b/tests/testthat/test-expr_string.R index ba283ca8f..05270eb0d 100644 --- a/tests/testthat/test-expr_string.R +++ b/tests/testthat/test-expr_string.R @@ -400,15 +400,15 @@ test_that("encode decode", { pl$col("strings")$str$encode("base64")$alias("base64"), # notice DataType is not encoded pl$col("strings")$str$encode("hex")$alias("hex") # ... and must restored with cast )$with_columns( - pl$col("base64")$str$decode("base64")$alias("base64_decoded")$cast(pl$Utf8), - pl$col("hex")$str$decode("hex")$alias("hex_decoded")$cast(pl$Utf8) + pl$col("base64")$str$decode("base64")$alias("base64_decoded")$cast(pl$String), + pl$col("hex")$str$decode("hex")$alias("hex_decoded")$cast(pl$String) )$to_list() expect_identical(l$strings, l$base64_decoded) expect_identical(l$strings, l$hex_decoded) expect_identical( - pl$lit("?")$str$decode("base64", strict = FALSE)$cast(pl$Utf8)$to_r(), + pl$lit("?")$str$decode("base64", strict = FALSE)$cast(pl$String)$to_r(), NA_character_ ) @@ -442,7 +442,7 @@ test_that("str$extract", { expect_grepl_error( pl$lit("abc")$str$extract(42, 42), - "String" + "str" ) expect_true( diff --git a/tests/testthat/test-from_arrow.R b/tests/testthat/test-from_arrow.R index c83d6de04..1707242c7 100644 --- a/tests/testthat/test-from_arrow.R +++ b/tests/testthat/test-from_arrow.R @@ -72,7 +72,7 @@ test_that("from_arrow", { # use schema override df = pl$from_arrow( arrow::arrow_table(iris), - schema_overrides = list(Sepal.Length = pl$Float32, Species = pl$Utf8) + schema_overrides = list(Sepal.Length = pl$Float32, Species = pl$String) ) iris_str = iris iris_str$Species = as.character(iris_str$Species) diff --git a/tests/testthat/test-lazy.R b/tests/testthat/test-lazy.R index f421c88e9..c8423c983 100644 --- a/tests/testthat/test-lazy.R +++ b/tests/testthat/test-lazy.R @@ -33,14 +33,14 @@ test_that("create LazyFrame", { test_that("LazyFrame, custom schema", { df = pl$LazyFrame( iris, - schema = list(Sepal.Length = pl$Float32, Species = pl$Utf8) + schema = list(Sepal.Length = pl$Float32, Species = pl$String) )$collect() # dtypes from object are as expected expect_true( all(mapply( df$dtypes, - pl$dtypes[c("Float32", rep("Float64", 3), "Utf8")], + pl$dtypes[c("Float32", rep("Float64", 3), "String")], FUN = "==" )) ) diff --git a/tests/testthat/test-series.R b/tests/testthat/test-series.R index 9f7c4d1b1..70a330338 100644 --- a/tests/testthat/test-series.R +++ b/tests/testthat/test-series.R @@ -457,7 +457,7 @@ test_that("Series list", { s = pl$Series(l) # check data_type - expect_true(s$dtype == with(pl, List(List(List(Utf8))))) + expect_true(s$dtype == with(pl, List(List(List(String))))) # flatten 3-levels and return to R # TODO CONTRIBUTE POLARS this is a bug, when flattening an empty list, it should not give a null @@ -532,5 +532,5 @@ patrick::with_parameters_test_that("mean, median, std, var", test_that("n_unique", { x = c(1:4, NA, NaN, 1) # 6 unique one repeated expect_identical(pl$Series(x)$n_unique(), 6) - expect_grepl_error(pl$Series(c())$n_unique(), "operation not supported for dtype") + expect_identical(pl$Series(c())$n_unique(), 0) })