From 3efe12d89f1122c3892f6e27bc1a140951cfcffe Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Fri, 29 Mar 2024 18:41:14 +0100 Subject: [PATCH 1/6] init [skip ci] --- NEWS.md | 1 + R/expr__string.R | 19 +++++++++++++++++ R/extendr-wrappers.R | 2 ++ src/rust/src/lazy/dsl.rs | 9 ++++++++ tests/testthat/test-expr_string.R | 35 +++++++++++++++++++++++++++++++ 5 files changed, 66 insertions(+) diff --git a/NEWS.md b/NEWS.md index 8aa7dbd1f..971157373 100644 --- a/NEWS.md +++ b/NEWS.md @@ -94,6 +94,7 @@ - Export the `Duration` datatype (#955). - New functions `pl$int_range()` and `pl$int_ranges()` (#968). - New string method `$str$extract_groups()` (#979). +- New string method `$str$find()` (#985). ### Bug fixes diff --git a/R/expr__string.R b/R/expr__string.R index 1db39d884..17dcf63bd 100644 --- a/R/expr__string.R +++ b/R/expr__string.R @@ -953,3 +953,22 @@ ExprStr_extract_groups = function(pattern) { .pr$Expr$str_extract_groups(self, pattern) |> unwrap("in str$extract_groups():") } + +#' Return the index position of the first substring matching a pattern +#' +#' @inheritParams ExprStr_count_matches +#' @param ... Not used. +#' @param strict Raise an error if the underlying pattern is not a valid regex, +#' otherwise mask out with a null value. +#' +#' @return An Expr of data type UInt32 +#' +#' @examples +#' pl$DataFrame(s = c("AAA", "aAa", "aaa"))$with_columns( +#' default_match = pl$col("s")$str$find("Aa"), +#' insensitive_match = pl$col("s")$str$find("(?i)Aa") +#' ) +ExprStr_find = function(pattern, ..., literal = FALSE, strict = TRUE) { + .pr$Expr$str_find(self, pattern, literal, strict) |> + unwrap("in str$find():") +} diff --git a/R/extendr-wrappers.R b/R/extendr-wrappers.R index 252b84cb7..b5f429bdb 100644 --- a/R/extendr-wrappers.R +++ b/R/extendr-wrappers.R @@ -1024,6 +1024,8 @@ RPolarsExpr$str_contains_any <- function(patterns, ascii_case_insensitive) .Call RPolarsExpr$str_replace_many <- function(patterns, replace_with, ascii_case_insensitive) .Call(wrap__RPolarsExpr__str_replace_many, self, patterns, replace_with, ascii_case_insensitive) +RPolarsExpr$str_find <- function(pat, literal, strict) .Call(wrap__RPolarsExpr__str_find, self, pat, literal, strict) + RPolarsExpr$bin_contains <- function(lit) .Call(wrap__RPolarsExpr__bin_contains, self, lit) RPolarsExpr$bin_starts_with <- function(sub) .Call(wrap__RPolarsExpr__bin_starts_with, self, sub) diff --git a/src/rust/src/lazy/dsl.rs b/src/rust/src/lazy/dsl.rs index 21135ef36..28a965646 100644 --- a/src/rust/src/lazy/dsl.rs +++ b/src/rust/src/lazy/dsl.rs @@ -2345,6 +2345,15 @@ impl RPolarsExpr { .into()) } + pub fn str_find(&self, pat: Robj, literal: Robj, strict: Robj) -> RResult { + let pat = robj_to!(PLExpr, pat)?; + let literal = robj_to!(Option, bool, literal)?; + let strict = robj_to!(bool, strict)?; + match literal { + Some(true) => Ok(self.0.clone().str().find_literal(pat).into()), + _ => Ok(self.0.clone().str().find(pat, strict).into()), + } + } //binary methods pub fn bin_contains(&self, lit: Robj) -> RResult { Ok(self diff --git a/tests/testthat/test-expr_string.R b/tests/testthat/test-expr_string.R index 6a8146d9e..ebc5ae8aa 100644 --- a/tests/testthat/test-expr_string.R +++ b/tests/testthat/test-expr_string.R @@ -832,3 +832,38 @@ test_that("str$extract_groups() works", { list(url = NULL) ) }) + +test_that("str$find() works", { + test = pl$DataFrame(s = c("AAA", "aAa", "aaa", "(?i)Aa")) + + expect_identical( + test$select( + default = pl$col("s")$str$find("Aa"), + insensitive = pl$col("s")$str$find("(?i)Aa") + )$to_list(), + list(default = c(NA, 1, NA, 4), insensitive = c(0, 0, 0, 4)) + ) + + # arg "literal" works + expect_identical( + test$select( + lit = pl$col("s")$str$find("(?i)Aa", literal = TRUE) + )$to_list(), + list(lit = c(NA, NA, NA, 0)) + ) + + # arg "strict" works + expect_grepl_error( + test$select(lit = pl$col("s")$str$find("(?iAa")), + "unrecognized flag" + ) + + expect_silent( + test$select(lit = pl$col("s")$str$find("(?iAa", strict = FALSE)) + ) + + # combining "literal" and "strict" + expect_silent( + test$select(lit = pl$col("s")$str$find("(?iAa", literal = TRUE)) + ) +}) From 96d8fb9e89ec6a626bf00a13af164ace431a7aa2 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Sat, 30 Mar 2024 09:53:48 +0100 Subject: [PATCH 2/6] update docs --- R/expr__string.R | 34 ++++++++++++++++++++++----------- man/ExprStr_contains.Rd | 14 +++++++++----- man/ExprStr_find.Rd | 42 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 74 insertions(+), 16 deletions(-) create mode 100644 man/ExprStr_find.Rd diff --git a/R/expr__string.R b/R/expr__string.R index a98395614..51ce377d6 100644 --- a/R/expr__string.R +++ b/R/expr__string.R @@ -429,13 +429,8 @@ ExprStr_pad_start = function(width, fillchar = " ") { } -# TODO: Add ExprStr_find to seealso #' Check if string contains a substring that matches a pattern #' -#' @details To modify regular expression behaviour (such as case-sensitivity) with flags, -#' use the inline `(?iLmsuxU)` syntax. See the regex crate’s section on -#' [grouping and flags](https://docs.rs/regex/latest/regex/#grouping-and-flags) -#' for additional information about the use of inline expression modifiers. #' @param pattern A character or something can be coerced to a string [Expr][Expr_class] #' of a valid regex pattern, compatible with the [regex crate](https://docs.rs/regex/latest/regex/). #' @param ... Ignored. @@ -443,10 +438,22 @@ ExprStr_pad_start = function(width, fillchar = " ") { #' not as a regular expression. #' @param strict Logical. If `TRUE` (default), raise an error if the underlying pattern is #' not a valid regex, otherwise mask out with a null value. +#' +#' @details To modify regular expression behaviour (such as case-sensitivity) +#' with flags, use the inline `(?iLmsuxU)` syntax. See the regex crate’s section +#' on [grouping and flags](https://docs.rs/regex/latest/regex/#grouping-and-flags) +#' for additional information about the use of inline expression modifiers. +#' #' @return [Expr][Expr_class] of Boolean data type #' @seealso -#' - [`$str$start_with()`][ExprStr_starts_with]: Check if string values start with a substring. -#' - [`$str$ends_with()`][ExprStr_ends_with]: Check if string values end with a substring. +#' - [`$str$start_with()`][ExprStr_starts_with]: Check if string values +#' start with a substring. +#' - [`$str$ends_with()`][ExprStr_ends_with]: Check if string values end +#' with a substring. +#' - [`$str$find()`][ExprStr_find]: Return the index position of the first +#' substring matching a pattern. +#' +#' #' @examples #' # The inline `(?i)` syntax example #' pl$DataFrame(s = c("AAA", "aAa", "aaa"))$with_columns( @@ -969,13 +976,18 @@ ExprStr_extract_groups = function(pattern) { #' Return the index position of the first substring matching a pattern #' -#' @inheritParams ExprStr_count_matches -#' @param ... Not used. -#' @param strict Raise an error if the underlying pattern is not a valid regex, -#' otherwise mask out with a null value. +#' @inheritParams ExprStr_contains #' #' @return An Expr of data type UInt32 #' +#' @seealso +#' - [`$str$start_with()`][ExprStr_starts_with]: Check if string values +#' start with a substring. +#' - [`$str$ends_with()`][ExprStr_ends_with]: Check if string values end +#' with a substring. +#' - [`$str$contains()`][ExprStr_contains]: Check if string contains a substring +#' that matches a pattern. +#' #' @examples #' pl$DataFrame(s = c("AAA", "aAa", "aaa"))$with_columns( #' default_match = pl$col("s")$str$find("Aa"), diff --git a/man/ExprStr_contains.Rd b/man/ExprStr_contains.Rd index ccbd56164..d656a9fa1 100644 --- a/man/ExprStr_contains.Rd +++ b/man/ExprStr_contains.Rd @@ -25,9 +25,9 @@ not a valid regex, otherwise mask out with a null value.} Check if string contains a substring that matches a pattern } \details{ -To modify regular expression behaviour (such as case-sensitivity) with flags, -use the inline \code{(?iLmsuxU)} syntax. See the regex crate’s section on -\href{https://docs.rs/regex/latest/regex/#grouping-and-flags}{grouping and flags} +To modify regular expression behaviour (such as case-sensitivity) +with flags, use the inline \code{(?iLmsuxU)} syntax. See the regex crate’s section +on \href{https://docs.rs/regex/latest/regex/#grouping-and-flags}{grouping and flags} for additional information about the use of inline expression modifiers. } \examples{ @@ -45,7 +45,11 @@ df$with_columns( } \seealso{ \itemize{ -\item \code{\link[=ExprStr_starts_with]{$str$start_with()}}: Check if string values start with a substring. -\item \code{\link[=ExprStr_ends_with]{$str$ends_with()}}: Check if string values end with a substring. +\item \code{\link[=ExprStr_starts_with]{$str$start_with()}}: Check if string values +start with a substring. +\item \code{\link[=ExprStr_ends_with]{$str$ends_with()}}: Check if string values end +with a substring. +\item \code{\link[=ExprStr_find]{$str$find()}}: Return the index position of the first +substring matching a pattern. } } diff --git a/man/ExprStr_find.Rd b/man/ExprStr_find.Rd new file mode 100644 index 000000000..6a7ede99d --- /dev/null +++ b/man/ExprStr_find.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/expr__string.R +\name{ExprStr_find} +\alias{ExprStr_find} +\title{Return the index position of the first substring matching a pattern} +\usage{ +ExprStr_find(pattern, ..., literal = FALSE, strict = TRUE) +} +\arguments{ +\item{pattern}{A character or something can be coerced to a string \link[=Expr_class]{Expr} +of a valid regex pattern, compatible with the \href{https://docs.rs/regex/latest/regex/}{regex crate}.} + +\item{...}{Ignored.} + +\item{literal}{Logical. If \code{TRUE} (default), treat \code{pattern} as a literal string, +not as a regular expression.} + +\item{strict}{Logical. If \code{TRUE} (default), raise an error if the underlying pattern is +not a valid regex, otherwise mask out with a null value.} +} +\value{ +An Expr of data type UInt32 +} +\description{ +Return the index position of the first substring matching a pattern +} +\examples{ +pl$DataFrame(s = c("AAA", "aAa", "aaa"))$with_columns( + default_match = pl$col("s")$str$find("Aa"), + insensitive_match = pl$col("s")$str$find("(?i)Aa") +) +} +\seealso{ +\itemize{ +\item \code{\link[=ExprStr_starts_with]{$str$start_with()}}: Check if string values +start with a substring. +\item \code{\link[=ExprStr_ends_with]{$str$ends_with()}}: Check if string values end +with a substring. +\item \code{\link[=ExprStr_contains]{$str$contains()}}: Check if string contains a substring +that matches a pattern. +} +} From 43b8de0b19acc6124520186493832928ae6f718f Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Sat, 30 Mar 2024 09:54:18 +0100 Subject: [PATCH 3/6] snapshot --- tests/testthat/_snaps/after-wrappers.md | 47 +++++++++++++------------ 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/tests/testthat/_snaps/after-wrappers.md b/tests/testthat/_snaps/after-wrappers.md index 716c5dc1f..acc43c3bf 100644 --- a/tests/testthat/_snaps/after-wrappers.md +++ b/tests/testthat/_snaps/after-wrappers.md @@ -406,29 +406,30 @@ [267] "str_contains_any" "str_count_matches" [269] "str_ends_with" "str_explode" [271] "str_extract" "str_extract_all" - [273] "str_extract_groups" "str_hex_decode" - [275] "str_hex_encode" "str_json_decode" - [277] "str_json_path_match" "str_len_bytes" - [279] "str_len_chars" "str_pad_end" - [281] "str_pad_start" "str_parse_int" - [283] "str_replace" "str_replace_all" - [285] "str_replace_many" "str_reverse" - [287] "str_slice" "str_split" - [289] "str_split_exact" "str_splitn" - [291] "str_starts_with" "str_strip_chars" - [293] "str_strip_chars_end" "str_strip_chars_start" - [295] "str_to_date" "str_to_datetime" - [297] "str_to_lowercase" "str_to_time" - [299] "str_to_titlecase" "str_to_uppercase" - [301] "str_zfill" "struct_field_by_name" - [303] "struct_rename_fields" "sub" - [305] "sum" "tail" - [307] "tan" "tanh" - [309] "timestamp" "to_physical" - [311] "top_k" "unique" - [313] "unique_counts" "unique_stable" - [315] "upper_bound" "value_counts" - [317] "var" "xor" + [273] "str_extract_groups" "str_find" + [275] "str_hex_decode" "str_hex_encode" + [277] "str_json_decode" "str_json_path_match" + [279] "str_len_bytes" "str_len_chars" + [281] "str_pad_end" "str_pad_start" + [283] "str_parse_int" "str_replace" + [285] "str_replace_all" "str_replace_many" + [287] "str_reverse" "str_slice" + [289] "str_split" "str_split_exact" + [291] "str_splitn" "str_starts_with" + [293] "str_strip_chars" "str_strip_chars_end" + [295] "str_strip_chars_start" "str_to_date" + [297] "str_to_datetime" "str_to_lowercase" + [299] "str_to_time" "str_to_titlecase" + [301] "str_to_uppercase" "str_zfill" + [303] "struct_field_by_name" "struct_rename_fields" + [305] "sub" "sum" + [307] "tail" "tan" + [309] "tanh" "timestamp" + [311] "to_physical" "top_k" + [313] "unique" "unique_counts" + [315] "unique_stable" "upper_bound" + [317] "value_counts" "var" + [319] "xor" # public and private methods of each class When From f86800411bb9da692a368a4e080fde62b37c8456 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Sat, 30 Mar 2024 09:55:18 +0100 Subject: [PATCH 4/6] fix test --- tests/testthat/test-expr_string.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-expr_string.R b/tests/testthat/test-expr_string.R index ebc5ae8aa..8d470a5dd 100644 --- a/tests/testthat/test-expr_string.R +++ b/tests/testthat/test-expr_string.R @@ -864,6 +864,6 @@ test_that("str$find() works", { # combining "literal" and "strict" expect_silent( - test$select(lit = pl$col("s")$str$find("(?iAa", literal = TRUE)) + test$select(lit = pl$col("s")$str$find("(?iAa", strict = TRUE, literal = TRUE)) ) }) From 1a3fe50aa2032f63ef132cbb788528917222acf1 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Sat, 30 Mar 2024 10:12:28 +0100 Subject: [PATCH 5/6] also inherit details from $str$contains [skip ci] --- R/expr__string.R | 2 +- man/ExprStr_find.Rd | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/R/expr__string.R b/R/expr__string.R index 51ce377d6..a793bdae0 100644 --- a/R/expr__string.R +++ b/R/expr__string.R @@ -976,7 +976,7 @@ ExprStr_extract_groups = function(pattern) { #' Return the index position of the first substring matching a pattern #' -#' @inheritParams ExprStr_contains +#' @inherit ExprStr_contains params details #' #' @return An Expr of data type UInt32 #' diff --git a/man/ExprStr_find.Rd b/man/ExprStr_find.Rd index 6a7ede99d..00c4082f4 100644 --- a/man/ExprStr_find.Rd +++ b/man/ExprStr_find.Rd @@ -24,6 +24,12 @@ An Expr of data type UInt32 \description{ Return the index position of the first substring matching a pattern } +\details{ +To modify regular expression behaviour (such as case-sensitivity) +with flags, use the inline \code{(?iLmsuxU)} syntax. See the regex crate’s section +on \href{https://docs.rs/regex/latest/regex/#grouping-and-flags}{grouping and flags} +for additional information about the use of inline expression modifiers. +} \examples{ pl$DataFrame(s = c("AAA", "aAa", "aaa"))$with_columns( default_match = pl$col("s")$str$find("Aa"), From 683b0fd3c069d3bc2fd27509bf8d07e6e007c984 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Sat, 30 Mar 2024 10:28:36 +0100 Subject: [PATCH 6/6] revert position change --- R/expr__string.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/R/expr__string.R b/R/expr__string.R index a793bdae0..ce6bdae16 100644 --- a/R/expr__string.R +++ b/R/expr__string.R @@ -431,6 +431,11 @@ ExprStr_pad_start = function(width, fillchar = " ") { #' Check if string contains a substring that matches a pattern #' +#' @details To modify regular expression behaviour (such as case-sensitivity) +#' with flags, use the inline `(?iLmsuxU)` syntax. See the regex crate’s section +#' on [grouping and flags](https://docs.rs/regex/latest/regex/#grouping-and-flags) +#' for additional information about the use of inline expression modifiers. +#' #' @param pattern A character or something can be coerced to a string [Expr][Expr_class] #' of a valid regex pattern, compatible with the [regex crate](https://docs.rs/regex/latest/regex/). #' @param ... Ignored. @@ -439,11 +444,6 @@ ExprStr_pad_start = function(width, fillchar = " ") { #' @param strict Logical. If `TRUE` (default), raise an error if the underlying pattern is #' not a valid regex, otherwise mask out with a null value. #' -#' @details To modify regular expression behaviour (such as case-sensitivity) -#' with flags, use the inline `(?iLmsuxU)` syntax. See the regex crate’s section -#' on [grouping and flags](https://docs.rs/regex/latest/regex/#grouping-and-flags) -#' for additional information about the use of inline expression modifiers. -#' #' @return [Expr][Expr_class] of Boolean data type #' @seealso #' - [`$str$start_with()`][ExprStr_starts_with]: Check if string values