Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix regex quantifier check to include capture groups #11373

Merged
merged 17 commits into from
Aug 4, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 29 additions & 3 deletions cpp/src/strings/regex/regcomp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ static reclass cclass_S(NCCLASS_S); // \S
static reclass cclass_D(NCCLASS_D); // \D

// Tables for analyzing quantifiers
const std::array<int, 6> valid_preceding_inst_types{{CHAR, CCLASS, NCCLASS, ANY, ANYNL, RBRA}};
const std::array<int, 5> valid_preceding_inst_types{{CHAR, CCLASS, NCCLASS, ANY, ANYNL}};
const std::array<char, 5> quantifiers{{'*', '?', '+', '{', '|'}};
// Valid regex characters that can be escaped and used as literals
const std::array<char, 33> escapable_chars{
Expand Down Expand Up @@ -459,16 +459,42 @@ class regex_parser {
}

// The quantifiers require at least one "real" previous item.
// We are throwing an error in these two if-checks for invalid quantifiers.
// We are throwing errors for invalid quantifiers.
// Another option is to just return CHAR silently here which effectively
// treats the chr character as a literal instead as a quantifier.
// This could lead to confusion where sometimes unescaped quantifier characters
// are treated as regex expressions and sometimes they are not.
if (_items.empty()) { CUDF_FAIL("invalid regex pattern: nothing to repeat at position 0"); }

// Check that the previous item can be used with quantifiers.
// If the previous item is a capture group, we need to check items inside the
// capture group can be used with quantifiers too.
// (Note that capture groups can be nested).
auto previous_type = _items.back().type;
if (previous_type == RBRA) { // previous item is a capture group
// look for matching LBRA
auto nested_count = 1;
auto lbra_itr =
std::find_if(_items.rbegin(), _items.rend(), [nested_count](auto const& item) mutable {
auto const is_closing = (item.type == RBRA);
auto const is_opening = (item.type == LBRA || item.type == LBRA_NC);
nested_count += is_closing - is_opening;
return is_opening && (nested_count == 0);
});
// search for the first valid item within the LBRA-RBRA range
auto first_valid = std::find_first_of(
_items.rbegin() + 1,
lbra_itr,
valid_preceding_inst_types.begin(),
valid_preceding_inst_types.end(),
[](auto const item, auto const valid_type) { return item.type == valid_type; });
// set previous_type to be checked in next if-statement
previous_type = (first_valid == lbra_itr) ? (--lbra_itr)->type : first_valid->type;
}

if (std::find(valid_preceding_inst_types.begin(),
valid_preceding_inst_types.end(),
_items.back().type) == valid_preceding_inst_types.end()) {
previous_type) == valid_preceding_inst_types.end()) {
CUDF_FAIL("invalid regex pattern: nothing to repeat at position " +
std::to_string(_expr_ptr - _pattern_begin - 1));
}
Expand Down
19 changes: 19 additions & 0 deletions cpp/tests/strings/contains_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,25 @@ TEST_F(StringsContainsTests, FixedQuantifier)
}
}

TEST_F(StringsContainsTests, QuantifierErrors)
{
auto input = cudf::test::strings_column_wrapper({"a", "aa", "aaa", "aaaa", "aaaaa", "aaaaaa"});
auto sv = cudf::strings_column_view(input);

EXPECT_THROW(cudf::strings::contains_re(sv, "^+"), cudf::logic_error);
EXPECT_THROW(cudf::strings::count_re(sv, "$+"), cudf::logic_error);
EXPECT_THROW(cudf::strings::count_re(sv, "(^)+"), cudf::logic_error);
EXPECT_THROW(cudf::strings::contains_re(sv, "($)+"), cudf::logic_error);
EXPECT_THROW(cudf::strings::count_re(sv, "\\A+"), cudf::logic_error);
EXPECT_THROW(cudf::strings::count_re(sv, "\\Z+"), cudf::logic_error);
EXPECT_THROW(cudf::strings::contains_re(sv, "(\\A)+"), cudf::logic_error);
EXPECT_THROW(cudf::strings::contains_re(sv, "(\\Z)+"), cudf::logic_error);

EXPECT_THROW(cudf::strings::contains_re(sv, "(^($))+"), cudf::logic_error);
EXPECT_NO_THROW(cudf::strings::contains_re(sv, "(^a($))+"));
EXPECT_NO_THROW(cudf::strings::count_re(sv, "(^(a$))+"));
}

TEST_F(StringsContainsTests, OverlappedClasses)
{
auto input = cudf::test::strings_column_wrapper({"abcdefg", "defghí", "", "éééééé", "ghijkl"});
Expand Down