Skip to content

Commit

Permalink
merge revision(s) eccfc97: [Backport #19379]
Browse files Browse the repository at this point in the history
	Fix parsing of regexps that toggle extended mode on/off inside regexp

	This was broken in ec35422. That commit
	didn't handle cases where extended mode was turned on/off inside the
	regexp.  There are two ways to turn extended mode on/off:

	```
	/(?-x:#y)#z
	/x =~ '#y'

	/(?-x)#y(?x)#z
	/x =~ '#y'
	```

	These can be nested inside the same regexp:

	```
	/(?-x:(?x)#x
	(?-x)#y)#z
	/x =~ '#y'
	```

	As you can probably imagine, this makes handling these regexps
	somewhat complex. Due to the nesting inside portions of regexps,
	the unassign_nonascii function needs to be recursive.  In
	recursive mode, it needs to track both opening and closing
	parentheses, similar to how it already tracked opening and
	closing brackets for character classes.

	When scanning the regexp and coming to `(?` not followed by `#`,
	scan for options, and use `x` and `i` to determine whether to
	turn on or off extended mode.  For `:`, indicting only the
	current regexp section should have the extended mode
	switched, recurse with the extended mode set or unset. For `)`,
	indicating the remainder of the regexp (or current regexp portion
	if already recursing) should turn extended mode on or off, just
	change the extended mode flag and keep scanning.

	While testing this, I noticed that `a`, `d`, and `u` are accepted
	as options, in addition to `i`, `m`, and `x`, but I can't see
	where those options are documented.  I'm not sure whether or not
	handling  `a`, `d`, and `u` as options is a bug.

	Fixes [Bug #19379]
	---
	 re.c                     | 153 +++++++++++++++++++++++++++++++++++++----------
	 test/ruby/test_regexp.rb |  56 +++++++++++++++++
	 2 files changed, 176 insertions(+), 33 deletions(-)
  • Loading branch information
nurse committed Jan 31, 2023
1 parent 5a2b289 commit ca75332
Show file tree
Hide file tree
Showing 3 changed files with 177 additions and 34 deletions.
153 changes: 120 additions & 33 deletions re.c
Expand Up @@ -2801,14 +2801,18 @@ unescape_unicode_bmp(const char **pp, const char *end,
}

static int
unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
unescape_nonascii0(const char **pp, const char *end, rb_encoding *enc,
VALUE buf, rb_encoding **encp, int *has_property,
onig_errmsg_buffer err, int options)
onig_errmsg_buffer err, int options, int recurse)
{
const char *p = *pp;
unsigned char c;
char smallbuf[2];
int in_char_class = 0;
int parens = 1; /* ignored unless recurse is true */
int extended_mode = options & ONIG_OPTION_EXTEND;

begin_scan:
while (p < end) {
int chlen = rb_enc_precise_mbclen(p, end, enc);
if (!MBCLEN_CHARFOUND_P(chlen)) {
Expand Down Expand Up @@ -2920,7 +2924,7 @@ unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
break;

case '#':
if ((options & ONIG_OPTION_EXTEND) && !in_char_class) {
if (extended_mode && !in_char_class) {
/* consume and ignore comment in extended regexp */
while ((p < end) && ((c = *p++) != '\n'));
break;
Expand All @@ -2937,51 +2941,134 @@ unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
}
rb_str_buf_cat(buf, (char *)&c, 1);
break;
case ')':
rb_str_buf_cat(buf, (char *)&c, 1);
if (!in_char_class && recurse) {
if (--parens == 0) {
*pp = p;
return 0;
}
}
break;
case '(':
if (!in_char_class && p + 1 < end && *p == '?' && *(p+1) == '#') {
/* (?# is comment inside any regexp, and content inside should be ignored */
const char *orig_p = p;
int cont = 1;

while (cont && (p < end)) {
switch (c = *p++) {
default:
if (!(c & 0x80)) break;
--p;
/* fallthrough */
case '\\':
chlen = rb_enc_precise_mbclen(p, end, enc);
if (!MBCLEN_CHARFOUND_P(chlen)) {
goto invalid_multibyte;
if (!in_char_class && p + 1 < end && *p == '?') {
if (*(p+1) == '#') {
/* (?# is comment inside any regexp, and content inside should be ignored */
const char *orig_p = p;
int cont = 1;

while (cont && (p < end)) {
switch (c = *p++) {
default:
if (!(c & 0x80)) break;
--p;
/* fallthrough */
case '\\':
chlen = rb_enc_precise_mbclen(p, end, enc);
if (!MBCLEN_CHARFOUND_P(chlen)) {
goto invalid_multibyte;
}
p += MBCLEN_CHARFOUND_LEN(chlen);
break;
case ')':
cont = 0;
break;
}
p += MBCLEN_CHARFOUND_LEN(chlen);
break;
case ')':
cont = 0;
break;
}
}

if (cont) {
/* unterminated (?#, rewind so it is syntax error */
p = orig_p;
c = '(';
rb_str_buf_cat(buf, (char *)&c, 1);
if (cont) {
/* unterminated (?#, rewind so it is syntax error */
p = orig_p;
c = '(';
rb_str_buf_cat(buf, (char *)&c, 1);
}
break;
} else {
/* potential change of extended option */
int invert = 0;
int local_extend = 0;
const char *s;

if (recurse) {
parens++;
}

for(s = p+1; s < end; s++) {
switch(*s) {
case 'x':
local_extend = invert ? -1 : 1;
break;
case '-':
invert = 1;
break;
case ':':
case ')':
if (local_extend == 0 ||
(local_extend == -1 && !extended_mode) ||
(local_extend == 1 && extended_mode)) {
/* no changes to extended flag */
goto fallthrough;
}

if (*s == ':') {
/* change extended flag until ')' */
int local_options = options;
if (local_extend == 1) {
local_options |= ONIG_OPTION_EXTEND;
} else {
local_options &= ~ONIG_OPTION_EXTEND;
}

rb_str_buf_cat(buf, (char *)&c, 1);
int ret = unescape_nonascii0(&p, end, enc, buf, encp,
has_property, err,
local_options, 1);
if (ret < 0) return ret;
goto begin_scan;
} else {
/* change extended flag for rest of expression */
extended_mode = local_extend == 1;
goto fallthrough;
}
case 'i':
case 'm':
case 'a':
case 'd':
case 'u':
/* other option flags, ignored during scanning */
break;
default:
/* other character, no extended flag change*/
goto fallthrough;
}
}
}
} else if (!in_char_class && recurse) {
parens++;
}
else {
rb_str_buf_cat(buf, (char *)&c, 1);
}
break;
/* FALLTHROUGH */
default:
fallthrough:
rb_str_buf_cat(buf, (char *)&c, 1);
break;
}
}

if (recurse) {
*pp = p;
}
return 0;
}

static int
unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
VALUE buf, rb_encoding **encp, int *has_property,
onig_errmsg_buffer err, int options)
{
return unescape_nonascii0(&p, end, enc, buf, encp, has_property,
err, options, 0);
}

static VALUE
rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
rb_encoding **fixed_enc, onig_errmsg_buffer err, int options)
Expand Down
56 changes: 56 additions & 0 deletions test/ruby/test_regexp.rb
Expand Up @@ -144,6 +144,62 @@ def test_extended_comment_invalid_escape_bug_18294
assert_raise(SyntaxError) {eval "/# \\users/"}
end

def test_nonextended_section_of_extended_regexp_bug_19379
assert_separately([], <<-'RUBY')
re = /(?-x:#)/x
assert_match(re, '#')
assert_not_match(re, '-')
re = /(?xi:#
y)/
assert_match(re, 'Y')
assert_not_match(re, '-')
re = /(?mix:#
y)/
assert_match(re, 'Y')
assert_not_match(re, '-')
re = /(?x-im:#
y)/i
assert_match(re, 'y')
assert_not_match(re, 'Y')
re = /(?-imx:(?xim:#
y))/x
assert_match(re, 'y')
assert_not_match(re, '-')
re = /(?x)#
y/
assert_match(re, 'y')
assert_not_match(re, 'Y')
re = /(?mx-i)#
y/i
assert_match(re, 'y')
assert_not_match(re, 'Y')
re = /(?-imx:(?xim:#
(?-x)y#))/x
assert_match(re, 'Y#')
assert_not_match(re, '-#')
re = /(?imx:#
(?-xim:#(?im)#(?x)#
)#
(?x)#
y)/
assert_match(re, '###Y')
assert_not_match(re, '###-')
re = %r{#c-\w+/comment/[\w-]+}
re = %r{https?://[^/]+#{re}}x
assert_match(re, 'http://foo#c-x/comment/bar')
assert_not_match(re, 'http://foo#cx/comment/bar')
RUBY
end

def test_union
assert_equal :ok, begin
Regexp.union(
Expand Down
2 changes: 1 addition & 1 deletion version.h
Expand Up @@ -11,7 +11,7 @@
# define RUBY_VERSION_MINOR RUBY_API_VERSION_MINOR
#define RUBY_VERSION_TEENY 0
#define RUBY_RELEASE_DATE RUBY_RELEASE_YEAR_STR"-"RUBY_RELEASE_MONTH_STR"-"RUBY_RELEASE_DAY_STR
#define RUBY_PATCHLEVEL 23
#define RUBY_PATCHLEVEL 24

#include "ruby/version.h"
#include "ruby/internal/abi.h"
Expand Down

0 comments on commit ca75332

Please sign in to comment.