Skip to content

Commit eccfc97

Browse files
committed
Fix parsing of regexps that toggle extended mode on/off inside regexp
This was broken in ec35422. That commit didn't handle cases where extended mode was turned on/off inside the regexp. There are two ways to turn extended mode on/off: ``` /(?-x:#y)#z /x =~ '#y' /(?-x)#y(?x)#z /x =~ '#y' ``` These can be nested inside the same regexp: ``` /(?-x:(?x)#x (?-x)#y)#z /x =~ '#y' ``` As you can probably imagine, this makes handling these regexps somewhat complex. Due to the nesting inside portions of regexps, the unassign_nonascii function needs to be recursive. In recursive mode, it needs to track both opening and closing parentheses, similar to how it already tracked opening and closing brackets for character classes. When scanning the regexp and coming to `(?` not followed by `#`, scan for options, and use `x` and `i` to determine whether to turn on or off extended mode. For `:`, indicting only the current regexp section should have the extended mode switched, recurse with the extended mode set or unset. For `)`, indicating the remainder of the regexp (or current regexp portion if already recursing) should turn extended mode on or off, just change the extended mode flag and keep scanning. While testing this, I noticed that `a`, `d`, and `u` are accepted as options, in addition to `i`, `m`, and `x`, but I can't see where those options are documented. I'm not sure whether or not handling `a`, `d`, and `u` as options is a bug. Fixes [Bug #19379]
1 parent 3f54d09 commit eccfc97

File tree

2 files changed

+176
-33
lines changed

2 files changed

+176
-33
lines changed

re.c

Lines changed: 120 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -2801,14 +2801,18 @@ unescape_unicode_bmp(const char **pp, const char *end,
28012801
}
28022802

28032803
static int
2804-
unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
2804+
unescape_nonascii0(const char **pp, const char *end, rb_encoding *enc,
28052805
VALUE buf, rb_encoding **encp, int *has_property,
2806-
onig_errmsg_buffer err, int options)
2806+
onig_errmsg_buffer err, int options, int recurse)
28072807
{
2808+
const char *p = *pp;
28082809
unsigned char c;
28092810
char smallbuf[2];
28102811
int in_char_class = 0;
2812+
int parens = 1; /* ignored unless recurse is true */
2813+
int extended_mode = options & ONIG_OPTION_EXTEND;
28112814

2815+
begin_scan:
28122816
while (p < end) {
28132817
int chlen = rb_enc_precise_mbclen(p, end, enc);
28142818
if (!MBCLEN_CHARFOUND_P(chlen)) {
@@ -2920,7 +2924,7 @@ unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
29202924
break;
29212925

29222926
case '#':
2923-
if ((options & ONIG_OPTION_EXTEND) && !in_char_class) {
2927+
if (extended_mode && !in_char_class) {
29242928
/* consume and ignore comment in extended regexp */
29252929
while ((p < end) && ((c = *p++) != '\n'));
29262930
break;
@@ -2937,51 +2941,134 @@ unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
29372941
}
29382942
rb_str_buf_cat(buf, (char *)&c, 1);
29392943
break;
2944+
case ')':
2945+
rb_str_buf_cat(buf, (char *)&c, 1);
2946+
if (!in_char_class && recurse) {
2947+
if (--parens == 0) {
2948+
*pp = p;
2949+
return 0;
2950+
}
2951+
}
2952+
break;
29402953
case '(':
2941-
if (!in_char_class && p + 1 < end && *p == '?' && *(p+1) == '#') {
2942-
/* (?# is comment inside any regexp, and content inside should be ignored */
2943-
const char *orig_p = p;
2944-
int cont = 1;
2945-
2946-
while (cont && (p < end)) {
2947-
switch (c = *p++) {
2948-
default:
2949-
if (!(c & 0x80)) break;
2950-
--p;
2951-
/* fallthrough */
2952-
case '\\':
2953-
chlen = rb_enc_precise_mbclen(p, end, enc);
2954-
if (!MBCLEN_CHARFOUND_P(chlen)) {
2955-
goto invalid_multibyte;
2954+
if (!in_char_class && p + 1 < end && *p == '?') {
2955+
if (*(p+1) == '#') {
2956+
/* (?# is comment inside any regexp, and content inside should be ignored */
2957+
const char *orig_p = p;
2958+
int cont = 1;
2959+
2960+
while (cont && (p < end)) {
2961+
switch (c = *p++) {
2962+
default:
2963+
if (!(c & 0x80)) break;
2964+
--p;
2965+
/* fallthrough */
2966+
case '\\':
2967+
chlen = rb_enc_precise_mbclen(p, end, enc);
2968+
if (!MBCLEN_CHARFOUND_P(chlen)) {
2969+
goto invalid_multibyte;
2970+
}
2971+
p += MBCLEN_CHARFOUND_LEN(chlen);
2972+
break;
2973+
case ')':
2974+
cont = 0;
2975+
break;
29562976
}
2957-
p += MBCLEN_CHARFOUND_LEN(chlen);
2958-
break;
2959-
case ')':
2960-
cont = 0;
2961-
break;
29622977
}
2963-
}
29642978

2965-
if (cont) {
2966-
/* unterminated (?#, rewind so it is syntax error */
2967-
p = orig_p;
2968-
c = '(';
2969-
rb_str_buf_cat(buf, (char *)&c, 1);
2979+
if (cont) {
2980+
/* unterminated (?#, rewind so it is syntax error */
2981+
p = orig_p;
2982+
c = '(';
2983+
rb_str_buf_cat(buf, (char *)&c, 1);
2984+
}
2985+
break;
2986+
} else {
2987+
/* potential change of extended option */
2988+
int invert = 0;
2989+
int local_extend = 0;
2990+
const char *s;
2991+
2992+
if (recurse) {
2993+
parens++;
2994+
}
2995+
2996+
for(s = p+1; s < end; s++) {
2997+
switch(*s) {
2998+
case 'x':
2999+
local_extend = invert ? -1 : 1;
3000+
break;
3001+
case '-':
3002+
invert = 1;
3003+
break;
3004+
case ':':
3005+
case ')':
3006+
if (local_extend == 0 ||
3007+
(local_extend == -1 && !extended_mode) ||
3008+
(local_extend == 1 && extended_mode)) {
3009+
/* no changes to extended flag */
3010+
goto fallthrough;
3011+
}
3012+
3013+
if (*s == ':') {
3014+
/* change extended flag until ')' */
3015+
int local_options = options;
3016+
if (local_extend == 1) {
3017+
local_options |= ONIG_OPTION_EXTEND;
3018+
} else {
3019+
local_options &= ~ONIG_OPTION_EXTEND;
3020+
}
3021+
3022+
rb_str_buf_cat(buf, (char *)&c, 1);
3023+
int ret = unescape_nonascii0(&p, end, enc, buf, encp,
3024+
has_property, err,
3025+
local_options, 1);
3026+
if (ret < 0) return ret;
3027+
goto begin_scan;
3028+
} else {
3029+
/* change extended flag for rest of expression */
3030+
extended_mode = local_extend == 1;
3031+
goto fallthrough;
3032+
}
3033+
case 'i':
3034+
case 'm':
3035+
case 'a':
3036+
case 'd':
3037+
case 'u':
3038+
/* other option flags, ignored during scanning */
3039+
break;
3040+
default:
3041+
/* other character, no extended flag change*/
3042+
goto fallthrough;
3043+
}
3044+
}
29703045
}
3046+
} else if (!in_char_class && recurse) {
3047+
parens++;
29713048
}
2972-
else {
2973-
rb_str_buf_cat(buf, (char *)&c, 1);
2974-
}
2975-
break;
3049+
/* FALLTHROUGH */
29763050
default:
3051+
fallthrough:
29773052
rb_str_buf_cat(buf, (char *)&c, 1);
29783053
break;
29793054
}
29803055
}
29813056

3057+
if (recurse) {
3058+
*pp = p;
3059+
}
29823060
return 0;
29833061
}
29843062

3063+
static int
3064+
unescape_nonascii(const char *p, const char *end, rb_encoding *enc,
3065+
VALUE buf, rb_encoding **encp, int *has_property,
3066+
onig_errmsg_buffer err, int options)
3067+
{
3068+
return unescape_nonascii0(&p, end, enc, buf, encp, has_property,
3069+
err, options, 0);
3070+
}
3071+
29853072
static VALUE
29863073
rb_reg_preprocess(const char *p, const char *end, rb_encoding *enc,
29873074
rb_encoding **fixed_enc, onig_errmsg_buffer err, int options)

test/ruby/test_regexp.rb

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,62 @@ def test_extended_comment_invalid_escape_bug_18294
144144
assert_raise(SyntaxError) {eval "/# \\users/"}
145145
end
146146

147+
def test_nonextended_section_of_extended_regexp_bug_19379
148+
assert_separately([], <<-'RUBY')
149+
re = /(?-x:#)/x
150+
assert_match(re, '#')
151+
assert_not_match(re, '-')
152+
153+
re = /(?xi:#
154+
y)/
155+
assert_match(re, 'Y')
156+
assert_not_match(re, '-')
157+
158+
re = /(?mix:#
159+
y)/
160+
assert_match(re, 'Y')
161+
assert_not_match(re, '-')
162+
163+
re = /(?x-im:#
164+
y)/i
165+
assert_match(re, 'y')
166+
assert_not_match(re, 'Y')
167+
168+
re = /(?-imx:(?xim:#
169+
y))/x
170+
assert_match(re, 'y')
171+
assert_not_match(re, '-')
172+
173+
re = /(?x)#
174+
y/
175+
assert_match(re, 'y')
176+
assert_not_match(re, 'Y')
177+
178+
re = /(?mx-i)#
179+
y/i
180+
assert_match(re, 'y')
181+
assert_not_match(re, 'Y')
182+
183+
re = /(?-imx:(?xim:#
184+
(?-x)y#))/x
185+
assert_match(re, 'Y#')
186+
assert_not_match(re, '-#')
187+
188+
re = /(?imx:#
189+
(?-xim:#(?im)#(?x)#
190+
)#
191+
(?x)#
192+
y)/
193+
assert_match(re, '###Y')
194+
assert_not_match(re, '###-')
195+
196+
re = %r{#c-\w+/comment/[\w-]+}
197+
re = %r{https?://[^/]+#{re}}x
198+
assert_match(re, 'http://foo#c-x/comment/bar')
199+
assert_not_match(re, 'http://foo#cx/comment/bar')
200+
RUBY
201+
end
202+
147203
def test_union
148204
assert_equal :ok, begin
149205
Regexp.union(

0 commit comments

Comments
 (0)