Skip to content

Commit

Permalink
[ruby/prism] Fix incorrect paring when using invalid regexp options
Browse files Browse the repository at this point in the history
Fixes ruby/prism#2617.

There was an issue with the lexer as follows.
The following are valid regexp options:

```console
$ bundle exec ruby -Ilib -rprism -ve 'p Prism.lex("/x/io").value.map {|token| token[0].type }'
ruby 3.3.0 (2023-12-25 revision ruby/prism@5124f9ac75) [x86_64-darwin22]
[:REGEXP_BEGIN, :STRING_CONTENT, :REGEXP_END, :EOF]
```

The following are invalid regexp options. Unnecessary the `IDENTIFIER` token is appearing:

```console
$ bundle exec ruby -Ilib -rprism -ve 'p Prism.lex("/x/az").value.map {|token| token[0].type }'
ruby 3.3.0 (2023-12-25 revision ruby/prism@5124f9ac75) [x86_64-darwin22]
[:REGEXP_BEGIN, :STRING_CONTENT, :REGEXP_END, :IDENTIFIER, :EOF]
```

As a behavior of Ruby, when given `A` to `Z` and `a` to `z`, they act as invalid regexp options. e.g.,

```console
$ ruby -e '/regexp/az'
-e:1: unknown regexp options - az
/regexp/az
-e: compile error (SyntaxError)
```

Thus, it should probably not be construed as `IDENTIFIER` token.

Therefore, `pm_byte_table` has been adapted to accept those invalid regexp option values.
Whether it is a valid regexp option or not is checked by `pm_regular_expression_flags_create`.
For invalid regexp options, `PM_ERR_REGEXP_UNKNOWN_OPTIONS` is added to diagnostics.

ruby/prism@d2a6096fcf
  • Loading branch information
koic authored and matzbot committed Mar 25, 2024
1 parent 9b921f6 commit 56a2fad
Show file tree
Hide file tree
Showing 7 changed files with 39 additions and 10 deletions.
2 changes: 2 additions & 0 deletions lib/prism/translation/parser.rb
Expand Up @@ -173,6 +173,8 @@ def error_diagnostic(error, offset_cache)
Diagnostic.new(:error, :duplicate_argument, {}, diagnostic_location, [])
when :parameter_numbered_reserved
Diagnostic.new(:error, :reserved_for_numparam, { name: location.slice }, diagnostic_location, [])
when :regexp_unknown_options
Diagnostic.new(:error, :regexp_options, { options: location.slice[1..] }, diagnostic_location, [])
when :singleton_for_literals
Diagnostic.new(:error, :singleton_literal, {}, diagnostic_location, [])
when :string_literal_eof
Expand Down
1 change: 1 addition & 0 deletions prism/config.yml
Expand Up @@ -199,6 +199,7 @@ errors:
- REGEXP_INVALID_UNICODE_RANGE
- REGEXP_NON_ESCAPED_MBC
- REGEXP_TERM
- REGEXP_UNKNOWN_OPTIONS
- REGEXP_UTF8_CHAR_NON_UTF8_REGEXP
- RESCUE_EXPRESSION
- RESCUE_MODIFIER_VALUE
Expand Down
21 changes: 15 additions & 6 deletions prism/prism.c
Expand Up @@ -1214,10 +1214,12 @@ pm_node_flag_set_repeated_parameter(pm_node_t *node) {
* Parse out the options for a regular expression.
*/
static inline pm_node_flags_t
pm_regular_expression_flags_create(const pm_token_t *closing) {
pm_regular_expression_flags_create(pm_parser_t *parser, const pm_token_t *closing) {
pm_node_flags_t flags = 0;

if (closing->type == PM_TOKEN_REGEXP_END) {
pm_buffer_t unknown_flags = { 0 };

for (const uint8_t *flag = closing->start + 1; flag < closing->end; flag++) {
switch (*flag) {
case 'i': flags |= PM_REGULAR_EXPRESSION_FLAGS_IGNORE_CASE; break;
Expand All @@ -1230,9 +1232,16 @@ pm_regular_expression_flags_create(const pm_token_t *closing) {
case 's': flags = (pm_node_flags_t) (((pm_node_flags_t) (flags & PM_REGULAR_EXPRESSION_ENCODING_MASK)) | PM_REGULAR_EXPRESSION_FLAGS_WINDOWS_31J); break;
case 'u': flags = (pm_node_flags_t) (((pm_node_flags_t) (flags & PM_REGULAR_EXPRESSION_ENCODING_MASK)) | PM_REGULAR_EXPRESSION_FLAGS_UTF_8); break;

default: assert(false && "unreachable");
default: pm_buffer_append_byte(&unknown_flags, *flag);
}
}

size_t unknown_flags_length = pm_buffer_length(&unknown_flags);
if (unknown_flags_length != 0) {
char *word = unknown_flags_length >= 2 ? "options" : "option";
PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->previous, PM_ERR_REGEXP_UNKNOWN_OPTIONS, word, unknown_flags_length, pm_buffer_value(&unknown_flags));
}
pm_buffer_free(&unknown_flags);
}

return flags;
Expand Down Expand Up @@ -4297,10 +4306,10 @@ pm_interpolated_regular_expression_node_append(pm_interpolated_regular_expressio
}

static inline void
pm_interpolated_regular_expression_node_closing_set(pm_interpolated_regular_expression_node_t *node, const pm_token_t *closing) {
pm_interpolated_regular_expression_node_closing_set(pm_parser_t *parser, pm_interpolated_regular_expression_node_t *node, const pm_token_t *closing) {
node->closing_loc = PM_LOCATION_TOKEN_VALUE(closing);
node->base.location.end = closing->end;
pm_node_flag_set((pm_node_t *)node, pm_regular_expression_flags_create(closing));
pm_node_flag_set((pm_node_t *)node, pm_regular_expression_flags_create(parser, closing));
}

/**
Expand Down Expand Up @@ -5528,7 +5537,7 @@ pm_regular_expression_node_create_unescaped(pm_parser_t *parser, const pm_token_
*node = (pm_regular_expression_node_t) {
{
.type = PM_REGULAR_EXPRESSION_NODE,
.flags = pm_regular_expression_flags_create(closing) | PM_NODE_FLAG_STATIC_LITERAL,
.flags = pm_regular_expression_flags_create(parser, closing) | PM_NODE_FLAG_STATIC_LITERAL,
.location = {
.start = MIN(opening->start, closing->start),
.end = MAX(opening->end, closing->end)
Expand Down Expand Up @@ -17490,7 +17499,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
expect1(parser, PM_TOKEN_REGEXP_END, PM_ERR_REGEXP_TERM);
}

pm_interpolated_regular_expression_node_closing_set(interpolated, &closing);
pm_interpolated_regular_expression_node_closing_set(parser, interpolated, &closing);
return (pm_node_t *) interpolated;
}
case PM_TOKEN_BACKTICK:
Expand Down
1 change: 1 addition & 0 deletions prism/templates/src/diagnostic.c.erb
Expand Up @@ -277,6 +277,7 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = {
[PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING] = { "incompatible character encoding: /%.*s/", PM_ERROR_LEVEL_FATAL },
[PM_ERR_REGEXP_NON_ESCAPED_MBC] = { "/.../n has a non escaped non ASCII character in non ASCII-8BIT script: /%.*s/", PM_ERROR_LEVEL_FATAL },
[PM_ERR_REGEXP_INVALID_UNICODE_RANGE] = { "invalid Unicode range: /%.*s/", PM_ERROR_LEVEL_FATAL },
[PM_ERR_REGEXP_UNKNOWN_OPTIONS] = { "unknown regexp %s: %.*s", PM_ERROR_LEVEL_FATAL },
[PM_ERR_REGEXP_TERM] = { "expected a closing delimiter for the regular expression", PM_ERROR_LEVEL_FATAL },
[PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP] = { "UTF-8 character in non UTF-8 regexp: /%s/", PM_ERROR_LEVEL_FATAL },
[PM_ERR_RESCUE_EXPRESSION] = { "expected a rescued expression", PM_ERROR_LEVEL_FATAL },
Expand Down
8 changes: 4 additions & 4 deletions prism/util/pm_char.c
Expand Up @@ -19,10 +19,10 @@ static const uint8_t pm_byte_table[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 3x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 5x
0, 0, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 4, 4, // 6x
0, 0, 0, 4, 0, 4, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, // 7x
0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 4x
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, // 5x
0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 6x
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, // 7x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax
Expand Down
14 changes: 14 additions & 0 deletions test/prism/errors_test.rb
Expand Up @@ -2067,6 +2067,20 @@ def test_it_with_ordinary_parameter
assert_errors expression(source), source, errors, compare_ripper: false
end

def test_regular_expression_with_unknown_regexp_options
source = "/foo/AZaz"
errors = [["unknown regexp options: AZaz", 4..9]]

assert_errors expression(source), source, errors
end

def test_interpolated_regular_expression_with_unknown_regexp_options
source = "/\#{foo}/AZaz"
errors = [["unknown regexp options: AZaz", 7..12]]

assert_errors expression(source), source, errors
end

def test_singleton_method_for_literals
source = <<~'RUBY'
def (1).g; end
Expand Down
2 changes: 2 additions & 0 deletions test/prism/location_test.rb
Expand Up @@ -527,6 +527,7 @@ def test_InterpolatedMatchLastLineNode

def test_InterpolatedRegularExpressionNode
assert_location(InterpolatedRegularExpressionNode, "/\#{foo}/")
assert_location(InterpolatedRegularExpressionNode, "/\#{foo}/io")
end

def test_InterpolatedStringNode
Expand Down Expand Up @@ -730,6 +731,7 @@ def test_RedoNode

def test_RegularExpressionNode
assert_location(RegularExpressionNode, "/foo/")
assert_location(RegularExpressionNode, "/foo/io")
end

def test_RequiredKeywordParameterNode
Expand Down

0 comments on commit 56a2fad

Please sign in to comment.