Skip to content

Commit 11e0e20

Browse files
committed
Parse all regular expressions
1 parent 499ec1c commit 11e0e20

File tree

4 files changed

+93
-64
lines changed

4 files changed

+93
-64
lines changed

Gemfile

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,16 @@ source "https://rubygems.org"
44

55
gemspec
66

7+
gem "benchmark-ips"
78
gem "rake"
89
gem "rake-compiler"
910
gem "test-unit"
1011

11-
gem "benchmark-ips"
12-
gem "ffi"
13-
gem "onigmo", platform: %i[mri mswin mingw x64_mingw]
14-
gem "parser"
15-
gem "ruby_memcheck", platform: %i[mri mswin mingw x64_mingw]
16-
gem "ruby_parser"
12+
platforms :mri, :mswin, :mingw, :x64_mingw do
13+
gem "ffi"
14+
gem "parser"
15+
gem "ruby_memcheck"
16+
gem "ruby_parser"
17+
end
18+
19+
gem "onigmo", platforms: [:mri]

src/prism.c

Lines changed: 63 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -17402,6 +17402,51 @@ parse_yield(pm_parser_t *parser, const pm_node_t *node) {
1740217402
}
1740317403
}
1740417404

17405+
/**
17406+
* This struct is used to pass information between the regular expression parser
17407+
* and the error callback.
17408+
*/
17409+
typedef struct {
17410+
pm_parser_t *parser;
17411+
const uint8_t *start;
17412+
const uint8_t *end;
17413+
bool shared;
17414+
} parse_regular_expression_error_data_t;
17415+
17416+
/**
17417+
* This callback is called when the regular expression parser encounters a
17418+
* syntax error.
17419+
*/
17420+
static void
17421+
parse_regular_expression_error(const uint8_t *start, const uint8_t *end, const char *message, void *data) {
17422+
parse_regular_expression_error_data_t *callback_data = (parse_regular_expression_error_data_t *) data;
17423+
pm_location_t location;
17424+
17425+
if (callback_data->shared) {
17426+
location = (pm_location_t) { .start = start, .end = end };
17427+
} else {
17428+
location = (pm_location_t) { .start = callback_data->start, .end = callback_data->end };
17429+
}
17430+
17431+
PM_PARSER_ERR_FORMAT(callback_data->parser, location.start, location.end, PM_ERR_REGEXP_PARSE_ERROR, message);
17432+
}
17433+
17434+
/**
17435+
* Parse the errors for the regular expression and add them to the parser.
17436+
*/
17437+
static void
17438+
parse_regular_expression_errors(pm_parser_t *parser, pm_regular_expression_node_t *node) {
17439+
const pm_string_t *unescaped = &node->unescaped;
17440+
parse_regular_expression_error_data_t error_data = {
17441+
.parser = parser,
17442+
.start = node->base.location.start,
17443+
.end = node->base.location.end,
17444+
.shared = unescaped->type == PM_STRING_SHARED
17445+
};
17446+
17447+
pm_regexp_parse(parser, pm_string_source(unescaped), pm_string_length(unescaped), NULL, NULL, parse_regular_expression_error, &error_data);
17448+
}
17449+
1740517450
/**
1740617451
* Parse an expression that begins with the previous node that we just lexed.
1740717452
*/
@@ -19523,13 +19568,22 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
1952319568
bool ascii_only = parser->current_regular_expression_ascii_only;
1952419569
parser_lex(parser);
1952519570

19526-
// If we hit an end, then we can create a regular expression node
19527-
// without interpolation, which can be represented more succinctly and
19528-
// more easily compiled.
19571+
// If we hit an end, then we can create a regular expression
19572+
// node without interpolation, which can be represented more
19573+
// succinctly and more easily compiled.
1952919574
if (accept1(parser, PM_TOKEN_REGEXP_END)) {
19530-
pm_node_t *node = (pm_node_t *) pm_regular_expression_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
19531-
pm_node_flag_set(node, parse_and_validate_regular_expression_encoding(parser, &unescaped, ascii_only, node->flags));
19532-
return node;
19575+
pm_regular_expression_node_t *node = (pm_regular_expression_node_t *) pm_regular_expression_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
19576+
19577+
// If we're not immediately followed by a =~, then we want
19578+
// to parse all of the errors at this point. If it is
19579+
// followed by a =~, then it will get parsed higher up while
19580+
// parsing the named captures as well.
19581+
if (!match1(parser, PM_TOKEN_EQUAL_TILDE)) {
19582+
parse_regular_expression_errors(parser, node);
19583+
}
19584+
19585+
pm_node_flag_set((pm_node_t *) node, parse_and_validate_regular_expression_encoding(parser, &unescaped, ascii_only, node->base.flags));
19586+
return (pm_node_t *) node;
1953319587
}
1953419588

1953519589
// If we get here, then we have interpolation so we'll need to create
@@ -20095,38 +20149,6 @@ parse_regular_expression_named_capture(const pm_string_t *capture, void *data) {
2009520149
}
2009620150
}
2009720151

20098-
/**
20099-
* This struct is used to pass information between the regular expression parser
20100-
* and the error callback.
20101-
*/
20102-
typedef struct {
20103-
pm_parser_t *parser;
20104-
const pm_string_t *content;
20105-
const pm_call_node_t *call;
20106-
} parse_regular_expression_error_data_t;
20107-
20108-
/**
20109-
* This callback is called when the regular expression parser encounters a
20110-
* syntax error.
20111-
*/
20112-
static void
20113-
parse_regular_expression_error(const uint8_t *start, const uint8_t *end, const char *message, void *data) {
20114-
parse_regular_expression_error_data_t *callback_data = (parse_regular_expression_error_data_t *) data;
20115-
20116-
pm_parser_t *parser = callback_data->parser;
20117-
const pm_string_t *content = callback_data->content;
20118-
const pm_call_node_t *call = callback_data->call;
20119-
20120-
pm_location_t location;
20121-
if (content->type == PM_STRING_SHARED) {
20122-
location = (pm_location_t) { .start = start, .end = end };
20123-
} else {
20124-
location = call->receiver->location;
20125-
}
20126-
20127-
PM_PARSER_ERR_FORMAT(parser, location.start, location.end, PM_ERR_REGEXP_PARSE_ERROR, message);
20128-
}
20129-
2013020152
/**
2013120153
* Potentially change a =~ with a regular expression with named captures into a
2013220154
* match write node.
@@ -20142,8 +20164,9 @@ parse_regular_expression_named_captures(pm_parser_t *parser, const pm_string_t *
2014220164

2014320165
parse_regular_expression_error_data_t error_data = {
2014420166
.parser = parser,
20145-
.content = content,
20146-
.call = call
20167+
.start = call->receiver->location.start,
20168+
.end = call->receiver->location.end,
20169+
.shared = content->type == PM_STRING_SHARED
2014720170
};
2014820171

2014920172
pm_regexp_parse(parser, pm_string_source(content), pm_string_length(content), parse_regular_expression_named_capture, &callback_data, parse_regular_expression_error, &error_data);

src/regexp.c

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -225,21 +225,24 @@ pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) {
225225
*/
226226
static bool
227227
pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) {
228-
if (pm_regexp_char_is_eof(parser)) return true;
229-
230-
switch (*parser->cursor) {
231-
case '*':
232-
case '+':
233-
case '?':
234-
parser->cursor++;
235-
return true;
236-
case '{':
237-
parser->cursor++;
238-
return pm_regexp_parse_range_quantifier(parser);
239-
default:
240-
// In this case there is no quantifier.
241-
return true;
228+
while (!pm_regexp_char_is_eof(parser)) {
229+
switch (*parser->cursor) {
230+
case '*':
231+
case '+':
232+
case '?':
233+
parser->cursor++;
234+
break;
235+
case '{':
236+
parser->cursor++;
237+
if (!pm_regexp_parse_range_quantifier(parser)) return false;
238+
break;
239+
default:
240+
// In this case there is no quantifier.
241+
return true;
242+
}
242243
}
244+
245+
return true;
243246
}
244247

245248
/**
@@ -276,7 +279,7 @@ pm_regexp_parse_character_set(pm_regexp_parser_t *parser, uint16_t depth) {
276279
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ']') {
277280
switch (*parser->cursor++) {
278281
case '[':
279-
pm_regexp_parse_lbracket(parser, depth + 1);
282+
pm_regexp_parse_lbracket(parser, (uint16_t) (depth + 1));
280283
break;
281284
case '\\':
282285
if (!pm_regexp_char_is_eof(parser)) {
@@ -584,7 +587,7 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser, uint16_t depth) {
584587

585588
// Now, parse the expressions within this group.
586589
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')') {
587-
if (!pm_regexp_parse_expression(parser, depth + 1)) {
590+
if (!pm_regexp_parse_expression(parser, (uint16_t) (depth + 1))) {
588591
return false;
589592
}
590593
pm_regexp_char_accept(parser, '|');
@@ -615,7 +618,7 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser, uint16_t depth) {
615618
case '^':
616619
case '$':
617620
parser->cursor++;
618-
return true;
621+
return pm_regexp_parse_quantifier(parser);
619622
case '\\':
620623
parser->cursor++;
621624
if (!pm_regexp_char_is_eof(parser)) {

test/prism/onigmo_test.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ def test_ONIGERR_UNMATCHED_CLOSE_PARENTHESIS
5454
private
5555

5656
def assert_error(source, message)
57-
result = Prism.parse(%Q{/#{source}/ =~ ""})
57+
result = Prism.parse("/#{source}/")
5858

5959
assert result.failure?, "Expected #{source.inspect} to error"
6060
assert_equal message, result.errors.first.message

0 commit comments

Comments
 (0)