Skip to content

Commit 2116770

Browse files
committed
Correctly handle line continuations in %w/i% interrupted by heredocs
See https://bugs.ruby-lang.org/issues/21756. Ripper fails to parse this, but prism actually also doesn't handle it correctly. When heredocs are used, even in lowercase percent arays there can be multiple `STRING_CONTENT` tokens. We need to concat them. Luckily we don't need to handle as many cases as in uppercase arrays where interpolation is allowed.
1 parent 434c4c5 commit 2116770

File tree

2 files changed

+95
-32
lines changed

2 files changed

+95
-32
lines changed

snapshots/spanning_heredoc.txt

Lines changed: 33 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -192,19 +192,24 @@
192192
│ │ │ └── unescaped: "i\n"
193193
│ │ └── @ ArrayNode (location: (28,9)-(31,2))
194194
│ │ ├── flags: ∅
195-
│ │ ├── elements: (length: 2)
196-
│ │ │ ├── @ StringNode (location: (28,12)-(28,14))
197-
│ │ │ │ ├── flags: ∅
198-
│ │ │ │ ├── opening_loc: ∅
199-
│ │ │ │ ├── content_loc: (28,12)-(28,14) = "j\\"
200-
│ │ │ │ ├── closing_loc: ∅
201-
│ │ │ │ └── unescaped: "j\n"
202-
│ │ │ └── @ StringNode (location: (31,0)-(31,1))
195+
│ │ ├── elements: (length: 1)
196+
│ │ │ └── @ InterpolatedStringNode (location: (28,12)-(31,1))
203197
│ │ │ ├── flags: ∅
204198
│ │ │ ├── opening_loc: ∅
205-
│ │ │ ├── content_loc: (31,0)-(31,1) = "j"
206-
│ │ │ ├── closing_loc: ∅
207-
│ │ │ └── unescaped: "j"
199+
│ │ │ ├── parts: (length: 2)
200+
│ │ │ │ ├── @ StringNode (location: (28,12)-(28,14))
201+
│ │ │ │ │ ├── flags: static_literal, frozen
202+
│ │ │ │ │ ├── opening_loc: ∅
203+
│ │ │ │ │ ├── content_loc: (28,12)-(28,14) = "j\\"
204+
│ │ │ │ │ ├── closing_loc: ∅
205+
│ │ │ │ │ └── unescaped: "j\n"
206+
│ │ │ │ └── @ StringNode (location: (31,0)-(31,1))
207+
│ │ │ │ ├── flags: static_literal, frozen
208+
│ │ │ │ ├── opening_loc: ∅
209+
│ │ │ │ ├── content_loc: (31,0)-(31,1) = "j"
210+
│ │ │ │ ├── closing_loc: ∅
211+
│ │ │ │ └── unescaped: "j"
212+
│ │ │ └── closing_loc: ∅
208213
│ │ ├── opening_loc: (28,9)-(28,12) = "%w["
209214
│ │ └── closing_loc: (31,1)-(31,2) = "]"
210215
│ ├── closing_loc: ∅
@@ -271,19 +276,24 @@
271276
│ │ │ └── unescaped: "m\n"
272277
│ │ └── @ ArrayNode (location: (41,9)-(44,2))
273278
│ │ ├── flags: static_literal
274-
│ │ ├── elements: (length: 2)
275-
│ │ │ ├── @ SymbolNode (location: (41,12)-(41,14))
276-
│ │ │ │ ├── flags: static_literal, forced_us_ascii_encoding
277-
│ │ │ │ ├── opening_loc: ∅
278-
│ │ │ │ ├── value_loc: (41,12)-(41,14) = "n\\"
279-
│ │ │ │ ├── closing_loc: ∅
280-
│ │ │ │ └── unescaped: "n\n"
281-
│ │ │ └── @ SymbolNode (location: (44,0)-(44,1))
282-
│ │ │ ├── flags: static_literal, forced_us_ascii_encoding
279+
│ │ ├── elements: (length: 1)
280+
│ │ │ └── @ InterpolatedSymbolNode (location: (41,12)-(41,14))
281+
│ │ │ ├── flags: static_literal
283282
│ │ │ ├── opening_loc: ∅
284-
│ │ │ ├── value_loc: (44,0)-(44,1) = "n"
285-
│ │ │ ├── closing_loc: ∅
286-
│ │ │ └── unescaped: "n"
283+
│ │ │ ├── parts: (length: 2)
284+
│ │ │ │ ├── @ StringNode (location: (41,12)-(41,14))
285+
│ │ │ │ │ ├── flags: static_literal, frozen
286+
│ │ │ │ │ ├── opening_loc: ∅
287+
│ │ │ │ │ ├── content_loc: (41,12)-(41,14) = "n\\"
288+
│ │ │ │ │ ├── closing_loc: ∅
289+
│ │ │ │ │ └── unescaped: "n\n"
290+
│ │ │ │ └── @ StringNode (location: (41,12)-(41,14))
291+
│ │ │ │ ├── flags: static_literal, frozen
292+
│ │ │ │ ├── opening_loc: ∅
293+
│ │ │ │ ├── content_loc: (41,12)-(41,14) = "n\\"
294+
│ │ │ │ ├── closing_loc: ∅
295+
│ │ │ │ └── unescaped: "n"
296+
│ │ │ └── closing_loc: ∅
287297
│ │ ├── opening_loc: (41,9)-(41,12) = "%i["
288298
│ │ └── closing_loc: (44,1)-(44,2) = "]"
289299
│ ├── closing_loc: ∅

src/prism.c

Lines changed: 62 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19299,18 +19299,52 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
1929919299
parser_lex(parser);
1930019300
pm_token_t opening = parser->previous;
1930119301
pm_array_node_t *array = pm_array_node_create(parser, &opening);
19302+
pm_node_t *current = NULL;
1930219303

1930319304
while (!match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) {
1930419305
accept1(parser, PM_TOKEN_WORDS_SEP);
1930519306
if (match1(parser, PM_TOKEN_STRING_END)) break;
1930619307

19307-
if (match1(parser, PM_TOKEN_STRING_CONTENT)) {
19308+
// Interpolation is not possible but nested heredocs can still lead to
19309+
// consecutive (disjoint) string tokens when the final newline is escaped.
19310+
while (match1(parser, PM_TOKEN_STRING_CONTENT)) {
1930819311
pm_token_t opening = not_provided(parser);
1930919312
pm_token_t closing = not_provided(parser);
19310-
pm_array_node_elements_append(array, UP(pm_symbol_node_create_current_string(parser, &opening, &parser->current, &closing)));
19313+
19314+
// Record the string node, moving to interpolation if needed.
19315+
if (current == NULL) {
19316+
current = UP(pm_symbol_node_create_current_string(parser, &opening, &parser->current, &closing));
19317+
parser_lex(parser);
19318+
} else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_SYMBOL_NODE)) {
19319+
pm_node_t *string = UP(pm_string_node_create_current_string(parser, &opening, &parser->current, &closing));
19320+
parser_lex(parser);
19321+
pm_interpolated_symbol_node_append((pm_interpolated_symbol_node_t *) current, string);
19322+
} else if (PM_NODE_TYPE_P(current, PM_SYMBOL_NODE)) {
19323+
pm_symbol_node_t *cast = (pm_symbol_node_t *) current;
19324+
pm_token_t bounds = not_provided(parser);
19325+
19326+
pm_token_t content = { .type = PM_TOKEN_STRING_CONTENT, .start = cast->value_loc.start, .end = cast->value_loc.end };
19327+
pm_node_t *first_string = UP(pm_string_node_create_unescaped(parser, &bounds, &content, &bounds, &cast->unescaped));
19328+
pm_node_t *second_string = UP(pm_string_node_create_current_string(parser, &opening, &parser->previous, &closing));
19329+
parser_lex(parser);
19330+
19331+
pm_interpolated_symbol_node_t *interpolated = pm_interpolated_symbol_node_create(parser, &opening, NULL, &closing);
19332+
pm_interpolated_symbol_node_append(interpolated, first_string);
19333+
pm_interpolated_symbol_node_append(interpolated, second_string);
19334+
19335+
xfree(current);
19336+
current = UP(interpolated);
19337+
} else {
19338+
assert(false && "unreachable");
19339+
}
1931119340
}
1931219341

19313-
expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_LIST_I_LOWER_ELEMENT);
19342+
if (current) {
19343+
pm_array_node_elements_append(array, current);
19344+
current = NULL;
19345+
} else {
19346+
expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_LIST_W_LOWER_ELEMENT);
19347+
}
1931419348
}
1931519349

1931619350
pm_token_t closing = parser->current;
@@ -19489,23 +19523,42 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
1948919523
parser_lex(parser);
1949019524
pm_token_t opening = parser->previous;
1949119525
pm_array_node_t *array = pm_array_node_create(parser, &opening);
19492-
19493-
// skip all leading whitespaces
19494-
accept1(parser, PM_TOKEN_WORDS_SEP);
19526+
pm_node_t *current = NULL;
1949519527

1949619528
while (!match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) {
1949719529
accept1(parser, PM_TOKEN_WORDS_SEP);
1949819530
if (match1(parser, PM_TOKEN_STRING_END)) break;
1949919531

19500-
if (match1(parser, PM_TOKEN_STRING_CONTENT)) {
19532+
// Interpolation is not possible but nested heredocs can still lead to
19533+
// consecutive (disjoint) string tokens when the final newline is escaped.
19534+
while (match1(parser, PM_TOKEN_STRING_CONTENT)) {
1950119535
pm_token_t opening = not_provided(parser);
1950219536
pm_token_t closing = not_provided(parser);
1950319537

1950419538
pm_node_t *string = UP(pm_string_node_create_current_string(parser, &opening, &parser->current, &closing));
19505-
pm_array_node_elements_append(array, string);
19539+
19540+
// Record the string node, moving to interpolation if needed.
19541+
if (current == NULL) {
19542+
current = string;
19543+
} else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_STRING_NODE)) {
19544+
pm_interpolated_string_node_append((pm_interpolated_string_node_t *) current, string);
19545+
} else if (PM_NODE_TYPE_P(current, PM_STRING_NODE)) {
19546+
pm_interpolated_string_node_t *interpolated = pm_interpolated_string_node_create(parser, &opening, NULL, &closing);
19547+
pm_interpolated_string_node_append(interpolated, current);
19548+
pm_interpolated_string_node_append(interpolated, string);
19549+
current = UP(interpolated);
19550+
} else {
19551+
assert(false && "unreachable");
19552+
}
19553+
parser_lex(parser);
1950619554
}
1950719555

19508-
expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_LIST_W_LOWER_ELEMENT);
19556+
if (current) {
19557+
pm_array_node_elements_append(array, current);
19558+
current = NULL;
19559+
} else {
19560+
expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_LIST_W_LOWER_ELEMENT);
19561+
}
1950919562
}
1951019563

1951119564
pm_token_t closing = parser->current;

0 commit comments

Comments
 (0)