Skip to content

Commit 58f839a

Browse files
committed
Fix string concat parsing
1 parent 6602b58 commit 58f839a

File tree

3 files changed

+131
-113
lines changed

3 files changed

+131
-113
lines changed

bin/lex

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,9 @@ yarp = YARP.lex_compat(source, filepath)
3232
if yarp.errors.any?
3333
puts "Errors lexing:"
3434
yarp.errors.map do |error|
35-
puts "\e[1;31m- #{error.message}\e[0m"
35+
print "- [#{error.location.start_line},#{error.location.start_column}-"
36+
print "#{error.location.end_line},#{error.location.end_column}] "
37+
puts "\e[1;31m#{error.message}\e[0m"
3638
end
3739
puts "\n"
3840
end

src/yarp.c

Lines changed: 123 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -12016,139 +12016,155 @@ parse_expression_prefix(yp_parser_t *parser, yp_binding_power_t binding_power) {
1201612016
return (yp_node_t *) node;
1201712017
}
1201812018
case YP_TOKEN_STRING_BEGIN: {
12019-
assert(parser->lex_modes.current->mode == YP_LEX_STRING);
12020-
bool lex_interpolation = parser->lex_modes.current->as.string.interpolation;
12019+
yp_node_t *result = NULL;
1202112020

12022-
yp_token_t opening = parser->current;
12023-
parser_lex(parser);
12024-
12025-
yp_node_t *node;
12021+
while (match_type_p(parser, YP_TOKEN_STRING_BEGIN)) {
12022+
assert(parser->lex_modes.current->mode == YP_LEX_STRING);
12023+
bool lex_interpolation = parser->lex_modes.current->as.string.interpolation;
1202612024

12027-
if (accept(parser, YP_TOKEN_STRING_END)) {
12028-
// If we get here, then we have an end immediately after a start. In
12029-
// that case we'll create an empty content token and return an
12030-
// uninterpolated string.
12031-
yp_token_t content = (yp_token_t) {
12032-
.type = YP_TOKEN_STRING_CONTENT,
12033-
.start = parser->previous.start,
12034-
.end = parser->previous.start
12035-
};
12025+
yp_node_t *node = NULL;
12026+
yp_token_t opening = parser->current;
12027+
parser_lex(parser);
1203612028

12037-
node = (yp_node_t *) yp_string_node_create_and_unescape(parser, &opening, &content, &parser->previous, YP_UNESCAPE_NONE);
12038-
} else if (accept(parser, YP_TOKEN_LABEL_END)) {
12039-
// If we get here, then we have an end of a label immediately after a
12040-
// start. In that case we'll create an empty symbol node.
12041-
yp_token_t opening = not_provided(parser);
12042-
yp_token_t content = (yp_token_t) {
12043-
.type = YP_TOKEN_STRING_CONTENT,
12044-
.start = parser->previous.start,
12045-
.end = parser->previous.start
12046-
};
12029+
if (accept(parser, YP_TOKEN_STRING_END)) {
12030+
// If we get here, then we have an end immediately after a
12031+
// start. In that case we'll create an empty content token
12032+
// and return an uninterpolated string.
12033+
yp_token_t content = (yp_token_t) {
12034+
.type = YP_TOKEN_STRING_CONTENT,
12035+
.start = parser->previous.start,
12036+
.end = parser->previous.start
12037+
};
12038+
12039+
node = (yp_node_t *) yp_string_node_create_and_unescape(parser, &opening, &content, &parser->previous, YP_UNESCAPE_NONE);
12040+
} else if (accept(parser, YP_TOKEN_LABEL_END)) {
12041+
// If we get here, then we have an end of a label
12042+
// immediately after a start. In that case we'll create an
12043+
// empty symbol node.
12044+
yp_token_t opening = not_provided(parser);
12045+
yp_token_t content = (yp_token_t) {
12046+
.type = YP_TOKEN_STRING_CONTENT,
12047+
.start = parser->previous.start,
12048+
.end = parser->previous.start
12049+
};
12050+
12051+
node = (yp_node_t *) yp_symbol_node_create(parser, &opening, &content, &parser->previous);
12052+
} else if (!lex_interpolation) {
12053+
// If we don't accept interpolation then we expect the
12054+
// string to start with a single string content node.
12055+
expect(parser, YP_TOKEN_STRING_CONTENT, "Expected string content after opening delimiter.");
12056+
yp_token_t content = parser->previous;
12057+
12058+
// It is unfortunately possible to have multiple string
12059+
// content nodes in a row in the case that there's heredoc
12060+
// content in the middle of the string, like this cursed
12061+
// example:
12062+
//
12063+
// <<-END+'b
12064+
// a
12065+
// END
12066+
// c'+'d'
12067+
//
12068+
// In that case we need to switch to an interpolated string
12069+
// to be able to contain all of the parts.
12070+
if (match_type_p(parser, YP_TOKEN_STRING_CONTENT)) {
12071+
yp_node_list_t parts = YP_EMPTY_NODE_LIST;
1204712072

12048-
return (yp_node_t *) yp_symbol_node_create(parser, &opening, &content, &parser->previous);
12049-
} else if (!lex_interpolation) {
12050-
// If we don't accept interpolation then we expect the string to start
12051-
// with a single string content node.
12052-
expect(parser, YP_TOKEN_STRING_CONTENT, "Expected string content after opening delimiter.");
12053-
yp_token_t content = parser->previous;
12073+
yp_token_t delimiters = not_provided(parser);
12074+
yp_node_t *part = (yp_node_t *) yp_string_node_create_and_unescape(parser, &delimiters, &content, &delimiters, YP_UNESCAPE_MINIMAL);
12075+
yp_node_list_append(&parts, part);
1205412076

12055-
// It is unfortunately possible to have multiple string content nodes in
12056-
// a row in the case that there's heredoc content in the middle of the
12057-
// string, like this cursed example:
12058-
//
12059-
// <<-END+'b
12060-
// a
12061-
// END
12062-
// c'+'d'
12063-
//
12064-
// In that case we need to switch to an interpolated string to be able
12065-
// to contain all of the parts.
12066-
if (match_type_p(parser, YP_TOKEN_STRING_CONTENT)) {
12067-
yp_node_list_t parts = YP_EMPTY_NODE_LIST;
12077+
while (accept(parser, YP_TOKEN_STRING_CONTENT)) {
12078+
part = (yp_node_t *) yp_string_node_create_and_unescape(parser, &delimiters, &parser->previous, &delimiters, YP_UNESCAPE_MINIMAL);
12079+
yp_node_list_append(&parts, part);
12080+
}
1206812081

12069-
yp_token_t delimiters = not_provided(parser);
12070-
yp_node_t *part = (yp_node_t *) yp_string_node_create_and_unescape(parser, &delimiters, &content, &delimiters, YP_UNESCAPE_MINIMAL);
12071-
yp_node_list_append(&parts, part);
12082+
expect(parser, YP_TOKEN_STRING_END, "Expected a closing delimiter for a string literal.");
12083+
node = (yp_node_t *) yp_interpolated_string_node_create(parser, &opening, &parts, &parser->previous);
12084+
} else if (accept(parser, YP_TOKEN_LABEL_END)) {
12085+
node = (yp_node_t *) yp_symbol_node_create_and_unescape(parser, &opening, &content, &parser->previous, YP_UNESCAPE_ALL);
12086+
} else {
12087+
expect(parser, YP_TOKEN_STRING_END, "Expected a closing delimiter for a string literal.");
12088+
node = (yp_node_t *) yp_string_node_create_and_unescape(parser, &opening, &content, &parser->previous, YP_UNESCAPE_MINIMAL);
12089+
}
12090+
} else if (match_type_p(parser, YP_TOKEN_STRING_CONTENT)) {
12091+
// In this case we've hit string content so we know the string at
12092+
// least has something in it. We'll need to check if the following
12093+
// token is the end (in which case we can return a plain string) or if
12094+
// it's not then it has interpolation.
12095+
yp_token_t content = parser->current;
12096+
parser_lex(parser);
1207212097

12073-
while (accept(parser, YP_TOKEN_STRING_CONTENT)) {
12074-
part = (yp_node_t *) yp_string_node_create_and_unescape(parser, &delimiters, &parser->previous, &delimiters, YP_UNESCAPE_MINIMAL);
12098+
if (accept(parser, YP_TOKEN_STRING_END)) {
12099+
node = (yp_node_t *) yp_string_node_create_and_unescape(parser, &opening, &content, &parser->previous, YP_UNESCAPE_ALL);
12100+
} else if (accept(parser, YP_TOKEN_LABEL_END)) {
12101+
node = (yp_node_t *) yp_symbol_node_create_and_unescape(parser, &opening, &content, &parser->previous, YP_UNESCAPE_ALL);
12102+
} else {
12103+
// If we get here, then we have interpolation so we'll need to create
12104+
// a string or symbol node with interpolation.
12105+
yp_node_list_t parts = YP_EMPTY_NODE_LIST;
12106+
yp_token_t string_opening = not_provided(parser);
12107+
yp_token_t string_closing = not_provided(parser);
12108+
yp_node_t *part = (yp_node_t *) yp_string_node_create_and_unescape(parser, &string_opening, &parser->previous, &string_closing, YP_UNESCAPE_ALL);
1207512109
yp_node_list_append(&parts, part);
12076-
}
12077-
12078-
expect(parser, YP_TOKEN_STRING_END, "Expected a closing delimiter for a string literal.");
12079-
return (yp_node_t *) yp_interpolated_string_node_create(parser, &opening, &parts, &parser->previous);
12080-
}
1208112110

12082-
if (accept(parser, YP_TOKEN_LABEL_END)) {
12083-
return (yp_node_t *) yp_symbol_node_create_and_unescape(parser, &opening, &content, &parser->previous, YP_UNESCAPE_ALL);
12084-
}
12085-
12086-
expect(parser, YP_TOKEN_STRING_END, "Expected a closing delimiter for a string literal.");
12087-
node = (yp_node_t *) yp_string_node_create_and_unescape(parser, &opening, &content, &parser->previous, YP_UNESCAPE_MINIMAL);
12088-
} else if (match_type_p(parser, YP_TOKEN_STRING_CONTENT)) {
12089-
// In this case we've hit string content so we know the string at
12090-
// least has something in it. We'll need to check if the following
12091-
// token is the end (in which case we can return a plain string) or if
12092-
// it's not then it has interpolation.
12093-
yp_token_t content = parser->current;
12094-
parser_lex(parser);
12111+
while (!match_any_type_p(parser, 3, YP_TOKEN_STRING_END, YP_TOKEN_LABEL_END, YP_TOKEN_EOF)) {
12112+
yp_node_t *part = parse_string_part(parser);
12113+
if (part != NULL) yp_node_list_append(&parts, part);
12114+
}
1209512115

12096-
if (accept(parser, YP_TOKEN_STRING_END)) {
12097-
node = (yp_node_t *) yp_string_node_create_and_unescape(parser, &opening, &content, &parser->previous, YP_UNESCAPE_ALL);
12098-
} else if (accept(parser, YP_TOKEN_LABEL_END)) {
12099-
return (yp_node_t *) yp_symbol_node_create_and_unescape(parser, &opening, &content, &parser->previous, YP_UNESCAPE_ALL);
12116+
if (accept(parser, YP_TOKEN_LABEL_END)) {
12117+
node = (yp_node_t *) yp_interpolated_symbol_node_create(parser, &opening, &parts, &parser->previous);
12118+
} else {
12119+
expect(parser, YP_TOKEN_STRING_END, "Expected a closing delimiter for an interpolated string.");
12120+
node = (yp_node_t *) yp_interpolated_string_node_create(parser, &opening, &parts, &parser->previous);
12121+
}
12122+
}
1210012123
} else {
12101-
// If we get here, then we have interpolation so we'll need to create
12102-
// a string or symbol node with interpolation.
12124+
// If we get here, then the first part of the string is not plain string
12125+
// content, in which case we need to parse the string as an interpolated
12126+
// string.
1210312127
yp_node_list_t parts = YP_EMPTY_NODE_LIST;
12104-
yp_token_t string_opening = not_provided(parser);
12105-
yp_token_t string_closing = not_provided(parser);
12106-
yp_node_t *part = (yp_node_t *) yp_string_node_create_and_unescape(parser, &string_opening, &parser->previous, &string_closing, YP_UNESCAPE_ALL);
12107-
yp_node_list_append(&parts, part);
1210812128

1210912129
while (!match_any_type_p(parser, 3, YP_TOKEN_STRING_END, YP_TOKEN_LABEL_END, YP_TOKEN_EOF)) {
1211012130
yp_node_t *part = parse_string_part(parser);
1211112131
if (part != NULL) yp_node_list_append(&parts, part);
1211212132
}
1211312133

1211412134
if (accept(parser, YP_TOKEN_LABEL_END)) {
12115-
return (yp_node_t *) yp_interpolated_symbol_node_create(parser, &opening, &parts, &parser->previous);
12135+
node = (yp_node_t *) yp_interpolated_symbol_node_create(parser, &opening, &parts, &parser->previous);
12136+
} else {
12137+
expect(parser, YP_TOKEN_STRING_END, "Expected a closing delimiter for an interpolated string.");
12138+
node = (yp_node_t *) yp_interpolated_string_node_create(parser, &opening, &parts, &parser->previous);
1211612139
}
12117-
12118-
expect(parser, YP_TOKEN_STRING_END, "Expected a closing delimiter for an interpolated string.");
12119-
node = (yp_node_t *) yp_interpolated_string_node_create(parser, &opening, &parts, &parser->previous);
1212012140
}
12121-
} else {
12122-
// If we get here, then the first part of the string is not plain string
12123-
// content, in which case we need to parse the string as an interpolated
12124-
// string.
12125-
yp_node_list_t parts = YP_EMPTY_NODE_LIST;
1212612141

12127-
while (!match_any_type_p(parser, 3, YP_TOKEN_STRING_END, YP_TOKEN_LABEL_END, YP_TOKEN_EOF)) {
12128-
yp_node_t *part = parse_string_part(parser);
12129-
if (part != NULL) yp_node_list_append(&parts, part);
12130-
}
12142+
if (result == NULL) {
12143+
// If the node we just parsed is a symbol node, then we
12144+
// can't concatenate it with anything else, so we can now
12145+
// return that node.
12146+
if (YP_NODE_TYPE_P(node, YP_NODE_SYMBOL_NODE) || YP_NODE_TYPE_P(node, YP_NODE_INTERPOLATED_SYMBOL_NODE)) {
12147+
return node;
12148+
}
1213112149

12132-
if (accept(parser, YP_TOKEN_LABEL_END)) {
12133-
return (yp_node_t *) yp_interpolated_symbol_node_create(parser, &opening, &parts, &parser->previous);
12134-
}
12150+
// If we don't already have a node, then it's fine and we
12151+
// can just set the result to be the node we just parsed.
12152+
result = node;
12153+
} else {
12154+
// Otherwise we need to check the type of the node we just
12155+
// parsed. If it cannot be concatenated with the previous
12156+
// node, then we'll need to add a syntax error.
12157+
if (!YP_NODE_TYPE_P(node, YP_NODE_STRING_NODE) && !YP_NODE_TYPE_P(node, YP_NODE_INTERPOLATED_STRING_NODE)) {
12158+
yp_diagnostic_list_append(&parser->error_list, node->location.start, node->location.end, "Unexpected string concatenation.");
12159+
}
1213512160

12136-
expect(parser, YP_TOKEN_STRING_END, "Expected a closing delimiter for an interpolated string.");
12137-
node = (yp_node_t *) yp_interpolated_string_node_create(parser, &opening, &parts, &parser->previous);
12161+
// Either way we will create a concat node to hold the
12162+
// strings together.
12163+
result = (yp_node_t *) yp_string_concat_node_create(parser, result, node);
12164+
}
1213812165
}
1213912166

12140-
// If there's a string immediately following this string, then it's a
12141-
// concatenatation. In this case we'll parse the next string and create a
12142-
// node in the tree that concatenates the two strings.
12143-
if (parser->current.type == YP_TOKEN_STRING_BEGIN) {
12144-
return (yp_node_t *) yp_string_concat_node_create(
12145-
parser,
12146-
node,
12147-
parse_expression(parser, YP_BINDING_POWER_CALL, "Expected string on the right side of concatenation.")
12148-
);
12149-
} else {
12150-
return node;
12151-
}
12167+
return result;
1215212168
}
1215312169
case YP_TOKEN_SYMBOL_BEGIN: {
1215412170
yp_lex_mode_t lex_mode = *parser->lex_modes.current;

test/snapshots/unparser/corpus/semantic/dstr.txt

Lines changed: 5 additions & 5 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)