Skip to content

Commit

Permalink
[ruby/prism] Fix up tilde heredoc line continuations
Browse files Browse the repository at this point in the history
  • Loading branch information
kddnewton authored and matzbot committed Mar 7, 2024
1 parent 18ee7c9 commit 76e1159
Show file tree
Hide file tree
Showing 7 changed files with 72 additions and 21 deletions.
3 changes: 3 additions & 0 deletions prism/parser.h
Expand Up @@ -234,6 +234,9 @@ typedef struct pm_lex_mode {
* a tilde heredoc.
*/
size_t common_whitespace;

/** True if the previous token ended with a line continuation. */
bool line_continuation;
} heredoc;
} as;

Expand Down
38 changes: 30 additions & 8 deletions prism/prism.c
Expand Up @@ -9450,7 +9450,8 @@ parser_lex(pm_parser_t *parser) {
.next_start = parser->current.end,
.quote = quote,
.indent = indent,
.common_whitespace = (size_t) -1
.common_whitespace = (size_t) -1,
.line_continuation = false
}
});

Expand Down Expand Up @@ -10719,6 +10720,9 @@ parser_lex(pm_parser_t *parser) {
// current lex mode.
pm_lex_mode_t *lex_mode = parser->lex_modes.current;

bool line_continuation = lex_mode->as.heredoc.line_continuation;
lex_mode->as.heredoc.line_continuation = false;

// We'll check if we're at the end of the file. If we are, then we
// will add an error (because we weren't able to find the
// terminator) but still continue parsing so that content after the
Expand All @@ -10736,7 +10740,7 @@ parser_lex(pm_parser_t *parser) {

// If we are immediately following a newline and we have hit the
// terminator, then we need to return the ending of the heredoc.
if (current_token_starts_line(parser)) {
if (!line_continuation && current_token_starts_line(parser)) {
const uint8_t *start = parser->current.start;
if (start + ident_length <= parser->end) {
const uint8_t *newline = next_newline(start, parser->end - start);
Expand Down Expand Up @@ -10808,7 +10812,7 @@ parser_lex(pm_parser_t *parser) {

const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
pm_token_buffer_t token_buffer = { { 0 }, 0 };
bool was_escaped_newline = false;
bool was_line_continuation = false;

while (breakpoint != NULL) {
switch (*breakpoint) {
Expand All @@ -10831,7 +10835,7 @@ parser_lex(pm_parser_t *parser) {
// some leading whitespace.
const uint8_t *start = breakpoint + 1;

if (!was_escaped_newline && (start + ident_length <= parser->end)) {
if (!was_line_continuation && (start + ident_length <= parser->end)) {
// We want to match the terminator starting from the end of the line in case
// there is whitespace in the ident such as <<-' DOC' or <<~' DOC'.
const uint8_t *newline = next_newline(start, parser->end - start);
Expand Down Expand Up @@ -10873,15 +10877,14 @@ parser_lex(pm_parser_t *parser) {
// heredoc here as string content. Then, the next time a
// token is lexed, it will match again and return the
// end of the heredoc.

if (lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) {
if ((lex_mode->as.heredoc.common_whitespace > whitespace) && peek_at(parser, start) != '\n') {
lex_mode->as.heredoc.common_whitespace = whitespace;
}

parser->current.end = breakpoint + 1;

if (!was_escaped_newline) {
if (!was_line_continuation) {
pm_token_buffer_flush(parser, &token_buffer);
LEX(PM_TOKEN_STRING_CONTENT);
}
Expand Down Expand Up @@ -10943,7 +10946,26 @@ parser_lex(pm_parser_t *parser) {
}
/* fallthrough */
case '\n':
was_escaped_newline = true;
// If we are in a tilde here, we should
// break out of the loop and return the
// string content.
if (lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) {
const uint8_t *end = parser->current.end;
pm_newline_list_append(&parser->newline_list, end);

// Here we want the buffer to only
// include up to the backslash.
parser->current.end = breakpoint;
pm_token_buffer_flush(parser, &token_buffer);

// Now we can advance the end of the
// token past the newline.
parser->current.end = end + 1;
lex_mode->as.heredoc.line_continuation = true;
LEX(PM_TOKEN_STRING_CONTENT);
}

was_line_continuation = true;
token_buffer.cursor = parser->current.end + 1;
breakpoint = parser->current.end;
continue;
Expand Down Expand Up @@ -10980,7 +11002,7 @@ parser_lex(pm_parser_t *parser) {
assert(false && "unreachable");
}

was_escaped_newline = false;
was_line_continuation = false;
}

if (parser->current.end > parser->current.start) {
Expand Down
2 changes: 2 additions & 0 deletions test/prism/ruby_parser_test.rb
Expand Up @@ -71,6 +71,7 @@ class RubyParserTest < TestCase
# https://github.com/seattlerb/ruby_parser/issues/344
failures = crlf | %w[
alias.txt
heredocs_with_ignored_newlines.txt
method_calls.txt
methods.txt
multi_write.txt
Expand All @@ -94,6 +95,7 @@ class RubyParserTest < TestCase
whitequark/lvar_injecting_match.txt
whitequark/not.txt
whitequark/op_asgn_cmd.txt
whitequark/parser_bug_640.txt
whitequark/parser_slash_slash_n_escaping_in_literals.txt
whitequark/pattern_matching_single_line_allowed_omission_of_parentheses.txt
whitequark/pattern_matching_single_line.txt
Expand Down
14 changes: 10 additions & 4 deletions test/prism/snapshots/heredocs_with_ignored_newlines.txt
Expand Up @@ -11,7 +11,7 @@
│ └── unescaped: ""
└── @ InterpolatedStringNode (location: (4,0)-(4,8))
├── opening_loc: (4,0)-(4,8) = "<<~THERE"
├── parts: (length: 8)
├── parts: (length: 9)
│ ├── @ StringNode (location: (5,0)-(6,0))
│ │ ├── flags: ∅
│ │ ├── opening_loc: ∅
Expand Down Expand Up @@ -42,12 +42,18 @@
│ │ ├── content_loc: (9,0)-(10,0) = "\n"
│ │ ├── closing_loc: ∅
│ │ └── unescaped: "\n"
│ ├── @ StringNode (location: (10,0)-(12,0))
│ ├── @ StringNode (location: (10,0)-(11,0))
│ │ ├── flags: ∅
│ │ ├── opening_loc: ∅
│ │ ├── content_loc: (10,0)-(12,0) = " <<~BUT\\\n but\n"
│ │ ├── content_loc: (10,0)-(11,0) = " <<~BUT\\\n"
│ │ ├── closing_loc: ∅
│ │ └── unescaped: "<<~BUT but\n"
│ │ └── unescaped: "<<~BUT"
│ ├── @ StringNode (location: (11,0)-(12,0))
│ │ ├── flags: ∅
│ │ ├── opening_loc: ∅
│ │ ├── content_loc: (11,0)-(12,0) = " but\n"
│ │ ├── closing_loc: ∅
│ │ └── unescaped: " but\n"
│ ├── @ StringNode (location: (12,0)-(13,0))
│ │ ├── flags: ∅
│ │ ├── opening_loc: ∅
Expand Down
20 changes: 15 additions & 5 deletions test/prism/snapshots/whitequark/parser_bug_640.txt
Expand Up @@ -3,9 +3,19 @@
└── statements:
@ StatementsNode (location: (1,0)-(1,6))
└── body: (length: 1)
└── @ StringNode (location: (1,0)-(1,6))
├── flags: ∅
└── @ InterpolatedStringNode (location: (1,0)-(1,6))
├── opening_loc: (1,0)-(1,6) = "<<~FOO"
├── content_loc: (2,0)-(4,0) = " baz\\\n qux\n"
├── closing_loc: (4,0)-(5,0) = "FOO\n"
└── unescaped: "baz qux\n"
├── parts: (length: 2)
│ ├── @ StringNode (location: (2,0)-(3,0))
│ │ ├── flags: ∅
│ │ ├── opening_loc: ∅
│ │ ├── content_loc: (2,0)-(3,0) = " baz\\\n"
│ │ ├── closing_loc: ∅
│ │ └── unescaped: "baz"
│ └── @ StringNode (location: (3,0)-(4,0))
│ ├── flags: ∅
│ ├── opening_loc: ∅
│ ├── content_loc: (3,0)-(4,0) = " qux\n"
│ ├── closing_loc: ∅
│ └── unescaped: "qux\n"
└── closing_loc: (4,0)-(5,0) = "FOO\n"
14 changes: 10 additions & 4 deletions test/prism/snapshots/whitequark/slash_newline_in_heredocs.txt
Expand Up @@ -11,13 +11,19 @@
│ └── unescaped: " 1 2\n 3\n"
└── @ InterpolatedStringNode (location: (8,0)-(8,4))
├── opening_loc: (8,0)-(8,4) = "<<~E"
├── parts: (length: 2)
│ ├── @ StringNode (location: (9,0)-(11,0))
├── parts: (length: 3)
│ ├── @ StringNode (location: (9,0)-(10,0))
│ │ ├── flags: ∅
│ │ ├── opening_loc: ∅
│ │ ├── content_loc: (9,0)-(11,0) = " 1 \\\n 2\n"
│ │ ├── content_loc: (9,0)-(10,0) = " 1 \\\n"
│ │ ├── closing_loc: ∅
│ │ └── unescaped: "1 2\n"
│ │ └── unescaped: "1 "
│ ├── @ StringNode (location: (10,0)-(11,0))
│ │ ├── flags: ∅
│ │ ├── opening_loc: ∅
│ │ ├── content_loc: (10,0)-(11,0) = " 2\n"
│ │ ├── closing_loc: ∅
│ │ └── unescaped: "2\n"
│ └── @ StringNode (location: (11,0)-(12,0))
│ ├── flags: ∅
│ ├── opening_loc: ∅
Expand Down
2 changes: 2 additions & 0 deletions test/prism/unescape_test.rb
Expand Up @@ -230,6 +230,8 @@ def assert_unescape(context, escape)
else
assert_equal expected.bytes, actual.bytes, message
end
rescue Exception
binding.irb
end
end
end

0 comments on commit 76e1159

Please sign in to comment.