From 76e11595e28e258f4a4187a6d3eaccc9ca752e10 Mon Sep 17 00:00:00 2001 From: Kevin Newton Date: Thu, 7 Mar 2024 14:45:32 -0500 Subject: [PATCH] [ruby/prism] Fix up tilde heredoc line continuations https://github.com/ruby/prism/commit/15e74b2f65 --- prism/parser.h | 3 ++ prism/prism.c | 38 +++++++++++++++---- test/prism/ruby_parser_test.rb | 2 + .../heredocs_with_ignored_newlines.txt | 14 +++++-- .../snapshots/whitequark/parser_bug_640.txt | 20 +++++++--- .../whitequark/slash_newline_in_heredocs.txt | 14 +++++-- test/prism/unescape_test.rb | 2 + 7 files changed, 72 insertions(+), 21 deletions(-) diff --git a/prism/parser.h b/prism/parser.h index 80521e4ad943af..02f60192d559c6 100644 --- a/prism/parser.h +++ b/prism/parser.h @@ -234,6 +234,9 @@ typedef struct pm_lex_mode { * a tilde heredoc. */ size_t common_whitespace; + + /** True if the previous token ended with a line continuation. */ + bool line_continuation; } heredoc; } as; diff --git a/prism/prism.c b/prism/prism.c index 6717488882edec..d7ee5ac7db7bb4 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -9450,7 +9450,8 @@ parser_lex(pm_parser_t *parser) { .next_start = parser->current.end, .quote = quote, .indent = indent, - .common_whitespace = (size_t) -1 + .common_whitespace = (size_t) -1, + .line_continuation = false } }); @@ -10719,6 +10720,9 @@ parser_lex(pm_parser_t *parser) { // current lex mode. pm_lex_mode_t *lex_mode = parser->lex_modes.current; + bool line_continuation = lex_mode->as.heredoc.line_continuation; + lex_mode->as.heredoc.line_continuation = false; + // We'll check if we're at the end of the file. If we are, then we // will add an error (because we weren't able to find the // terminator) but still continue parsing so that content after the @@ -10736,7 +10740,7 @@ parser_lex(pm_parser_t *parser) { // If we are immediately following a newline and we have hit the // terminator, then we need to return the ending of the heredoc. - if (current_token_starts_line(parser)) { + if (!line_continuation && current_token_starts_line(parser)) { const uint8_t *start = parser->current.start; if (start + ident_length <= parser->end) { const uint8_t *newline = next_newline(start, parser->end - start); @@ -10808,7 +10812,7 @@ parser_lex(pm_parser_t *parser) { const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true); pm_token_buffer_t token_buffer = { { 0 }, 0 }; - bool was_escaped_newline = false; + bool was_line_continuation = false; while (breakpoint != NULL) { switch (*breakpoint) { @@ -10831,7 +10835,7 @@ parser_lex(pm_parser_t *parser) { // some leading whitespace. const uint8_t *start = breakpoint + 1; - if (!was_escaped_newline && (start + ident_length <= parser->end)) { + if (!was_line_continuation && (start + ident_length <= parser->end)) { // We want to match the terminator starting from the end of the line in case // there is whitespace in the ident such as <<-' DOC' or <<~' DOC'. const uint8_t *newline = next_newline(start, parser->end - start); @@ -10873,7 +10877,6 @@ parser_lex(pm_parser_t *parser) { // heredoc here as string content. Then, the next time a // token is lexed, it will match again and return the // end of the heredoc. - if (lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) { if ((lex_mode->as.heredoc.common_whitespace > whitespace) && peek_at(parser, start) != '\n') { lex_mode->as.heredoc.common_whitespace = whitespace; @@ -10881,7 +10884,7 @@ parser_lex(pm_parser_t *parser) { parser->current.end = breakpoint + 1; - if (!was_escaped_newline) { + if (!was_line_continuation) { pm_token_buffer_flush(parser, &token_buffer); LEX(PM_TOKEN_STRING_CONTENT); } @@ -10943,7 +10946,26 @@ parser_lex(pm_parser_t *parser) { } /* fallthrough */ case '\n': - was_escaped_newline = true; + // If we are in a tilde here, we should + // break out of the loop and return the + // string content. + if (lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) { + const uint8_t *end = parser->current.end; + pm_newline_list_append(&parser->newline_list, end); + + // Here we want the buffer to only + // include up to the backslash. + parser->current.end = breakpoint; + pm_token_buffer_flush(parser, &token_buffer); + + // Now we can advance the end of the + // token past the newline. + parser->current.end = end + 1; + lex_mode->as.heredoc.line_continuation = true; + LEX(PM_TOKEN_STRING_CONTENT); + } + + was_line_continuation = true; token_buffer.cursor = parser->current.end + 1; breakpoint = parser->current.end; continue; @@ -10980,7 +11002,7 @@ parser_lex(pm_parser_t *parser) { assert(false && "unreachable"); } - was_escaped_newline = false; + was_line_continuation = false; } if (parser->current.end > parser->current.start) { diff --git a/test/prism/ruby_parser_test.rb b/test/prism/ruby_parser_test.rb index 89150b2faac33e..1d22f0e7b8729f 100644 --- a/test/prism/ruby_parser_test.rb +++ b/test/prism/ruby_parser_test.rb @@ -71,6 +71,7 @@ class RubyParserTest < TestCase # https://github.com/seattlerb/ruby_parser/issues/344 failures = crlf | %w[ alias.txt + heredocs_with_ignored_newlines.txt method_calls.txt methods.txt multi_write.txt @@ -94,6 +95,7 @@ class RubyParserTest < TestCase whitequark/lvar_injecting_match.txt whitequark/not.txt whitequark/op_asgn_cmd.txt + whitequark/parser_bug_640.txt whitequark/parser_slash_slash_n_escaping_in_literals.txt whitequark/pattern_matching_single_line_allowed_omission_of_parentheses.txt whitequark/pattern_matching_single_line.txt diff --git a/test/prism/snapshots/heredocs_with_ignored_newlines.txt b/test/prism/snapshots/heredocs_with_ignored_newlines.txt index 00111b1ca54625..cdc0b4faab9279 100644 --- a/test/prism/snapshots/heredocs_with_ignored_newlines.txt +++ b/test/prism/snapshots/heredocs_with_ignored_newlines.txt @@ -11,7 +11,7 @@ │ └── unescaped: "" └── @ InterpolatedStringNode (location: (4,0)-(4,8)) ├── opening_loc: (4,0)-(4,8) = "<<~THERE" - ├── parts: (length: 8) + ├── parts: (length: 9) │ ├── @ StringNode (location: (5,0)-(6,0)) │ │ ├── flags: ∅ │ │ ├── opening_loc: ∅ @@ -42,12 +42,18 @@ │ │ ├── content_loc: (9,0)-(10,0) = "\n" │ │ ├── closing_loc: ∅ │ │ └── unescaped: "\n" - │ ├── @ StringNode (location: (10,0)-(12,0)) + │ ├── @ StringNode (location: (10,0)-(11,0)) │ │ ├── flags: ∅ │ │ ├── opening_loc: ∅ - │ │ ├── content_loc: (10,0)-(12,0) = " <<~BUT\\\n but\n" + │ │ ├── content_loc: (10,0)-(11,0) = " <<~BUT\\\n" │ │ ├── closing_loc: ∅ - │ │ └── unescaped: "<<~BUT but\n" + │ │ └── unescaped: "<<~BUT" + │ ├── @ StringNode (location: (11,0)-(12,0)) + │ │ ├── flags: ∅ + │ │ ├── opening_loc: ∅ + │ │ ├── content_loc: (11,0)-(12,0) = " but\n" + │ │ ├── closing_loc: ∅ + │ │ └── unescaped: " but\n" │ ├── @ StringNode (location: (12,0)-(13,0)) │ │ ├── flags: ∅ │ │ ├── opening_loc: ∅ diff --git a/test/prism/snapshots/whitequark/parser_bug_640.txt b/test/prism/snapshots/whitequark/parser_bug_640.txt index 0320011e2e295d..a9d3f957e83910 100644 --- a/test/prism/snapshots/whitequark/parser_bug_640.txt +++ b/test/prism/snapshots/whitequark/parser_bug_640.txt @@ -3,9 +3,19 @@ └── statements: @ StatementsNode (location: (1,0)-(1,6)) └── body: (length: 1) - └── @ StringNode (location: (1,0)-(1,6)) - ├── flags: ∅ + └── @ InterpolatedStringNode (location: (1,0)-(1,6)) ├── opening_loc: (1,0)-(1,6) = "<<~FOO" - ├── content_loc: (2,0)-(4,0) = " baz\\\n qux\n" - ├── closing_loc: (4,0)-(5,0) = "FOO\n" - └── unescaped: "baz qux\n" + ├── parts: (length: 2) + │ ├── @ StringNode (location: (2,0)-(3,0)) + │ │ ├── flags: ∅ + │ │ ├── opening_loc: ∅ + │ │ ├── content_loc: (2,0)-(3,0) = " baz\\\n" + │ │ ├── closing_loc: ∅ + │ │ └── unescaped: "baz" + │ └── @ StringNode (location: (3,0)-(4,0)) + │ ├── flags: ∅ + │ ├── opening_loc: ∅ + │ ├── content_loc: (3,0)-(4,0) = " qux\n" + │ ├── closing_loc: ∅ + │ └── unescaped: "qux\n" + └── closing_loc: (4,0)-(5,0) = "FOO\n" diff --git a/test/prism/snapshots/whitequark/slash_newline_in_heredocs.txt b/test/prism/snapshots/whitequark/slash_newline_in_heredocs.txt index 58a134dd62b2ac..8d6fce2ba9676b 100644 --- a/test/prism/snapshots/whitequark/slash_newline_in_heredocs.txt +++ b/test/prism/snapshots/whitequark/slash_newline_in_heredocs.txt @@ -11,13 +11,19 @@ │ └── unescaped: " 1 2\n 3\n" └── @ InterpolatedStringNode (location: (8,0)-(8,4)) ├── opening_loc: (8,0)-(8,4) = "<<~E" - ├── parts: (length: 2) - │ ├── @ StringNode (location: (9,0)-(11,0)) + ├── parts: (length: 3) + │ ├── @ StringNode (location: (9,0)-(10,0)) │ │ ├── flags: ∅ │ │ ├── opening_loc: ∅ - │ │ ├── content_loc: (9,0)-(11,0) = " 1 \\\n 2\n" + │ │ ├── content_loc: (9,0)-(10,0) = " 1 \\\n" │ │ ├── closing_loc: ∅ - │ │ └── unescaped: "1 2\n" + │ │ └── unescaped: "1 " + │ ├── @ StringNode (location: (10,0)-(11,0)) + │ │ ├── flags: ∅ + │ │ ├── opening_loc: ∅ + │ │ ├── content_loc: (10,0)-(11,0) = " 2\n" + │ │ ├── closing_loc: ∅ + │ │ └── unescaped: "2\n" │ └── @ StringNode (location: (11,0)-(12,0)) │ ├── flags: ∅ │ ├── opening_loc: ∅ diff --git a/test/prism/unescape_test.rb b/test/prism/unescape_test.rb index 2a352c52347e84..72ad780d8bbd4a 100644 --- a/test/prism/unescape_test.rb +++ b/test/prism/unescape_test.rb @@ -230,6 +230,8 @@ def assert_unescape(context, escape) else assert_equal expected.bytes, actual.bytes, message end + rescue Exception + binding.irb end end end