Fix up tilde heredoc line continuations

ruby · Mar 7, 2024 · 15e74b2 · 15e74b2
1 parent f372c6f
commit 15e74b2
Show file tree

Hide file tree

Showing 7 changed files with 72 additions and 21 deletions.
diff --git a/include/prism/parser.h b/include/prism/parser.h
@@ -234,6 +234,9 @@ typedef struct pm_lex_mode {
              * a tilde heredoc.
              */
             size_t common_whitespace;
+
+            /** True if the previous token ended with a line continuation. */
+            bool line_continuation;
         } heredoc;
     } as;
 

diff --git a/src/prism.c b/src/prism.c
@@ -9450,7 +9450,8 @@ parser_lex(pm_parser_t *parser) {
                                         .next_start = parser->current.end,
                                         .quote = quote,
                                         .indent = indent,
-                                        .common_whitespace = (size_t) -1
+                                        .common_whitespace = (size_t) -1,
+                                        .line_continuation = false
                                     }
                                 });
 
@@ -10719,6 +10720,9 @@ parser_lex(pm_parser_t *parser) {
             // current lex mode.
             pm_lex_mode_t *lex_mode = parser->lex_modes.current;
 
+            bool line_continuation = lex_mode->as.heredoc.line_continuation;
+            lex_mode->as.heredoc.line_continuation = false;
+
             // We'll check if we're at the end of the file. If we are, then we
             // will add an error (because we weren't able to find the
             // terminator) but still continue parsing so that content after the
@@ -10736,7 +10740,7 @@ parser_lex(pm_parser_t *parser) {
 
             // If we are immediately following a newline and we have hit the
             // terminator, then we need to return the ending of the heredoc.
-            if (current_token_starts_line(parser)) {
+            if (!line_continuation && current_token_starts_line(parser)) {
                 const uint8_t *start = parser->current.start;
                 if (start + ident_length <= parser->end) {
                     const uint8_t *newline = next_newline(start, parser->end - start);
@@ -10808,7 +10812,7 @@ parser_lex(pm_parser_t *parser) {
 
             const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
             pm_token_buffer_t token_buffer = { { 0 }, 0 };
-            bool was_escaped_newline = false;
+            bool was_line_continuation = false;
 
             while (breakpoint != NULL) {
                 switch (*breakpoint) {
@@ -10831,7 +10835,7 @@ parser_lex(pm_parser_t *parser) {
                         // some leading whitespace.
                         const uint8_t *start = breakpoint + 1;
 
-                        if (!was_escaped_newline && (start + ident_length <= parser->end)) {
+                        if (!was_line_continuation && (start + ident_length <= parser->end)) {
                             // We want to match the terminator starting from the end of the line in case
                             // there is whitespace in the ident such as <<-'   DOC' or <<~'   DOC'.
                             const uint8_t *newline = next_newline(start, parser->end - start);
@@ -10873,15 +10877,14 @@ parser_lex(pm_parser_t *parser) {
                         // heredoc here as string content. Then, the next time a
                         // token is lexed, it will match again and return the
                         // end of the heredoc.
-
                         if (lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) {
                             if ((lex_mode->as.heredoc.common_whitespace > whitespace) && peek_at(parser, start) != '\n') {
                                 lex_mode->as.heredoc.common_whitespace = whitespace;
                             }
 
                             parser->current.end = breakpoint + 1;
 
-                            if (!was_escaped_newline) {
+                            if (!was_line_continuation) {
                                 pm_token_buffer_flush(parser, &token_buffer);
                                 LEX(PM_TOKEN_STRING_CONTENT);
                             }
@@ -10943,7 +10946,26 @@ parser_lex(pm_parser_t *parser) {
                                     }
                                 /* fallthrough */
                                 case '\n':
-                                    was_escaped_newline = true;
+                                    // If we are in a tilde here, we should
+                                    // break out of the loop and return the
+                                    // string content.
+                                    if (lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) {
+                                        const uint8_t *end = parser->current.end;
+                                        pm_newline_list_append(&parser->newline_list, end);
+
+                                        // Here we want the buffer to only
+                                        // include up to the backslash.
+                                        parser->current.end = breakpoint;
+                                        pm_token_buffer_flush(parser, &token_buffer);
+
+                                        // Now we can advance the end of the
+                                        // token past the newline.
+                                        parser->current.end = end + 1;
+                                        lex_mode->as.heredoc.line_continuation = true;
+                                        LEX(PM_TOKEN_STRING_CONTENT);
+                                    }
+
+                                    was_line_continuation = true;
                                     token_buffer.cursor = parser->current.end + 1;
                                     breakpoint = parser->current.end;
                                     continue;
@@ -10980,7 +11002,7 @@ parser_lex(pm_parser_t *parser) {
                         assert(false && "unreachable");
                 }
 
-                was_escaped_newline = false;
+                was_line_continuation = false;
             }
 
             if (parser->current.end > parser->current.start) {

diff --git a/test/prism/ruby_parser_test.rb b/test/prism/ruby_parser_test.rb
@@ -71,6 +71,7 @@ class RubyParserTest < TestCase
     # https://github.com/seattlerb/ruby_parser/issues/344
     failures = crlf | %w[
       alias.txt
+      heredocs_with_ignored_newlines.txt
       method_calls.txt
       methods.txt
       multi_write.txt
@@ -94,6 +95,7 @@ class RubyParserTest < TestCase
       whitequark/lvar_injecting_match.txt
       whitequark/not.txt
       whitequark/op_asgn_cmd.txt
+      whitequark/parser_bug_640.txt
       whitequark/parser_slash_slash_n_escaping_in_literals.txt
       whitequark/pattern_matching_single_line_allowed_omission_of_parentheses.txt
       whitequark/pattern_matching_single_line.txt

diff --git a/test/prism/snapshots/heredocs_with_ignored_newlines.txt b/test/prism/snapshots/heredocs_with_ignored_newlines.txt
diff --git a/test/prism/snapshots/whitequark/parser_bug_640.txt b/test/prism/snapshots/whitequark/parser_bug_640.txt
diff --git a/test/prism/snapshots/whitequark/slash_newline_in_heredocs.txt b/test/prism/snapshots/whitequark/slash_newline_in_heredocs.txt
diff --git a/test/prism/unescape_test.rb b/test/prism/unescape_test.rb
@@ -230,6 +230,8 @@ def assert_unescape(context, escape)
       else
         assert_equal expected.bytes, actual.bytes, message
       end
+    rescue Exception
+      binding.irb
     end
   end
 end