[ruby/prism] Fix up embdoc lexing on EOF

kddnewton · matzbot · commit 0424c1fa7b25 · 2024-04-12T16:50:34.000Z
ruby/prism@8ee43be26d
diff --git a/prism/prism.c b/prism/prism.c
@@ -9605,15 +9605,23 @@ lex_embdoc(pm_parser_t *parser) {
     pm_comment_t *comment = parser_comment(parser, PM_COMMENT_EMBDOC);
     if (comment == NULL) return PM_TOKEN_EOF;
 
-    // Now, loop until we find the end of the embedded documentation or the end of
-    // the file.
+    // Now, loop until we find the end of the embedded documentation or the end
+    // of the file.
     while (parser->current.end + 4 <= parser->end) {
         parser->current.start = parser->current.end;
 
-        // If we've hit the end of the embedded documentation then we'll return that
-        // token here.
-        if (memcmp(parser->current.end, "=end", 4) == 0 &&
-                (parser->current.end + 4 == parser->end || pm_char_is_whitespace(parser->current.end[4]))) {
+        // If we've hit the end of the embedded documentation then we'll return
+        // that token here.
+        if (
+            (memcmp(parser->current.end, "=end", 4) == 0) &&
+            (
+                (parser->current.end + 4 == parser->end) || // end of file
+                pm_char_is_whitespace(parser->current.end[4]) || // whitespace
+                (parser->current.end[4] == '\0') || // NUL or end of script
+                (parser->current.end[4] == '\004') || // ^D
+                (parser->current.end[4] == '\032') // ^Z
+            )
+        ) {
             const uint8_t *newline = next_newline(parser->current.end, parser->end - parser->current.end);
 
             if (newline == NULL) {
@@ -10425,9 +10433,13 @@ parser_lex(pm_parser_t *parser) {
 
                 // = => =~ == === =begin
                 case '=':
-                    if (current_token_starts_line(parser) && (parser->current.end + 5 <= parser->end) && memcmp(parser->current.end, "begin", 5) == 0 && pm_char_is_whitespace(peek_offset(parser, 5))) {
+                    if (
+                        current_token_starts_line(parser) &&
+                        (parser->current.end + 5 <= parser->end) &&
+                        memcmp(parser->current.end, "begin", 5) == 0 &&
+                        (pm_char_is_whitespace(peek_offset(parser, 5)) || (peek_offset(parser, 5) == '\0'))
+                    ) {
                         pm_token_type_t type = lex_embdoc(parser);
-
                         if (type == PM_TOKEN_EOF) {
                             LEX(type);
                         }
diff --git a/prism/templates/src/diagnostic.c.erb b/prism/templates/src/diagnostic.c.erb
@@ -152,7 +152,7 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = {
     [PM_ERR_DEF_RECEIVER_TERM]                  = { "expected a `.` or `::` after the receiver in a method definition", PM_ERROR_LEVEL_SYNTAX },
     [PM_ERR_DEF_TERM]                           = { "expected an `end` to close the `def` statement", PM_ERROR_LEVEL_SYNTAX },
     [PM_ERR_DEFINED_EXPRESSION]                 = { "expected an expression after `defined?`", PM_ERROR_LEVEL_SYNTAX },
-    [PM_ERR_EMBDOC_TERM]                        = { "could not find a terminator for the embedded document", PM_ERROR_LEVEL_SYNTAX },
+    [PM_ERR_EMBDOC_TERM]                        = { "embedded document meets end of file", PM_ERROR_LEVEL_SYNTAX },
     [PM_ERR_EMBEXPR_END]                        = { "expected a `}` to close the embedded expression", PM_ERROR_LEVEL_SYNTAX },
     [PM_ERR_EMBVAR_INVALID]                     = { "invalid embedded variable", PM_ERROR_LEVEL_SYNTAX },
     [PM_ERR_END_UPCASE_BRACE]                   = { "expected a `{` after `END`", PM_ERROR_LEVEL_SYNTAX },
diff --git a/test/prism/errors_test.rb b/test/prism/errors_test.rb
@@ -105,9 +105,14 @@ def test_pre_execution_context
     end
 
     def test_unterminated_embdoc
-      assert_errors expression("1"), "1\n=begin\n", [
-        ["could not find a terminator for the embedded document", 2..9]
-      ]
+      message = "embedded document meets end of file"
+      assert_error_messages "=begin", [message]
+      assert_error_messages "=begin\n", [message]
+
+      refute_error_messages "=begin\n=end"
+      refute_error_messages "=begin\n=end\0"
+      refute_error_messages "=begin\n=end\C-d"
+      refute_error_messages "=begin\n=end\C-z"
     end
 
     def test_unterminated_i_list
@@ -2217,7 +2222,7 @@ def assert_error_messages(source, errors)
 
     def refute_error_messages(source)
       assert_valid_syntax(source)
-      assert Prism.parse_success?(source)
+      assert Prism.parse_success?(source), "Expected #{source.inspect} to parse successfully"
     end
 
     def assert_warning_messages(source, warnings)