Skip to content

Commit aa8c702

Browse files
committed
Fix parsing heredoc ends
1 parent 229fc7b commit aa8c702

File tree

5 files changed

+147
-28
lines changed

5 files changed

+147
-28
lines changed

src/prism.c

Lines changed: 67 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -9746,24 +9746,43 @@ parser_lex(pm_parser_t *parser) {
97469746
// terminator, then we need to return the ending of the heredoc.
97479747
if (current_token_starts_line(parser)) {
97489748
const uint8_t *start = parser->current.start;
9749-
size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent);
9750-
9751-
if ((start + ident_length <= parser->end) && (memcmp(start, ident_start, ident_length) == 0)) {
9752-
bool matched = true;
9749+
if (start + ident_length <= parser->end) {
97539750
bool at_end = false;
9751+
const uint8_t *newline = next_newline(start, parser->end - start);
9752+
const uint8_t *ident_end = newline;
9753+
const uint8_t *terminator_end = newline;
97549754

9755-
size_t eol_length = match_eol_at(parser, start + ident_length);
9756-
if (eol_length) {
9757-
parser->current.end = start + ident_length + eol_length;
9758-
pm_newline_list_append(&parser->newline_list, parser->current.end - 1);
9759-
} else if (parser->end == (start + ident_length)) {
9760-
parser->current.end = start + ident_length;
9755+
if (newline == NULL) {
9756+
terminator_end = parser->end;
9757+
ident_end = parser->end;
97619758
at_end = true;
97629759
} else {
9763-
matched = false;
9760+
terminator_end++;
9761+
if (newline[-1] == '\r') {
9762+
ident_end--; // Remove \r
9763+
}
9764+
}
9765+
9766+
const uint8_t *terminator_start = ident_end - ident_length;
9767+
const uint8_t *cursor = start;
9768+
9769+
if (
9770+
lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_DASH ||
9771+
lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE
9772+
) {
9773+
while (cursor < terminator_start && pm_char_is_inline_whitespace(*cursor)) {
9774+
cursor++;
9775+
}
97649776
}
97659777

9766-
if (matched) {
9778+
if (
9779+
(cursor == terminator_start) &&
9780+
(memcmp(terminator_start, ident_start, ident_length) == 0)
9781+
) {
9782+
if (newline != NULL) {
9783+
pm_newline_list_append(&parser->newline_list, newline);
9784+
}
9785+
parser->current.end = terminator_end;
97679786
if (*lex_mode->as.heredoc.next_start == '\\') {
97689787
parser->next_start = NULL;
97699788
} else {
@@ -9779,7 +9798,7 @@ parser_lex(pm_parser_t *parser) {
97799798
LEX(PM_TOKEN_HEREDOC_END);
97809799
}
97819800
}
9782-
9801+
size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent);
97839802
if (
97849803
lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE &&
97859804
(lex_mode->as.heredoc.common_whitespace > whitespace) &&
@@ -9823,30 +9842,50 @@ parser_lex(pm_parser_t *parser) {
98239842
// If we have a - or ~ heredoc, then we can match after
98249843
// some leading whitespace.
98259844
const uint8_t *start = breakpoint + 1;
9826-
size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent);
98279845

9828-
// If we have hit a newline that is followed by a valid
9829-
// terminator, then we need to return the content of the
9830-
// heredoc here as string content. Then, the next time a
9831-
// token is lexed, it will match again and return the
9832-
// end of the heredoc.
9833-
if (
9834-
!was_escaped_newline &&
9835-
(start + ident_length <= parser->end) &&
9836-
(memcmp(start, ident_start, ident_length) == 0)
9837-
) {
9838-
// Heredoc terminators must be followed by a
9839-
// newline, CRLF, or EOF to be valid.
9846+
if (!was_escaped_newline && (start + ident_length <= parser->end)) {
9847+
// We want to match the terminator starting from the end of the line in case
9848+
// there is whitespace in the ident such as <<-' DOC' or <<~' DOC'.
9849+
const uint8_t *newline = next_newline(start, parser->end - start);
9850+
9851+
if (newline == NULL) {
9852+
newline = parser->end;
9853+
} else if (newline[-1] == '\r') {
9854+
newline--; // Remove \r
9855+
}
9856+
9857+
// Start of a possible terminator.
9858+
const uint8_t *terminator_start = newline - ident_length;
9859+
9860+
// Cursor to check for the leading whitespace. We skip the
9861+
// leading whitespace if we have a - or ~ heredoc.
9862+
const uint8_t *cursor = start;
9863+
9864+
if (lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_DASH ||
9865+
lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) {
9866+
while (cursor < terminator_start && pm_char_is_inline_whitespace(*cursor)) {
9867+
cursor++;
9868+
}
9869+
}
9870+
98409871
if (
9841-
start + ident_length == parser->end ||
9842-
match_eol_at(parser, start + ident_length)
9872+
cursor == terminator_start &&
9873+
(memcmp(terminator_start, ident_start, ident_length) == 0)
98439874
) {
98449875
parser->current.end = breakpoint + 1;
98459876
pm_token_buffer_flush(parser, &token_buffer);
98469877
LEX(PM_TOKEN_STRING_CONTENT);
98479878
}
98489879
}
98499880

9881+
size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent);
9882+
9883+
// If we have hit a newline that is followed by a valid
9884+
// terminator, then we need to return the content of the
9885+
// heredoc here as string content. Then, the next time a
9886+
// token is lexed, it will match again and return the
9887+
// end of the heredoc.
9888+
98509889
if (lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) {
98519890
if ((lex_mode->as.heredoc.common_whitespace > whitespace) && peek_at(parser, start) != '\n') {
98529891
lex_mode->as.heredoc.common_whitespace = whitespace;
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
<<-' FOO'
2+
a
3+
b
4+
FOO
5+
6+
<<-' FOO'
7+
a
8+
b
9+
FOO
10+
11+
<<~' FOO'
12+
a
13+
b
14+
FOO
15+
16+
<<~' FOO'
17+
a
18+
b
19+
FOO

test/prism/locals_test.rb

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,13 @@ class LocalsTest < TestCase
6868
# HERE
6969
todos << "seattlerb/heredoc_nested.txt"
7070

71+
# Ruby < 3.3.0 fails to parse:
72+
#
73+
# <<-' HERE'
74+
# foo
75+
# HERE
76+
invalid << "heredocs_leading_whitespace.txt" if RUBY_VERSION < "3.3.0"
77+
7178
base = File.join(__dir__, "fixtures")
7279
skips = invalid | todos
7380

test/prism/parse_test.rb

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,11 @@ def test_parse_lex_file
111111
# Additionally, Ripper cannot parse the %w[] fixture in this file, so set ripper_should_parse to false.
112112
ripper_should_parse = false if relative == "spanning_heredoc.txt"
113113

114+
# Ruby < 3.3.0 cannot parse heredocs where there are leading whitespace charactes in the heredoc start.
115+
# Example: <<~' EOF' or <<-' EOF'
116+
# https://bugs.ruby-lang.org/issues/19539
117+
ripper_should_parse = false if relative == "heredocs_leading_whitespace.txt" && RUBY_VERSION < "3.3.0"
118+
114119
define_method "test_filepath_#{relative}" do
115120
# First, read the source from the filepath. Use binmode to avoid converting CRLF on Windows,
116121
# and explicitly set the external encoding to UTF-8 to override the binmode default.

test/prism/snapshots/heredocs_leading_whitespace.txt

Lines changed: 49 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)