@@ -9746,24 +9746,43 @@ parser_lex(pm_parser_t *parser) {
9746
9746
// terminator, then we need to return the ending of the heredoc.
9747
9747
if (current_token_starts_line(parser)) {
9748
9748
const uint8_t *start = parser->current.start;
9749
- size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent);
9750
-
9751
- if ((start + ident_length <= parser->end) && (memcmp(start, ident_start, ident_length) == 0)) {
9752
- bool matched = true;
9749
+ if (start + ident_length <= parser->end) {
9753
9750
bool at_end = false;
9751
+ const uint8_t *newline = next_newline(start, parser->end - start);
9752
+ const uint8_t *ident_end = newline;
9753
+ const uint8_t *terminator_end = newline;
9754
9754
9755
- size_t eol_length = match_eol_at(parser, start + ident_length);
9756
- if (eol_length) {
9757
- parser->current.end = start + ident_length + eol_length;
9758
- pm_newline_list_append(&parser->newline_list, parser->current.end - 1);
9759
- } else if (parser->end == (start + ident_length)) {
9760
- parser->current.end = start + ident_length;
9755
+ if (newline == NULL) {
9756
+ terminator_end = parser->end;
9757
+ ident_end = parser->end;
9761
9758
at_end = true;
9762
9759
} else {
9763
- matched = false;
9760
+ terminator_end++;
9761
+ if (newline[-1] == '\r') {
9762
+ ident_end--; // Remove \r
9763
+ }
9764
+ }
9765
+
9766
+ const uint8_t *terminator_start = ident_end - ident_length;
9767
+ const uint8_t *cursor = start;
9768
+
9769
+ if (
9770
+ lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_DASH ||
9771
+ lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE
9772
+ ) {
9773
+ while (cursor < terminator_start && pm_char_is_inline_whitespace(*cursor)) {
9774
+ cursor++;
9775
+ }
9764
9776
}
9765
9777
9766
- if (matched) {
9778
+ if (
9779
+ (cursor == terminator_start) &&
9780
+ (memcmp(terminator_start, ident_start, ident_length) == 0)
9781
+ ) {
9782
+ if (newline != NULL) {
9783
+ pm_newline_list_append(&parser->newline_list, newline);
9784
+ }
9785
+ parser->current.end = terminator_end;
9767
9786
if (*lex_mode->as.heredoc.next_start == '\\') {
9768
9787
parser->next_start = NULL;
9769
9788
} else {
@@ -9779,7 +9798,7 @@ parser_lex(pm_parser_t *parser) {
9779
9798
LEX(PM_TOKEN_HEREDOC_END);
9780
9799
}
9781
9800
}
9782
-
9801
+ size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent);
9783
9802
if (
9784
9803
lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE &&
9785
9804
(lex_mode->as.heredoc.common_whitespace > whitespace) &&
@@ -9823,30 +9842,50 @@ parser_lex(pm_parser_t *parser) {
9823
9842
// If we have a - or ~ heredoc, then we can match after
9824
9843
// some leading whitespace.
9825
9844
const uint8_t *start = breakpoint + 1;
9826
- size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent);
9827
9845
9828
- // If we have hit a newline that is followed by a valid
9829
- // terminator, then we need to return the content of the
9830
- // heredoc here as string content. Then, the next time a
9831
- // token is lexed, it will match again and return the
9832
- // end of the heredoc.
9833
- if (
9834
- !was_escaped_newline &&
9835
- (start + ident_length <= parser->end) &&
9836
- (memcmp(start, ident_start, ident_length) == 0)
9837
- ) {
9838
- // Heredoc terminators must be followed by a
9839
- // newline, CRLF, or EOF to be valid.
9846
+ if (!was_escaped_newline && (start + ident_length <= parser->end)) {
9847
+ // We want to match the terminator starting from the end of the line in case
9848
+ // there is whitespace in the ident such as <<-' DOC' or <<~' DOC'.
9849
+ const uint8_t *newline = next_newline(start, parser->end - start);
9850
+
9851
+ if (newline == NULL) {
9852
+ newline = parser->end;
9853
+ } else if (newline[-1] == '\r') {
9854
+ newline--; // Remove \r
9855
+ }
9856
+
9857
+ // Start of a possible terminator.
9858
+ const uint8_t *terminator_start = newline - ident_length;
9859
+
9860
+ // Cursor to check for the leading whitespace. We skip the
9861
+ // leading whitespace if we have a - or ~ heredoc.
9862
+ const uint8_t *cursor = start;
9863
+
9864
+ if (lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_DASH ||
9865
+ lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) {
9866
+ while (cursor < terminator_start && pm_char_is_inline_whitespace(*cursor)) {
9867
+ cursor++;
9868
+ }
9869
+ }
9870
+
9840
9871
if (
9841
- start + ident_length == parser->end ||
9842
- match_eol_at(parser, start + ident_length)
9872
+ cursor == terminator_start &&
9873
+ (memcmp(terminator_start, ident_start, ident_length) == 0 )
9843
9874
) {
9844
9875
parser->current.end = breakpoint + 1;
9845
9876
pm_token_buffer_flush(parser, &token_buffer);
9846
9877
LEX(PM_TOKEN_STRING_CONTENT);
9847
9878
}
9848
9879
}
9849
9880
9881
+ size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent);
9882
+
9883
+ // If we have hit a newline that is followed by a valid
9884
+ // terminator, then we need to return the content of the
9885
+ // heredoc here as string content. Then, the next time a
9886
+ // token is lexed, it will match again and return the
9887
+ // end of the heredoc.
9888
+
9850
9889
if (lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) {
9851
9890
if ((lex_mode->as.heredoc.common_whitespace > whitespace) && peek_at(parser, start) != '\n') {
9852
9891
lex_mode->as.heredoc.common_whitespace = whitespace;
0 commit comments