@@ -7264,7 +7264,8 @@ parser_lex(pm_parser_t *parser) {
7264
7264
.ident_length = ident_length,
7265
7265
.next_start = parser->current.end,
7266
7266
.quote = quote,
7267
- .indent = indent
7267
+ .indent = indent,
7268
+ .common_whitespace = (size_t) -1
7268
7269
}
7269
7270
});
7270
7271
@@ -8434,8 +8435,30 @@ parser_lex(pm_parser_t *parser) {
8434
8435
// terminator, then we need to return the ending of the heredoc.
8435
8436
if (current_token_starts_line(parser)) {
8436
8437
const uint8_t *start = parser->current.start;
8437
- if (lex_mode->as.heredoc.indent != PM_HEREDOC_INDENT_NONE) {
8438
- start += pm_strspn_inline_whitespace(start, parser->end - start);
8438
+ size_t whitespace = 0;
8439
+
8440
+ switch (lex_mode->as.heredoc.indent) {
8441
+ case PM_HEREDOC_INDENT_NONE:
8442
+ // Do nothing, we can't match a terminator with
8443
+ // indentation and there's no need to calculate common
8444
+ // whitespace.
8445
+ break;
8446
+ case PM_HEREDOC_INDENT_DASH:
8447
+ // Skip past inline whitespace.
8448
+ start += pm_strspn_inline_whitespace(start, parser->end - start);
8449
+ break;
8450
+ case PM_HEREDOC_INDENT_TILDE:
8451
+ // Skip past inline whitespace and calculate common
8452
+ // whitespace.
8453
+ while (start < parser->end && pm_char_is_inline_whitespace(*start)) {
8454
+ if (*start == '\t') {
8455
+ whitespace = (whitespace / PM_TAB_WHITESPACE_SIZE + 1) * PM_TAB_WHITESPACE_SIZE;
8456
+ } else {
8457
+ whitespace++;
8458
+ }
8459
+ start++;
8460
+ }
8461
+ break;
8439
8462
}
8440
8463
8441
8464
if ((start + ident_length <= parser->end) && (memcmp(start, ident_start, ident_length) == 0)) {
@@ -8468,6 +8491,14 @@ parser_lex(pm_parser_t *parser) {
8468
8491
LEX(PM_TOKEN_HEREDOC_END);
8469
8492
}
8470
8493
}
8494
+
8495
+ if (
8496
+ lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE &&
8497
+ (lex_mode->as.heredoc.common_whitespace > whitespace) &&
8498
+ peek_at(parser, start) != '\n'
8499
+ ) {
8500
+ lex_mode->as.heredoc.common_whitespace = whitespace;
8501
+ }
8471
8502
}
8472
8503
8473
8504
// Otherwise we'll be parsing string content. These are the places
@@ -8500,21 +8531,47 @@ parser_lex(pm_parser_t *parser) {
8500
8531
8501
8532
pm_newline_list_append(&parser->newline_list, breakpoint);
8502
8533
8534
+ // If we have a - or ~ heredoc, then we can match after
8535
+ // some leading whitespace.
8503
8536
const uint8_t *start = breakpoint + 1;
8504
- if (lex_mode->as.heredoc.indent != PM_HEREDOC_INDENT_NONE) {
8505
- start += pm_strspn_inline_whitespace(start, parser->end - start);
8537
+ size_t whitespace = 0;
8538
+
8539
+ switch (lex_mode->as.heredoc.indent) {
8540
+ case PM_HEREDOC_INDENT_NONE:
8541
+ // Do nothing, we can't match a terminator with
8542
+ // indentation and there's no need to calculate
8543
+ // common whitespace.
8544
+ break;
8545
+ case PM_HEREDOC_INDENT_DASH:
8546
+ // Skip past inline whitespace.
8547
+ start += pm_strspn_inline_whitespace(start, parser->end - start);
8548
+ break;
8549
+ case PM_HEREDOC_INDENT_TILDE:
8550
+ // Skip past inline whitespace and calculate common
8551
+ // whitespace.
8552
+ while (start < parser->end && pm_char_is_inline_whitespace(*start)) {
8553
+ if (*start == '\t') {
8554
+ whitespace = (whitespace / PM_TAB_WHITESPACE_SIZE + 1) * PM_TAB_WHITESPACE_SIZE;
8555
+ } else {
8556
+ whitespace++;
8557
+ }
8558
+ start++;
8559
+ }
8560
+ break;
8506
8561
}
8507
8562
8508
- // If we have hit a newline that is followed by a valid terminator,
8509
- // then we need to return the content of the heredoc here as string
8510
- // content. Then, the next time a token is lexed, it will match
8511
- // again and return the end of the heredoc.
8563
+ // If we have hit a newline that is followed by a valid
8564
+ // terminator, then we need to return the content of the
8565
+ // heredoc here as string content. Then, the next time a
8566
+ // token is lexed, it will match again and return the
8567
+ // end of the heredoc.
8512
8568
if (
8513
8569
!was_escaped_newline &&
8514
8570
(start + ident_length <= parser->end) &&
8515
8571
(memcmp(start, ident_start, ident_length) == 0)
8516
8572
) {
8517
- // Heredoc terminators must be followed by a newline, CRLF, or EOF to be valid.
8573
+ // Heredoc terminators must be followed by a
8574
+ // newline, CRLF, or EOF to be valid.
8518
8575
if (
8519
8576
start + ident_length == parser->end ||
8520
8577
match_eol_at(parser, start + ident_length)
@@ -8525,8 +8582,16 @@ parser_lex(pm_parser_t *parser) {
8525
8582
}
8526
8583
}
8527
8584
8528
- // Otherwise we hit a newline and it wasn't followed by a
8529
- // terminator, so we can continue parsing.
8585
+ if (
8586
+ lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE &&
8587
+ (lex_mode->as.heredoc.common_whitespace > whitespace) &&
8588
+ peek_at(parser, start) != '\n'
8589
+ ) {
8590
+ lex_mode->as.heredoc.common_whitespace = whitespace;
8591
+ }
8592
+
8593
+ // Otherwise we hit a newline and it wasn't followed by
8594
+ // a terminator, so we can continue parsing.
8530
8595
breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
8531
8596
break;
8532
8597
}
@@ -11082,75 +11147,8 @@ parse_method_definition_name(pm_parser_t *parser) {
11082
11147
}
11083
11148
}
11084
11149
11085
- static int
11086
- parse_heredoc_common_whitespace_for_single_node(pm_parser_t *parser, pm_node_t *node, int common_whitespace)
11087
- {
11088
- const pm_location_t *content_loc = &((pm_string_node_t *) node)->content_loc;
11089
- int cur_whitespace;
11090
- const uint8_t *cur_char = content_loc->start;
11091
-
11092
- while (cur_char && cur_char < content_loc->end) {
11093
- // Any empty newlines aren't included in the minimum whitespace
11094
- // calculation.
11095
- size_t eol_length;
11096
- while ((eol_length = match_eol_at(parser, cur_char))) {
11097
- cur_char += eol_length;
11098
- }
11099
-
11100
- if (cur_char == content_loc->end) break;
11101
-
11102
- cur_whitespace = 0;
11103
-
11104
- while (pm_char_is_inline_whitespace(*cur_char) && cur_char < content_loc->end) {
11105
- if (cur_char[0] == '\t') {
11106
- cur_whitespace = (cur_whitespace / PM_TAB_WHITESPACE_SIZE + 1) * PM_TAB_WHITESPACE_SIZE;
11107
- } else {
11108
- cur_whitespace++;
11109
- }
11110
- cur_char++;
11111
- }
11112
-
11113
- // If we hit a newline, then we have encountered a line that
11114
- // contains only whitespace, and it shouldn't be considered in
11115
- // the calculation of common leading whitespace.
11116
- eol_length = match_eol_at(parser, cur_char);
11117
- if (eol_length) {
11118
- cur_char += eol_length;
11119
- continue;
11120
- }
11121
-
11122
- if (cur_whitespace < common_whitespace || common_whitespace == -1) {
11123
- common_whitespace = cur_whitespace;
11124
- }
11125
-
11126
- cur_char = next_newline(cur_char + 1, parser->end - (cur_char + 1));
11127
- if (cur_char) cur_char++;
11128
- }
11129
- return common_whitespace;
11130
- }
11131
-
11132
- // Calculate the common leading whitespace for each line in a heredoc.
11133
- static int
11134
- parse_heredoc_common_whitespace(pm_parser_t *parser, pm_node_list_t *nodes) {
11135
- int common_whitespace = -1;
11136
-
11137
- for (size_t index = 0; index < nodes->size; index++) {
11138
- pm_node_t *node = nodes->nodes[index];
11139
- if (!PM_NODE_TYPE_P(node, PM_STRING_NODE)) continue;
11140
-
11141
- // If the previous node wasn't a string node, we don't want to trim
11142
- // whitespace. This could happen after an interpolated expression or
11143
- // variable.
11144
- if (index == 0 || PM_NODE_TYPE_P(nodes->nodes[index - 1], PM_STRING_NODE)) {
11145
- common_whitespace = parse_heredoc_common_whitespace_for_single_node(parser, node, common_whitespace);
11146
- }
11147
- }
11148
-
11149
- return common_whitespace;
11150
- }
11151
-
11152
11150
static pm_string_t *
11153
- parse_heredoc_dedent_single_node(pm_parser_t *parser, pm_string_t *string, bool dedent_node, int common_whitespace, pm_heredoc_quote_t quote)
11151
+ parse_heredoc_dedent_single_node(pm_parser_t *parser, pm_string_t *string, bool dedent_node, size_t common_whitespace, pm_heredoc_quote_t quote)
11154
11152
{
11155
11153
// Get a reference to the string struct that is being held by the string
11156
11154
// node. This is the value we're going to actually manipulate.
@@ -11174,7 +11172,7 @@ parse_heredoc_dedent_single_node(pm_parser_t *parser, pm_string_t *string, bool
11174
11172
// If we need to dedent the next element within the heredoc or the next
11175
11173
// line within the string node, then we'll do it here.
11176
11174
if (dedent_node) {
11177
- int trimmed_whitespace = 0;
11175
+ size_t trimmed_whitespace = 0;
11178
11176
11179
11177
// While we haven't reached the amount of common whitespace that we need
11180
11178
// to trim and we haven't reached the end of the string, we'll keep
@@ -11224,7 +11222,7 @@ parse_heredoc_dedent_single_node(pm_parser_t *parser, pm_string_t *string, bool
11224
11222
11225
11223
// Take a heredoc node that is indented by a ~ and trim the leading whitespace.
11226
11224
static void
11227
- parse_heredoc_dedent(pm_parser_t *parser, pm_node_t *heredoc_node, pm_heredoc_quote_t quote)
11225
+ parse_heredoc_dedent(pm_parser_t *parser, pm_node_t *heredoc_node, pm_heredoc_quote_t quote, size_t common_whitespace )
11228
11226
{
11229
11227
pm_node_list_t *nodes;
11230
11228
@@ -11234,11 +11232,6 @@ parse_heredoc_dedent(pm_parser_t *parser, pm_node_t *heredoc_node, pm_heredoc_qu
11234
11232
nodes = &((pm_interpolated_string_node_t *) heredoc_node)->parts;
11235
11233
}
11236
11234
11237
- // First, calculate how much common whitespace we need to trim. If there is
11238
- // none or it's 0, then we can return early.
11239
- int common_whitespace;
11240
- if ((common_whitespace = parse_heredoc_common_whitespace(parser, nodes)) <= 0) return;
11241
-
11242
11235
// The next node should be dedented if it's the first node in the list or if
11243
11236
// if follows a string node.
11244
11237
bool dedent_next = true;
@@ -12525,9 +12518,10 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12525
12518
case PM_TOKEN_HEREDOC_START: {
12526
12519
// Here we have found a heredoc. We'll parse it and add it to the
12527
12520
// list of strings.
12528
- assert(parser->lex_modes.current->mode == PM_LEX_HEREDOC);
12529
- pm_heredoc_quote_t quote = parser->lex_modes.current->as.heredoc.quote;
12530
- pm_heredoc_indent_t indent = parser->lex_modes.current->as.heredoc.indent;
12521
+ pm_lex_mode_t *lex_mode = parser->lex_modes.current;
12522
+ assert(lex_mode->mode == PM_LEX_HEREDOC);
12523
+ pm_heredoc_quote_t quote = lex_mode->as.heredoc.quote;
12524
+ pm_heredoc_indent_t indent = lex_mode->as.heredoc.indent;
12531
12525
12532
12526
parser_lex(parser);
12533
12527
pm_token_t opening = parser->previous;
@@ -12580,15 +12574,14 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12580
12574
cast->base.type = PM_X_STRING_NODE;
12581
12575
}
12582
12576
12583
- lex_state_set(parser, PM_LEX_STATE_END);
12584
- expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM);
12585
-
12586
12577
node = (pm_node_t *) cast;
12587
12578
12588
- if (indent == PM_HEREDOC_INDENT_TILDE) {
12589
- int common_whitespace = parse_heredoc_common_whitespace_for_single_node(parser, node, -1);
12590
- parse_heredoc_dedent_single_node(parser, &cast->unescaped, true, common_whitespace, quote);
12579
+ if (indent == PM_HEREDOC_INDENT_TILDE && (lex_mode->as.heredoc.common_whitespace != (size_t) -1) && (lex_mode->as.heredoc.common_whitespace != 0)) {
12580
+ parse_heredoc_dedent_single_node(parser, &cast->unescaped, true, lex_mode->as.heredoc.common_whitespace, quote);
12591
12581
}
12582
+
12583
+ lex_state_set(parser, PM_LEX_STATE_END);
12584
+ expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM);
12592
12585
} else {
12593
12586
// If we get here, then we have multiple parts in the heredoc,
12594
12587
// so we'll need to create an interpolated string node to hold
@@ -12636,8 +12629,8 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12636
12629
12637
12630
// If this is a heredoc that is indented with a ~, then we need
12638
12631
// to dedent each line by the common leading whitespace.
12639
- if (indent == PM_HEREDOC_INDENT_TILDE) {
12640
- parse_heredoc_dedent(parser, node, quote);
12632
+ if (indent == PM_HEREDOC_INDENT_TILDE && (lex_mode->as.heredoc.common_whitespace != (size_t) -1) && (lex_mode->as.heredoc.common_whitespace != 0) ) {
12633
+ parse_heredoc_dedent(parser, node, quote, lex_mode->as.heredoc.common_whitespace );
12641
12634
}
12642
12635
}
12643
12636
}
0 commit comments