Skip to content

Commit c3f43b6

Browse files
committed
Calculate heredoc common whitespace while lexing
1 parent c7ea494 commit c3f43b6

File tree

2 files changed

+96
-98
lines changed

2 files changed

+96
-98
lines changed

include/prism/parser.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,11 @@ typedef struct pm_lex_mode {
173173
// This is the pointer to the character where lexing should resume
174174
// once the heredoc has been completely processed.
175175
const uint8_t *next_start;
176+
177+
// This is used to track the amount of common whitespace on each
178+
// line so that we know how much to dedent each line in the case of
179+
// a tilde heredoc.
180+
size_t common_whitespace;
176181
} heredoc;
177182
} as;
178183

src/prism.c

Lines changed: 91 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -7264,7 +7264,8 @@ parser_lex(pm_parser_t *parser) {
72647264
.ident_length = ident_length,
72657265
.next_start = parser->current.end,
72667266
.quote = quote,
7267-
.indent = indent
7267+
.indent = indent,
7268+
.common_whitespace = (size_t) -1
72687269
}
72697270
});
72707271

@@ -8434,8 +8435,30 @@ parser_lex(pm_parser_t *parser) {
84348435
// terminator, then we need to return the ending of the heredoc.
84358436
if (current_token_starts_line(parser)) {
84368437
const uint8_t *start = parser->current.start;
8437-
if (lex_mode->as.heredoc.indent != PM_HEREDOC_INDENT_NONE) {
8438-
start += pm_strspn_inline_whitespace(start, parser->end - start);
8438+
size_t whitespace = 0;
8439+
8440+
switch (lex_mode->as.heredoc.indent) {
8441+
case PM_HEREDOC_INDENT_NONE:
8442+
// Do nothing, we can't match a terminator with
8443+
// indentation and there's no need to calculate common
8444+
// whitespace.
8445+
break;
8446+
case PM_HEREDOC_INDENT_DASH:
8447+
// Skip past inline whitespace.
8448+
start += pm_strspn_inline_whitespace(start, parser->end - start);
8449+
break;
8450+
case PM_HEREDOC_INDENT_TILDE:
8451+
// Skip past inline whitespace and calculate common
8452+
// whitespace.
8453+
while (start < parser->end && pm_char_is_inline_whitespace(*start)) {
8454+
if (*start == '\t') {
8455+
whitespace = (whitespace / PM_TAB_WHITESPACE_SIZE + 1) * PM_TAB_WHITESPACE_SIZE;
8456+
} else {
8457+
whitespace++;
8458+
}
8459+
start++;
8460+
}
8461+
break;
84398462
}
84408463

84418464
if ((start + ident_length <= parser->end) && (memcmp(start, ident_start, ident_length) == 0)) {
@@ -8468,6 +8491,14 @@ parser_lex(pm_parser_t *parser) {
84688491
LEX(PM_TOKEN_HEREDOC_END);
84698492
}
84708493
}
8494+
8495+
if (
8496+
lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE &&
8497+
(lex_mode->as.heredoc.common_whitespace > whitespace) &&
8498+
peek_at(parser, start) != '\n'
8499+
) {
8500+
lex_mode->as.heredoc.common_whitespace = whitespace;
8501+
}
84718502
}
84728503

84738504
// Otherwise we'll be parsing string content. These are the places
@@ -8500,21 +8531,47 @@ parser_lex(pm_parser_t *parser) {
85008531

85018532
pm_newline_list_append(&parser->newline_list, breakpoint);
85028533

8534+
// If we have a - or ~ heredoc, then we can match after
8535+
// some leading whitespace.
85038536
const uint8_t *start = breakpoint + 1;
8504-
if (lex_mode->as.heredoc.indent != PM_HEREDOC_INDENT_NONE) {
8505-
start += pm_strspn_inline_whitespace(start, parser->end - start);
8537+
size_t whitespace = 0;
8538+
8539+
switch (lex_mode->as.heredoc.indent) {
8540+
case PM_HEREDOC_INDENT_NONE:
8541+
// Do nothing, we can't match a terminator with
8542+
// indentation and there's no need to calculate
8543+
// common whitespace.
8544+
break;
8545+
case PM_HEREDOC_INDENT_DASH:
8546+
// Skip past inline whitespace.
8547+
start += pm_strspn_inline_whitespace(start, parser->end - start);
8548+
break;
8549+
case PM_HEREDOC_INDENT_TILDE:
8550+
// Skip past inline whitespace and calculate common
8551+
// whitespace.
8552+
while (start < parser->end && pm_char_is_inline_whitespace(*start)) {
8553+
if (*start == '\t') {
8554+
whitespace = (whitespace / PM_TAB_WHITESPACE_SIZE + 1) * PM_TAB_WHITESPACE_SIZE;
8555+
} else {
8556+
whitespace++;
8557+
}
8558+
start++;
8559+
}
8560+
break;
85068561
}
85078562

8508-
// If we have hit a newline that is followed by a valid terminator,
8509-
// then we need to return the content of the heredoc here as string
8510-
// content. Then, the next time a token is lexed, it will match
8511-
// again and return the end of the heredoc.
8563+
// If we have hit a newline that is followed by a valid
8564+
// terminator, then we need to return the content of the
8565+
// heredoc here as string content. Then, the next time a
8566+
// token is lexed, it will match again and return the
8567+
// end of the heredoc.
85128568
if (
85138569
!was_escaped_newline &&
85148570
(start + ident_length <= parser->end) &&
85158571
(memcmp(start, ident_start, ident_length) == 0)
85168572
) {
8517-
// Heredoc terminators must be followed by a newline, CRLF, or EOF to be valid.
8573+
// Heredoc terminators must be followed by a
8574+
// newline, CRLF, or EOF to be valid.
85188575
if (
85198576
start + ident_length == parser->end ||
85208577
match_eol_at(parser, start + ident_length)
@@ -8525,8 +8582,16 @@ parser_lex(pm_parser_t *parser) {
85258582
}
85268583
}
85278584

8528-
// Otherwise we hit a newline and it wasn't followed by a
8529-
// terminator, so we can continue parsing.
8585+
if (
8586+
lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE &&
8587+
(lex_mode->as.heredoc.common_whitespace > whitespace) &&
8588+
peek_at(parser, start) != '\n'
8589+
) {
8590+
lex_mode->as.heredoc.common_whitespace = whitespace;
8591+
}
8592+
8593+
// Otherwise we hit a newline and it wasn't followed by
8594+
// a terminator, so we can continue parsing.
85308595
breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
85318596
break;
85328597
}
@@ -11082,75 +11147,8 @@ parse_method_definition_name(pm_parser_t *parser) {
1108211147
}
1108311148
}
1108411149

11085-
static int
11086-
parse_heredoc_common_whitespace_for_single_node(pm_parser_t *parser, pm_node_t *node, int common_whitespace)
11087-
{
11088-
const pm_location_t *content_loc = &((pm_string_node_t *) node)->content_loc;
11089-
int cur_whitespace;
11090-
const uint8_t *cur_char = content_loc->start;
11091-
11092-
while (cur_char && cur_char < content_loc->end) {
11093-
// Any empty newlines aren't included in the minimum whitespace
11094-
// calculation.
11095-
size_t eol_length;
11096-
while ((eol_length = match_eol_at(parser, cur_char))) {
11097-
cur_char += eol_length;
11098-
}
11099-
11100-
if (cur_char == content_loc->end) break;
11101-
11102-
cur_whitespace = 0;
11103-
11104-
while (pm_char_is_inline_whitespace(*cur_char) && cur_char < content_loc->end) {
11105-
if (cur_char[0] == '\t') {
11106-
cur_whitespace = (cur_whitespace / PM_TAB_WHITESPACE_SIZE + 1) * PM_TAB_WHITESPACE_SIZE;
11107-
} else {
11108-
cur_whitespace++;
11109-
}
11110-
cur_char++;
11111-
}
11112-
11113-
// If we hit a newline, then we have encountered a line that
11114-
// contains only whitespace, and it shouldn't be considered in
11115-
// the calculation of common leading whitespace.
11116-
eol_length = match_eol_at(parser, cur_char);
11117-
if (eol_length) {
11118-
cur_char += eol_length;
11119-
continue;
11120-
}
11121-
11122-
if (cur_whitespace < common_whitespace || common_whitespace == -1) {
11123-
common_whitespace = cur_whitespace;
11124-
}
11125-
11126-
cur_char = next_newline(cur_char + 1, parser->end - (cur_char + 1));
11127-
if (cur_char) cur_char++;
11128-
}
11129-
return common_whitespace;
11130-
}
11131-
11132-
// Calculate the common leading whitespace for each line in a heredoc.
11133-
static int
11134-
parse_heredoc_common_whitespace(pm_parser_t *parser, pm_node_list_t *nodes) {
11135-
int common_whitespace = -1;
11136-
11137-
for (size_t index = 0; index < nodes->size; index++) {
11138-
pm_node_t *node = nodes->nodes[index];
11139-
if (!PM_NODE_TYPE_P(node, PM_STRING_NODE)) continue;
11140-
11141-
// If the previous node wasn't a string node, we don't want to trim
11142-
// whitespace. This could happen after an interpolated expression or
11143-
// variable.
11144-
if (index == 0 || PM_NODE_TYPE_P(nodes->nodes[index - 1], PM_STRING_NODE)) {
11145-
common_whitespace = parse_heredoc_common_whitespace_for_single_node(parser, node, common_whitespace);
11146-
}
11147-
}
11148-
11149-
return common_whitespace;
11150-
}
11151-
1115211150
static pm_string_t *
11153-
parse_heredoc_dedent_single_node(pm_parser_t *parser, pm_string_t *string, bool dedent_node, int common_whitespace, pm_heredoc_quote_t quote)
11151+
parse_heredoc_dedent_single_node(pm_parser_t *parser, pm_string_t *string, bool dedent_node, size_t common_whitespace, pm_heredoc_quote_t quote)
1115411152
{
1115511153
// Get a reference to the string struct that is being held by the string
1115611154
// node. This is the value we're going to actually manipulate.
@@ -11174,7 +11172,7 @@ parse_heredoc_dedent_single_node(pm_parser_t *parser, pm_string_t *string, bool
1117411172
// If we need to dedent the next element within the heredoc or the next
1117511173
// line within the string node, then we'll do it here.
1117611174
if (dedent_node) {
11177-
int trimmed_whitespace = 0;
11175+
size_t trimmed_whitespace = 0;
1117811176

1117911177
// While we haven't reached the amount of common whitespace that we need
1118011178
// to trim and we haven't reached the end of the string, we'll keep
@@ -11224,7 +11222,7 @@ parse_heredoc_dedent_single_node(pm_parser_t *parser, pm_string_t *string, bool
1122411222

1122511223
// Take a heredoc node that is indented by a ~ and trim the leading whitespace.
1122611224
static void
11227-
parse_heredoc_dedent(pm_parser_t *parser, pm_node_t *heredoc_node, pm_heredoc_quote_t quote)
11225+
parse_heredoc_dedent(pm_parser_t *parser, pm_node_t *heredoc_node, pm_heredoc_quote_t quote, size_t common_whitespace)
1122811226
{
1122911227
pm_node_list_t *nodes;
1123011228

@@ -11234,11 +11232,6 @@ parse_heredoc_dedent(pm_parser_t *parser, pm_node_t *heredoc_node, pm_heredoc_qu
1123411232
nodes = &((pm_interpolated_string_node_t *) heredoc_node)->parts;
1123511233
}
1123611234

11237-
// First, calculate how much common whitespace we need to trim. If there is
11238-
// none or it's 0, then we can return early.
11239-
int common_whitespace;
11240-
if ((common_whitespace = parse_heredoc_common_whitespace(parser, nodes)) <= 0) return;
11241-
1124211235
// The next node should be dedented if it's the first node in the list or if
1124311236
// if follows a string node.
1124411237
bool dedent_next = true;
@@ -12525,9 +12518,10 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
1252512518
case PM_TOKEN_HEREDOC_START: {
1252612519
// Here we have found a heredoc. We'll parse it and add it to the
1252712520
// list of strings.
12528-
assert(parser->lex_modes.current->mode == PM_LEX_HEREDOC);
12529-
pm_heredoc_quote_t quote = parser->lex_modes.current->as.heredoc.quote;
12530-
pm_heredoc_indent_t indent = parser->lex_modes.current->as.heredoc.indent;
12521+
pm_lex_mode_t *lex_mode = parser->lex_modes.current;
12522+
assert(lex_mode->mode == PM_LEX_HEREDOC);
12523+
pm_heredoc_quote_t quote = lex_mode->as.heredoc.quote;
12524+
pm_heredoc_indent_t indent = lex_mode->as.heredoc.indent;
1253112525

1253212526
parser_lex(parser);
1253312527
pm_token_t opening = parser->previous;
@@ -12580,15 +12574,14 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
1258012574
cast->base.type = PM_X_STRING_NODE;
1258112575
}
1258212576

12583-
lex_state_set(parser, PM_LEX_STATE_END);
12584-
expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM);
12585-
1258612577
node = (pm_node_t *) cast;
1258712578

12588-
if (indent == PM_HEREDOC_INDENT_TILDE) {
12589-
int common_whitespace = parse_heredoc_common_whitespace_for_single_node(parser, node, -1);
12590-
parse_heredoc_dedent_single_node(parser, &cast->unescaped, true, common_whitespace, quote);
12579+
if (indent == PM_HEREDOC_INDENT_TILDE && (lex_mode->as.heredoc.common_whitespace != (size_t) -1) && (lex_mode->as.heredoc.common_whitespace != 0)) {
12580+
parse_heredoc_dedent_single_node(parser, &cast->unescaped, true, lex_mode->as.heredoc.common_whitespace, quote);
1259112581
}
12582+
12583+
lex_state_set(parser, PM_LEX_STATE_END);
12584+
expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM);
1259212585
} else {
1259312586
// If we get here, then we have multiple parts in the heredoc,
1259412587
// so we'll need to create an interpolated string node to hold
@@ -12636,8 +12629,8 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
1263612629

1263712630
// If this is a heredoc that is indented with a ~, then we need
1263812631
// to dedent each line by the common leading whitespace.
12639-
if (indent == PM_HEREDOC_INDENT_TILDE) {
12640-
parse_heredoc_dedent(parser, node, quote);
12632+
if (indent == PM_HEREDOC_INDENT_TILDE && (lex_mode->as.heredoc.common_whitespace != (size_t) -1) && (lex_mode->as.heredoc.common_whitespace != 0)) {
12633+
parse_heredoc_dedent(parser, node, quote, lex_mode->as.heredoc.common_whitespace);
1264112634
}
1264212635
}
1264312636
}

0 commit comments

Comments
 (0)