diff --git a/prism/encoding.h b/prism/encoding.h index d0f947eacdab0b..a1af1298e0c533 100644 --- a/prism/encoding.h +++ b/prism/encoding.h @@ -248,7 +248,7 @@ extern const pm_encoding_t pm_encodings[PM_ENCODING_MAXIMUM]; /** * This is the ASCII-8BIT encoding. We need a reference to it so that pm_strpbrk * can compare against it because invalid multibyte characters are not a thing - * in this encoding. + * in this encoding. It is also needed for handling Regexp encoding flags. */ #define PM_ENCODING_ASCII_8BIT_ENTRY (&pm_encodings[PM_ENCODING_ASCII_8BIT]) diff --git a/prism/parser.h b/prism/parser.h index 02f60192d559c6..cf5f702a871e92 100644 --- a/prism/parser.h +++ b/prism/parser.h @@ -663,6 +663,17 @@ struct pm_parser { */ pm_string_t current_string; + /** + * This string is used to pass information from the lexer to the parser. When + * processing regular expressions we must track the string source for the expression + * as well as its unescaped representation. In that case, `current_string` will hold + * the unescaped value while this field will hold the translated source value. There + * are some escape sequences in regular expressions that will cause the associated + * source string to have a different value than the content of the expression so we + * must track this state separately. + */ + pm_string_t current_regular_expression_source; + /** * The line number at the start of the parse. This will be used to offset * the line numbers of all of the locations. diff --git a/prism/prism.c b/prism/prism.c index 6921feac48fffe..6e4de22ec1eab4 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -5949,6 +5949,34 @@ parse_symbol_encoding(const pm_parser_t *parser, const pm_string_t *contents) { return 0; } +/** + * Ruby "downgrades" the encoding of Regexps to US-ASCII if the associated encoding is ASCII-compatible and + * the unescaped representation of a Regexp source consists only of US-ASCII code points. This is true even + * when the Regexp is explicitly given an ASCII-8BIT encoding via the (/n) modifier. Otherwise, the encoding + * may be explicitly set with an escape sequence. + */ +static inline pm_node_flags_t +parse_regular_expression_encoding(const pm_parser_t *parser, const pm_string_t *contents) { + // Ruby stipulates that all source files must use an ASCII-compatible encoding. Thus, all regular expressions + // appearing in source are eligible for "downgrading" to US-ASCII. + if (pm_ascii_only_p(contents)) { + return PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING; + } + + // A Regexp may optionally have its encoding explicitly set via a character escape sequence in the source string + // or by specifying a modifier. + // + // NB: an explicitly set encoding is ignored by Ruby if the Regexp consists of only US ASCII code points. + if (parser->explicit_encoding != NULL) { + if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) { + return PM_REGULAR_EXPRESSION_FLAGS_FORCED_UTF8_ENCODING; + } else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) { + return PM_REGULAR_EXPRESSION_FLAGS_FORCED_BINARY_ENCODING; + } + } + return 0; +} + /** * Allocate and initialize a new SymbolNode node with the given unescaped * string. @@ -8130,34 +8158,34 @@ escape_write_escape_encoded(pm_parser_t *parser, pm_buffer_t *buffer) { * source so that the regular expression engine will perform its own unescaping. */ static inline void -escape_write_byte(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags, uint8_t byte) { +escape_write_byte(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expression_buffer, uint8_t flags, uint8_t byte) { if (flags & PM_ESCAPE_FLAG_REGEXP) { - pm_buffer_append_bytes(buffer, (const uint8_t *) "\\x", 2); + pm_buffer_append_bytes(regular_expression_buffer, (const uint8_t *) "\\x", 2); uint8_t byte1 = (uint8_t) ((byte >> 4) & 0xF); uint8_t byte2 = (uint8_t) (byte & 0xF); if (byte1 >= 0xA) { - pm_buffer_append_byte(buffer, (uint8_t) ((byte1 - 0xA) + 'A')); + pm_buffer_append_byte(regular_expression_buffer, (uint8_t) ((byte1 - 0xA) + 'A')); } else { - pm_buffer_append_byte(buffer, (uint8_t) (byte1 + '0')); + pm_buffer_append_byte(regular_expression_buffer, (uint8_t) (byte1 + '0')); } if (byte2 >= 0xA) { - pm_buffer_append_byte(buffer, (uint8_t) (byte2 - 0xA + 'A')); + pm_buffer_append_byte(regular_expression_buffer, (uint8_t) (byte2 - 0xA + 'A')); } else { - pm_buffer_append_byte(buffer, (uint8_t) (byte2 + '0')); + pm_buffer_append_byte(regular_expression_buffer, (uint8_t) (byte2 + '0')); } - } else { - escape_write_byte_encoded(parser, buffer, byte); } + + escape_write_byte_encoded(parser, buffer, byte); } /** * Read the value of an escape into the buffer. */ static void -escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { +escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expression_buffer, uint8_t flags) { switch (peek(parser)) { case '\\': { parser->current.end++; @@ -8248,10 +8276,10 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { } if (flags & PM_ESCAPE_FLAG_REGEXP) { - pm_buffer_append_bytes(buffer, start, (size_t) (parser->current.end - start)); - } else { - escape_write_byte_encoded(parser, buffer, value); + pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start)); } + + escape_write_byte_encoded(parser, buffer, value); } else { pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_HEXADECIMAL); } @@ -8272,10 +8300,9 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { uint32_t value = escape_unicode(parser->current.end, 4); if (flags & PM_ESCAPE_FLAG_REGEXP) { - pm_buffer_append_bytes(buffer, start, (size_t) (parser->current.end + 4 - start)); - } else { - escape_write_unicode(parser, buffer, flags, start, parser->current.end + 4, value); + pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end + 4 - start)); } + escape_write_unicode(parser, buffer, flags, start, parser->current.end + 4, value); parser->current.end += 4; } else if (peek(parser) == '{') { @@ -8306,10 +8333,8 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { extra_codepoints_start = unicode_start; } - if (!(flags & PM_ESCAPE_FLAG_REGEXP)) { - uint32_t value = escape_unicode(unicode_start, hexadecimal_length); - escape_write_unicode(parser, buffer, flags, unicode_start, parser->current.end, value); - } + uint32_t value = escape_unicode(unicode_start, hexadecimal_length); + escape_write_unicode(parser, buffer, flags, unicode_start, parser->current.end, value); parser->current.end += pm_strspn_whitespace(parser->current.end, parser->end - parser->current.end); } @@ -8327,7 +8352,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { } if (flags & PM_ESCAPE_FLAG_REGEXP) { - pm_buffer_append_bytes(buffer, unicode_codepoints_start, (size_t) (parser->current.end - unicode_codepoints_start)); + pm_buffer_append_bytes(regular_expression_buffer, unicode_codepoints_start, (size_t) (parser->current.end - unicode_codepoints_start)); } } else { pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_UNICODE); @@ -8346,7 +8371,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { switch (peeked) { case '?': { parser->current.end++; - escape_write_byte(parser, buffer, flags, escape_byte(0x7f, flags)); + escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte(0x7f, flags)); return; } case '\\': @@ -8355,7 +8380,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { return; } parser->current.end++; - escape_read(parser, buffer, flags | PM_ESCAPE_FLAG_CONTROL); + escape_read(parser, buffer, regular_expression_buffer, flags | PM_ESCAPE_FLAG_CONTROL); return; default: { if (!char_is_ascii_printable(peeked)) { @@ -8364,7 +8389,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { } parser->current.end++; - escape_write_byte(parser, buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL)); + escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL)); return; } } @@ -8386,7 +8411,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { switch (peeked) { case '?': { parser->current.end++; - escape_write_byte(parser, buffer, flags, escape_byte(0x7f, flags)); + escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte(0x7f, flags)); return; } case '\\': @@ -8395,7 +8420,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { return; } parser->current.end++; - escape_read(parser, buffer, flags | PM_ESCAPE_FLAG_CONTROL); + escape_read(parser, buffer, regular_expression_buffer, flags | PM_ESCAPE_FLAG_CONTROL); return; default: { if (!char_is_ascii_printable(peeked)) { @@ -8404,7 +8429,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { } parser->current.end++; - escape_write_byte(parser, buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL)); + escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL)); return; } } @@ -8429,7 +8454,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { return; } parser->current.end++; - escape_read(parser, buffer, flags | PM_ESCAPE_FLAG_META); + escape_read(parser, buffer, regular_expression_buffer, flags | PM_ESCAPE_FLAG_META); return; } @@ -8439,7 +8464,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { } parser->current.end++; - escape_write_byte(parser, buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_META)); + escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_META)); return; } case '\r': { @@ -8510,7 +8535,7 @@ lex_question_mark(pm_parser_t *parser) { pm_buffer_t buffer; pm_buffer_init_capacity(&buffer, 3); - escape_read(parser, &buffer, PM_ESCAPE_FLAG_SINGLE); + escape_read(parser, &buffer, NULL, PM_ESCAPE_FLAG_SINGLE); pm_string_owned_init(&parser->current_string, (uint8_t *) buffer.value, buffer.length); return PM_TOKEN_CHARACTER_LITERAL; @@ -8724,7 +8749,7 @@ parser_end_of_line_p(const pm_parser_t *parser) { * "foo\n" * * then the bytes in the string are "f", "o", "o", "\", "n", but we want to - * provide out consumers with the string content "f", "o", "o", "\n". In these + * provide our consumers with the string content "f", "o", "o", "\n". In these * cases, when we find the first escape sequence, we initialize a pm_buffer_t * to keep track of the string content. Then in the parser, it will * automatically attach the string content to the node that it belongs to. @@ -8736,6 +8761,20 @@ typedef struct { */ pm_buffer_t buffer; + /** + * In order to properly set a regular expression's encoding and to validate + * the byte sequence for the underlying encoding we must process any escape + * sequences. The unescaped byte sequence will be stored in `buffer` just like + * for other string-like types. However, we also need to store the regular + * expression's source string. That string may different from the what we see + * during lexing because some escape sequences rewrite the source. + * + * This value will only be initialized for regular expressions and only if we + * receive an escape sequence. It will contain the regular expression's source + * string's byte sequence. + */ + pm_buffer_t regular_expression_buffer; + /** * The cursor into the source string that points to how far we have * currently copied into the buffer. @@ -8751,19 +8790,29 @@ pm_token_buffer_push_byte(pm_token_buffer_t *token_buffer, uint8_t byte) { pm_buffer_append_byte(&token_buffer->buffer, byte); } +static inline void +pm_token_buffer_push_byte_regular_expression(pm_token_buffer_t *token_buffer, uint8_t byte) { + pm_buffer_append_byte(&token_buffer->regular_expression_buffer, byte); +} + + /** * Append the given bytes into the token buffer. */ static inline void -pm_token_buffer_push_bytes(pm_token_buffer_t *token_buffer, const uint8_t *bytes, size_t length) { +pm_token_buffer_push_bytes(pm_token_buffer_t *token_buffer, const uint8_t *bytes, size_t length, uint8_t flags) { pm_buffer_append_bytes(&token_buffer->buffer, bytes, length); + + if (flags & PM_ESCAPE_FLAG_REGEXP) { + pm_buffer_append_bytes(&token_buffer->regular_expression_buffer, bytes, length); + } } /** * Push an escaped character into the token buffer. */ static inline void -pm_token_buffer_push_escaped(pm_token_buffer_t *token_buffer, pm_parser_t *parser) { +pm_token_buffer_push_escaped(pm_token_buffer_t *token_buffer, pm_parser_t *parser, uint8_t flags) { // First, determine the width of the character to be escaped. size_t width; if (parser->encoding_changed) { @@ -8777,7 +8826,7 @@ pm_token_buffer_push_escaped(pm_token_buffer_t *token_buffer, pm_parser_t *parse width = (width == 0 ? 1 : width); // Now, push the bytes into the buffer. - pm_token_buffer_push_bytes(token_buffer, parser->current.end, width); + pm_token_buffer_push_bytes(token_buffer, parser->current.end, width, flags); parser->current.end += width; } @@ -8790,6 +8839,7 @@ pm_token_buffer_push_escaped(pm_token_buffer_t *token_buffer, pm_parser_t *parse static inline void pm_token_buffer_copy(pm_parser_t *parser, pm_token_buffer_t *token_buffer) { pm_string_owned_init(&parser->current_string, (uint8_t *) token_buffer->buffer.value, token_buffer->buffer.length); + pm_string_owned_init(&parser->current_regular_expression_source, (uint8_t *) token_buffer->regular_expression_buffer.value, token_buffer->regular_expression_buffer.length); } /** @@ -8805,8 +8855,10 @@ static void pm_token_buffer_flush(pm_parser_t *parser, pm_token_buffer_t *token_buffer) { if (token_buffer->cursor == NULL) { pm_string_shared_init(&parser->current_string, parser->current.start, parser->current.end); + pm_string_shared_init(&parser->current_regular_expression_source, parser->current.start, parser->current.end); } else { pm_buffer_append_bytes(&token_buffer->buffer, token_buffer->cursor, (size_t) (parser->current.end - token_buffer->cursor)); + pm_buffer_append_bytes(&token_buffer->regular_expression_buffer, token_buffer->cursor, (size_t) (parser->current.end - token_buffer->cursor)); pm_token_buffer_copy(parser, token_buffer); } } @@ -8824,6 +8876,7 @@ pm_token_buffer_escape(pm_parser_t *parser, pm_token_buffer_t *token_buffer) { const uint8_t *start; if (token_buffer->cursor == NULL) { pm_buffer_init_capacity(&token_buffer->buffer, 16); + pm_buffer_init_capacity(&token_buffer->regular_expression_buffer, 16); start = parser->current.start; } else { start = token_buffer->cursor; @@ -8831,6 +8884,7 @@ pm_token_buffer_escape(pm_parser_t *parser, pm_token_buffer_t *token_buffer) { const uint8_t *end = parser->current.end - 1; pm_buffer_append_bytes(&token_buffer->buffer, start, (size_t) (end - start)); + pm_buffer_append_bytes(&token_buffer->regular_expression_buffer, start, (size_t) (end - start)); token_buffer->cursor = end; } @@ -10143,7 +10197,7 @@ parser_lex(pm_parser_t *parser) { // If we haven't found an escape yet, then this buffer will be // unallocated since we can refer directly to the source string. - pm_token_buffer_t token_buffer = { { 0 }, 0 }; + pm_token_buffer_t token_buffer = { { 0 }, { 0 }, 0 }; while (breakpoint != NULL) { // If we hit a null byte, skip directly past it. @@ -10242,10 +10296,10 @@ parser_lex(pm_parser_t *parser) { pm_token_buffer_push_byte(&token_buffer, peeked); parser->current.end++; } else if (lex_mode->as.list.interpolation) { - escape_read(parser, &token_buffer.buffer, PM_ESCAPE_FLAG_NONE); + escape_read(parser, &token_buffer.buffer, NULL, PM_ESCAPE_FLAG_NONE); } else { pm_token_buffer_push_byte(&token_buffer, '\\'); - pm_token_buffer_push_escaped(&token_buffer, parser); + pm_token_buffer_push_escaped(&token_buffer, parser, PM_ESCAPE_FLAG_NONE); } break; @@ -10320,7 +10374,7 @@ parser_lex(pm_parser_t *parser) { // characters. const uint8_t *breakpoints = lex_mode->as.regexp.breakpoints; const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false); - pm_token_buffer_t token_buffer = { { 0 }, 0 }; + pm_token_buffer_t token_buffer = { { 0 }, { 0 }, 0 }; while (breakpoint != NULL) { // If we hit a null byte, skip directly past it. @@ -10403,9 +10457,10 @@ parser_lex(pm_parser_t *parser) { parser->current.end++; if (peek(parser) != '\n') { if (lex_mode->as.regexp.terminator != '\r') { - pm_token_buffer_push_byte(&token_buffer, '\\'); + pm_token_buffer_push_byte_regular_expression(&token_buffer, '\\'); } pm_token_buffer_push_byte(&token_buffer, '\r'); + pm_token_buffer_push_byte_regular_expression(&token_buffer, '\r'); break; } /* fallthrough */ @@ -10429,7 +10484,7 @@ parser_lex(pm_parser_t *parser) { case 'M': case 'u': case 'x': - escape_read(parser, &token_buffer.buffer, PM_ESCAPE_FLAG_REGEXP); + escape_read(parser, &token_buffer.buffer, &token_buffer.regular_expression_buffer, PM_ESCAPE_FLAG_REGEXP); break; default: if (lex_mode->as.regexp.terminator == peeked) { @@ -10440,19 +10495,20 @@ parser_lex(pm_parser_t *parser) { case '$': case ')': case '*': case '+': case '.': case '>': case '?': case ']': case '^': case '|': case '}': - pm_token_buffer_push_byte(&token_buffer, '\\'); + pm_token_buffer_push_byte_regular_expression(&token_buffer, '\\'); break; default: break; } pm_token_buffer_push_byte(&token_buffer, peeked); + pm_token_buffer_push_byte_regular_expression(&token_buffer, peeked); parser->current.end++; break; } - if (peeked < 0x80) pm_token_buffer_push_byte(&token_buffer, '\\'); - pm_token_buffer_push_escaped(&token_buffer, parser); + if (peeked < 0x80) pm_token_buffer_push_byte_regular_expression(&token_buffer, '\\'); + pm_token_buffer_push_escaped(&token_buffer, parser, PM_ESCAPE_FLAG_REGEXP); break; } @@ -10525,7 +10581,7 @@ parser_lex(pm_parser_t *parser) { // If we haven't found an escape yet, then this buffer will be // unallocated since we can refer directly to the source string. - pm_token_buffer_t token_buffer = { { 0 }, 0 }; + pm_token_buffer_t token_buffer = { { 0 }, { 0 }, 0 }; while (breakpoint != NULL) { // If we hit the incrementor, then we'll increment then nesting and @@ -10660,10 +10716,10 @@ parser_lex(pm_parser_t *parser) { pm_token_buffer_push_byte(&token_buffer, peeked); parser->current.end++; } else if (lex_mode->as.string.interpolation) { - escape_read(parser, &token_buffer.buffer, PM_ESCAPE_FLAG_NONE); + escape_read(parser, &token_buffer.buffer, NULL, PM_ESCAPE_FLAG_NONE); } else { pm_token_buffer_push_byte(&token_buffer, '\\'); - pm_token_buffer_push_escaped(&token_buffer, parser); + pm_token_buffer_push_escaped(&token_buffer, parser, PM_ESCAPE_FLAG_NONE); } break; @@ -10813,7 +10869,7 @@ parser_lex(pm_parser_t *parser) { } const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true); - pm_token_buffer_t token_buffer = { { 0 }, 0 }; + pm_token_buffer_t token_buffer = { { 0 }, { 0 }, 0 }; bool was_line_continuation = false; while (breakpoint != NULL) { @@ -10935,7 +10991,7 @@ parser_lex(pm_parser_t *parser) { continue; default: pm_token_buffer_push_byte(&token_buffer, '\\'); - pm_token_buffer_push_escaped(&token_buffer, parser); + pm_token_buffer_push_escaped(&token_buffer, parser, PM_ESCAPE_FLAG_NONE); break; } } else { @@ -10972,7 +11028,7 @@ parser_lex(pm_parser_t *parser) { breakpoint = parser->current.end; continue; default: - escape_read(parser, &token_buffer.buffer, PM_ESCAPE_FLAG_NONE); + escape_read(parser, &token_buffer.buffer, NULL, PM_ESCAPE_FLAG_NONE); break; } } @@ -16948,7 +17004,11 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b }; parser_lex(parser); - return (pm_node_t *) pm_regular_expression_node_create(parser, &opening, &content, &parser->previous); + + pm_node_t *regular_expression_node = (pm_node_t *) (pm_node_t *) pm_regular_expression_node_create(parser, &opening, &content, &parser->previous); + pm_node_flag_set(regular_expression_node, PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING); + + return regular_expression_node; } pm_interpolated_regular_expression_node_t *node; @@ -16959,6 +17019,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b // following token is the end (in which case we can return a plain // regular expression) or if it's not then it has interpolation. pm_string_t unescaped = parser->current_string; + pm_string_t source = parser->current_regular_expression_source; pm_token_t content = parser->current; parser_lex(parser); @@ -16966,7 +17027,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b // without interpolation, which can be represented more succinctly and // more easily compiled. if (accept1(parser, PM_TOKEN_REGEXP_END)) { - return (pm_node_t *) pm_regular_expression_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped); + pm_node_t *regular_expression_node = (pm_node_t *) pm_regular_expression_node_create_unescaped(parser, &opening, &content, &parser->previous, &source); + pm_node_flag_set(regular_expression_node, parse_regular_expression_encoding(parser, &unescaped)); + return regular_expression_node; } // If we get here, then we have interpolation so we'll need to create @@ -18527,6 +18590,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm .newline_list = { 0 }, .integer_base = 0, .current_string = PM_STRING_EMPTY, + .current_regular_expression_source = PM_STRING_EMPTY, .start_line = 1, .explicit_encoding = NULL, .command_line = 0, diff --git a/test/prism/encoding_test.rb b/test/prism/encoding_test.rb index 44491bf0d5b110..762dcde7178943 100644 --- a/test/prism/encoding_test.rb +++ b/test/prism/encoding_test.rb @@ -149,6 +149,7 @@ class EncodingTest < TestCase escapes = ["\\x00", "\\x7F", "\\x80", "\\xFF", "\\u{00}", "\\u{7F}", "\\u{80}", "\\M-\\C-?"] escapes = escapes.concat(escapes.product(escapes).map(&:join)) symbols = [:a, :ą, :+] + regexps = [/a/, /ą/, //] encodings.each_key do |encoding| define_method(:"test_encoding_flags_#{encoding.name}") do @@ -168,6 +169,18 @@ class EncodingTest < TestCase end end + encodings.each_key do |encoding| + define_method(:"test_regular_expression_encoding_flags_#{encoding.name}") do + assert_regular_expression_encoding_flags(encoding, regexps.map(&:inspect)) + end + end + + encodings.each_key do |encoding| + define_method(:"test_regular_expression_escape_encoding_flags_#{encoding.name}") do + assert_regular_expression_encoding_flags(encoding, escapes.map { |e| "/#{e}/" }) + end + end + def test_coding result = Prism.parse("# coding: utf-8\n'string'") actual = result.value.statements.body.first.unescaped.encoding @@ -454,5 +467,50 @@ def assert_symbol_character_escape_encoding_flags(encoding, escapes) assert_equal expected, actual end end + + def assert_regular_expression_encoding_flags(encoding, regexps) + regexps.each do |regexp| + source = "# encoding: #{encoding.name}\n#{regexp}" + + expected = + begin + eval(source).encoding + rescue SyntaxError => error + if error.message.include?("UTF-8 character in non UTF-8 regexp") || error.message.include?("escaped non ASCII character in UTF-8 regexp") + error.message[/: (.+?)\n/, 1] + elsif error.message.include?("invalid multibyte char") + # TODO (nirvdrum 26-Jan-2024): Bail out early of the rest of the test due to https://github.com/ruby/prism/issues/2104. + next + else + raise + end + end + + actual = + Prism.parse(source).then do |result| + if result.success? + regexp = result.value.statements.body.first + + if regexp.forced_utf8_encoding? + Encoding::UTF_8 + elsif regexp.forced_binary_encoding? + Encoding::ASCII_8BIT + elsif regexp.forced_us_ascii_encoding? + Encoding::US_ASCII + else + encoding + end + else + error = result.errors.last + + unless error.message.include?("UTF-8 mixed within") + raise error.message + end + end + end + + assert_equal expected, actual + end + end end end diff --git a/test/prism/snapshots/heredoc_with_escaped_newline_at_start.txt b/test/prism/snapshots/heredoc_with_escaped_newline_at_start.txt index 5d17559ed44f99..acc6b082fcd757 100644 --- a/test/prism/snapshots/heredoc_with_escaped_newline_at_start.txt +++ b/test/prism/snapshots/heredoc_with_escaped_newline_at_start.txt @@ -21,7 +21,7 @@ │ │ ├── flags: ∅ │ │ └── arguments: (length: 2) │ │ ├── @ RegularExpressionNode (location: (1,15)-(1,21)) - │ │ │ ├── flags: ∅ + │ │ │ ├── flags: forced_us_ascii_encoding │ │ │ ├── opening_loc: (1,15)-(1,16) = "/" │ │ │ ├── content_loc: (1,16)-(1,20) = "^\\s{" │ │ │ ├── closing_loc: (1,20)-(1,21) = "/" @@ -52,7 +52,7 @@ │ ├── flags: ∅ │ └── arguments: (length: 2) │ ├── @ RegularExpressionNode (location: (5,15)-(5,21)) - │ │ ├── flags: ∅ + │ │ ├── flags: forced_us_ascii_encoding │ │ ├── opening_loc: (5,15)-(5,16) = "/" │ │ ├── content_loc: (5,16)-(5,20) = "^\\s{" │ │ ├── closing_loc: (5,20)-(5,21) = "/" diff --git a/test/prism/snapshots/newline_terminated.txt b/test/prism/snapshots/newline_terminated.txt index 496e86fbe44f4a..6a3b28dba98d3a 100644 --- a/test/prism/snapshots/newline_terminated.txt +++ b/test/prism/snapshots/newline_terminated.txt @@ -100,7 +100,7 @@ │ ├── closing_loc: (37,3)-(38,0) = "\n" │ └── unescaped: "foo" └── @ RegularExpressionNode (location: (39,0)-(41,0)) - ├── flags: ∅ + ├── flags: forced_us_ascii_encoding ├── opening_loc: (39,0)-(40,0) = "%r\n" ├── content_loc: (40,0)-(40,3) = "foo" ├── closing_loc: (40,3)-(41,0) = "\n" diff --git a/test/prism/snapshots/patterns.txt b/test/prism/snapshots/patterns.txt index 96205349d33063..4efd3159fcb176 100644 --- a/test/prism/snapshots/patterns.txt +++ b/test/prism/snapshots/patterns.txt @@ -165,7 +165,7 @@ │ │ └── block: ∅ │ ├── pattern: │ │ @ RegularExpressionNode (location: (9,7)-(9,12)) - │ │ ├── flags: ∅ + │ │ ├── flags: forced_us_ascii_encoding │ │ ├── opening_loc: (9,7)-(9,8) = "/" │ │ ├── content_loc: (9,8)-(9,11) = "foo" │ │ ├── closing_loc: (9,11)-(9,12) = "/" @@ -719,14 +719,14 @@ │ │ ├── flags: ∅ │ │ ├── left: │ │ │ @ RegularExpressionNode (location: (35,7)-(35,12)) - │ │ │ ├── flags: ∅ + │ │ │ ├── flags: forced_us_ascii_encoding │ │ │ ├── opening_loc: (35,7)-(35,8) = "/" │ │ │ ├── content_loc: (35,8)-(35,11) = "foo" │ │ │ ├── closing_loc: (35,11)-(35,12) = "/" │ │ │ └── unescaped: "foo" │ │ ├── right: │ │ │ @ RegularExpressionNode (location: (35,16)-(35,21)) - │ │ │ ├── flags: ∅ + │ │ │ ├── flags: forced_us_ascii_encoding │ │ │ ├── opening_loc: (35,16)-(35,17) = "/" │ │ │ ├── content_loc: (35,17)-(35,20) = "foo" │ │ │ ├── closing_loc: (35,20)-(35,21) = "/" @@ -2543,7 +2543,7 @@ │ │ └── block: ∅ │ ├── pattern: │ │ @ RegularExpressionNode (location: (112,7)-(112,12)) - │ │ ├── flags: ∅ + │ │ ├── flags: forced_us_ascii_encoding │ │ ├── opening_loc: (112,7)-(112,8) = "/" │ │ ├── content_loc: (112,8)-(112,11) = "foo" │ │ ├── closing_loc: (112,11)-(112,12) = "/" @@ -3126,7 +3126,7 @@ │ │ └── @ InNode (location: (143,10)-(143,23)) │ │ ├── pattern: │ │ │ @ RegularExpressionNode (location: (143,13)-(143,18)) - │ │ │ ├── flags: ∅ + │ │ │ ├── flags: forced_us_ascii_encoding │ │ │ ├── opening_loc: (143,13)-(143,14) = "/" │ │ │ ├── content_loc: (143,14)-(143,17) = "foo" │ │ │ ├── closing_loc: (143,17)-(143,18) = "/" @@ -3914,7 +3914,7 @@ │ │ │ │ @ StatementsNode (location: (170,13)-(170,18)) │ │ │ │ └── body: (length: 1) │ │ │ │ └── @ RegularExpressionNode (location: (170,13)-(170,18)) - │ │ │ │ ├── flags: ∅ + │ │ │ │ ├── flags: forced_us_ascii_encoding │ │ │ │ ├── opening_loc: (170,13)-(170,14) = "/" │ │ │ │ ├── content_loc: (170,14)-(170,17) = "foo" │ │ │ │ ├── closing_loc: (170,17)-(170,18) = "/" diff --git a/test/prism/snapshots/regex.txt b/test/prism/snapshots/regex.txt index 8f3e2595168b86..9e19bbb18d273c 100644 --- a/test/prism/snapshots/regex.txt +++ b/test/prism/snapshots/regex.txt @@ -15,7 +15,7 @@ │ │ ├── flags: ∅ │ │ └── arguments: (length: 1) │ │ └── @ RegularExpressionNode (location: (1,4)-(1,9)) - │ │ ├── flags: ∅ + │ │ ├── flags: forced_us_ascii_encoding │ │ ├── opening_loc: (1,4)-(1,5) = "/" │ │ ├── content_loc: (1,5)-(1,8) = "bar" │ │ ├── closing_loc: (1,8)-(1,9) = "/" @@ -23,13 +23,13 @@ │ ├── closing_loc: ∅ │ └── block: ∅ ├── @ RegularExpressionNode (location: (3,0)-(3,8)) - │ ├── flags: ignore_case + │ ├── flags: ignore_case, forced_us_ascii_encoding │ ├── opening_loc: (3,0)-(3,3) = "%r{" │ ├── content_loc: (3,3)-(3,6) = "abc" │ ├── closing_loc: (3,6)-(3,8) = "}i" │ └── unescaped: "abc" ├── @ RegularExpressionNode (location: (5,0)-(5,5)) - │ ├── flags: ∅ + │ ├── flags: forced_us_ascii_encoding │ ├── opening_loc: (5,0)-(5,1) = "/" │ ├── content_loc: (5,1)-(5,4) = "a\\b" │ ├── closing_loc: (5,4)-(5,5) = "/" @@ -92,7 +92,7 @@ │ │ │ │ ├── flags: ∅ │ │ │ │ ├── receiver: │ │ │ │ │ @ RegularExpressionNode (location: (11,1)-(11,14)) - │ │ │ │ │ ├── flags: ∅ + │ │ │ │ │ ├── flags: forced_us_ascii_encoding │ │ │ │ │ ├── opening_loc: (11,1)-(11,2) = "/" │ │ │ │ │ ├── content_loc: (11,2)-(11,13) = "(?bar)" │ │ │ │ │ ├── closing_loc: (11,13)-(11,14) = "/" @@ -127,31 +127,31 @@ │ ├── opening_loc: (11,0)-(11,1) = "[" │ └── closing_loc: (11,26)-(11,27) = "]" ├── @ RegularExpressionNode (location: (13,0)-(13,6)) - │ ├── flags: ignore_case + │ ├── flags: ignore_case, forced_us_ascii_encoding │ ├── opening_loc: (13,0)-(13,1) = "/" │ ├── content_loc: (13,1)-(13,4) = "abc" │ ├── closing_loc: (13,4)-(13,6) = "/i" │ └── unescaped: "abc" ├── @ RegularExpressionNode (location: (15,0)-(15,26)) - │ ├── flags: ignore_case + │ ├── flags: ignore_case, forced_us_ascii_encoding │ ├── opening_loc: (15,0)-(15,3) = "%r/" │ ├── content_loc: (15,3)-(15,24) = "[a-z$._?][\\w$.?\#@~]*:" │ ├── closing_loc: (15,24)-(15,26) = "/i" │ └── unescaped: "[a-z$._?][\\w$.?\#@~]*:" ├── @ RegularExpressionNode (location: (17,0)-(17,37)) - │ ├── flags: ignore_case + │ ├── flags: ignore_case, forced_us_ascii_encoding │ ├── opening_loc: (17,0)-(17,3) = "%r/" │ ├── content_loc: (17,3)-(17,35) = "([a-z$._?][\\w$.?\#@~]*)(\\s+)(equ)" │ ├── closing_loc: (17,35)-(17,37) = "/i" │ └── unescaped: "([a-z$._?][\\w$.?\#@~]*)(\\s+)(equ)" ├── @ RegularExpressionNode (location: (19,0)-(19,25)) - │ ├── flags: ignore_case + │ ├── flags: ignore_case, forced_us_ascii_encoding │ ├── opening_loc: (19,0)-(19,3) = "%r/" │ ├── content_loc: (19,3)-(19,23) = "[a-z$._?][\\w$.?\#@~]*" │ ├── closing_loc: (19,23)-(19,25) = "/i" │ └── unescaped: "[a-z$._?][\\w$.?\#@~]*" ├── @ RegularExpressionNode (location: (21,0)-(24,1)) - │ ├── flags: ∅ + │ ├── flags: forced_us_ascii_encoding │ ├── opening_loc: (21,0)-(21,3) = "%r(" │ ├── content_loc: (21,3)-(24,0) = "\n(?:[\\w\#$%_']|\\(\\)|\\(,\\)|\\[\\]|[0-9])*\n (?:[\\w\#$%_']+)\n" │ ├── closing_loc: (24,0)-(24,1) = ")" @@ -160,7 +160,7 @@ │ ├── flags: ∅ │ ├── receiver: │ │ @ RegularExpressionNode (location: (26,0)-(26,8)) - │ │ ├── flags: ∅ + │ │ ├── flags: forced_us_ascii_encoding │ │ ├── opening_loc: (26,0)-(26,1) = "/" │ │ ├── content_loc: (26,1)-(26,7) = "(?#\\))" │ │ ├── closing_loc: (26,7)-(26,8) = "/" @@ -182,7 +182,7 @@ │ ├── closing_loc: ∅ │ └── block: ∅ ├── @ RegularExpressionNode (location: (28,0)-(28,9)) - │ ├── flags: ∅ + │ ├── flags: forced_us_ascii_encoding │ ├── opening_loc: (28,0)-(28,3) = "%r#" │ ├── content_loc: (28,3)-(28,8) = "pound" │ ├── closing_loc: (28,8)-(28,9) = "#" @@ -220,7 +220,7 @@ │ │ ├── flags: ∅ │ │ ├── receiver: │ │ │ @ RegularExpressionNode (location: (32,0)-(33,4)) - │ │ │ ├── flags: ∅ + │ │ │ ├── flags: forced_us_ascii_encoding │ │ │ ├── opening_loc: (32,0)-(32,1) = "/" │ │ │ ├── content_loc: (32,1)-(33,3) = "(?)" │ │ │ ├── closing_loc: (33,3)-(33,4) = "/" @@ -254,7 +254,7 @@ │ │ ├── flags: ∅ │ │ ├── receiver: │ │ │ @ RegularExpressionNode (location: (35,0)-(35,18)) - │ │ │ ├── flags: ∅ + │ │ │ ├── flags: forced_us_ascii_encoding │ │ │ ├── opening_loc: (35,0)-(35,1) = "/" │ │ │ ├── content_loc: (35,1)-(35,17) = "(?)(?)" │ │ │ ├── closing_loc: (35,17)-(35,18) = "/" @@ -286,7 +286,7 @@ │ ├── flags: ∅ │ ├── receiver: │ │ @ RegularExpressionNode (location: (37,0)-(37,10)) - │ │ ├── flags: ∅ + │ │ ├── flags: forced_us_ascii_encoding │ │ ├── opening_loc: (37,0)-(37,1) = "/" │ │ ├── content_loc: (37,1)-(37,9) = "(?)" │ │ ├── closing_loc: (37,9)-(37,10) = "/" @@ -338,7 +338,7 @@ │ │ ├── flags: ∅ │ │ ├── receiver: │ │ │ @ RegularExpressionNode (location: (40,6)-(40,14)) - │ │ │ ├── flags: ∅ + │ │ │ ├── flags: forced_us_ascii_encoding │ │ │ ├── opening_loc: (40,6)-(40,7) = "/" │ │ │ ├── content_loc: (40,7)-(40,13) = "(?)" │ │ │ ├── closing_loc: (40,13)-(40,14) = "/" diff --git a/test/prism/snapshots/seattlerb/TestRubyParserShared.txt b/test/prism/snapshots/seattlerb/TestRubyParserShared.txt index 4a2a48b794bfde..fabc92e47795c9 100644 --- a/test/prism/snapshots/seattlerb/TestRubyParserShared.txt +++ b/test/prism/snapshots/seattlerb/TestRubyParserShared.txt @@ -70,7 +70,7 @@ │ ├── opening_loc: (26,0)-(26,3) = "%i[" │ └── closing_loc: (29,0)-(29,1) = "]" ├── @ RegularExpressionNode (location: (31,0)-(34,1)) - │ ├── flags: ∅ + │ ├── flags: forced_us_ascii_encoding │ ├── opening_loc: (31,0)-(31,3) = "%r[" │ ├── content_loc: (31,3)-(34,0) = "\n\n\n" │ ├── closing_loc: (34,0)-(34,1) = "]" diff --git a/test/prism/snapshots/seattlerb/bug190.txt b/test/prism/snapshots/seattlerb/bug190.txt index f7eaefa5c67574..b261a166cf9da3 100644 --- a/test/prism/snapshots/seattlerb/bug190.txt +++ b/test/prism/snapshots/seattlerb/bug190.txt @@ -4,7 +4,7 @@ @ StatementsNode (location: (1,0)-(1,6)) └── body: (length: 1) └── @ RegularExpressionNode (location: (1,0)-(1,6)) - ├── flags: ∅ + ├── flags: forced_us_ascii_encoding ├── opening_loc: (1,0)-(1,3) = "%r'" ├── content_loc: (1,3)-(1,5) = "\\'" ├── closing_loc: (1,5)-(1,6) = "'" diff --git a/test/prism/snapshots/seattlerb/bug_case_when_regexp.txt b/test/prism/snapshots/seattlerb/bug_case_when_regexp.txt index f6a6f41c89e004..0cc1ca05e1f247 100644 --- a/test/prism/snapshots/seattlerb/bug_case_when_regexp.txt +++ b/test/prism/snapshots/seattlerb/bug_case_when_regexp.txt @@ -16,7 +16,7 @@ │ ├── keyword_loc: (1,9)-(1,13) = "when" │ ├── conditions: (length: 1) │ │ └── @ RegularExpressionNode (location: (1,14)-(1,17)) - │ │ ├── flags: ∅ + │ │ ├── flags: forced_us_ascii_encoding │ │ ├── opening_loc: (1,14)-(1,15) = "/" │ │ ├── content_loc: (1,15)-(1,16) = "x" │ │ ├── closing_loc: (1,16)-(1,17) = "/" diff --git a/test/prism/snapshots/seattlerb/bug_cond_pct.txt b/test/prism/snapshots/seattlerb/bug_cond_pct.txt index 73cb18f5080dea..cbf3bc3ef02da1 100644 --- a/test/prism/snapshots/seattlerb/bug_cond_pct.txt +++ b/test/prism/snapshots/seattlerb/bug_cond_pct.txt @@ -10,7 +10,7 @@ │ ├── keyword_loc: (1,6)-(1,10) = "when" │ ├── conditions: (length: 1) │ │ └── @ RegularExpressionNode (location: (1,11)-(1,23)) - │ │ ├── flags: ∅ + │ │ ├── flags: forced_us_ascii_encoding │ │ ├── opening_loc: (1,11)-(1,14) = "%r%" │ │ ├── content_loc: (1,14)-(1,22) = "blahblah" │ │ ├── closing_loc: (1,22)-(1,23) = "%" diff --git a/test/prism/snapshots/seattlerb/case_in.txt b/test/prism/snapshots/seattlerb/case_in.txt index 6a1cc56da6ba26..e7e291c63f8d1d 100644 --- a/test/prism/snapshots/seattlerb/case_in.txt +++ b/test/prism/snapshots/seattlerb/case_in.txt @@ -338,7 +338,7 @@ │ │ └── @ InNode (location: (46,0)-(46,11)) │ │ ├── pattern: │ │ │ @ RegularExpressionNode (location: (46,3)-(46,11)) - │ │ │ ├── flags: ∅ + │ │ │ ├── flags: forced_us_ascii_encoding │ │ │ ├── opening_loc: (46,3)-(46,4) = "/" │ │ │ ├── content_loc: (46,4)-(46,10) = "regexp" │ │ │ ├── closing_loc: (46,10)-(46,11) = "/" diff --git a/test/prism/snapshots/seattlerb/regexp.txt b/test/prism/snapshots/seattlerb/regexp.txt index abe17918e56aa3..06cf99264e3c43 100644 --- a/test/prism/snapshots/seattlerb/regexp.txt +++ b/test/prism/snapshots/seattlerb/regexp.txt @@ -4,31 +4,31 @@ @ StatementsNode (location: (1,0)-(9,13)) └── body: (length: 5) ├── @ RegularExpressionNode (location: (1,0)-(1,5)) - │ ├── flags: ∅ + │ ├── flags: forced_us_ascii_encoding │ ├── opening_loc: (1,0)-(1,1) = "/" │ ├── content_loc: (1,1)-(1,4) = "wtf" │ ├── closing_loc: (1,4)-(1,5) = "/" │ └── unescaped: "wtf" ├── @ RegularExpressionNode (location: (3,0)-(3,6)) - │ ├── flags: multi_line + │ ├── flags: multi_line, forced_us_ascii_encoding │ ├── opening_loc: (3,0)-(3,1) = "/" │ ├── content_loc: (3,1)-(3,4) = "wtf" │ ├── closing_loc: (3,4)-(3,6) = "/m" │ └── unescaped: "wtf" ├── @ RegularExpressionNode (location: (5,0)-(5,6)) - │ ├── flags: ascii_8bit + │ ├── flags: ascii_8bit, forced_us_ascii_encoding │ ├── opening_loc: (5,0)-(5,1) = "/" │ ├── content_loc: (5,1)-(5,4) = "wtf" │ ├── closing_loc: (5,4)-(5,6) = "/n" │ └── unescaped: "wtf" ├── @ RegularExpressionNode (location: (7,0)-(7,7)) - │ ├── flags: multi_line, ascii_8bit + │ ├── flags: multi_line, ascii_8bit, forced_us_ascii_encoding │ ├── opening_loc: (7,0)-(7,1) = "/" │ ├── content_loc: (7,1)-(7,4) = "wtf" │ ├── closing_loc: (7,4)-(7,7) = "/nm" │ └── unescaped: "wtf" └── @ RegularExpressionNode (location: (9,0)-(9,13)) - ├── flags: multi_line, ascii_8bit + ├── flags: multi_line, ascii_8bit, forced_us_ascii_encoding ├── opening_loc: (9,0)-(9,1) = "/" ├── content_loc: (9,1)-(9,4) = "wtf" ├── closing_loc: (9,4)-(9,13) = "/nmnmnmnm" diff --git a/test/prism/snapshots/seattlerb/regexp_esc_C_slash.txt b/test/prism/snapshots/seattlerb/regexp_esc_C_slash.txt index b6bf2426129f0c..4dbedc44ca47b8 100644 --- a/test/prism/snapshots/seattlerb/regexp_esc_C_slash.txt +++ b/test/prism/snapshots/seattlerb/regexp_esc_C_slash.txt @@ -4,7 +4,7 @@ @ StatementsNode (location: (1,0)-(1,7)) └── body: (length: 1) └── @ RegularExpressionNode (location: (1,0)-(1,7)) - ├── flags: ∅ + ├── flags: forced_us_ascii_encoding ├── opening_loc: (1,0)-(1,1) = "/" ├── content_loc: (1,1)-(1,6) = "\\cC\\d" ├── closing_loc: (1,6)-(1,7) = "/" diff --git a/test/prism/snapshots/seattlerb/regexp_esc_u.txt b/test/prism/snapshots/seattlerb/regexp_esc_u.txt index 7b1ebdc636b7dc..bca451eb3b518f 100644 --- a/test/prism/snapshots/seattlerb/regexp_esc_u.txt +++ b/test/prism/snapshots/seattlerb/regexp_esc_u.txt @@ -4,7 +4,7 @@ @ StatementsNode (location: (1,0)-(1,17)) └── body: (length: 1) └── @ RegularExpressionNode (location: (1,0)-(1,17)) - ├── flags: ∅ + ├── flags: forced_us_ascii_encoding ├── opening_loc: (1,0)-(1,1) = "/" ├── content_loc: (1,1)-(1,16) = "[\\u0021-\\u0027]" ├── closing_loc: (1,16)-(1,17) = "/" diff --git a/test/prism/snapshots/seattlerb/regexp_unicode_curlies.txt b/test/prism/snapshots/seattlerb/regexp_unicode_curlies.txt index 8dd265af5f9405..487161b4d0d4e7 100644 --- a/test/prism/snapshots/seattlerb/regexp_unicode_curlies.txt +++ b/test/prism/snapshots/seattlerb/regexp_unicode_curlies.txt @@ -4,13 +4,13 @@ @ StatementsNode (location: (1,0)-(3,8)) └── body: (length: 2) ├── @ RegularExpressionNode (location: (1,0)-(1,15)) - │ ├── flags: ∅ + │ ├── flags: forced_utf8_encoding │ ├── opening_loc: (1,0)-(1,1) = "/" │ ├── content_loc: (1,1)-(1,14) = "\\u{c0de babe}" │ ├── closing_loc: (1,14)-(1,15) = "/" │ └── unescaped: "\\u{c0de babe}" └── @ RegularExpressionNode (location: (3,0)-(3,8)) - ├── flags: ∅ + ├── flags: forced_utf8_encoding ├── opening_loc: (3,0)-(3,1) = "/" ├── content_loc: (3,1)-(3,7) = "\\u{df}" ├── closing_loc: (3,7)-(3,8) = "/" diff --git a/test/prism/snapshots/spanning_heredoc_newlines.txt b/test/prism/snapshots/spanning_heredoc_newlines.txt index 171b0ff974c938..e3609ddbbab533 100644 --- a/test/prism/snapshots/spanning_heredoc_newlines.txt +++ b/test/prism/snapshots/spanning_heredoc_newlines.txt @@ -46,7 +46,7 @@ │ │ ├── flags: ∅ │ │ └── arguments: (length: 1) │ │ └── @ RegularExpressionNode (location: (5,4)-(8,0)) - │ │ ├── flags: ∅ + │ │ ├── flags: forced_us_ascii_encoding │ │ ├── opening_loc: (5,4)-(6,0) = "%r\n" │ │ ├── content_loc: (6,0)-(6,0) = "" │ │ ├── closing_loc: (7,0)-(8,0) = "\n" diff --git a/test/prism/snapshots/unescaping.txt b/test/prism/snapshots/unescaping.txt index 00c5f59cd13e8e..456ef226d07f79 100644 --- a/test/prism/snapshots/unescaping.txt +++ b/test/prism/snapshots/unescaping.txt @@ -15,7 +15,7 @@ │ ├── opening_loc: (1,0)-(1,1) = "[" │ └── closing_loc: (1,9)-(1,10) = "]" ├── @ RegularExpressionNode (location: (3,0)-(3,8)) - │ ├── flags: ∅ + │ ├── flags: forced_us_ascii_encoding │ ├── opening_loc: (3,0)-(3,1) = "/" │ ├── content_loc: (3,1)-(3,7) = "\\c\#{1}" │ ├── closing_loc: (3,7)-(3,8) = "/" diff --git a/test/prism/snapshots/unparser/corpus/literal/if.txt b/test/prism/snapshots/unparser/corpus/literal/if.txt index 6a78779dc9ca2b..00eeba179c6782 100644 --- a/test/prism/snapshots/unparser/corpus/literal/if.txt +++ b/test/prism/snapshots/unparser/corpus/literal/if.txt @@ -7,7 +7,7 @@ │ ├── if_keyword_loc: (1,0)-(1,2) = "if" │ ├── predicate: │ │ @ MatchLastLineNode (location: (1,3)-(1,8)) - │ │ ├── flags: ∅ + │ │ ├── flags: forced_us_ascii_encoding │ │ ├── opening_loc: (1,3)-(1,4) = "/" │ │ ├── content_loc: (1,4)-(1,7) = "foo" │ │ ├── closing_loc: (1,7)-(1,8) = "/" diff --git a/test/prism/snapshots/unparser/corpus/literal/literal.txt b/test/prism/snapshots/unparser/corpus/literal/literal.txt index 8ed1bf5fe9bdd7..ba7dd70b5b717c 100644 --- a/test/prism/snapshots/unparser/corpus/literal/literal.txt +++ b/test/prism/snapshots/unparser/corpus/literal/literal.txt @@ -566,13 +566,13 @@ │ ├── closing_loc: (48,2)-(48,3) = "\"" │ └── unescaped: "" ├── @ RegularExpressionNode (location: (49,0)-(49,5)) - │ ├── flags: ∅ + │ ├── flags: forced_us_ascii_encoding │ ├── opening_loc: (49,0)-(49,1) = "/" │ ├── content_loc: (49,1)-(49,4) = "foo" │ ├── closing_loc: (49,4)-(49,5) = "/" │ └── unescaped: "foo" ├── @ RegularExpressionNode (location: (50,0)-(50,28)) - │ ├── flags: ∅ + │ ├── flags: forced_us_ascii_encoding │ ├── opening_loc: (50,0)-(50,1) = "/" │ ├── content_loc: (50,1)-(50,27) = "[^-+',.\\/:@[:alnum:]\\[\\]]+" │ ├── closing_loc: (50,27)-(50,28) = "/" @@ -633,25 +633,25 @@ │ │ └── closing_loc: (53,11)-(53,12) = "}" │ └── closing_loc: (53,12)-(53,13) = "/" ├── @ RegularExpressionNode (location: (54,0)-(54,4)) - │ ├── flags: ∅ + │ ├── flags: forced_us_ascii_encoding │ ├── opening_loc: (54,0)-(54,1) = "/" │ ├── content_loc: (54,1)-(54,3) = "\\n" │ ├── closing_loc: (54,3)-(54,4) = "/" │ └── unescaped: "\\n" ├── @ RegularExpressionNode (location: (55,0)-(55,4)) - │ ├── flags: ∅ + │ ├── flags: forced_us_ascii_encoding │ ├── opening_loc: (55,0)-(55,1) = "/" │ ├── content_loc: (55,1)-(55,3) = "\\n" │ ├── closing_loc: (55,3)-(55,4) = "/" │ └── unescaped: "\\n" ├── @ RegularExpressionNode (location: (56,0)-(56,5)) - │ ├── flags: extended + │ ├── flags: extended, forced_us_ascii_encoding │ ├── opening_loc: (56,0)-(56,1) = "/" │ ├── content_loc: (56,1)-(56,3) = "\\n" │ ├── closing_loc: (56,3)-(56,5) = "/x" │ └── unescaped: "\\n" ├── @ RegularExpressionNode (location: (57,0)-(57,7)) - │ ├── flags: extended + │ ├── flags: extended, forced_us_ascii_encoding │ ├── opening_loc: (57,0)-(57,1) = "/" │ ├── content_loc: (57,1)-(57,5) = "\\/\\/" │ ├── closing_loc: (57,5)-(57,7) = "/x" diff --git a/test/prism/snapshots/unparser/corpus/literal/send.txt b/test/prism/snapshots/unparser/corpus/literal/send.txt index 2fa4fd621b87f8..b7eb064717ae06 100644 --- a/test/prism/snapshots/unparser/corpus/literal/send.txt +++ b/test/prism/snapshots/unparser/corpus/literal/send.txt @@ -425,7 +425,7 @@ │ │ │ ├── flags: ∅ │ │ │ ├── receiver: │ │ │ │ @ RegularExpressionNode (location: (37,1)-(37,6)) - │ │ │ │ ├── flags: ∅ + │ │ │ │ ├── flags: forced_us_ascii_encoding │ │ │ │ ├── opening_loc: (37,1)-(37,2) = "/" │ │ │ │ ├── content_loc: (37,2)-(37,5) = "bar" │ │ │ │ ├── closing_loc: (37,5)-(37,6) = "/" @@ -511,7 +511,7 @@ │ │ │ │ ├── flags: ∅ │ │ │ │ └── arguments: (length: 1) │ │ │ │ └── @ RegularExpressionNode (location: (39,8)-(39,13)) - │ │ │ │ ├── flags: ∅ + │ │ │ │ ├── flags: forced_us_ascii_encoding │ │ │ │ ├── opening_loc: (39,8)-(39,9) = "/" │ │ │ │ ├── content_loc: (39,9)-(39,12) = "bar" │ │ │ │ ├── closing_loc: (39,12)-(39,13) = "/" @@ -531,7 +531,7 @@ │ ├── flags: ∅ │ ├── receiver: │ │ @ RegularExpressionNode (location: (40,0)-(40,5)) - │ │ ├── flags: ∅ + │ │ ├── flags: forced_us_ascii_encoding │ │ ├── opening_loc: (40,0)-(40,1) = "/" │ │ ├── content_loc: (40,1)-(40,4) = "bar" │ │ ├── closing_loc: (40,4)-(40,5) = "/" @@ -556,7 +556,7 @@ │ ├── flags: ∅ │ ├── receiver: │ │ @ RegularExpressionNode (location: (41,0)-(41,5)) - │ │ ├── flags: ∅ + │ │ ├── flags: forced_us_ascii_encoding │ │ ├── opening_loc: (41,0)-(41,1) = "/" │ │ ├── content_loc: (41,1)-(41,4) = "bar" │ │ ├── closing_loc: (41,4)-(41,5) = "/" @@ -758,7 +758,7 @@ │ │ ├── flags: ∅ │ │ └── arguments: (length: 1) │ │ └── @ RegularExpressionNode (location: (49,7)-(49,12)) - │ │ ├── flags: ∅ + │ │ ├── flags: forced_us_ascii_encoding │ │ ├── opening_loc: (49,7)-(49,8) = "/" │ │ ├── content_loc: (49,8)-(49,11) = "bar" │ │ ├── closing_loc: (49,11)-(49,12) = "/" @@ -1007,7 +1007,7 @@ │ │ │ ├── flags: ∅ │ │ │ └── arguments: (length: 1) │ │ │ └── @ RegularExpressionNode (location: (57,11)-(57,16)) - │ │ │ ├── flags: ∅ + │ │ │ ├── flags: forced_us_ascii_encoding │ │ │ ├── opening_loc: (57,11)-(57,12) = "/" │ │ │ ├── content_loc: (57,12)-(57,15) = "bar" │ │ │ ├── closing_loc: (57,15)-(57,16) = "/" diff --git a/test/prism/snapshots/unparser/corpus/semantic/literal.txt b/test/prism/snapshots/unparser/corpus/semantic/literal.txt index 7f76e2f561faab..59e02be64fcfd2 100644 --- a/test/prism/snapshots/unparser/corpus/semantic/literal.txt +++ b/test/prism/snapshots/unparser/corpus/semantic/literal.txt @@ -31,13 +31,13 @@ │ ├── closing_loc: ∅ │ └── unescaped: "c" ├── @ RegularExpressionNode (location: (9,0)-(9,5)) - │ ├── flags: ∅ + │ ├── flags: forced_us_ascii_encoding │ ├── opening_loc: (9,0)-(9,3) = "%r(" │ ├── content_loc: (9,3)-(9,4) = "/" │ ├── closing_loc: (9,4)-(9,5) = ")" │ └── unescaped: "/" ├── @ RegularExpressionNode (location: (10,0)-(10,6)) - │ ├── flags: ∅ + │ ├── flags: forced_us_ascii_encoding │ ├── opening_loc: (10,0)-(10,3) = "%r(" │ ├── content_loc: (10,3)-(10,5) = "\\)" │ ├── closing_loc: (10,5)-(10,6) = ")" diff --git a/test/prism/snapshots/whitequark/bug_regex_verification.txt b/test/prism/snapshots/whitequark/bug_regex_verification.txt index 5ca85e34c7f101..4464b66e38bf23 100644 --- a/test/prism/snapshots/whitequark/bug_regex_verification.txt +++ b/test/prism/snapshots/whitequark/bug_regex_verification.txt @@ -4,7 +4,7 @@ @ StatementsNode (location: (1,0)-(1,5)) └── body: (length: 1) └── @ RegularExpressionNode (location: (1,0)-(1,5)) - ├── flags: extended + ├── flags: extended, forced_us_ascii_encoding ├── opening_loc: (1,0)-(1,1) = "/" ├── content_loc: (1,1)-(1,3) = "#)" ├── closing_loc: (1,3)-(1,5) = "/x" diff --git a/test/prism/snapshots/whitequark/cond_match_current_line.txt b/test/prism/snapshots/whitequark/cond_match_current_line.txt index fb5ff33ed52094..700d0966f7cdac 100644 --- a/test/prism/snapshots/whitequark/cond_match_current_line.txt +++ b/test/prism/snapshots/whitequark/cond_match_current_line.txt @@ -7,7 +7,7 @@ │ ├── flags: ∅ │ ├── receiver: │ │ @ MatchLastLineNode (location: (1,1)-(1,6)) - │ │ ├── flags: ∅ + │ │ ├── flags: forced_us_ascii_encoding │ │ ├── opening_loc: (1,1)-(1,2) = "/" │ │ ├── content_loc: (1,2)-(1,5) = "wat" │ │ ├── closing_loc: (1,5)-(1,6) = "/" @@ -23,7 +23,7 @@ ├── if_keyword_loc: (3,0)-(3,2) = "if" ├── predicate: │ @ MatchLastLineNode (location: (3,3)-(3,8)) - │ ├── flags: ∅ + │ ├── flags: forced_us_ascii_encoding │ ├── opening_loc: (3,3)-(3,4) = "/" │ ├── content_loc: (3,4)-(3,7) = "wat" │ ├── closing_loc: (3,7)-(3,8) = "/" diff --git a/test/prism/snapshots/whitequark/interp_digit_var.txt b/test/prism/snapshots/whitequark/interp_digit_var.txt index 6c34760bc35320..09d90981058e0b 100644 --- a/test/prism/snapshots/whitequark/interp_digit_var.txt +++ b/test/prism/snapshots/whitequark/interp_digit_var.txt @@ -106,13 +106,13 @@ │ ├── closing_loc: (23,8)-(23,9) = "}" │ └── unescaped: "\#@@1" ├── @ RegularExpressionNode (location: (25,1)-(25,8)) - │ ├── flags: ∅ + │ ├── flags: forced_us_ascii_encoding │ ├── opening_loc: (25,1)-(25,4) = "%r{" │ ├── content_loc: (25,4)-(25,7) = "\#@1" │ ├── closing_loc: (25,7)-(25,8) = "}" │ └── unescaped: "\#@1" ├── @ RegularExpressionNode (location: (27,1)-(27,9)) - │ ├── flags: ∅ + │ ├── flags: forced_us_ascii_encoding │ ├── opening_loc: (27,1)-(27,4) = "%r{" │ ├── content_loc: (27,4)-(27,8) = "\#@@1" │ ├── closing_loc: (27,8)-(27,9) = "}" @@ -188,13 +188,13 @@ │ ├── closing_loc: (47,6)-(47,7) = "'" │ └── unescaped: "\#@@1" ├── @ RegularExpressionNode (location: (49,1)-(49,6)) - │ ├── flags: ∅ + │ ├── flags: forced_us_ascii_encoding │ ├── opening_loc: (49,1)-(49,2) = "/" │ ├── content_loc: (49,2)-(49,5) = "\#@1" │ ├── closing_loc: (49,5)-(49,6) = "/" │ └── unescaped: "\#@1" ├── @ RegularExpressionNode (location: (51,1)-(51,7)) - │ ├── flags: ∅ + │ ├── flags: forced_us_ascii_encoding │ ├── opening_loc: (51,1)-(51,2) = "/" │ ├── content_loc: (51,2)-(51,6) = "\#@@1" │ ├── closing_loc: (51,6)-(51,7) = "/" diff --git a/test/prism/snapshots/whitequark/lvar_injecting_match.txt b/test/prism/snapshots/whitequark/lvar_injecting_match.txt index a1d70e9ccfe8ee..0d1df23d0d9ed5 100644 --- a/test/prism/snapshots/whitequark/lvar_injecting_match.txt +++ b/test/prism/snapshots/whitequark/lvar_injecting_match.txt @@ -9,7 +9,7 @@ │ │ ├── flags: ∅ │ │ ├── receiver: │ │ │ @ RegularExpressionNode (location: (1,0)-(1,15)) - │ │ │ ├── flags: ∅ + │ │ │ ├── flags: forced_us_ascii_encoding │ │ │ ├── opening_loc: (1,0)-(1,1) = "/" │ │ │ ├── content_loc: (1,1)-(1,14) = "(?bar)" │ │ │ ├── closing_loc: (1,14)-(1,15) = "/" diff --git a/test/prism/snapshots/whitequark/parser_bug_830.txt b/test/prism/snapshots/whitequark/parser_bug_830.txt index e920108731e0cc..e52b291d6a1d22 100644 --- a/test/prism/snapshots/whitequark/parser_bug_830.txt +++ b/test/prism/snapshots/whitequark/parser_bug_830.txt @@ -4,7 +4,7 @@ @ StatementsNode (location: (1,0)-(1,4)) └── body: (length: 1) └── @ RegularExpressionNode (location: (1,0)-(1,4)) - ├── flags: ∅ + ├── flags: forced_us_ascii_encoding ├── opening_loc: (1,0)-(1,1) = "/" ├── content_loc: (1,1)-(1,3) = "\\(" ├── closing_loc: (1,3)-(1,4) = "/" diff --git a/test/prism/snapshots/whitequark/parser_slash_slash_n_escaping_in_literals.txt b/test/prism/snapshots/whitequark/parser_slash_slash_n_escaping_in_literals.txt index 96cc5671a61482..080d4d0e7d4437 100644 --- a/test/prism/snapshots/whitequark/parser_slash_slash_n_escaping_in_literals.txt +++ b/test/prism/snapshots/whitequark/parser_slash_slash_n_escaping_in_literals.txt @@ -55,7 +55,7 @@ │ ├── closing_loc: (17,1)-(17,2) = "}" │ └── unescaped: "a\\\nb" ├── @ RegularExpressionNode (location: (19,0)-(20,2)) - │ ├── flags: ∅ + │ ├── flags: forced_us_ascii_encoding │ ├── opening_loc: (19,0)-(19,3) = "%r{" │ ├── content_loc: (19,3)-(20,1) = "a\\\nb" │ ├── closing_loc: (20,1)-(20,2) = "}" @@ -96,7 +96,7 @@ │ ├── closing_loc: (35,1)-(35,2) = "'" │ └── unescaped: "a\\\nb" ├── @ RegularExpressionNode (location: (37,0)-(38,2)) - │ ├── flags: ∅ + │ ├── flags: forced_us_ascii_encoding │ ├── opening_loc: (37,0)-(37,1) = "/" │ ├── content_loc: (37,1)-(38,1) = "a\\\nb" │ ├── closing_loc: (38,1)-(38,2) = "/" diff --git a/test/prism/snapshots/whitequark/regex_plain.txt b/test/prism/snapshots/whitequark/regex_plain.txt index 34fe61c687daaf..df771f7a21d9ba 100644 --- a/test/prism/snapshots/whitequark/regex_plain.txt +++ b/test/prism/snapshots/whitequark/regex_plain.txt @@ -4,7 +4,7 @@ @ StatementsNode (location: (1,0)-(1,10)) └── body: (length: 1) └── @ RegularExpressionNode (location: (1,0)-(1,10)) - ├── flags: ignore_case, multi_line + ├── flags: ignore_case, multi_line, forced_us_ascii_encoding ├── opening_loc: (1,0)-(1,1) = "/" ├── content_loc: (1,1)-(1,7) = "source" ├── closing_loc: (1,7)-(1,10) = "/im" diff --git a/test/prism/snapshots/whitequark/ruby_bug_11873.txt b/test/prism/snapshots/whitequark/ruby_bug_11873.txt index af04f59b5e7e08..2999662cc43bfa 100644 --- a/test/prism/snapshots/whitequark/ruby_bug_11873.txt +++ b/test/prism/snapshots/whitequark/ruby_bug_11873.txt @@ -112,7 +112,7 @@ │ │ │ ├── closing_loc: (3,7)-(3,8) = ")" │ │ │ └── block: ∅ │ │ └── @ RegularExpressionNode (location: (3,10)-(3,13)) - │ │ ├── flags: ∅ + │ │ ├── flags: forced_us_ascii_encoding │ │ ├── opening_loc: (3,10)-(3,11) = "/" │ │ ├── content_loc: (3,11)-(3,12) = "x" │ │ ├── closing_loc: (3,12)-(3,13) = "/" @@ -173,7 +173,7 @@ │ │ │ ├── closing_loc: (5,7)-(5,8) = ")" │ │ │ └── block: ∅ │ │ └── @ RegularExpressionNode (location: (5,10)-(5,14)) - │ │ ├── flags: multi_line + │ │ ├── flags: multi_line, forced_us_ascii_encoding │ │ ├── opening_loc: (5,10)-(5,11) = "/" │ │ ├── content_loc: (5,11)-(5,12) = "x" │ │ ├── closing_loc: (5,12)-(5,14) = "/m" @@ -295,7 +295,7 @@ │ │ │ ├── closing_loc: (9,8)-(9,9) = ")" │ │ │ └── block: ∅ │ │ └── @ RegularExpressionNode (location: (9,11)-(9,14)) - │ │ ├── flags: ∅ + │ │ ├── flags: forced_us_ascii_encoding │ │ ├── opening_loc: (9,11)-(9,12) = "/" │ │ ├── content_loc: (9,12)-(9,13) = "x" │ │ ├── closing_loc: (9,13)-(9,14) = "/" @@ -356,7 +356,7 @@ │ │ │ ├── closing_loc: (11,8)-(11,9) = ")" │ │ │ └── block: ∅ │ │ └── @ RegularExpressionNode (location: (11,11)-(11,15)) - │ │ ├── flags: multi_line + │ │ ├── flags: multi_line, forced_us_ascii_encoding │ │ ├── opening_loc: (11,11)-(11,12) = "/" │ │ ├── content_loc: (11,12)-(11,13) = "x" │ │ ├── closing_loc: (11,13)-(11,15) = "/m" @@ -488,7 +488,7 @@ │ │ │ ├── opening_loc: (15,3)-(15,4) = "{" │ │ │ └── closing_loc: (15,7)-(15,8) = "}" │ │ └── @ RegularExpressionNode (location: (15,10)-(15,13)) - │ │ ├── flags: ∅ + │ │ ├── flags: forced_us_ascii_encoding │ │ ├── opening_loc: (15,10)-(15,11) = "/" │ │ ├── content_loc: (15,11)-(15,12) = "x" │ │ ├── closing_loc: (15,12)-(15,13) = "/" @@ -554,7 +554,7 @@ │ │ │ ├── opening_loc: (17,3)-(17,4) = "{" │ │ │ └── closing_loc: (17,7)-(17,8) = "}" │ │ └── @ RegularExpressionNode (location: (17,10)-(17,14)) - │ │ ├── flags: multi_line + │ │ ├── flags: multi_line, forced_us_ascii_encoding │ │ ├── opening_loc: (17,10)-(17,11) = "/" │ │ ├── content_loc: (17,11)-(17,12) = "x" │ │ ├── closing_loc: (17,12)-(17,14) = "/m" @@ -686,7 +686,7 @@ │ │ │ ├── opening_loc: (21,3)-(21,4) = "{" │ │ │ └── closing_loc: (21,8)-(21,9) = "}" │ │ └── @ RegularExpressionNode (location: (21,11)-(21,14)) - │ │ ├── flags: ∅ + │ │ ├── flags: forced_us_ascii_encoding │ │ ├── opening_loc: (21,11)-(21,12) = "/" │ │ ├── content_loc: (21,12)-(21,13) = "x" │ │ ├── closing_loc: (21,13)-(21,14) = "/" @@ -752,7 +752,7 @@ │ │ ├── opening_loc: (23,3)-(23,4) = "{" │ │ └── closing_loc: (23,8)-(23,9) = "}" │ └── @ RegularExpressionNode (location: (23,11)-(23,15)) - │ ├── flags: multi_line + │ ├── flags: multi_line, forced_us_ascii_encoding │ ├── opening_loc: (23,11)-(23,12) = "/" │ ├── content_loc: (23,12)-(23,13) = "x" │ ├── closing_loc: (23,13)-(23,15) = "/m"