[ruby/prism] Provide flags for changing encodings

ruby/prism@e838eaff6f
ruby · Dec 6, 2023 · 82f18ba · 82f18ba
1 parent 9620ca6
commit 82f18ba
Show file tree

Hide file tree

Showing 31 changed files with 455 additions and 226 deletions.
diff --git a/prism/config.yml b/prism/config.yml
@@ -346,6 +346,13 @@ flags:
       - name: VARIABLE_CALL
         comment: "a call that could have been a local variable"
     comment: Flags for call nodes.
+  - name: EncodingFlags
+    values:
+      - name: FORCED_UTF8_ENCODING
+        comment: "internal bytes forced the encoding to UTF-8"
+      - name: FORCED_BINARY_ENCODING
+        comment: "internal bytes forced the encoding to binary"
+    comment: Flags for nodes that have unescaped content.
   - name: IntegerBaseFlags
     values:
       - name: BINARY
@@ -388,6 +395,10 @@ flags:
     comment: Flags for regular expression and match last line nodes.
   - name: StringFlags
     values:
+      - name: FORCED_UTF8_ENCODING
+        comment: "internal bytes forced the encoding to UTF-8"
+      - name: FORCED_BINARY_ENCODING
+        comment: "internal bytes forced the encoding to binary"
       - name: FROZEN
         comment: "frozen by virtue of a `frozen_string_literal` comment"
     comment: Flags for string nodes.
@@ -2576,6 +2587,9 @@ nodes:
           ^^^^^^^^^^^^^^^^^^^^
   - name: XStringNode
     fields:
+      - name: flags
+        type: flags
+        kind: EncodingFlags
       - name: opening_loc
         type: location
       - name: content_loc

diff --git a/prism/defines.h b/prism/defines.h
@@ -74,4 +74,21 @@
 #   define snprintf _snprintf
 #endif
 
+/**
+ * A simple utility macro to concatenate two tokens together, necessary when one
+ * of the tokens is itself a macro.
+ */
+#define PM_CONCATENATE(left, right) left ## right
+
+/**
+ * We want to be able to use static assertions, but they weren't standardized
+ * until C11. As such, we polyfill it here by making a hacky typedef that will
+ * fail to compile due to a negative array size if the condition is false.
+ */
+#if defined(_Static_assert)
+#   define PM_STATIC_ASSERT(line, condition, message) _Static_assert(condition, message)
+#else
+#   define PM_STATIC_ASSERT(line, condition, message) typedef char PM_CONCATENATE(static_assert_, line)[(condition) ? 1 : -1]
+#endif
+
 #endif
diff --git a/prism/diagnostic.c b/prism/diagnostic.c
@@ -185,6 +185,7 @@ static const char* const diagnostic_messages[PM_DIAGNOSTIC_ID_LEN] = {
     [PM_ERR_LIST_W_UPPER_ELEMENT]               = "expected a string in a `%W` list",
     [PM_ERR_LIST_W_UPPER_TERM]                  = "expected a closing delimiter for the `%W` list",
     [PM_ERR_MALLOC_FAILED]                      = "failed to allocate memory",
+    [PM_ERR_MIXED_ENCODING]                     = "UTF-8 mixed within %s source",
     [PM_ERR_MODULE_IN_METHOD]                   = "unexpected module definition in a method definition",
     [PM_ERR_MODULE_NAME]                        = "expected a constant name after `module`",
     [PM_ERR_MODULE_TERM]                        = "expected an `end` to close the `module` statement",

diff --git a/prism/diagnostic.h b/prism/diagnostic.h
@@ -177,6 +177,7 @@ typedef enum {
     PM_ERR_LIST_W_UPPER_ELEMENT,
     PM_ERR_LIST_W_UPPER_TERM,
     PM_ERR_MALLOC_FAILED,
+    PM_ERR_MIXED_ENCODING,
     PM_ERR_MODULE_IN_METHOD,
     PM_ERR_MODULE_NAME,
     PM_ERR_MODULE_TERM,

diff --git a/prism/encoding.c b/prism/encoding.c
@@ -4212,9 +4212,9 @@ pm_encoding_shift_jis_char_width(const uint8_t *b, ptrdiff_t n) {
 }
 
 /**
- * This is the definition of all of the encodings that we support.
+ * This is the table of all of the encodings that prisms supports.
  */
-static const pm_encoding_t pm_encodings[] = {
+const pm_encoding_t pm_encodings[] = {
     [PM_ENCODING_UTF_8] = {
         .name = "UTF-8",
         .char_width = pm_encoding_utf_8_char_width,
@@ -4223,14 +4223,6 @@ static const pm_encoding_t pm_encodings[] = {
         .isupper_char = pm_encoding_utf_8_isupper_char,
         .multibyte = true
     },
-    [PM_ENCODING_ASCII] = {
-        .name = "US-ASCII",
-        .char_width = pm_encoding_ascii_char_width,
-        .alnum_char = pm_encoding_ascii_alnum_char,
-        .alpha_char = pm_encoding_ascii_alpha_char,
-        .isupper_char = pm_encoding_ascii_isupper_char,
-        .multibyte = false
-    },
     [PM_ENCODING_ASCII_8BIT] = {
         .name = "ASCII-8BIT",
         .char_width = pm_encoding_single_char_width,
@@ -4815,6 +4807,14 @@ static const pm_encoding_t pm_encodings[] = {
         .isupper_char = pm_encoding_tis_620_isupper_char,
         .multibyte = false
     },
+    [PM_ENCODING_US_ASCII] = {
+        .name = "US-ASCII",
+        .char_width = pm_encoding_ascii_char_width,
+        .alnum_char = pm_encoding_ascii_alnum_char,
+        .alpha_char = pm_encoding_ascii_alpha_char,
+        .isupper_char = pm_encoding_ascii_isupper_char,
+        .multibyte = false
+    },
     [PM_ENCODING_UTF8_MAC] = {
         .name = "UTF8-MAC",
         .char_width = pm_encoding_utf_8_char_width,
@@ -4937,11 +4937,6 @@ static const pm_encoding_t pm_encodings[] = {
     }
 };
 
-/**
- * This is the default UTF-8 encoding. We need it to quickly create parsers.
- */
-const pm_encoding_t *pm_encoding_utf_8 = pm_encodings;
-
 /**
  * Parse the given name of an encoding and return a pointer to the corresponding
  * encoding struct if one can be found, otherwise return NULL.
@@ -4961,7 +4956,7 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
         }
 
         // Otherwise we'll return the default UTF-8 encoding.
-        return pm_encoding_utf_8;
+        return PM_ENCODING_UTF_8_ENTRY;
     }
 
     // Next, we're going to loop through each of the encodings that we handle
@@ -4972,9 +4967,9 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
     if (width >= 3) {
         switch (*start) {
             case 'A': case 'a':
-                ENCODING1("ASCII", PM_ENCODING_ASCII);
+                ENCODING1("ASCII", PM_ENCODING_US_ASCII);
                 ENCODING1("ASCII-8BIT", PM_ENCODING_ASCII_8BIT);
-                ENCODING1("ANSI_X3.4-1968", PM_ENCODING_ASCII);
+                ENCODING1("ANSI_X3.4-1968", PM_ENCODING_US_ASCII);
                 break;
             case 'B': case 'b':
                 ENCODING1("BINARY", PM_ENCODING_ASCII_8BIT);
@@ -5109,7 +5104,7 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
                 ENCODING1("TIS-620", PM_ENCODING_TIS_620);
                 break;
             case 'U': case 'u':
-                ENCODING1("US-ASCII", PM_ENCODING_ASCII);
+                ENCODING1("US-ASCII", PM_ENCODING_US_ASCII);
                 ENCODING2("UTF8-MAC", "UTF-8-HFS", PM_ENCODING_UTF8_MAC);
                 ENCODING1("UTF8-DoCoMo", PM_ENCODING_UTF8_DOCOMO);
                 ENCODING1("UTF8-KDDI", PM_ENCODING_UTF8_KDDI);
@@ -5129,7 +5124,7 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
                 ENCODING1("Windows-1258", PM_ENCODING_WINDOWS_1258);
                 break;
             case '6':
-                ENCODING1("646", PM_ENCODING_ASCII);
+                ENCODING1("646", PM_ENCODING_US_ASCII);
                 break;
         }
     }

diff --git a/prism/encoding.h b/prism/encoding.h
@@ -125,7 +125,6 @@ extern const uint8_t pm_encoding_unicode_table[256];
  */
 typedef enum {
     PM_ENCODING_UTF_8 = 0,
-    PM_ENCODING_ASCII,
     PM_ENCODING_ASCII_8BIT,
     PM_ENCODING_BIG5,
     PM_ENCODING_BIG5_HKSCS,
@@ -199,6 +198,7 @@ typedef enum {
     PM_ENCODING_STATELESS_ISO_2022_JP,
     PM_ENCODING_STATELESS_ISO_2022_JP_KDDI,
     PM_ENCODING_TIS_620,
+    PM_ENCODING_US_ASCII,
     PM_ENCODING_UTF8_MAC,
     PM_ENCODING_UTF8_DOCOMO,
     PM_ENCODING_UTF8_KDDI,
@@ -213,13 +213,27 @@ typedef enum {
     PM_ENCODING_WINDOWS_1257,
     PM_ENCODING_WINDOWS_1258,
     PM_ENCODING_WINDOWS_31J,
-    PM_ENCODING_WINDOWS_874
+    PM_ENCODING_WINDOWS_874,
+    PM_ENCODING_MAXIMUM
 } pm_encoding_type_t;
 
 /**
- * This is the default UTF-8 encoding. We need it to quickly create parsers.
+ * This is the table of all of the encodings that prisms supports.
  */
-extern const pm_encoding_t *pm_encoding_utf_8;
+extern const pm_encoding_t pm_encodings[PM_ENCODING_MAXIMUM];
+
+/**
+ * This is the default UTF-8 encoding. We need a reference to it to quickly
+ * create parsers.
+ */
+#define PM_ENCODING_UTF_8_ENTRY (&pm_encodings[PM_ENCODING_UTF_8])
+
+/**
+ * This is the US-ASCII encoding. We need a reference to it to be able to
+ * compare against it when a string is being created because it could possibly
+ * need to fall back to ASCII-8BIT.
+ */
+#define PM_ENCODING_US_ASCII_ENTRY (&pm_encodings[PM_ENCODING_US_ASCII])
 
 /**
  * Parse the given name of an encoding and return a pointer to the corresponding

diff --git a/prism/extension.c b/prism/extension.c
@@ -469,7 +469,7 @@ parse_lex_token(void *data, pm_parser_t *parser, pm_token_t *token) {
 static void
 parse_lex_encoding_changed_callback(pm_parser_t *parser) {
     parse_lex_data_t *parse_lex_data = (parse_lex_data_t *) parser->lex_callback->data;
-    parse_lex_data->encoding = rb_enc_find(parser->encoding.name);
+    parse_lex_data->encoding = rb_enc_find(parser->encoding->name);
 
     // Since the encoding changed, we need to go back and change the encoding of
     // the tokens that were already lexed. This is only going to end up being
@@ -599,7 +599,7 @@ parse_input(pm_string_t *input, const pm_options_t *options) {
     pm_parser_init(&parser, pm_string_source(input), pm_string_length(input), options);
 
     pm_node_t *node = pm_parse(&parser);
-    rb_encoding *encoding = rb_enc_find(parser.encoding.name);
+    rb_encoding *encoding = rb_enc_find(parser.encoding->name);
 
     VALUE source = pm_source_new(&parser, encoding);
     VALUE result_argv[] = {
@@ -693,7 +693,7 @@ parse_input_comments(pm_string_t *input, const pm_options_t *options) {
     pm_parser_init(&parser, pm_string_source(input), pm_string_length(input), options);
 
     pm_node_t *node = pm_parse(&parser);
-    rb_encoding *encoding = rb_enc_find(parser.encoding.name);
+    rb_encoding *encoding = rb_enc_find(parser.encoding->name);
 
     VALUE source = pm_source_new(&parser, encoding);
     VALUE comments = parser_comments(&parser, source);
@@ -872,7 +872,7 @@ static VALUE
 named_captures(VALUE self, VALUE source) {
     pm_string_list_t string_list = { 0 };
 
-    if (!pm_regexp_named_capture_group_names((const uint8_t *) RSTRING_PTR(source), RSTRING_LEN(source), &string_list, false, pm_encoding_utf_8)) {
+    if (!pm_regexp_named_capture_group_names((const uint8_t *) RSTRING_PTR(source), RSTRING_LEN(source), &string_list, false, PM_ENCODING_UTF_8_ENTRY)) {
         pm_string_list_free(&string_list);
         return Qnil;
     }
@@ -962,7 +962,7 @@ inspect_node(VALUE self, VALUE source) {
 
     pm_prettyprint(&buffer, &parser, node);
 
-    rb_encoding *encoding = rb_enc_find(parser.encoding.name);
+    rb_encoding *encoding = rb_enc_find(parser.encoding->name);
     VALUE string = rb_enc_str_new(pm_buffer_value(&buffer), pm_buffer_length(&buffer), encoding);
 
     pm_buffer_free(&buffer);

diff --git a/prism/parser.h b/prism/parser.h
@@ -523,12 +523,6 @@ struct pm_parser {
         size_t index;
     } lex_modes;
 
-    /**
-     * The common_whitespace value from the most-recently-popped heredoc mode of the lexer, so we
-     * can dedent the heredoc after popping the lex mode.
-     */
-    size_t current_string_common_whitespace;
-
     /** The pointer to the start of the source. */
     const uint8_t *start;
 
@@ -581,7 +575,7 @@ struct pm_parser {
      * The encoding functions for the current file is attached to the parser as
      * it's parsing so that it can change with a magic comment.
      */
-    pm_encoding_t encoding;
+    const pm_encoding_t *encoding;
 
     /**
      * When the encoding that is being used to parse the source is changed by
@@ -637,6 +631,37 @@ struct pm_parser {
      */
     int32_t start_line;
 
+    /**
+     * When a string-like expression is being lexed, any byte or escape sequence
+     * that resolves to a value whose top bit is set (i.e., >= 0x80) will
+     * explicitly set the encoding to the same encoding as the source.
+     * Alternatively, if a unicode escape sequence is used (e.g., \\u{80}) that
+     * resolves to a value whose top bit is set, then the encoding will be
+     * explicitly set to UTF-8.
+     *
+     * The _next_ time this happens, if the encoding that is about to become the
+     * explicitly set encoding does not match the previously set explicit
+     * encoding, a mixed encoding error will be emitted.
+     *
+     * When the expression is finished being lexed, the explicit encoding
+     * controls the encoding of the expression. For the most part this means
+     * that the expression will either be encoded in the source encoding or
+     * UTF-8. This holds for all encodings except US-ASCII. If the source is
+     * US-ASCII and an explicit encoding was set that was _not_ UTF-8, then the
+     * expression will be encoded as ASCII-8BIT.
+     *
+     * Note that if the expression is a list, different elements within the same
+     * list can have different encodings, so this will get reset between each
+     * element. Furthermore all of this only applies to lists that support
+     * interpolation, because otherwise escapes that could change the encoding
+     * are ignored.
+     *
+     * At first glance, it may make more sense for this to live on the lexer
+     * mode, but we need it here to communicate back to the parser for character
+     * literals that do not push a new lexer mode.
+     */
+    const pm_encoding_t *explicit_encoding;
+
     /** Whether or not we're at the beginning of a command. */
     bool command_start;