Add initial implementation of Regexp validation.

nirvdrum · kddnewton · commit 6bf1b8edf083 · 2024-03-07T18:15:35.000-05:00
diff --git a/config.yml b/config.yml
@@ -1,5 +1,4 @@
 errors:
-  - CANNOT_PARSE_EXPRESSION
   - ALIAS_ARGUMENT
   - AMPAMPEQ_MULTI_ASSIGN
   - ARGUMENT_AFTER_BLOCK
@@ -34,6 +33,7 @@ errors:
   - BLOCK_PARAM_PIPE_TERM
   - BLOCK_TERM_BRACE
   - BLOCK_TERM_END
+  - CANNOT_PARSE_EXPRESSION
   - CANNOT_PARSE_STRING_PART
   - CASE_EXPRESSION_AFTER_CASE
   - CASE_EXPRESSION_AFTER_WHEN
@@ -82,13 +82,13 @@ errors:
   - EXPECT_ARGUMENT
   - EXPECT_EOL_AFTER_STATEMENT
   - EXPECT_EXPRESSION_AFTER_AMPAMPEQ
-  - EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ
   - EXPECT_EXPRESSION_AFTER_COMMA
   - EXPECT_EXPRESSION_AFTER_EQUAL
   - EXPECT_EXPRESSION_AFTER_LESS_LESS
   - EXPECT_EXPRESSION_AFTER_LPAREN
-  - EXPECT_EXPRESSION_AFTER_QUESTION
   - EXPECT_EXPRESSION_AFTER_OPERATOR
+  - EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ
+  - EXPECT_EXPRESSION_AFTER_QUESTION
   - EXPECT_EXPRESSION_AFTER_SPLAT
   - EXPECT_EXPRESSION_AFTER_SPLAT_HASH
   - EXPECT_EXPRESSION_AFTER_STAR
@@ -113,23 +113,25 @@ errors:
   - HASH_VALUE
   - HEREDOC_TERM
   - INCOMPLETE_QUESTION_MARK
-  - INCOMPLETE_VARIABLE_CLASS_3_3_0
   - INCOMPLETE_VARIABLE_CLASS
-  - INCOMPLETE_VARIABLE_INSTANCE_3_3_0
+  - INCOMPLETE_VARIABLE_CLASS_3_3_0
   - INCOMPLETE_VARIABLE_INSTANCE
+  - INCOMPLETE_VARIABLE_INSTANCE_3_3_0
+  - INVALID_CHARACTER
   - INVALID_ENCODING_MAGIC_COMMENT
   - INVALID_FLOAT_EXPONENT
+  - INVALID_MULTIBYTE_CHAR
+  - INVALID_MULTIBYTE_CHARACTER
+  - INVALID_MULTIBYTE_ESCAPE
   - INVALID_NUMBER_BINARY
   - INVALID_NUMBER_DECIMAL
   - INVALID_NUMBER_HEXADECIMAL
   - INVALID_NUMBER_OCTAL
   - INVALID_NUMBER_UNDERSCORE
-  - INVALID_CHARACTER
-  - INVALID_MULTIBYTE_CHARACTER
-  - INVALID_PRINTABLE_CHARACTER
   - INVALID_PERCENT
-  - INVALID_VARIABLE_GLOBAL_3_3_0
+  - INVALID_PRINTABLE_CHARACTER
   - INVALID_VARIABLE_GLOBAL
+  - INVALID_VARIABLE_GLOBAL_3_3_0
   - IT_NOT_ALLOWED_NUMBERED
   - IT_NOT_ALLOWED_ORDINARY
   - LAMBDA_OPEN
@@ -150,8 +152,8 @@ errors:
   - MODULE_TERM
   - MULTI_ASSIGN_MULTI_SPLATS
   - MULTI_ASSIGN_UNEXPECTED_REST
-  - NOT_EXPRESSION
   - NO_LOCAL_VARIABLE
+  - NOT_EXPRESSION
   - NUMBER_LITERAL_UNDERSCORE
   - NUMBERED_PARAMETER_IT
   - NUMBERED_PARAMETER_ORDINARY
@@ -173,8 +175,8 @@ errors:
   - PARAMETER_UNEXPECTED_FWD
   - PARAMETER_WILD_LOOSE_COMMA
   - PATTERN_EXPRESSION_AFTER_BRACKET
-  - PATTERN_EXPRESSION_AFTER_HROCKET
   - PATTERN_EXPRESSION_AFTER_COMMA
+  - PATTERN_EXPRESSION_AFTER_HROCKET
   - PATTERN_EXPRESSION_AFTER_IN
   - PATTERN_EXPRESSION_AFTER_KEY
   - PATTERN_EXPRESSION_AFTER_PAREN
@@ -191,7 +193,12 @@ errors:
   - PATTERN_TERM_BRACKET
   - PATTERN_TERM_PAREN
   - PIPEPIPEEQ_MULTI_ASSIGN
+  - REGEXP_ENCODING_OPTION_MISMATCH
+  - REGEXP_INCOMPAT_CHAR_ENCODING
+  - REGEXP_INVALID_UNICODE_RANGE
+  - REGEXP_NON_ESCAPED_MBC
   - REGEXP_TERM
+  - REGEXP_UTF8_CHAR_NON_UTF8_REGEXP
   - RESCUE_EXPRESSION
   - RESCUE_MODIFIER_VALUE
   - RESCUE_TERM
@@ -213,9 +220,9 @@ errors:
   - TERNARY_EXPRESSION_FALSE
   - TERNARY_EXPRESSION_TRUE
   - UNARY_RECEIVER
+  - UNDEF_ARGUMENT
   - UNEXPECTED_TOKEN_CLOSE_CONTEXT
   - UNEXPECTED_TOKEN_IGNORE
-  - UNDEF_ARGUMENT
   - UNTIL_TERM
   - VOID_EXPRESSION
   - WHILE_TERM
diff --git a/include/prism/encoding.h b/include/prism/encoding.h
@@ -252,6 +252,18 @@ extern const pm_encoding_t pm_encodings[PM_ENCODING_MAXIMUM];
  */
 #define PM_ENCODING_ASCII_8BIT_ENTRY (&pm_encodings[PM_ENCODING_ASCII_8BIT])
 
+/**
+ * This is the EUC-JP encoding. We need a reference to it to quickly process
+ * regular expression modifiers.
+ */
+#define PM_ENCODING_EUC_JP_ENTRY (&pm_encodings[PM_ENCODING_EUC_JP])
+
+/**
+ * This is the Windows-31J encoding. We need a reference to it to quickly
+ * process regular expression modifiers.
+ */
+#define PM_ENCODING_WINDOWS_31J_ENTRY (&pm_encodings[PM_ENCODING_WINDOWS_31J])
+
 /**
  * Parse the given name of an encoding and return a pointer to the corresponding
  * encoding struct if one can be found, otherwise return NULL.
diff --git a/src/prism.c b/src/prism.c
@@ -5951,16 +5951,112 @@ parse_symbol_encoding(const pm_parser_t *parser, const pm_string_t *contents) {
     return 0;
 }
 
+static inline pm_node_flags_t
+parse_and_validate_regular_expression_encoding_modifier(pm_parser_t *parser, const pm_string_t *source, const pm_string_t *contents, pm_node_flags_t flags, char modifier, const pm_encoding_t *modifier_encoding) {
+    assert ((modifier == 'n' && modifier_encoding == PM_ENCODING_ASCII_8BIT_ENTRY) ||
+            (modifier == 'u' && modifier_encoding == PM_ENCODING_UTF_8_ENTRY) ||
+            (modifier == 'e' && modifier_encoding == PM_ENCODING_EUC_JP_ENTRY) ||
+            (modifier == 's' && modifier_encoding == PM_ENCODING_WINDOWS_31J_ENTRY));
+
+    // There's special validation logic used if a string does not contain any character escape sequences.
+    if (parser->explicit_encoding == NULL) {
+        // If an ASCII-only string without character escapes is used with an encoding modifier, then resulting Regexp
+        // has the modifier encoding, unless the ASCII-8BIT modifier is used, in which case the Regexp "downgrades" to
+        // the US-ASCII encoding.
+        bool ascii_only = pm_ascii_only_p(contents);
+        if (ascii_only) {
+            return modifier == 'n' ? PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING : flags;
+        }
+
+        if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) {
+            if (!ascii_only) {
+                PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name);
+            }
+        } else if (parser->encoding != modifier_encoding) {
+            PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_ENCODING_OPTION_MISMATCH, modifier, parser->encoding->name);
+
+            if (modifier == 'n' && !ascii_only) {
+                PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_NON_ESCAPED_MBC, pm_string_source(source));
+            }
+        }
+
+        return flags;
+    }
+
+    // TODO (nirvdrum 21-Feb-2024): To validate regexp sources with character escape sequences we need to know whether hex or Unicode escape sequences were used and Prism doesn't currently provide that data. We handle a subset of unambiguous cases in the meanwhile.
+    bool mixed_encoding = false;
+
+    if (mixed_encoding) {
+        PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_ESCAPE, pm_string_source(source));
+    } else if (modifier != 'n' && parser->explicit_encoding == PM_ENCODING_ASCII_8BIT_ENTRY) {
+        // TODO (nirvdrum 21-Feb-2024): Validate the content is valid in the modifier encoding. Do this on-demand so we don't pay the cost of computation unnecessarily.
+        bool valid_string_in_modifier_encoding = true;
+
+        if (!valid_string_in_modifier_encoding) {
+            PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_ESCAPE, pm_string_source(source));
+        }
+    } else if (modifier != 'u' && parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
+        // TODO (nirvdrum 21-Feb-2024): There's currently no way to tell if the source used hex or Unicode character escapes from `explicit_encoding` alone. If the source encoding was already UTF-8, both character escape types would set `explicit_encoding` to UTF-8, but need to be processed differently. Skip for now.
+        if (parser->encoding != PM_ENCODING_UTF_8_ENTRY) {
+            PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING, pm_string_source(source));
+        }
+    }
+
+    // We've determined the encoding would naturally be EUC-JP and there is no need to force the encoding to anything else.
+    return flags;
+}
+
 /**
  * Ruby "downgrades" the encoding of Regexps to US-ASCII if the associated encoding is ASCII-compatible and
  * the unescaped representation of a Regexp source consists only of US-ASCII code points. This is true even
  * when the Regexp is explicitly given an ASCII-8BIT encoding via the (/n) modifier. Otherwise, the encoding
  * may be explicitly set with an escape sequence.
  */
 static inline pm_node_flags_t
-parse_regular_expression_encoding(const pm_parser_t *parser, const pm_string_t *contents) {
-    // Ruby stipulates that all source files must use an ASCII-compatible encoding. Thus, all regular expressions
-    // appearing in source are eligible for "downgrading" to US-ASCII.
+parse_and_validate_regular_expression_encoding(pm_parser_t *parser, const pm_string_t *source, const pm_string_t *contents, pm_node_flags_t flags) {
+    // TODO (nirvdrum 22-Feb-2024): CRuby reports a special Regexp-specific error for invalid Unicode ranges. We either need to scan again or modify the "invalid Unicode escape sequence" message we already report.
+    bool valid_unicode_range = true;
+    if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY && !valid_unicode_range) {
+        PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_INVALID_UNICODE_RANGE, pm_string_source(source));
+
+        return flags;
+    }
+
+    // US-ASCII strings do not admit multi-byte character literals. However, character escape sequences corresponding
+    // to multi-byte characters are allowed.
+    if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY && parser->explicit_encoding == NULL && !pm_ascii_only_p(contents)) {
+        // CRuby will continue processing even though a SyntaxError has already been detected. It may result in the
+        // following error message appearing twice. We do the same for compatibility.
+        PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name);
+    }
+
+    /**
+     * Start checking modifier flags. We need to process these before considering any explicit encodings that may have
+     * been set by character literals. The order in which the encoding modifiers is checked does not matter. In the
+     * event that both an encoding modifier and an explicit encoding would result in the same encoding we do not set
+     * the corresponding "forced_<encoding>" flag. Instead, the caller should check the encoding modifier flag and
+     * determine the encoding that way.
+     */
+
+    if (flags & PM_REGULAR_EXPRESSION_FLAGS_ASCII_8BIT) {
+        return parse_and_validate_regular_expression_encoding_modifier(parser, source, contents, flags, 'n', PM_ENCODING_ASCII_8BIT_ENTRY);
+    }
+
+    if (flags & PM_REGULAR_EXPRESSION_FLAGS_UTF_8) {
+        return parse_and_validate_regular_expression_encoding_modifier(parser, source, contents, flags, 'u', PM_ENCODING_UTF_8_ENTRY);
+    }
+
+    if (flags & PM_REGULAR_EXPRESSION_FLAGS_EUC_JP) {
+        return parse_and_validate_regular_expression_encoding_modifier(parser, source, contents, flags, 'e', PM_ENCODING_EUC_JP_ENTRY);
+    }
+
+    if (flags & PM_REGULAR_EXPRESSION_FLAGS_WINDOWS_31J) {
+        return parse_and_validate_regular_expression_encoding_modifier(parser, source, contents, flags, 's', PM_ENCODING_WINDOWS_31J_ENTRY);
+    }
+
+    // At this point no encoding modifiers will be present on the regular expression as they would have already
+    // been processed. Ruby stipulates that all source files must use an ASCII-compatible encoding. Thus, all
+    // regular expressions without an encoding modifier appearing in source are eligible for "downgrading" to US-ASCII.
     if (pm_ascii_only_p(contents)) {
         return PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING;
     }
@@ -5976,6 +6072,7 @@ parse_regular_expression_encoding(const pm_parser_t *parser, const pm_string_t *
             return PM_REGULAR_EXPRESSION_FLAGS_FORCED_BINARY_ENCODING;
         }
     }
+
     return 0;
 }
 
@@ -17030,7 +17127,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
                 // more easily compiled.
                 if (accept1(parser, PM_TOKEN_REGEXP_END)) {
                     pm_node_t *regular_expression_node = (pm_node_t *) pm_regular_expression_node_create_unescaped(parser, &opening, &content, &parser->previous, &source);
-                    pm_node_flag_set(regular_expression_node, parse_regular_expression_encoding(parser, &unescaped));
+                    pm_node_flag_set(regular_expression_node, parse_and_validate_regular_expression_encoding(parser, &source, &unescaped, regular_expression_node->flags));
                     return regular_expression_node;
                 }
 
diff --git a/templates/src/diagnostic.c.erb b/templates/src/diagnostic.c.erb
@@ -204,7 +204,9 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = {
     [PM_ERR_INVALID_NUMBER_OCTAL]               = { "invalid octal number", PM_ERROR_LEVEL_FATAL },
     [PM_ERR_INVALID_NUMBER_UNDERSCORE]          = { "invalid underscore placement in number", PM_ERROR_LEVEL_FATAL },
     [PM_ERR_INVALID_CHARACTER]                  = { "invalid character 0x%X", PM_ERROR_LEVEL_FATAL },
+    [PM_ERR_INVALID_MULTIBYTE_CHAR]             = { "invalid multibyte char (%s)", PM_ERROR_LEVEL_FATAL },
     [PM_ERR_INVALID_MULTIBYTE_CHARACTER]        = { "invalid multibyte character 0x%X", PM_ERROR_LEVEL_FATAL },
+    [PM_ERR_INVALID_MULTIBYTE_ESCAPE]           = { "invalid multibyte escape: /%s/", PM_ERROR_LEVEL_FATAL },
     [PM_ERR_INVALID_PRINTABLE_CHARACTER]        = { "invalid character `%c`", PM_ERROR_LEVEL_FATAL },
     [PM_ERR_INVALID_PERCENT]                    = { "invalid `%` token", PM_ERROR_LEVEL_FATAL }, // TODO WHAT?
     [PM_ERR_INVALID_VARIABLE_GLOBAL_3_3_0]      = { "`%.*s' is not allowed as a global variable name", PM_ERROR_LEVEL_FATAL },
@@ -270,7 +272,12 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = {
     [PM_ERR_PATTERN_TERM_BRACKET]               = { "expected a `]` to close the pattern expression", PM_ERROR_LEVEL_FATAL },
     [PM_ERR_PATTERN_TERM_PAREN]                 = { "expected a `)` to close the pattern expression", PM_ERROR_LEVEL_FATAL },
     [PM_ERR_PIPEPIPEEQ_MULTI_ASSIGN]            = { "unexpected `||=` in a multiple assignment", PM_ERROR_LEVEL_FATAL },
+    [PM_ERR_REGEXP_ENCODING_OPTION_MISMATCH]    = { "regexp encoding option '%c' differs from source encoding '%s'", PM_ERROR_LEVEL_FATAL },
+    [PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING]      = { "incompatible character encoding: /%s/", PM_ERROR_LEVEL_FATAL },
+    [PM_ERR_REGEXP_NON_ESCAPED_MBC]             = { "/.../n has a non escaped non ASCII character in non ASCII-8BIT script: /%s/", PM_ERROR_LEVEL_FATAL },
+    [PM_ERR_REGEXP_INVALID_UNICODE_RANGE]       = { "invalid Unicode range: /%s/", PM_ERROR_LEVEL_FATAL },
     [PM_ERR_REGEXP_TERM]                        = { "expected a closing delimiter for the regular expression", PM_ERROR_LEVEL_FATAL },
+    [PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP]   = { "UTF-8 character in non UTF-8 regexp: /%s/", PM_ERROR_LEVEL_FATAL },
     [PM_ERR_RESCUE_EXPRESSION]                  = { "expected a rescued expression", PM_ERROR_LEVEL_FATAL },
     [PM_ERR_RESCUE_MODIFIER_VALUE]              = { "expected a value after the `rescue` modifier", PM_ERROR_LEVEL_FATAL },
     [PM_ERR_RESCUE_TERM]                        = { "expected a closing delimiter for the `rescue` clause", PM_ERROR_LEVEL_FATAL },
diff --git a/test/prism/encoding_test.rb b/test/prism/encoding_test.rb