@@ -5951,16 +5951,112 @@ parse_symbol_encoding(const pm_parser_t *parser, const pm_string_t *contents) {
5951
5951
return 0;
5952
5952
}
5953
5953
5954
+ static inline pm_node_flags_t
5955
+ parse_and_validate_regular_expression_encoding_modifier(pm_parser_t *parser, const pm_string_t *source, const pm_string_t *contents, pm_node_flags_t flags, char modifier, const pm_encoding_t *modifier_encoding) {
5956
+ assert ((modifier == 'n' && modifier_encoding == PM_ENCODING_ASCII_8BIT_ENTRY) ||
5957
+ (modifier == 'u' && modifier_encoding == PM_ENCODING_UTF_8_ENTRY) ||
5958
+ (modifier == 'e' && modifier_encoding == PM_ENCODING_EUC_JP_ENTRY) ||
5959
+ (modifier == 's' && modifier_encoding == PM_ENCODING_WINDOWS_31J_ENTRY));
5960
+
5961
+ // There's special validation logic used if a string does not contain any character escape sequences.
5962
+ if (parser->explicit_encoding == NULL) {
5963
+ // If an ASCII-only string without character escapes is used with an encoding modifier, then resulting Regexp
5964
+ // has the modifier encoding, unless the ASCII-8BIT modifier is used, in which case the Regexp "downgrades" to
5965
+ // the US-ASCII encoding.
5966
+ bool ascii_only = pm_ascii_only_p(contents);
5967
+ if (ascii_only) {
5968
+ return modifier == 'n' ? PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING : flags;
5969
+ }
5970
+
5971
+ if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) {
5972
+ if (!ascii_only) {
5973
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name);
5974
+ }
5975
+ } else if (parser->encoding != modifier_encoding) {
5976
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_ENCODING_OPTION_MISMATCH, modifier, parser->encoding->name);
5977
+
5978
+ if (modifier == 'n' && !ascii_only) {
5979
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_NON_ESCAPED_MBC, pm_string_source(source));
5980
+ }
5981
+ }
5982
+
5983
+ return flags;
5984
+ }
5985
+
5986
+ // TODO (nirvdrum 21-Feb-2024): To validate regexp sources with character escape sequences we need to know whether hex or Unicode escape sequences were used and Prism doesn't currently provide that data. We handle a subset of unambiguous cases in the meanwhile.
5987
+ bool mixed_encoding = false;
5988
+
5989
+ if (mixed_encoding) {
5990
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_ESCAPE, pm_string_source(source));
5991
+ } else if (modifier != 'n' && parser->explicit_encoding == PM_ENCODING_ASCII_8BIT_ENTRY) {
5992
+ // TODO (nirvdrum 21-Feb-2024): Validate the content is valid in the modifier encoding. Do this on-demand so we don't pay the cost of computation unnecessarily.
5993
+ bool valid_string_in_modifier_encoding = true;
5994
+
5995
+ if (!valid_string_in_modifier_encoding) {
5996
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_ESCAPE, pm_string_source(source));
5997
+ }
5998
+ } else if (modifier != 'u' && parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
5999
+ // TODO (nirvdrum 21-Feb-2024): There's currently no way to tell if the source used hex or Unicode character escapes from `explicit_encoding` alone. If the source encoding was already UTF-8, both character escape types would set `explicit_encoding` to UTF-8, but need to be processed differently. Skip for now.
6000
+ if (parser->encoding != PM_ENCODING_UTF_8_ENTRY) {
6001
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING, pm_string_source(source));
6002
+ }
6003
+ }
6004
+
6005
+ // We've determined the encoding would naturally be EUC-JP and there is no need to force the encoding to anything else.
6006
+ return flags;
6007
+ }
6008
+
5954
6009
/**
5955
6010
* Ruby "downgrades" the encoding of Regexps to US-ASCII if the associated encoding is ASCII-compatible and
5956
6011
* the unescaped representation of a Regexp source consists only of US-ASCII code points. This is true even
5957
6012
* when the Regexp is explicitly given an ASCII-8BIT encoding via the (/n) modifier. Otherwise, the encoding
5958
6013
* may be explicitly set with an escape sequence.
5959
6014
*/
5960
6015
static inline pm_node_flags_t
5961
- parse_regular_expression_encoding(const pm_parser_t *parser, const pm_string_t *contents) {
5962
- // Ruby stipulates that all source files must use an ASCII-compatible encoding. Thus, all regular expressions
5963
- // appearing in source are eligible for "downgrading" to US-ASCII.
6016
+ parse_and_validate_regular_expression_encoding(pm_parser_t *parser, const pm_string_t *source, const pm_string_t *contents, pm_node_flags_t flags) {
6017
+ // TODO (nirvdrum 22-Feb-2024): CRuby reports a special Regexp-specific error for invalid Unicode ranges. We either need to scan again or modify the "invalid Unicode escape sequence" message we already report.
6018
+ bool valid_unicode_range = true;
6019
+ if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY && !valid_unicode_range) {
6020
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_INVALID_UNICODE_RANGE, pm_string_source(source));
6021
+
6022
+ return flags;
6023
+ }
6024
+
6025
+ // US-ASCII strings do not admit multi-byte character literals. However, character escape sequences corresponding
6026
+ // to multi-byte characters are allowed.
6027
+ if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY && parser->explicit_encoding == NULL && !pm_ascii_only_p(contents)) {
6028
+ // CRuby will continue processing even though a SyntaxError has already been detected. It may result in the
6029
+ // following error message appearing twice. We do the same for compatibility.
6030
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name);
6031
+ }
6032
+
6033
+ /**
6034
+ * Start checking modifier flags. We need to process these before considering any explicit encodings that may have
6035
+ * been set by character literals. The order in which the encoding modifiers is checked does not matter. In the
6036
+ * event that both an encoding modifier and an explicit encoding would result in the same encoding we do not set
6037
+ * the corresponding "forced_<encoding>" flag. Instead, the caller should check the encoding modifier flag and
6038
+ * determine the encoding that way.
6039
+ */
6040
+
6041
+ if (flags & PM_REGULAR_EXPRESSION_FLAGS_ASCII_8BIT) {
6042
+ return parse_and_validate_regular_expression_encoding_modifier(parser, source, contents, flags, 'n', PM_ENCODING_ASCII_8BIT_ENTRY);
6043
+ }
6044
+
6045
+ if (flags & PM_REGULAR_EXPRESSION_FLAGS_UTF_8) {
6046
+ return parse_and_validate_regular_expression_encoding_modifier(parser, source, contents, flags, 'u', PM_ENCODING_UTF_8_ENTRY);
6047
+ }
6048
+
6049
+ if (flags & PM_REGULAR_EXPRESSION_FLAGS_EUC_JP) {
6050
+ return parse_and_validate_regular_expression_encoding_modifier(parser, source, contents, flags, 'e', PM_ENCODING_EUC_JP_ENTRY);
6051
+ }
6052
+
6053
+ if (flags & PM_REGULAR_EXPRESSION_FLAGS_WINDOWS_31J) {
6054
+ return parse_and_validate_regular_expression_encoding_modifier(parser, source, contents, flags, 's', PM_ENCODING_WINDOWS_31J_ENTRY);
6055
+ }
6056
+
6057
+ // At this point no encoding modifiers will be present on the regular expression as they would have already
6058
+ // been processed. Ruby stipulates that all source files must use an ASCII-compatible encoding. Thus, all
6059
+ // regular expressions without an encoding modifier appearing in source are eligible for "downgrading" to US-ASCII.
5964
6060
if (pm_ascii_only_p(contents)) {
5965
6061
return PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING;
5966
6062
}
@@ -5976,6 +6072,7 @@ parse_regular_expression_encoding(const pm_parser_t *parser, const pm_string_t *
5976
6072
return PM_REGULAR_EXPRESSION_FLAGS_FORCED_BINARY_ENCODING;
5977
6073
}
5978
6074
}
6075
+
5979
6076
return 0;
5980
6077
}
5981
6078
@@ -17030,7 +17127,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
17030
17127
// more easily compiled.
17031
17128
if (accept1(parser, PM_TOKEN_REGEXP_END)) {
17032
17129
pm_node_t *regular_expression_node = (pm_node_t *) pm_regular_expression_node_create_unescaped(parser, &opening, &content, &parser->previous, &source);
17033
- pm_node_flag_set(regular_expression_node, parse_regular_expression_encoding (parser, &unescaped));
17130
+ pm_node_flag_set(regular_expression_node, parse_and_validate_regular_expression_encoding (parser, &source, & unescaped, regular_expression_node->flags ));
17034
17131
return regular_expression_node;
17035
17132
}
17036
17133
0 commit comments