Skip to content

Commit 6bf1b8e

Browse files
nirvdrumkddnewton
authored andcommitted
Add initial implementation of Regexp validation.
1 parent 54d14a3 commit 6bf1b8e

File tree

5 files changed

+209
-24
lines changed

5 files changed

+209
-24
lines changed

config.yml

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
errors:
2-
- CANNOT_PARSE_EXPRESSION
32
- ALIAS_ARGUMENT
43
- AMPAMPEQ_MULTI_ASSIGN
54
- ARGUMENT_AFTER_BLOCK
@@ -34,6 +33,7 @@ errors:
3433
- BLOCK_PARAM_PIPE_TERM
3534
- BLOCK_TERM_BRACE
3635
- BLOCK_TERM_END
36+
- CANNOT_PARSE_EXPRESSION
3737
- CANNOT_PARSE_STRING_PART
3838
- CASE_EXPRESSION_AFTER_CASE
3939
- CASE_EXPRESSION_AFTER_WHEN
@@ -82,13 +82,13 @@ errors:
8282
- EXPECT_ARGUMENT
8383
- EXPECT_EOL_AFTER_STATEMENT
8484
- EXPECT_EXPRESSION_AFTER_AMPAMPEQ
85-
- EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ
8685
- EXPECT_EXPRESSION_AFTER_COMMA
8786
- EXPECT_EXPRESSION_AFTER_EQUAL
8887
- EXPECT_EXPRESSION_AFTER_LESS_LESS
8988
- EXPECT_EXPRESSION_AFTER_LPAREN
90-
- EXPECT_EXPRESSION_AFTER_QUESTION
9189
- EXPECT_EXPRESSION_AFTER_OPERATOR
90+
- EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ
91+
- EXPECT_EXPRESSION_AFTER_QUESTION
9292
- EXPECT_EXPRESSION_AFTER_SPLAT
9393
- EXPECT_EXPRESSION_AFTER_SPLAT_HASH
9494
- EXPECT_EXPRESSION_AFTER_STAR
@@ -113,23 +113,25 @@ errors:
113113
- HASH_VALUE
114114
- HEREDOC_TERM
115115
- INCOMPLETE_QUESTION_MARK
116-
- INCOMPLETE_VARIABLE_CLASS_3_3_0
117116
- INCOMPLETE_VARIABLE_CLASS
118-
- INCOMPLETE_VARIABLE_INSTANCE_3_3_0
117+
- INCOMPLETE_VARIABLE_CLASS_3_3_0
119118
- INCOMPLETE_VARIABLE_INSTANCE
119+
- INCOMPLETE_VARIABLE_INSTANCE_3_3_0
120+
- INVALID_CHARACTER
120121
- INVALID_ENCODING_MAGIC_COMMENT
121122
- INVALID_FLOAT_EXPONENT
123+
- INVALID_MULTIBYTE_CHAR
124+
- INVALID_MULTIBYTE_CHARACTER
125+
- INVALID_MULTIBYTE_ESCAPE
122126
- INVALID_NUMBER_BINARY
123127
- INVALID_NUMBER_DECIMAL
124128
- INVALID_NUMBER_HEXADECIMAL
125129
- INVALID_NUMBER_OCTAL
126130
- INVALID_NUMBER_UNDERSCORE
127-
- INVALID_CHARACTER
128-
- INVALID_MULTIBYTE_CHARACTER
129-
- INVALID_PRINTABLE_CHARACTER
130131
- INVALID_PERCENT
131-
- INVALID_VARIABLE_GLOBAL_3_3_0
132+
- INVALID_PRINTABLE_CHARACTER
132133
- INVALID_VARIABLE_GLOBAL
134+
- INVALID_VARIABLE_GLOBAL_3_3_0
133135
- IT_NOT_ALLOWED_NUMBERED
134136
- IT_NOT_ALLOWED_ORDINARY
135137
- LAMBDA_OPEN
@@ -150,8 +152,8 @@ errors:
150152
- MODULE_TERM
151153
- MULTI_ASSIGN_MULTI_SPLATS
152154
- MULTI_ASSIGN_UNEXPECTED_REST
153-
- NOT_EXPRESSION
154155
- NO_LOCAL_VARIABLE
156+
- NOT_EXPRESSION
155157
- NUMBER_LITERAL_UNDERSCORE
156158
- NUMBERED_PARAMETER_IT
157159
- NUMBERED_PARAMETER_ORDINARY
@@ -173,8 +175,8 @@ errors:
173175
- PARAMETER_UNEXPECTED_FWD
174176
- PARAMETER_WILD_LOOSE_COMMA
175177
- PATTERN_EXPRESSION_AFTER_BRACKET
176-
- PATTERN_EXPRESSION_AFTER_HROCKET
177178
- PATTERN_EXPRESSION_AFTER_COMMA
179+
- PATTERN_EXPRESSION_AFTER_HROCKET
178180
- PATTERN_EXPRESSION_AFTER_IN
179181
- PATTERN_EXPRESSION_AFTER_KEY
180182
- PATTERN_EXPRESSION_AFTER_PAREN
@@ -191,7 +193,12 @@ errors:
191193
- PATTERN_TERM_BRACKET
192194
- PATTERN_TERM_PAREN
193195
- PIPEPIPEEQ_MULTI_ASSIGN
196+
- REGEXP_ENCODING_OPTION_MISMATCH
197+
- REGEXP_INCOMPAT_CHAR_ENCODING
198+
- REGEXP_INVALID_UNICODE_RANGE
199+
- REGEXP_NON_ESCAPED_MBC
194200
- REGEXP_TERM
201+
- REGEXP_UTF8_CHAR_NON_UTF8_REGEXP
195202
- RESCUE_EXPRESSION
196203
- RESCUE_MODIFIER_VALUE
197204
- RESCUE_TERM
@@ -213,9 +220,9 @@ errors:
213220
- TERNARY_EXPRESSION_FALSE
214221
- TERNARY_EXPRESSION_TRUE
215222
- UNARY_RECEIVER
223+
- UNDEF_ARGUMENT
216224
- UNEXPECTED_TOKEN_CLOSE_CONTEXT
217225
- UNEXPECTED_TOKEN_IGNORE
218-
- UNDEF_ARGUMENT
219226
- UNTIL_TERM
220227
- VOID_EXPRESSION
221228
- WHILE_TERM

include/prism/encoding.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,18 @@ extern const pm_encoding_t pm_encodings[PM_ENCODING_MAXIMUM];
252252
*/
253253
#define PM_ENCODING_ASCII_8BIT_ENTRY (&pm_encodings[PM_ENCODING_ASCII_8BIT])
254254

255+
/**
256+
* This is the EUC-JP encoding. We need a reference to it to quickly process
257+
* regular expression modifiers.
258+
*/
259+
#define PM_ENCODING_EUC_JP_ENTRY (&pm_encodings[PM_ENCODING_EUC_JP])
260+
261+
/**
262+
* This is the Windows-31J encoding. We need a reference to it to quickly
263+
* process regular expression modifiers.
264+
*/
265+
#define PM_ENCODING_WINDOWS_31J_ENTRY (&pm_encodings[PM_ENCODING_WINDOWS_31J])
266+
255267
/**
256268
* Parse the given name of an encoding and return a pointer to the corresponding
257269
* encoding struct if one can be found, otherwise return NULL.

src/prism.c

Lines changed: 101 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5951,16 +5951,112 @@ parse_symbol_encoding(const pm_parser_t *parser, const pm_string_t *contents) {
59515951
return 0;
59525952
}
59535953

5954+
static inline pm_node_flags_t
5955+
parse_and_validate_regular_expression_encoding_modifier(pm_parser_t *parser, const pm_string_t *source, const pm_string_t *contents, pm_node_flags_t flags, char modifier, const pm_encoding_t *modifier_encoding) {
5956+
assert ((modifier == 'n' && modifier_encoding == PM_ENCODING_ASCII_8BIT_ENTRY) ||
5957+
(modifier == 'u' && modifier_encoding == PM_ENCODING_UTF_8_ENTRY) ||
5958+
(modifier == 'e' && modifier_encoding == PM_ENCODING_EUC_JP_ENTRY) ||
5959+
(modifier == 's' && modifier_encoding == PM_ENCODING_WINDOWS_31J_ENTRY));
5960+
5961+
// There's special validation logic used if a string does not contain any character escape sequences.
5962+
if (parser->explicit_encoding == NULL) {
5963+
// If an ASCII-only string without character escapes is used with an encoding modifier, then resulting Regexp
5964+
// has the modifier encoding, unless the ASCII-8BIT modifier is used, in which case the Regexp "downgrades" to
5965+
// the US-ASCII encoding.
5966+
bool ascii_only = pm_ascii_only_p(contents);
5967+
if (ascii_only) {
5968+
return modifier == 'n' ? PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING : flags;
5969+
}
5970+
5971+
if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) {
5972+
if (!ascii_only) {
5973+
PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name);
5974+
}
5975+
} else if (parser->encoding != modifier_encoding) {
5976+
PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_ENCODING_OPTION_MISMATCH, modifier, parser->encoding->name);
5977+
5978+
if (modifier == 'n' && !ascii_only) {
5979+
PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_NON_ESCAPED_MBC, pm_string_source(source));
5980+
}
5981+
}
5982+
5983+
return flags;
5984+
}
5985+
5986+
// TODO (nirvdrum 21-Feb-2024): To validate regexp sources with character escape sequences we need to know whether hex or Unicode escape sequences were used and Prism doesn't currently provide that data. We handle a subset of unambiguous cases in the meanwhile.
5987+
bool mixed_encoding = false;
5988+
5989+
if (mixed_encoding) {
5990+
PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_ESCAPE, pm_string_source(source));
5991+
} else if (modifier != 'n' && parser->explicit_encoding == PM_ENCODING_ASCII_8BIT_ENTRY) {
5992+
// TODO (nirvdrum 21-Feb-2024): Validate the content is valid in the modifier encoding. Do this on-demand so we don't pay the cost of computation unnecessarily.
5993+
bool valid_string_in_modifier_encoding = true;
5994+
5995+
if (!valid_string_in_modifier_encoding) {
5996+
PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_ESCAPE, pm_string_source(source));
5997+
}
5998+
} else if (modifier != 'u' && parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
5999+
// TODO (nirvdrum 21-Feb-2024): There's currently no way to tell if the source used hex or Unicode character escapes from `explicit_encoding` alone. If the source encoding was already UTF-8, both character escape types would set `explicit_encoding` to UTF-8, but need to be processed differently. Skip for now.
6000+
if (parser->encoding != PM_ENCODING_UTF_8_ENTRY) {
6001+
PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING, pm_string_source(source));
6002+
}
6003+
}
6004+
6005+
// We've determined the encoding would naturally be EUC-JP and there is no need to force the encoding to anything else.
6006+
return flags;
6007+
}
6008+
59546009
/**
59556010
* Ruby "downgrades" the encoding of Regexps to US-ASCII if the associated encoding is ASCII-compatible and
59566011
* the unescaped representation of a Regexp source consists only of US-ASCII code points. This is true even
59576012
* when the Regexp is explicitly given an ASCII-8BIT encoding via the (/n) modifier. Otherwise, the encoding
59586013
* may be explicitly set with an escape sequence.
59596014
*/
59606015
static inline pm_node_flags_t
5961-
parse_regular_expression_encoding(const pm_parser_t *parser, const pm_string_t *contents) {
5962-
// Ruby stipulates that all source files must use an ASCII-compatible encoding. Thus, all regular expressions
5963-
// appearing in source are eligible for "downgrading" to US-ASCII.
6016+
parse_and_validate_regular_expression_encoding(pm_parser_t *parser, const pm_string_t *source, const pm_string_t *contents, pm_node_flags_t flags) {
6017+
// TODO (nirvdrum 22-Feb-2024): CRuby reports a special Regexp-specific error for invalid Unicode ranges. We either need to scan again or modify the "invalid Unicode escape sequence" message we already report.
6018+
bool valid_unicode_range = true;
6019+
if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY && !valid_unicode_range) {
6020+
PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_INVALID_UNICODE_RANGE, pm_string_source(source));
6021+
6022+
return flags;
6023+
}
6024+
6025+
// US-ASCII strings do not admit multi-byte character literals. However, character escape sequences corresponding
6026+
// to multi-byte characters are allowed.
6027+
if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY && parser->explicit_encoding == NULL && !pm_ascii_only_p(contents)) {
6028+
// CRuby will continue processing even though a SyntaxError has already been detected. It may result in the
6029+
// following error message appearing twice. We do the same for compatibility.
6030+
PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name);
6031+
}
6032+
6033+
/**
6034+
* Start checking modifier flags. We need to process these before considering any explicit encodings that may have
6035+
* been set by character literals. The order in which the encoding modifiers is checked does not matter. In the
6036+
* event that both an encoding modifier and an explicit encoding would result in the same encoding we do not set
6037+
* the corresponding "forced_<encoding>" flag. Instead, the caller should check the encoding modifier flag and
6038+
* determine the encoding that way.
6039+
*/
6040+
6041+
if (flags & PM_REGULAR_EXPRESSION_FLAGS_ASCII_8BIT) {
6042+
return parse_and_validate_regular_expression_encoding_modifier(parser, source, contents, flags, 'n', PM_ENCODING_ASCII_8BIT_ENTRY);
6043+
}
6044+
6045+
if (flags & PM_REGULAR_EXPRESSION_FLAGS_UTF_8) {
6046+
return parse_and_validate_regular_expression_encoding_modifier(parser, source, contents, flags, 'u', PM_ENCODING_UTF_8_ENTRY);
6047+
}
6048+
6049+
if (flags & PM_REGULAR_EXPRESSION_FLAGS_EUC_JP) {
6050+
return parse_and_validate_regular_expression_encoding_modifier(parser, source, contents, flags, 'e', PM_ENCODING_EUC_JP_ENTRY);
6051+
}
6052+
6053+
if (flags & PM_REGULAR_EXPRESSION_FLAGS_WINDOWS_31J) {
6054+
return parse_and_validate_regular_expression_encoding_modifier(parser, source, contents, flags, 's', PM_ENCODING_WINDOWS_31J_ENTRY);
6055+
}
6056+
6057+
// At this point no encoding modifiers will be present on the regular expression as they would have already
6058+
// been processed. Ruby stipulates that all source files must use an ASCII-compatible encoding. Thus, all
6059+
// regular expressions without an encoding modifier appearing in source are eligible for "downgrading" to US-ASCII.
59646060
if (pm_ascii_only_p(contents)) {
59656061
return PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING;
59666062
}
@@ -5976,6 +6072,7 @@ parse_regular_expression_encoding(const pm_parser_t *parser, const pm_string_t *
59766072
return PM_REGULAR_EXPRESSION_FLAGS_FORCED_BINARY_ENCODING;
59776073
}
59786074
}
6075+
59796076
return 0;
59806077
}
59816078

@@ -17030,7 +17127,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
1703017127
// more easily compiled.
1703117128
if (accept1(parser, PM_TOKEN_REGEXP_END)) {
1703217129
pm_node_t *regular_expression_node = (pm_node_t *) pm_regular_expression_node_create_unescaped(parser, &opening, &content, &parser->previous, &source);
17033-
pm_node_flag_set(regular_expression_node, parse_regular_expression_encoding(parser, &unescaped));
17130+
pm_node_flag_set(regular_expression_node, parse_and_validate_regular_expression_encoding(parser, &source, &unescaped, regular_expression_node->flags));
1703417131
return regular_expression_node;
1703517132
}
1703617133

templates/src/diagnostic.c.erb

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,9 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = {
204204
[PM_ERR_INVALID_NUMBER_OCTAL] = { "invalid octal number", PM_ERROR_LEVEL_FATAL },
205205
[PM_ERR_INVALID_NUMBER_UNDERSCORE] = { "invalid underscore placement in number", PM_ERROR_LEVEL_FATAL },
206206
[PM_ERR_INVALID_CHARACTER] = { "invalid character 0x%X", PM_ERROR_LEVEL_FATAL },
207+
[PM_ERR_INVALID_MULTIBYTE_CHAR] = { "invalid multibyte char (%s)", PM_ERROR_LEVEL_FATAL },
207208
[PM_ERR_INVALID_MULTIBYTE_CHARACTER] = { "invalid multibyte character 0x%X", PM_ERROR_LEVEL_FATAL },
209+
[PM_ERR_INVALID_MULTIBYTE_ESCAPE] = { "invalid multibyte escape: /%s/", PM_ERROR_LEVEL_FATAL },
208210
[PM_ERR_INVALID_PRINTABLE_CHARACTER] = { "invalid character `%c`", PM_ERROR_LEVEL_FATAL },
209211
[PM_ERR_INVALID_PERCENT] = { "invalid `%` token", PM_ERROR_LEVEL_FATAL }, // TODO WHAT?
210212
[PM_ERR_INVALID_VARIABLE_GLOBAL_3_3_0] = { "`%.*s' is not allowed as a global variable name", PM_ERROR_LEVEL_FATAL },
@@ -270,7 +272,12 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = {
270272
[PM_ERR_PATTERN_TERM_BRACKET] = { "expected a `]` to close the pattern expression", PM_ERROR_LEVEL_FATAL },
271273
[PM_ERR_PATTERN_TERM_PAREN] = { "expected a `)` to close the pattern expression", PM_ERROR_LEVEL_FATAL },
272274
[PM_ERR_PIPEPIPEEQ_MULTI_ASSIGN] = { "unexpected `||=` in a multiple assignment", PM_ERROR_LEVEL_FATAL },
275+
[PM_ERR_REGEXP_ENCODING_OPTION_MISMATCH] = { "regexp encoding option '%c' differs from source encoding '%s'", PM_ERROR_LEVEL_FATAL },
276+
[PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING] = { "incompatible character encoding: /%s/", PM_ERROR_LEVEL_FATAL },
277+
[PM_ERR_REGEXP_NON_ESCAPED_MBC] = { "/.../n has a non escaped non ASCII character in non ASCII-8BIT script: /%s/", PM_ERROR_LEVEL_FATAL },
278+
[PM_ERR_REGEXP_INVALID_UNICODE_RANGE] = { "invalid Unicode range: /%s/", PM_ERROR_LEVEL_FATAL },
273279
[PM_ERR_REGEXP_TERM] = { "expected a closing delimiter for the regular expression", PM_ERROR_LEVEL_FATAL },
280+
[PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP] = { "UTF-8 character in non UTF-8 regexp: /%s/", PM_ERROR_LEVEL_FATAL },
274281
[PM_ERR_RESCUE_EXPRESSION] = { "expected a rescued expression", PM_ERROR_LEVEL_FATAL },
275282
[PM_ERR_RESCUE_MODIFIER_VALUE] = { "expected a value after the `rescue` modifier", PM_ERROR_LEVEL_FATAL },
276283
[PM_ERR_RESCUE_TERM] = { "expected a closing delimiter for the `rescue` clause", PM_ERROR_LEVEL_FATAL },

0 commit comments

Comments
 (0)