Skip to content

Commit 41f8ae1

Browse files
kddnewtonmatzbot
authored andcommitted
[ruby/prism] Mark errors for invalid symbols
ruby/prism@661884c4a3
1 parent 0fa09c5 commit 41f8ae1

File tree

4 files changed

+61
-12
lines changed

4 files changed

+61
-12
lines changed

prism/config.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ errors:
149149
- INVALID_RETRY_AFTER_ELSE
150150
- INVALID_RETRY_AFTER_ENSURE
151151
- INVALID_RETRY_WITHOUT_RESCUE
152+
- INVALID_SYMBOL
152153
- INVALID_VARIABLE_GLOBAL
153154
- INVALID_VARIABLE_GLOBAL_3_3_0
154155
- INVALID_YIELD

prism/prism.c

Lines changed: 58 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6942,7 +6942,8 @@ pm_super_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_argument
69426942
}
69436943

69446944
/**
6945-
* Read through the contents of a string and check if it consists solely of US ASCII code points.
6945+
* Read through the contents of a string and check if it consists solely of
6946+
* US-ASCII code points.
69466947
*/
69476948
static bool
69486949
pm_ascii_only_p(const pm_string_t *contents) {
@@ -6956,27 +6957,72 @@ pm_ascii_only_p(const pm_string_t *contents) {
69566957
return true;
69576958
}
69586959

6960+
/**
6961+
* Validate that the contents of the given symbol are all valid UTF-8.
6962+
*/
6963+
static void
6964+
parse_symbol_encoding_validate_utf8(pm_parser_t *parser, const pm_token_t *location, const pm_string_t *contents) {
6965+
for (const uint8_t *cursor = pm_string_source(contents), *end = cursor + pm_string_length(contents); cursor < end;) {
6966+
size_t width = pm_encoding_utf_8_char_width(cursor, end - cursor);
6967+
6968+
if (width == 0) {
6969+
pm_parser_err(parser, location->start, location->end, PM_ERR_INVALID_SYMBOL);
6970+
break;
6971+
}
6972+
6973+
cursor += width;
6974+
}
6975+
}
6976+
6977+
/**
6978+
* Validate that the contents of the given symbol are all valid in the encoding
6979+
* of the parser.
6980+
*/
6981+
static void
6982+
parse_symbol_encoding_validate_other(pm_parser_t *parser, const pm_token_t *location, const pm_string_t *contents) {
6983+
const pm_encoding_t *encoding = parser->encoding;
6984+
6985+
for (const uint8_t *cursor = pm_string_source(contents), *end = cursor + pm_string_length(contents); cursor < end;) {
6986+
size_t width = encoding->char_width(cursor, end - cursor);
6987+
6988+
if (width == 0) {
6989+
pm_parser_err(parser, location->start, location->end, PM_ERR_INVALID_SYMBOL);
6990+
break;
6991+
}
6992+
6993+
cursor += width;
6994+
}
6995+
}
6996+
69596997
/**
69606998
* Ruby "downgrades" the encoding of Symbols to US-ASCII if the associated
69616999
* encoding is ASCII-compatible and the Symbol consists only of US-ASCII code
69627000
* points. Otherwise, the encoding may be explicitly set with an escape
69637001
* sequence.
7002+
*
7003+
* If the validate flag is set, then it will check the contents of the symbol
7004+
* to ensure that all characters are valid in the encoding.
69647005
*/
69657006
static inline pm_node_flags_t
6966-
parse_symbol_encoding(const pm_parser_t *parser, const pm_string_t *contents) {
7007+
parse_symbol_encoding(pm_parser_t *parser, const pm_token_t *location, const pm_string_t *contents, bool validate) {
69677008
if (parser->explicit_encoding != NULL) {
69687009
// A Symbol may optionally have its encoding explicitly set. This will
69697010
// happen if an escape sequence results in a non-ASCII code point.
69707011
if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
7012+
if (validate) parse_symbol_encoding_validate_utf8(parser, location, contents);
69717013
return PM_SYMBOL_FLAGS_FORCED_UTF8_ENCODING;
69727014
} else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) {
69737015
return PM_SYMBOL_FLAGS_FORCED_BINARY_ENCODING;
7016+
} else if (validate) {
7017+
parse_symbol_encoding_validate_other(parser, location, contents);
69747018
}
69757019
} else if (pm_ascii_only_p(contents)) {
69767020
// Ruby stipulates that all source files must use an ASCII-compatible
69777021
// encoding. Thus, all symbols appearing in source are eligible for
69787022
// "downgrading" to US-ASCII.
69797023
return PM_SYMBOL_FLAGS_FORCED_US_ASCII_ENCODING;
7024+
} else if (validate) {
7025+
parse_symbol_encoding_validate_other(parser, location, contents);
69807026
}
69817027

69827028
return 0;
@@ -7144,7 +7190,7 @@ pm_symbol_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_t
71447190
*/
71457191
static pm_symbol_node_t *
71467192
pm_symbol_node_create_current_string(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing) {
7147-
pm_symbol_node_t *node = pm_symbol_node_create_unescaped(parser, opening, value, closing, &parser->current_string, parse_symbol_encoding(parser, &parser->current_string));
7193+
pm_symbol_node_t *node = pm_symbol_node_create_unescaped(parser, opening, value, closing, &parser->current_string, parse_symbol_encoding(parser, value, &parser->current_string, false));
71487194
parser->current_string = PM_STRING_EMPTY;
71497195
return node;
71507196
}
@@ -7166,7 +7212,7 @@ pm_symbol_node_label_create(pm_parser_t *parser, const pm_token_t *token) {
71667212

71677213
assert((label.end - label.start) >= 0);
71687214
pm_string_shared_init(&node->unescaped, label.start, label.end);
7169-
pm_node_flag_set((pm_node_t *) node, parse_symbol_encoding(parser, &node->unescaped));
7215+
pm_node_flag_set((pm_node_t *) node, parse_symbol_encoding(parser, &label, &node->unescaped, false));
71707216

71717217
break;
71727218
}
@@ -7251,7 +7297,8 @@ pm_string_node_to_symbol_node(pm_parser_t *parser, pm_string_node_t *node, const
72517297
.unescaped = node->unescaped
72527298
};
72537299

7254-
pm_node_flag_set((pm_node_t *)new_node, parse_symbol_encoding(parser, &node->unescaped));
7300+
pm_token_t content = { .type = PM_TOKEN_IDENTIFIER, .start = node->content_loc.start, .end = node->content_loc.end };
7301+
pm_node_flag_set((pm_node_t *) new_node, parse_symbol_encoding(parser, &content, &node->unescaped, true));
72557302

72567303
// We are explicitly _not_ using pm_node_destroy here because we don't want
72577304
// to trash the unescaped string. We could instead copy the string if we
@@ -15259,7 +15306,7 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
1525915306
pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing);
1526015307

1526115308
pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end);
15262-
pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &symbol->unescaped));
15309+
pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &parser->previous, &symbol->unescaped, false));
1526315310

1526415311
return (pm_node_t *) symbol;
1526515312
}
@@ -15359,7 +15406,7 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
1535915406
expect1(parser, PM_TOKEN_STRING_END, PM_ERR_SYMBOL_TERM_DYNAMIC);
1536015407
}
1536115408

15362-
return (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &unescaped));
15409+
return (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &content, &unescaped, false));
1536315410
}
1536415411

1536515412
/**
@@ -15384,7 +15431,7 @@ parse_undef_argument(pm_parser_t *parser) {
1538415431
pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing);
1538515432

1538615433
pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end);
15387-
pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &symbol->unescaped));
15434+
pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &parser->previous, &symbol->unescaped, false));
1538815435

1538915436
return (pm_node_t *) symbol;
1539015437
}
@@ -15425,7 +15472,7 @@ parse_alias_argument(pm_parser_t *parser, bool first) {
1542515472
pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing);
1542615473

1542715474
pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end);
15428-
pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &symbol->unescaped));
15475+
pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &parser->previous, &symbol->unescaped, false));
1542915476

1543015477
return (pm_node_t *) symbol;
1543115478
}
@@ -16590,7 +16637,7 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) {
1659016637

1659116638
pm_node_list_free(&parts);
1659216639
} else if (accept1(parser, PM_TOKEN_LABEL_END) && !state_is_arg_labeled) {
16593-
node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &unescaped));
16640+
node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &content, &unescaped, true));
1659416641
} else if (match1(parser, PM_TOKEN_EOF)) {
1659516642
pm_parser_err_token(parser, &opening, PM_ERR_STRING_LITERAL_EOF);
1659616643
node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->current, &unescaped);
@@ -16616,7 +16663,7 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) {
1661616663
pm_node_flag_set(node, parse_unescaped_encoding(parser));
1661716664
expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_EOF);
1661816665
} else if (accept1(parser, PM_TOKEN_LABEL_END)) {
16619-
node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &unescaped));
16666+
node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &content, &unescaped, true));
1662016667
} else {
1662116668
// If we get here, then we have interpolation so we'll need
1662216669
// to create a string or symbol node with interpolation.

prism/templates/src/diagnostic.c.erb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,7 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = {
232232
[PM_ERR_INVALID_RETRY_AFTER_ELSE] = { "Invalid retry after else", PM_ERROR_LEVEL_SYNTAX },
233233
[PM_ERR_INVALID_RETRY_AFTER_ENSURE] = { "Invalid retry after ensure", PM_ERROR_LEVEL_SYNTAX },
234234
[PM_ERR_INVALID_RETRY_WITHOUT_RESCUE] = { "Invalid retry without rescue", PM_ERROR_LEVEL_SYNTAX },
235+
[PM_ERR_INVALID_SYMBOL] = { "invalid symbol", PM_ERROR_LEVEL_SYNTAX },
235236
[PM_ERR_INVALID_VARIABLE_GLOBAL_3_3_0] = { "`%.*s' is not allowed as a global variable name", PM_ERROR_LEVEL_SYNTAX },
236237
[PM_ERR_INVALID_VARIABLE_GLOBAL] = { "'%.*s' is not allowed as a global variable name", PM_ERROR_LEVEL_SYNTAX },
237238
[PM_ERR_INVALID_YIELD] = { "Invalid yield", PM_ERROR_LEVEL_SYNTAX },

test/prism/unescape_test.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def ruby(escape)
3838
end
3939

4040
def prism(escape)
41-
result = Prism.parse(code(escape))
41+
result = Prism.parse(code(escape), encoding: "binary")
4242

4343
if result.success?
4444
yield result.value.statements.body.first

0 commit comments

Comments
 (0)