@@ -6942,7 +6942,8 @@ pm_super_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_argument
6942
6942
}
6943
6943
6944
6944
/**
6945
- * Read through the contents of a string and check if it consists solely of US ASCII code points.
6945
+ * Read through the contents of a string and check if it consists solely of
6946
+ * US-ASCII code points.
6946
6947
*/
6947
6948
static bool
6948
6949
pm_ascii_only_p(const pm_string_t *contents) {
@@ -6956,27 +6957,72 @@ pm_ascii_only_p(const pm_string_t *contents) {
6956
6957
return true;
6957
6958
}
6958
6959
6960
+ /**
6961
+ * Validate that the contents of the given symbol are all valid UTF-8.
6962
+ */
6963
+ static void
6964
+ parse_symbol_encoding_validate_utf8(pm_parser_t *parser, const pm_token_t *location, const pm_string_t *contents) {
6965
+ for (const uint8_t *cursor = pm_string_source(contents), *end = cursor + pm_string_length(contents); cursor < end;) {
6966
+ size_t width = pm_encoding_utf_8_char_width(cursor, end - cursor);
6967
+
6968
+ if (width == 0) {
6969
+ pm_parser_err(parser, location->start, location->end, PM_ERR_INVALID_SYMBOL);
6970
+ break;
6971
+ }
6972
+
6973
+ cursor += width;
6974
+ }
6975
+ }
6976
+
6977
+ /**
6978
+ * Validate that the contents of the given symbol are all valid in the encoding
6979
+ * of the parser.
6980
+ */
6981
+ static void
6982
+ parse_symbol_encoding_validate_other(pm_parser_t *parser, const pm_token_t *location, const pm_string_t *contents) {
6983
+ const pm_encoding_t *encoding = parser->encoding;
6984
+
6985
+ for (const uint8_t *cursor = pm_string_source(contents), *end = cursor + pm_string_length(contents); cursor < end;) {
6986
+ size_t width = encoding->char_width(cursor, end - cursor);
6987
+
6988
+ if (width == 0) {
6989
+ pm_parser_err(parser, location->start, location->end, PM_ERR_INVALID_SYMBOL);
6990
+ break;
6991
+ }
6992
+
6993
+ cursor += width;
6994
+ }
6995
+ }
6996
+
6959
6997
/**
6960
6998
* Ruby "downgrades" the encoding of Symbols to US-ASCII if the associated
6961
6999
* encoding is ASCII-compatible and the Symbol consists only of US-ASCII code
6962
7000
* points. Otherwise, the encoding may be explicitly set with an escape
6963
7001
* sequence.
7002
+ *
7003
+ * If the validate flag is set, then it will check the contents of the symbol
7004
+ * to ensure that all characters are valid in the encoding.
6964
7005
*/
6965
7006
static inline pm_node_flags_t
6966
- parse_symbol_encoding(const pm_parser_t *parser, const pm_string_t *contents) {
7007
+ parse_symbol_encoding(pm_parser_t *parser, const pm_token_t *location, const pm_string_t *contents, bool validate ) {
6967
7008
if (parser->explicit_encoding != NULL) {
6968
7009
// A Symbol may optionally have its encoding explicitly set. This will
6969
7010
// happen if an escape sequence results in a non-ASCII code point.
6970
7011
if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
7012
+ if (validate) parse_symbol_encoding_validate_utf8(parser, location, contents);
6971
7013
return PM_SYMBOL_FLAGS_FORCED_UTF8_ENCODING;
6972
7014
} else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) {
6973
7015
return PM_SYMBOL_FLAGS_FORCED_BINARY_ENCODING;
7016
+ } else if (validate) {
7017
+ parse_symbol_encoding_validate_other(parser, location, contents);
6974
7018
}
6975
7019
} else if (pm_ascii_only_p(contents)) {
6976
7020
// Ruby stipulates that all source files must use an ASCII-compatible
6977
7021
// encoding. Thus, all symbols appearing in source are eligible for
6978
7022
// "downgrading" to US-ASCII.
6979
7023
return PM_SYMBOL_FLAGS_FORCED_US_ASCII_ENCODING;
7024
+ } else if (validate) {
7025
+ parse_symbol_encoding_validate_other(parser, location, contents);
6980
7026
}
6981
7027
6982
7028
return 0;
@@ -7144,7 +7190,7 @@ pm_symbol_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_t
7144
7190
*/
7145
7191
static pm_symbol_node_t *
7146
7192
pm_symbol_node_create_current_string(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing) {
7147
- pm_symbol_node_t *node = pm_symbol_node_create_unescaped(parser, opening, value, closing, &parser->current_string, parse_symbol_encoding(parser, &parser->current_string));
7193
+ pm_symbol_node_t *node = pm_symbol_node_create_unescaped(parser, opening, value, closing, &parser->current_string, parse_symbol_encoding(parser, value, &parser->current_string, false ));
7148
7194
parser->current_string = PM_STRING_EMPTY;
7149
7195
return node;
7150
7196
}
@@ -7166,7 +7212,7 @@ pm_symbol_node_label_create(pm_parser_t *parser, const pm_token_t *token) {
7166
7212
7167
7213
assert((label.end - label.start) >= 0);
7168
7214
pm_string_shared_init(&node->unescaped, label.start, label.end);
7169
- pm_node_flag_set((pm_node_t *) node, parse_symbol_encoding(parser, &node->unescaped));
7215
+ pm_node_flag_set((pm_node_t *) node, parse_symbol_encoding(parser, &label, & node->unescaped, false ));
7170
7216
7171
7217
break;
7172
7218
}
@@ -7251,7 +7297,8 @@ pm_string_node_to_symbol_node(pm_parser_t *parser, pm_string_node_t *node, const
7251
7297
.unescaped = node->unescaped
7252
7298
};
7253
7299
7254
- pm_node_flag_set((pm_node_t *)new_node, parse_symbol_encoding(parser, &node->unescaped));
7300
+ pm_token_t content = { .type = PM_TOKEN_IDENTIFIER, .start = node->content_loc.start, .end = node->content_loc.end };
7301
+ pm_node_flag_set((pm_node_t *) new_node, parse_symbol_encoding(parser, &content, &node->unescaped, true));
7255
7302
7256
7303
// We are explicitly _not_ using pm_node_destroy here because we don't want
7257
7304
// to trash the unescaped string. We could instead copy the string if we
@@ -15259,7 +15306,7 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
15259
15306
pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing);
15260
15307
15261
15308
pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end);
15262
- pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &symbol->unescaped));
15309
+ pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &parser->previous, & symbol->unescaped, false ));
15263
15310
15264
15311
return (pm_node_t *) symbol;
15265
15312
}
@@ -15359,7 +15406,7 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
15359
15406
expect1(parser, PM_TOKEN_STRING_END, PM_ERR_SYMBOL_TERM_DYNAMIC);
15360
15407
}
15361
15408
15362
- return (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &unescaped));
15409
+ return (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &content, & unescaped, false ));
15363
15410
}
15364
15411
15365
15412
/**
@@ -15384,7 +15431,7 @@ parse_undef_argument(pm_parser_t *parser) {
15384
15431
pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing);
15385
15432
15386
15433
pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end);
15387
- pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &symbol->unescaped));
15434
+ pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &parser->previous, & symbol->unescaped, false ));
15388
15435
15389
15436
return (pm_node_t *) symbol;
15390
15437
}
@@ -15425,7 +15472,7 @@ parse_alias_argument(pm_parser_t *parser, bool first) {
15425
15472
pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing);
15426
15473
15427
15474
pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end);
15428
- pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &symbol->unescaped));
15475
+ pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &parser->previous, & symbol->unescaped, false ));
15429
15476
15430
15477
return (pm_node_t *) symbol;
15431
15478
}
@@ -16590,7 +16637,7 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) {
16590
16637
16591
16638
pm_node_list_free(&parts);
16592
16639
} else if (accept1(parser, PM_TOKEN_LABEL_END) && !state_is_arg_labeled) {
16593
- node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &unescaped));
16640
+ node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &content, & unescaped, true ));
16594
16641
} else if (match1(parser, PM_TOKEN_EOF)) {
16595
16642
pm_parser_err_token(parser, &opening, PM_ERR_STRING_LITERAL_EOF);
16596
16643
node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->current, &unescaped);
@@ -16616,7 +16663,7 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) {
16616
16663
pm_node_flag_set(node, parse_unescaped_encoding(parser));
16617
16664
expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_EOF);
16618
16665
} else if (accept1(parser, PM_TOKEN_LABEL_END)) {
16619
- node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &unescaped));
16666
+ node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &content, & unescaped, true ));
16620
16667
} else {
16621
16668
// If we get here, then we have interpolation so we'll need
16622
16669
// to create a string or symbol node with interpolation.
0 commit comments