Skip to content

Commit

Permalink
[ruby/prism] Provide flags for changing encodings
Browse files Browse the repository at this point in the history
  • Loading branch information
kddnewton committed Dec 6, 2023
1 parent 9620ca6 commit 82f18ba
Show file tree
Hide file tree
Showing 31 changed files with 455 additions and 226 deletions.
14 changes: 14 additions & 0 deletions prism/config.yml
Expand Up @@ -346,6 +346,13 @@ flags:
- name: VARIABLE_CALL
comment: "a call that could have been a local variable"
comment: Flags for call nodes.
- name: EncodingFlags
values:
- name: FORCED_UTF8_ENCODING
comment: "internal bytes forced the encoding to UTF-8"
- name: FORCED_BINARY_ENCODING
comment: "internal bytes forced the encoding to binary"
comment: Flags for nodes that have unescaped content.
- name: IntegerBaseFlags
values:
- name: BINARY
Expand Down Expand Up @@ -388,6 +395,10 @@ flags:
comment: Flags for regular expression and match last line nodes.
- name: StringFlags
values:
- name: FORCED_UTF8_ENCODING
comment: "internal bytes forced the encoding to UTF-8"
- name: FORCED_BINARY_ENCODING
comment: "internal bytes forced the encoding to binary"
- name: FROZEN
comment: "frozen by virtue of a `frozen_string_literal` comment"
comment: Flags for string nodes.
Expand Down Expand Up @@ -2576,6 +2587,9 @@ nodes:
^^^^^^^^^^^^^^^^^^^^
- name: XStringNode
fields:
- name: flags
type: flags
kind: EncodingFlags
- name: opening_loc
type: location
- name: content_loc
Expand Down
17 changes: 17 additions & 0 deletions prism/defines.h
Expand Up @@ -74,4 +74,21 @@
# define snprintf _snprintf
#endif

/**
* A simple utility macro to concatenate two tokens together, necessary when one
* of the tokens is itself a macro.
*/
#define PM_CONCATENATE(left, right) left ## right

/**
* We want to be able to use static assertions, but they weren't standardized
* until C11. As such, we polyfill it here by making a hacky typedef that will
* fail to compile due to a negative array size if the condition is false.
*/
#if defined(_Static_assert)
# define PM_STATIC_ASSERT(line, condition, message) _Static_assert(condition, message)
#else
# define PM_STATIC_ASSERT(line, condition, message) typedef char PM_CONCATENATE(static_assert_, line)[(condition) ? 1 : -1]
#endif

#endif
1 change: 1 addition & 0 deletions prism/diagnostic.c
Expand Up @@ -185,6 +185,7 @@ static const char* const diagnostic_messages[PM_DIAGNOSTIC_ID_LEN] = {
[PM_ERR_LIST_W_UPPER_ELEMENT] = "expected a string in a `%W` list",
[PM_ERR_LIST_W_UPPER_TERM] = "expected a closing delimiter for the `%W` list",
[PM_ERR_MALLOC_FAILED] = "failed to allocate memory",
[PM_ERR_MIXED_ENCODING] = "UTF-8 mixed within %s source",
[PM_ERR_MODULE_IN_METHOD] = "unexpected module definition in a method definition",
[PM_ERR_MODULE_NAME] = "expected a constant name after `module`",
[PM_ERR_MODULE_TERM] = "expected an `end` to close the `module` statement",
Expand Down
1 change: 1 addition & 0 deletions prism/diagnostic.h
Expand Up @@ -177,6 +177,7 @@ typedef enum {
PM_ERR_LIST_W_UPPER_ELEMENT,
PM_ERR_LIST_W_UPPER_TERM,
PM_ERR_MALLOC_FAILED,
PM_ERR_MIXED_ENCODING,
PM_ERR_MODULE_IN_METHOD,
PM_ERR_MODULE_NAME,
PM_ERR_MODULE_TERM,
Expand Down
35 changes: 15 additions & 20 deletions prism/encoding.c
Expand Up @@ -4212,9 +4212,9 @@ pm_encoding_shift_jis_char_width(const uint8_t *b, ptrdiff_t n) {
}

/**
* This is the definition of all of the encodings that we support.
* This is the table of all of the encodings that prisms supports.
*/
static const pm_encoding_t pm_encodings[] = {
const pm_encoding_t pm_encodings[] = {
[PM_ENCODING_UTF_8] = {
.name = "UTF-8",
.char_width = pm_encoding_utf_8_char_width,
Expand All @@ -4223,14 +4223,6 @@ static const pm_encoding_t pm_encodings[] = {
.isupper_char = pm_encoding_utf_8_isupper_char,
.multibyte = true
},
[PM_ENCODING_ASCII] = {
.name = "US-ASCII",
.char_width = pm_encoding_ascii_char_width,
.alnum_char = pm_encoding_ascii_alnum_char,
.alpha_char = pm_encoding_ascii_alpha_char,
.isupper_char = pm_encoding_ascii_isupper_char,
.multibyte = false
},
[PM_ENCODING_ASCII_8BIT] = {
.name = "ASCII-8BIT",
.char_width = pm_encoding_single_char_width,
Expand Down Expand Up @@ -4815,6 +4807,14 @@ static const pm_encoding_t pm_encodings[] = {
.isupper_char = pm_encoding_tis_620_isupper_char,
.multibyte = false
},
[PM_ENCODING_US_ASCII] = {
.name = "US-ASCII",
.char_width = pm_encoding_ascii_char_width,
.alnum_char = pm_encoding_ascii_alnum_char,
.alpha_char = pm_encoding_ascii_alpha_char,
.isupper_char = pm_encoding_ascii_isupper_char,
.multibyte = false
},
[PM_ENCODING_UTF8_MAC] = {
.name = "UTF8-MAC",
.char_width = pm_encoding_utf_8_char_width,
Expand Down Expand Up @@ -4937,11 +4937,6 @@ static const pm_encoding_t pm_encodings[] = {
}
};

/**
* This is the default UTF-8 encoding. We need it to quickly create parsers.
*/
const pm_encoding_t *pm_encoding_utf_8 = pm_encodings;

/**
* Parse the given name of an encoding and return a pointer to the corresponding
* encoding struct if one can be found, otherwise return NULL.
Expand All @@ -4961,7 +4956,7 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
}

// Otherwise we'll return the default UTF-8 encoding.
return pm_encoding_utf_8;
return PM_ENCODING_UTF_8_ENTRY;
}

// Next, we're going to loop through each of the encodings that we handle
Expand All @@ -4972,9 +4967,9 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
if (width >= 3) {
switch (*start) {
case 'A': case 'a':
ENCODING1("ASCII", PM_ENCODING_ASCII);
ENCODING1("ASCII", PM_ENCODING_US_ASCII);
ENCODING1("ASCII-8BIT", PM_ENCODING_ASCII_8BIT);
ENCODING1("ANSI_X3.4-1968", PM_ENCODING_ASCII);
ENCODING1("ANSI_X3.4-1968", PM_ENCODING_US_ASCII);
break;
case 'B': case 'b':
ENCODING1("BINARY", PM_ENCODING_ASCII_8BIT);
Expand Down Expand Up @@ -5109,7 +5104,7 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
ENCODING1("TIS-620", PM_ENCODING_TIS_620);
break;
case 'U': case 'u':
ENCODING1("US-ASCII", PM_ENCODING_ASCII);
ENCODING1("US-ASCII", PM_ENCODING_US_ASCII);
ENCODING2("UTF8-MAC", "UTF-8-HFS", PM_ENCODING_UTF8_MAC);
ENCODING1("UTF8-DoCoMo", PM_ENCODING_UTF8_DOCOMO);
ENCODING1("UTF8-KDDI", PM_ENCODING_UTF8_KDDI);
Expand All @@ -5129,7 +5124,7 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
ENCODING1("Windows-1258", PM_ENCODING_WINDOWS_1258);
break;
case '6':
ENCODING1("646", PM_ENCODING_ASCII);
ENCODING1("646", PM_ENCODING_US_ASCII);
break;
}
}
Expand Down
22 changes: 18 additions & 4 deletions prism/encoding.h
Expand Up @@ -125,7 +125,6 @@ extern const uint8_t pm_encoding_unicode_table[256];
*/
typedef enum {
PM_ENCODING_UTF_8 = 0,
PM_ENCODING_ASCII,
PM_ENCODING_ASCII_8BIT,
PM_ENCODING_BIG5,
PM_ENCODING_BIG5_HKSCS,
Expand Down Expand Up @@ -199,6 +198,7 @@ typedef enum {
PM_ENCODING_STATELESS_ISO_2022_JP,
PM_ENCODING_STATELESS_ISO_2022_JP_KDDI,
PM_ENCODING_TIS_620,
PM_ENCODING_US_ASCII,
PM_ENCODING_UTF8_MAC,
PM_ENCODING_UTF8_DOCOMO,
PM_ENCODING_UTF8_KDDI,
Expand All @@ -213,13 +213,27 @@ typedef enum {
PM_ENCODING_WINDOWS_1257,
PM_ENCODING_WINDOWS_1258,
PM_ENCODING_WINDOWS_31J,
PM_ENCODING_WINDOWS_874
PM_ENCODING_WINDOWS_874,
PM_ENCODING_MAXIMUM
} pm_encoding_type_t;

/**
* This is the default UTF-8 encoding. We need it to quickly create parsers.
* This is the table of all of the encodings that prisms supports.
*/
extern const pm_encoding_t *pm_encoding_utf_8;
extern const pm_encoding_t pm_encodings[PM_ENCODING_MAXIMUM];

/**
* This is the default UTF-8 encoding. We need a reference to it to quickly
* create parsers.
*/
#define PM_ENCODING_UTF_8_ENTRY (&pm_encodings[PM_ENCODING_UTF_8])

/**
* This is the US-ASCII encoding. We need a reference to it to be able to
* compare against it when a string is being created because it could possibly
* need to fall back to ASCII-8BIT.
*/
#define PM_ENCODING_US_ASCII_ENTRY (&pm_encodings[PM_ENCODING_US_ASCII])

/**
* Parse the given name of an encoding and return a pointer to the corresponding
Expand Down
10 changes: 5 additions & 5 deletions prism/extension.c
Expand Up @@ -469,7 +469,7 @@ parse_lex_token(void *data, pm_parser_t *parser, pm_token_t *token) {
static void
parse_lex_encoding_changed_callback(pm_parser_t *parser) {
parse_lex_data_t *parse_lex_data = (parse_lex_data_t *) parser->lex_callback->data;
parse_lex_data->encoding = rb_enc_find(parser->encoding.name);
parse_lex_data->encoding = rb_enc_find(parser->encoding->name);

// Since the encoding changed, we need to go back and change the encoding of
// the tokens that were already lexed. This is only going to end up being
Expand Down Expand Up @@ -599,7 +599,7 @@ parse_input(pm_string_t *input, const pm_options_t *options) {
pm_parser_init(&parser, pm_string_source(input), pm_string_length(input), options);

pm_node_t *node = pm_parse(&parser);
rb_encoding *encoding = rb_enc_find(parser.encoding.name);
rb_encoding *encoding = rb_enc_find(parser.encoding->name);

VALUE source = pm_source_new(&parser, encoding);
VALUE result_argv[] = {
Expand Down Expand Up @@ -693,7 +693,7 @@ parse_input_comments(pm_string_t *input, const pm_options_t *options) {
pm_parser_init(&parser, pm_string_source(input), pm_string_length(input), options);

pm_node_t *node = pm_parse(&parser);
rb_encoding *encoding = rb_enc_find(parser.encoding.name);
rb_encoding *encoding = rb_enc_find(parser.encoding->name);

VALUE source = pm_source_new(&parser, encoding);
VALUE comments = parser_comments(&parser, source);
Expand Down Expand Up @@ -872,7 +872,7 @@ static VALUE
named_captures(VALUE self, VALUE source) {
pm_string_list_t string_list = { 0 };

if (!pm_regexp_named_capture_group_names((const uint8_t *) RSTRING_PTR(source), RSTRING_LEN(source), &string_list, false, pm_encoding_utf_8)) {
if (!pm_regexp_named_capture_group_names((const uint8_t *) RSTRING_PTR(source), RSTRING_LEN(source), &string_list, false, PM_ENCODING_UTF_8_ENTRY)) {
pm_string_list_free(&string_list);
return Qnil;
}
Expand Down Expand Up @@ -962,7 +962,7 @@ inspect_node(VALUE self, VALUE source) {

pm_prettyprint(&buffer, &parser, node);

rb_encoding *encoding = rb_enc_find(parser.encoding.name);
rb_encoding *encoding = rb_enc_find(parser.encoding->name);
VALUE string = rb_enc_str_new(pm_buffer_value(&buffer), pm_buffer_length(&buffer), encoding);

pm_buffer_free(&buffer);
Expand Down
39 changes: 32 additions & 7 deletions prism/parser.h
Expand Up @@ -523,12 +523,6 @@ struct pm_parser {
size_t index;
} lex_modes;

/**
* The common_whitespace value from the most-recently-popped heredoc mode of the lexer, so we
* can dedent the heredoc after popping the lex mode.
*/
size_t current_string_common_whitespace;

/** The pointer to the start of the source. */
const uint8_t *start;

Expand Down Expand Up @@ -581,7 +575,7 @@ struct pm_parser {
* The encoding functions for the current file is attached to the parser as
* it's parsing so that it can change with a magic comment.
*/
pm_encoding_t encoding;
const pm_encoding_t *encoding;

/**
* When the encoding that is being used to parse the source is changed by
Expand Down Expand Up @@ -637,6 +631,37 @@ struct pm_parser {
*/
int32_t start_line;

/**
* When a string-like expression is being lexed, any byte or escape sequence
* that resolves to a value whose top bit is set (i.e., >= 0x80) will
* explicitly set the encoding to the same encoding as the source.
* Alternatively, if a unicode escape sequence is used (e.g., \\u{80}) that
* resolves to a value whose top bit is set, then the encoding will be
* explicitly set to UTF-8.
*
* The _next_ time this happens, if the encoding that is about to become the
* explicitly set encoding does not match the previously set explicit
* encoding, a mixed encoding error will be emitted.
*
* When the expression is finished being lexed, the explicit encoding
* controls the encoding of the expression. For the most part this means
* that the expression will either be encoded in the source encoding or
* UTF-8. This holds for all encodings except US-ASCII. If the source is
* US-ASCII and an explicit encoding was set that was _not_ UTF-8, then the
* expression will be encoded as ASCII-8BIT.
*
* Note that if the expression is a list, different elements within the same
* list can have different encodings, so this will get reset between each
* element. Furthermore all of this only applies to lists that support
* interpolation, because otherwise escapes that could change the encoding
* are ignored.
*
* At first glance, it may make more sense for this to live on the lexer
* mode, but we need it here to communicate back to the parser for character
* literals that do not push a new lexer mode.
*/
const pm_encoding_t *explicit_encoding;

/** Whether or not we're at the beginning of a command. */
bool command_start;

Expand Down

0 comments on commit 82f18ba

Please sign in to comment.