Skip to content

Commit

Permalink
[ruby/prism] Always return the character width for char_is_identifier…
Browse files Browse the repository at this point in the history
…_start() and char_is_identifier_utf8()

* This is also faster than calling pm_encoding_utf_8_alpha_char/pm_encoding_utf_8_alnum_char
  as those compute the character width and do extra checks.

ruby/prism@4cb276ac4c
  • Loading branch information
eregon authored and matzbot committed Jan 31, 2024
1 parent b5a2c60 commit 9fdfdf4
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 3 deletions.
5 changes: 4 additions & 1 deletion prism/encoding.c
Expand Up @@ -2277,7 +2277,10 @@ pm_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
return 0;
}

static size_t
/**
* Return the size of the next character in the UTF-8 encoding.
*/
size_t
pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n) {
size_t width;
pm_utf_8_codepoint(b, n, &width);
Expand Down
10 changes: 10 additions & 0 deletions prism/encoding.h
Expand Up @@ -79,6 +79,16 @@ typedef struct {
*/
#define PRISM_ENCODING_UPPERCASE_BIT 1 << 2

/**
* Return the size of the next character in the UTF-8 encoding.
*
* @param b The bytes to read.
* @param n The number of bytes that can be read.
* @returns The number of bytes that the next character takes if it is valid in
* the encoding, or 0 if it is not.
*/
size_t pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n);

/**
* Return the size of the next character in the UTF-8 encoding if it is an
* alphabetical character.
Expand Down
4 changes: 2 additions & 2 deletions prism/prism.c
Expand Up @@ -6249,7 +6249,7 @@ char_is_identifier_start(pm_parser_t *parser, const uint8_t *b) {
} else if (*b < 0x80) {
return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT ? 1 : 0) || (*b == '_');
} else {
return (size_t) (pm_encoding_utf_8_alpha_char(b, parser->end - b) || 1u);
return pm_encoding_utf_8_char_width(b, parser->end - b);
}
}

Expand All @@ -6262,7 +6262,7 @@ char_is_identifier_utf8(const uint8_t *b, const uint8_t *end) {
if (*b < 0x80) {
return (*b == '_') || (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT ? 1 : 0);
} else {
return (size_t) (pm_encoding_utf_8_alnum_char(b, end - b) || 1u);
return pm_encoding_utf_8_char_width(b, end - b);
}
}

Expand Down

0 comments on commit 9fdfdf4

Please sign in to comment.