Skip to content

Commit 4cb276a

Browse files
committed
Always return the character width for char_is_identifier_start() and char_is_identifier_utf8()
* This is also faster than calling pm_encoding_utf_8_alpha_char/pm_encoding_utf_8_alnum_char as those compute the character width and do extra checks.
1 parent a1d0180 commit 4cb276a

File tree

3 files changed

+16
-3
lines changed

3 files changed

+16
-3
lines changed

include/prism/encoding.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,16 @@ typedef struct {
7979
*/
8080
#define PRISM_ENCODING_UPPERCASE_BIT 1 << 2
8181

82+
/**
83+
* Return the size of the next character in the UTF-8 encoding.
84+
*
85+
* @param b The bytes to read.
86+
* @param n The number of bytes that can be read.
87+
* @returns The number of bytes that the next character takes if it is valid in
88+
* the encoding, or 0 if it is not.
89+
*/
90+
size_t pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n);
91+
8292
/**
8393
* Return the size of the next character in the UTF-8 encoding if it is an
8494
* alphabetical character.

src/encoding.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2277,7 +2277,10 @@ pm_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
22772277
return 0;
22782278
}
22792279

2280-
static size_t
2280+
/**
2281+
* Return the size of the next character in the UTF-8 encoding.
2282+
*/
2283+
size_t
22812284
pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n) {
22822285
size_t width;
22832286
pm_utf_8_codepoint(b, n, &width);

src/prism.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6249,7 +6249,7 @@ char_is_identifier_start(pm_parser_t *parser, const uint8_t *b) {
62496249
} else if (*b < 0x80) {
62506250
return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT ? 1 : 0) || (*b == '_');
62516251
} else {
6252-
return (size_t) (pm_encoding_utf_8_alpha_char(b, parser->end - b) || 1u);
6252+
return pm_encoding_utf_8_char_width(b, parser->end - b);
62536253
}
62546254
}
62556255

@@ -6262,7 +6262,7 @@ char_is_identifier_utf8(const uint8_t *b, const uint8_t *end) {
62626262
if (*b < 0x80) {
62636263
return (*b == '_') || (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT ? 1 : 0);
62646264
} else {
6265-
return (size_t) (pm_encoding_utf_8_alnum_char(b, end - b) || 1u);
6265+
return pm_encoding_utf_8_char_width(b, end - b);
62666266
}
62676267
}
62686268

0 commit comments

Comments
 (0)