3131#include "SPParserUtils.h"
3232#include <stdint.h>
3333
34+ #define SIZET (sizeof(size_t))
35+ #define SIZET1 (SIZET - 1)
36+ #define SBYTE (SIZET1 * 8)
37+
3438#define ONEMASK ((size_t)(-1) / 0xFF)
39+ #define ONEMASK8 (ONEMASK * 0x80)
40+ #define FMASK ((size_t)(-1)*(ONEMASK*0xf)-1)
3541
3642// adapted from http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html
3743size_t utf8strlen (const char * _s )
3844{
45+
46+ /* Due to [NSString length] behaviour for chars > 0xFFFF {length = 2}
47+ "correct" the variable 'count' by subtraction the number
48+ of occurrences of the start byte 0xF0 (4-byte UTF-8 char).
49+ Here we assume that only up to 4-byte UTF-8 chars
50+ are allowed [latest UTF-8 specification].
51+
52+ Marked in the source code by "CORRECT".
53+ */
54+
3955 const char * s ;
40- size_t count = 0 ;
41- size_t u ;
56+ long count = 0 ;
57+ size_t u = 0 ;
58+ size_t u1 = 0 ;
4259 unsigned char b ;
4360
61+
4462 /* Handle any initial misaligned bytes. */
45- for (s = _s ; (uintptr_t )(s ) & ( sizeof ( size_t ) - 1 ) ; s ++ ) {
63+ for (s = _s ; (uintptr_t )(s ) & SIZET1 ; s ++ ) {
4664 b = * s ;
4765
4866 /* Exit if we hit a zero byte. */
@@ -51,23 +69,33 @@ size_t utf8strlen(const char * _s)
5169
5270 /* Is this byte NOT the first byte of a character? */
5371 count += (b >> 7 ) & ((~b ) >> 6 );
72+
73+ /* CORRECT */
74+ count -= (b & 0xf0 ) == 0xf0 ;
5475 }
5576
5677 /* Handle complete blocks. */
57- for (; ; s += sizeof ( size_t ) ) {
78+ for (; ; s += SIZET ) {
5879 /* Prefetch 256 bytes ahead. */
5980 __builtin_prefetch (& s [256 ], 0 , 0 );
6081
6182 /* Grab 4 or 8 bytes of UTF-8 data. */
6283 u = * (size_t * )(s );
6384
6485 /* Exit the loop if there are any zero bytes. */
65- if ((u - ONEMASK ) & (~u ) & ( ONEMASK * 0x80 ) )
86+ if ((u - ONEMASK ) & (~u ) & ONEMASK8 )
6687 break ;
6788
89+ /* CORRECT */
90+ u1 = u & FMASK ;
91+ u1 = (u1 >> 7 ) & (u1 >> 6 ) & (u1 >> 5 ) & (u1 >> 4 );
92+ if (u1 ) count -= (u1 * ONEMASK ) >> SBYTE ;
93+
6894 /* Count bytes which are NOT the first byte of a character. */
69- u = ((u & (ONEMASK * 0x80 )) >> 7 ) & ((~u ) >> 6 );
70- count += (u * ONEMASK ) >> ((sizeof (size_t ) - 1 ) * 8 );
95+ u = ((u & ONEMASK8 ) >> 7 ) & ((~u ) >> 6 );
96+
97+ count += (u * ONEMASK ) >> SBYTE ;
98+
7199 }
72100
73101 /* Take care of any left-over bytes. */
@@ -80,6 +108,9 @@ size_t utf8strlen(const char * _s)
80108
81109 /* Is this byte NOT the first byte of a character? */
82110 count += (b >> 7 ) & ((~b ) >> 6 );
111+
112+ /* CORRECT */
113+ count -= (b & 0xf0 ) == 0xf0 ;
83114 }
84115
85116done :
0 commit comments