31
31
#include "SPParserUtils.h"
32
32
#include <stdint.h>
33
33
34
+ #define SIZET (sizeof(size_t))
35
+ #define SIZET1 (SIZET - 1)
36
+ #define SBYTE (SIZET1 * 8)
37
+
34
38
#define ONEMASK ((size_t)(-1) / 0xFF)
39
+ #define ONEMASK8 (ONEMASK * 0x80)
40
+ #define FMASK ((size_t)(-1)*(ONEMASK*0xf)-1)
35
41
36
42
// adapted from http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html
37
43
size_t utf8strlen (const char * _s )
38
44
{
45
+
46
+ /* Due to [NSString length] behaviour for chars > 0xFFFF {length = 2}
47
+ "correct" the variable 'count' by subtraction the number
48
+ of occurrences of the start byte 0xF0 (4-byte UTF-8 char).
49
+ Here we assume that only up to 4-byte UTF-8 chars
50
+ are allowed [latest UTF-8 specification].
51
+
52
+ Marked in the source code by "CORRECT".
53
+ */
54
+
39
55
const char * s ;
40
- size_t count = 0 ;
41
- size_t u ;
56
+ long count = 0 ;
57
+ size_t u = 0 ;
58
+ size_t u1 = 0 ;
42
59
unsigned char b ;
43
60
61
+
44
62
/* Handle any initial misaligned bytes. */
45
- for (s = _s ; (uintptr_t )(s ) & ( sizeof ( size_t ) - 1 ) ; s ++ ) {
63
+ for (s = _s ; (uintptr_t )(s ) & SIZET1 ; s ++ ) {
46
64
b = * s ;
47
65
48
66
/* Exit if we hit a zero byte. */
@@ -51,23 +69,33 @@ size_t utf8strlen(const char * _s)
51
69
52
70
/* Is this byte NOT the first byte of a character? */
53
71
count += (b >> 7 ) & ((~b ) >> 6 );
72
+
73
+ /* CORRECT */
74
+ count -= (b & 0xf0 ) == 0xf0 ;
54
75
}
55
76
56
77
/* Handle complete blocks. */
57
- for (; ; s += sizeof ( size_t ) ) {
78
+ for (; ; s += SIZET ) {
58
79
/* Prefetch 256 bytes ahead. */
59
80
__builtin_prefetch (& s [256 ], 0 , 0 );
60
81
61
82
/* Grab 4 or 8 bytes of UTF-8 data. */
62
83
u = * (size_t * )(s );
63
84
64
85
/* Exit the loop if there are any zero bytes. */
65
- if ((u - ONEMASK ) & (~u ) & ( ONEMASK * 0x80 ) )
86
+ if ((u - ONEMASK ) & (~u ) & ONEMASK8 )
66
87
break ;
67
88
89
+ /* CORRECT */
90
+ u1 = u & FMASK ;
91
+ u1 = (u1 >> 7 ) & (u1 >> 6 ) & (u1 >> 5 ) & (u1 >> 4 );
92
+ if (u1 ) count -= (u1 * ONEMASK ) >> SBYTE ;
93
+
68
94
/* Count bytes which are NOT the first byte of a character. */
69
- u = ((u & (ONEMASK * 0x80 )) >> 7 ) & ((~u ) >> 6 );
70
- count += (u * ONEMASK ) >> ((sizeof (size_t ) - 1 ) * 8 );
95
+ u = ((u & ONEMASK8 ) >> 7 ) & ((~u ) >> 6 );
96
+
97
+ count += (u * ONEMASK ) >> SBYTE ;
98
+
71
99
}
72
100
73
101
/* Take care of any left-over bytes. */
@@ -80,6 +108,9 @@ size_t utf8strlen(const char * _s)
80
108
81
109
/* Is this byte NOT the first byte of a character? */
82
110
count += (b >> 7 ) & ((~b ) >> 6 );
111
+
112
+ /* CORRECT */
113
+ count -= (b & 0xf0 ) == 0xf0 ;
83
114
}
84
115
85
116
done :
0 commit comments