|
@@ -31,18 +31,36 @@ |
|
|
#include "SPParserUtils.h" |
|
|
#include <stdint.h> |
|
|
|
|
|
#define SIZET (sizeof(size_t)) |
|
|
#define SIZET1 (SIZET - 1) |
|
|
#define SBYTE (SIZET1 * 8) |
|
|
|
|
|
#define ONEMASK ((size_t)(-1) / 0xFF) |
|
|
#define ONEMASK8 (ONEMASK * 0x80) |
|
|
#define FMASK ((size_t)(-1)*(ONEMASK*0xf)-1) |
|
|
|
|
|
// adapted from http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html |
|
|
size_t utf8strlen(const char * _s) |
|
|
{ |
|
|
|
|
|
/* Due to [NSString length] behaviour for chars > 0xFFFF {length = 2} |
|
|
"correct" the variable 'count' by subtraction the number |
|
|
of occurrences of the start byte 0xF0 (4-byte UTF-8 char). |
|
|
Here we assume that only up to 4-byte UTF-8 chars |
|
|
are allowed [latest UTF-8 specification]. |
|
|
|
|
|
Marked in the source code by "CORRECT". |
|
|
*/ |
|
|
|
|
|
const char * s; |
|
|
size_t count = 0; |
|
|
size_t u; |
|
|
long count = 0; |
|
|
size_t u = 0; |
|
|
size_t u1 = 0; |
|
|
unsigned char b; |
|
|
|
|
|
|
|
|
/* Handle any initial misaligned bytes. */ |
|
|
for (s = _s; (uintptr_t)(s) & (sizeof(size_t) - 1); s++) { |
|
|
for (s = _s; (uintptr_t)(s) & SIZET1; s++) { |
|
|
b = *s; |
|
|
|
|
|
/* Exit if we hit a zero byte. */ |
|
@@ -51,23 +69,33 @@ size_t utf8strlen(const char * _s) |
|
|
|
|
|
/* Is this byte NOT the first byte of a character? */ |
|
|
count += (b >> 7) & ((~b) >> 6); |
|
|
|
|
|
/* CORRECT */ |
|
|
count -= (b & 0xf0) == 0xf0; |
|
|
} |
|
|
|
|
|
/* Handle complete blocks. */ |
|
|
for (; ; s += sizeof(size_t)) { |
|
|
for (; ; s += SIZET) { |
|
|
/* Prefetch 256 bytes ahead. */ |
|
|
__builtin_prefetch(&s[256], 0, 0); |
|
|
|
|
|
/* Grab 4 or 8 bytes of UTF-8 data. */ |
|
|
u = *(size_t *)(s); |
|
|
|
|
|
/* Exit the loop if there are any zero bytes. */ |
|
|
if ((u - ONEMASK) & (~u) & (ONEMASK * 0x80)) |
|
|
if ((u - ONEMASK) & (~u) & ONEMASK8) |
|
|
break; |
|
|
|
|
|
/* CORRECT */ |
|
|
u1 = u & FMASK; |
|
|
u1 = (u1 >> 7) & (u1 >> 6) & (u1 >> 5) & (u1 >> 4); |
|
|
if (u1) count -= (u1 * ONEMASK) >> SBYTE; |
|
|
|
|
|
/* Count bytes which are NOT the first byte of a character. */ |
|
|
u = ((u & (ONEMASK * 0x80)) >> 7) & ((~u) >> 6); |
|
|
count += (u * ONEMASK) >> ((sizeof(size_t) - 1) * 8); |
|
|
u = ((u & ONEMASK8) >> 7) & ((~u) >> 6); |
|
|
|
|
|
count += (u * ONEMASK) >> SBYTE; |
|
|
|
|
|
} |
|
|
|
|
|
/* Take care of any left-over bytes. */ |
|
@@ -80,6 +108,9 @@ size_t utf8strlen(const char * _s) |
|
|
|
|
|
/* Is this byte NOT the first byte of a character? */ |
|
|
count += (b >> 7) & ((~b) >> 6); |
|
|
|
|
|
/* CORRECT */ |
|
|
count -= (b & 0xf0) == 0xf0; |
|
|
} |
|
|
|
|
|
done: |
|
|