Skip to content

Commit 2ab2ddf

Browse files
committed
Fix syntax highlighting for emoji et al. #2044
Voodoo magic provided by @Bibiko ;)
1 parent aa15d31 commit 2ab2ddf

File tree

3 files changed

+51
-10
lines changed

3 files changed

+51
-10
lines changed

Source/SPParserUtils.c

Lines changed: 38 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,18 +31,36 @@
3131
#include "SPParserUtils.h"
3232
#include <stdint.h>
3333

34+
#define SIZET (sizeof(size_t))
35+
#define SIZET1 (SIZET - 1)
36+
#define SBYTE (SIZET1 * 8)
37+
3438
#define ONEMASK ((size_t)(-1) / 0xFF)
39+
#define ONEMASK8 (ONEMASK * 0x80)
40+
#define FMASK ((size_t)(-1)*(ONEMASK*0xf)-1)
3541

3642
// adapted from http://www.daemonology.net/blog/2008-06-05-faster-utf8-strlen.html
3743
size_t utf8strlen(const char * _s)
3844
{
45+
46+
/* Due to [NSString length] behaviour for chars > 0xFFFF {length = 2}
47+
"correct" the variable 'count' by subtraction the number
48+
of occurrences of the start byte 0xF0 (4-byte UTF-8 char).
49+
Here we assume that only up to 4-byte UTF-8 chars
50+
are allowed [latest UTF-8 specification].
51+
52+
Marked in the source code by "CORRECT".
53+
*/
54+
3955
const char * s;
40-
size_t count = 0;
41-
size_t u;
56+
long count = 0;
57+
size_t u = 0;
58+
size_t u1 = 0;
4259
unsigned char b;
4360

61+
4462
/* Handle any initial misaligned bytes. */
45-
for (s = _s; (uintptr_t)(s) & (sizeof(size_t) - 1); s++) {
63+
for (s = _s; (uintptr_t)(s) & SIZET1; s++) {
4664
b = *s;
4765

4866
/* Exit if we hit a zero byte. */
@@ -51,23 +69,33 @@ size_t utf8strlen(const char * _s)
5169

5270
/* Is this byte NOT the first byte of a character? */
5371
count += (b >> 7) & ((~b) >> 6);
72+
73+
/* CORRECT */
74+
count -= (b & 0xf0) == 0xf0;
5475
}
5576

5677
/* Handle complete blocks. */
57-
for (; ; s += sizeof(size_t)) {
78+
for (; ; s += SIZET) {
5879
/* Prefetch 256 bytes ahead. */
5980
__builtin_prefetch(&s[256], 0, 0);
6081

6182
/* Grab 4 or 8 bytes of UTF-8 data. */
6283
u = *(size_t *)(s);
6384

6485
/* Exit the loop if there are any zero bytes. */
65-
if ((u - ONEMASK) & (~u) & (ONEMASK * 0x80))
86+
if ((u - ONEMASK) & (~u) & ONEMASK8)
6687
break;
6788

89+
/* CORRECT */
90+
u1 = u & FMASK;
91+
u1 = (u1 >> 7) & (u1 >> 6) & (u1 >> 5) & (u1 >> 4);
92+
if (u1) count -= (u1 * ONEMASK) >> SBYTE;
93+
6894
/* Count bytes which are NOT the first byte of a character. */
69-
u = ((u & (ONEMASK * 0x80)) >> 7) & ((~u) >> 6);
70-
count += (u * ONEMASK) >> ((sizeof(size_t) - 1) * 8);
95+
u = ((u & ONEMASK8) >> 7) & ((~u) >> 6);
96+
97+
count += (u * ONEMASK) >> SBYTE;
98+
7199
}
72100

73101
/* Take care of any left-over bytes. */
@@ -80,6 +108,9 @@ size_t utf8strlen(const char * _s)
80108

81109
/* Is this byte NOT the first byte of a character? */
82110
count += (b >> 7) & ((~b) >> 6);
111+
112+
/* CORRECT */
113+
count -= (b & 0xf0) == 0xf0;
83114
}
84115

85116
done:

UnitTests/SPParserUtilsTest.m

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ - (void)testUtf8strlen;
4444
@implementation SPParserUtilsTest
4545

4646
- (void)testUtf8strlen {
47+
// NOTE!!: Those test do not verify that the utf8strlen() function works according to spec,
48+
// but whether it produces the same results as NSString for the same input.
49+
4750
const char *empty = "";
4851
NSString *emptyString = [NSString stringWithCString:empty encoding:NSUTF8StringEncoding];
4952
STAssertEquals(utf8strlen(empty),[emptyString length], @"empty string");
@@ -52,8 +55,8 @@ - (void)testUtf8strlen {
5255
// If any of those conditions fail, all of the following assumptions are moot.
5356
const char *charSeq = "\xF0\x9F\x8D\x8F"; //🍏
5457
NSString *charString = [NSString stringWithCString:charSeq encoding:NSUTF8StringEncoding];
55-
STAssertEquals(strlen(charSeq), 4, @"assumption about storage for binary C string");
56-
STAssertEquals([charString length], 2, @"assumption about NSString internal storage of string");
58+
STAssertEquals(strlen(charSeq), (size_t)4, @"assumption about storage for binary C string");
59+
STAssertEquals([charString length], (NSUInteger)2, @"assumption about NSString internal storage of string");
5760

5861
const char *singleByteSeq = "Hello World!";
5962
NSString *singleByteString = [NSString stringWithCString:singleByteSeq encoding:NSUTF8StringEncoding];

sequel-pro.xcodeproj/xcshareddata/xcschemes/Sequel Pro.xcscheme

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
<TestAction
2626
selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
2727
selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
28-
shouldUseLaunchSchemeArgsEnv = "YES"
28+
shouldUseLaunchSchemeArgsEnv = "NO"
2929
buildConfiguration = "Debug">
3030
<Testables>
3131
<TestableReference
@@ -48,6 +48,13 @@
4848
ReferencedContainer = "container:sequel-pro.xcodeproj">
4949
</BuildableReference>
5050
</MacroExpansion>
51+
<EnvironmentVariables>
52+
<EnvironmentVariable
53+
key = "OBJC_DISABLE_GC"
54+
value = "YES"
55+
isEnabled = "YES">
56+
</EnvironmentVariable>
57+
</EnvironmentVariables>
5158
</TestAction>
5259
<LaunchAction
5360
selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"

0 commit comments

Comments
 (0)