Permalink
Browse files

Do UTF-8 encoding natively.

For strings consisting exclusively of ASCII characters, this is slightly
faster on the desktop JVM:

    length     benchmark  ns linear runtime
        64 NativeEncoder 103 ===
        64      GetBytes 113 ===
      1024 NativeEncoder 894 ============================
      1024      GetBytes 933 ==============================
    vm: java

It's substantially faster on dalvikvm:

    length     benchmark    us linear runtime
        64      GetBytes  6.46 ===========
        64 NativeEncoder  1.47 ==
      1024      GetBytes 16.20 ==============================
      1024 NativeEncoder 13.58 =========================
  • Loading branch information...
1 parent 35bbfb7 commit b2d19a9faee2e006339cf5a3e0a31378711e7f5f @swankjesse swankjesse committed Nov 30, 2014
Showing with 166 additions and 2 deletions.
  1. +65 −2 okio/src/main/java/okio/Buffer.java
  2. +101 −0 okio/src/test/java/okio/Utf8Test.java
@@ -557,13 +557,76 @@ public void clear() {
@Override public Buffer writeUtf8(String string) {
if (string == null) throw new IllegalArgumentException("string == null");
- // TODO: inline UTF-8 encoding to save allocating a byte[]?
- return writeString(string, Util.UTF_8);
+
+ // Transcode a UTF-16 Java String to UTF-8 bytes.
+ for (int i = 0, length = string.length(); i < length;) {
+ int c = string.charAt(i);
+
+ if (c < 0x80) {
+ Segment tail = writableSegment(1);
+ byte[] data = tail.data;
+ int segmentOffset = tail.limit - i;
+ int runLimit = Math.min(length, Segment.SIZE - segmentOffset);
+
+ // Emit a 7-bit character with 1 byte.
+ data[segmentOffset + i++] = (byte) c; // 0xxxxxxx
+
+ // Fast-path contiguous runs of ASCII characters. This is ugly, but yields a ~4x performance
+ // improvement over independent calls to writeByte().
+ while (i < runLimit) {
+ c = string.charAt(i);
+ if (c >= 0x80) break;
+ data[segmentOffset + i++] = (byte) c; // 0xxxxxxx
+ }
+
+ int runSize = i + segmentOffset - tail.limit; // Equivalent to i - (previous i).
+ tail.limit += runSize;
+ size += runSize;
+
+ } else if (c < 0x800) {
+ // Emit a 11-bit character with 2 bytes.
+ writeByte(c >> 6 | 0xc0); // 110xxxxx
+ writeByte(c & 0x3f | 0x80); // 10xxxxxx
+ i++;
+
+ } else if (c < 0xd800 || c > 0xdfff) {
+ // Emit a 16-bit character with 3 bytes.
+ writeByte(c >> 12 | 0xe0); // 1110xxxx
+ writeByte(c >> 6 & 0x3f | 0x80); // 10xxxxxx
+ writeByte(c & 0x3f | 0x80); // 10xxxxxx
+ i++;
+
+ } else {
+ // c is a surrogate. Make sure it is a high surrogate & that its successor is a low
+ // surrogate. If not, the UTF-16 is invalid, in which case we emit a replacement character.
+ int low = i + 1 < length ? string.charAt(i + 1) : 0;
+ if (c > 0xdbff || low < 0xdc00 || low > 0xdfff) {
+ writeByte('?');
+ i++;
+ continue;
+ }
+
+ // UTF-16 high surrogate: 110110xxxxxxxxxx (10 bits)
+ // UTF-16 low surrogate: 110111yyyyyyyyyy (10 bits)
+ // Unicode code point: 00010000000000000000 + xxxxxxxxxxyyyyyyyyyy (21 bits)
+ int codePoint = 0x010000 + ((c & ~0xd800) << 10 | low & ~0xdc00);
+
+ // Emit a 21-bit character with 4 bytes.
+ writeByte(codePoint >> 18 | 0xf0); // 11110xxx
+ writeByte(codePoint >> 12 & 0x3f | 0x80); // 10xxxxxx
+ writeByte(codePoint >> 6 & 0x3f | 0x80); // 10xxyyyy
+ writeByte(codePoint & 0x3f | 0x80); // 10yyyyyy
+ i += 2;
+ }
+ }
+
+ return this;
}
@Override public Buffer writeString(String string, Charset charset) {
if (string == null) throw new IllegalArgumentException("string == null");
if (charset == null) throw new IllegalArgumentException("charset == null");
+ if (charset.equals(Util.UTF_8)) return writeUtf8(string);
byte[] data = string.getBytes(charset);
return write(data, 0, data.length);
}
@@ -0,0 +1,101 @@
+/*
+ * Copyright (C) 2014 Square, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package okio;
+
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+public final class Utf8Test {
+ @Test public void oneByteCharacters() throws Exception {
+ assertEncoded("00", 0x00); // Smallest 1-byte character.
+ assertEncoded("20", ' ');
+ assertEncoded("7e", '~');
+ assertEncoded("7f", 0x7f); // Largest 1-byte character.
+ }
+
+ @Test public void twoByteCharacters() throws Exception {
+ assertEncoded("c280", 0x0080); // Smallest 2-byte character.
+ assertEncoded("c3bf", 0x00ff);
+ assertEncoded("c480", 0x0100);
+ assertEncoded("dfbf", 0x07ff); // Largest 2-byte character.
+ }
+
+ @Test public void threeByteCharacters() throws Exception {
+ assertEncoded("e0a080", 0x0800); // Smallest 3-byte character.
+ assertEncoded("e0bfbf", 0x0fff);
+ assertEncoded("e18080", 0x1000);
+ assertEncoded("e1bfbf", 0x1fff);
+ assertEncoded("ed8080", 0xd000);
+ assertEncoded("ed9fbf", 0xd7ff); // Largest character lower than the min surrogate.
+ assertEncoded("ee8080", 0xe000); // Smallest character greater than the max surrogate.
+ assertEncoded("eebfbf", 0xefff);
+ assertEncoded("ef8080", 0xf000);
+ assertEncoded("efbfbf", 0xffff); // Largest 3-byte character.
+ }
+
+ @Test public void fourByteCharacters() throws Exception {
+ assertEncoded("f0908080", 0x010000); // Smallest surrogate pair.
+ assertEncoded("f48fbfbf", 0x10ffff); // Largest code point expressible by UTF-16.
+ }
+
+ @Test public void danglingHighSurrogate() throws Exception {
+ assertEncoded("3f", "\ud800"); // "?"
+ }
+
+ @Test public void lowSurrogateWithoutHighSurrogate() throws Exception {
+ assertEncoded("3f", "\udc00"); // "?"
+ }
+
+ @Test public void highSurrogateFollowedByNonSurrogate() throws Exception {
+ assertEncoded("3f61", "\ud800\u0061"); // "?a": Following character is too low.
+ assertEncoded("3fee8080", "\ud800\ue000"); // "?\ue000": Following character is too high.
+ }
+
+ @Test public void multipleSegmentString() throws Exception {
+ String a = TestUtil.repeat('a', Segment.SIZE + Segment.SIZE + 1);
+ Buffer encoded = new Buffer().writeUtf8(a);
+ Buffer expected = new Buffer().write(a.getBytes(Util.UTF_8));
+ assertEquals(expected, encoded);
+ }
+
+ @Test public void stringSpansSegments() throws Exception {
+ Buffer buffer = new Buffer();
+ String a = TestUtil.repeat('a', Segment.SIZE - 1);
+ String b = "bb";
+ String c = TestUtil.repeat('c', Segment.SIZE - 1);
+ buffer.writeUtf8(a);
+ buffer.writeUtf8(b);
+ buffer.writeUtf8(c);
+ assertEquals(a + b + c, buffer.readUtf8());
+ }
+
+ private void assertEncoded(String hex, int... codePoints) throws Exception {
+ assertEncoded(hex, new String(codePoints, 0, codePoints.length));
+ }
+
+ private void assertEncoded(String hex, String string) throws Exception {
+ ByteString expectedUtf8 = ByteString.decodeHex(hex);
+
+ // Confirm our expectations are consistent with the platform.
+ ByteString platformUtf8 = ByteString.of(string.getBytes("UTF-8"));
+ assertEquals(expectedUtf8, platformUtf8);
+
+ // Confirm our implementation matches those expectations.
+ ByteString actualUtf8 = new Buffer().writeUtf8(string).readByteString();
+ assertEquals(expectedUtf8, actualUtf8);
+ }
+}

0 comments on commit b2d19a9

Please sign in to comment.