Allow Uid#decodeId to decode from a byte array slice

Today we only allow to decode byte arrays where the data has a 0 offset and the same length as the array. Allowing to decode stuff from a slice will make decoding IDs cheaper if the the ID is for instance coming from a term dictionary or BytesRef. Relates to elastic#26931
s1monw · Oct 12, 2017 · cb88fc1 · cb88fc1
1 parent cee9640
commit cb88fc1
Show file tree

Hide file tree

Showing 2 changed files with 72 additions and 47 deletions.
diff --git a/core/src/main/java/org/elasticsearch/index/mapper/Uid.java b/core/src/main/java/org/elasticsearch/index/mapper/Uid.java
@@ -135,36 +135,36 @@ static boolean isURLBase64WithoutPadding(String id) {
         // 'xxx=' and 'xxx' could be considered the same id
         final int length = id.length();
         switch (length & 0x03) {
-        case 0:
-            break;
-        case 1:
-            return false;
-        case 2:
-            // the last 2 symbols (12 bits) are encoding 1 byte (8 bits)
-            // so the last symbol only actually uses 8-6=2 bits and can only take 4 values
-            char last = id.charAt(length - 1);
-            if (last != 'A' && last != 'Q' && last != 'g' && last != 'w') {
+            case 0:
+                break;
+            case 1:
                 return false;
-            }
-            break;
-        case 3:
-            // The last 3 symbols (18 bits) are encoding 2 bytes (16 bits)
-            // so the last symbol only actually uses 16-12=4 bits and can only take 16 values
-            last = id.charAt(length - 1);
-            if (last != 'A' && last != 'E' && last != 'I' && last != 'M' && last != 'Q'&& last != 'U'&& last != 'Y'
+            case 2:
+                // the last 2 symbols (12 bits) are encoding 1 byte (8 bits)
+                // so the last symbol only actually uses 8-6=2 bits and can only take 4 values
+                char last = id.charAt(length - 1);
+                if (last != 'A' && last != 'Q' && last != 'g' && last != 'w') {
+                    return false;
+                }
+                break;
+            case 3:
+                // The last 3 symbols (18 bits) are encoding 2 bytes (16 bits)
+                // so the last symbol only actually uses 16-12=4 bits and can only take 16 values
+                last = id.charAt(length - 1);
+                if (last != 'A' && last != 'E' && last != 'I' && last != 'M' && last != 'Q'&& last != 'U'&& last != 'Y'
                     && last != 'c'&& last != 'g'&& last != 'k' && last != 'o' && last != 's' && last != 'w'
                     && last != '0' && last != '4' && last != '8') {
-                return false;
-            }
-            break;
-        default:
-            // number & 0x03 is always in [0,3]
-            throw new AssertionError("Impossible case");
+                    return false;
+                }
+                break;
+            default:
+                // number & 0x03 is always in [0,3]
+                throw new AssertionError("Impossible case");
         }
         for (int i = 0; i < length; ++i) {
             final char c = id.charAt(i);
             final boolean allowed =
-                    (c >= '0' && c <= '9') ||
+                (c >= '0' && c <= '9') ||
                     (c >= 'A' && c <= 'Z') ||
                     (c >= 'a' && c <= 'z') ||
                     c == '-' || c == '_';
@@ -244,16 +244,16 @@ public static BytesRef encodeId(String id) {
         }
     }
 
-    private static String decodeNumericId(byte[] idBytes) {
-        assert Byte.toUnsignedInt(idBytes[0]) == NUMERIC;
-        int length = (idBytes.length - 1) * 2;
+    private static String decodeNumericId(byte[] idBytes, int offset, int len) {
+        assert Byte.toUnsignedInt(idBytes[offset]) == NUMERIC;
+        int length = (len - 1) * 2;
         char[] chars = new char[length];
-        for (int i = 1; i < idBytes.length; ++i) {
-            final int b = Byte.toUnsignedInt(idBytes[i]);
+        for (int i = 1; i < len; ++i) {
+            final int b = Byte.toUnsignedInt(idBytes[offset + i]);
             final int b1 = (b >>> 4);
             final int b2 = b & 0x0f;
             chars[(i - 1) * 2] = (char) (b1 + '0');
-            if (i == idBytes.length - 1 && b2 == 0x0f) {
+            if (i == len - 1 && b2 == 0x0f) {
                 length--;
                 break;
             }
@@ -262,33 +262,41 @@ private static String decodeNumericId(byte[] idBytes) {
         return new String(chars, 0, length);
     }
 
-    private static String decodeUtf8Id(byte[] idBytes) {
-        assert Byte.toUnsignedInt(idBytes[0]) == UTF8;
-        return new BytesRef(idBytes, 1, idBytes.length - 1).utf8ToString();
+    private static String decodeUtf8Id(byte[] idBytes, int offset, int length) {
+        assert Byte.toUnsignedInt(idBytes[offset]) == UTF8;
+        return new BytesRef(idBytes, offset + 1, length - 1).utf8ToString();
     }
 
-    private static String decodeBase64Id(byte[] idBytes) {
-        assert Byte.toUnsignedInt(idBytes[0]) <= BASE64_ESCAPE;
-        if (Byte.toUnsignedInt(idBytes[0]) == BASE64_ESCAPE) {
-            idBytes = Arrays.copyOfRange(idBytes, 1, idBytes.length);
+    private static String decodeBase64Id(byte[] idBytes, int offset, int length) {
+        assert Byte.toUnsignedInt(idBytes[offset]) <= BASE64_ESCAPE;
+        if (Byte.toUnsignedInt(idBytes[offset]) == BASE64_ESCAPE) {
+            idBytes = Arrays.copyOfRange(idBytes, offset + 1, offset + length);
+        } else if ((idBytes.length == length && offset == 0) == false) { // no need to copy if it's not a slice
+            idBytes = Arrays.copyOfRange(idBytes, offset, offset + length);
         }
         return Base64.getUrlEncoder().withoutPadding().encodeToString(idBytes);
     }
 
     /** Decode an indexed id back to its original form.
      *  @see #encodeId */
     public static String decodeId(byte[] idBytes) {
-        if (idBytes.length == 0) {
+        return decodeId(idBytes, 0, idBytes.length);
+    }
+
+    /** Decode an indexed id back to its original form.
+     *  @see #encodeId */
+    public static String decodeId(byte[] idBytes, int offset, int length) {
+        if (length == 0) {
             throw new IllegalArgumentException("Ids can't be empty");
         }
-        final int magicChar = Byte.toUnsignedInt(idBytes[0]);
+        final int magicChar = Byte.toUnsignedInt(idBytes[offset]);
         switch (magicChar) {
-        case NUMERIC:
-            return decodeNumericId(idBytes);
-        case UTF8:
-            return decodeUtf8Id(idBytes);
-        default:
-            return decodeBase64Id(idBytes);
+            case NUMERIC:
+                return decodeNumericId(idBytes, offset, length);
+            case UTF8:
+                return decodeUtf8Id(idBytes, offset, length);
+            default:
+                return decodeBase64Id(idBytes, offset, length);
         }
     }
 }
diff --git a/core/src/test/java/org/elasticsearch/index/mapper/UidTests.java b/core/src/test/java/org/elasticsearch/index/mapper/UidTests.java
@@ -79,7 +79,7 @@ public void testEncodeUTF8Ids() {
         for (int iter = 0; iter < iters; ++iter) {
             final String id = TestUtil.randomRealisticUnicodeString(random(), 1, 10);
             BytesRef encoded = Uid.encodeId(id);
-            assertEquals(id, Uid.decodeId(Arrays.copyOfRange(encoded.bytes, encoded.offset, encoded.offset + encoded.length)));
+            assertEquals(id, doDecodeId(encoded));
             assertTrue(encoded.length <= 1 + new BytesRef(id).length);
         }
     }
@@ -93,7 +93,7 @@ public void testEncodeNumericIds() {
                 id = "0" + id;
             }
             BytesRef encoded = Uid.encodeId(id);
-            assertEquals(id, Uid.decodeId(Arrays.copyOfRange(encoded.bytes, encoded.offset, encoded.offset + encoded.length)));
+            assertEquals(id, doDecodeId(encoded));
             assertEquals(1 + (id.length() + 1) / 2, encoded.length);
         }
     }
@@ -105,9 +105,26 @@ public void testEncodeBase64Ids() {
             random().nextBytes(binaryId);
             final String id = Base64.getUrlEncoder().withoutPadding().encodeToString(binaryId);
             BytesRef encoded = Uid.encodeId(id);
-            assertEquals(id, Uid.decodeId(Arrays.copyOfRange(encoded.bytes, encoded.offset, encoded.offset + encoded.length)));
+            assertEquals(id, doDecodeId(encoded));
             assertTrue(encoded.length <= 1 + binaryId.length);
         }
     }
 
+    private static String doDecodeId(BytesRef encoded) {
+
+        if (randomBoolean()) {
+            return Uid.decodeId(Arrays.copyOfRange(encoded.bytes, encoded.offset, encoded.offset + encoded.length));
+        } else {
+            if (randomBoolean()) {
+                BytesRef slicedCopy = new BytesRef(randomIntBetween(encoded.length + 1, encoded.length + 100));
+                slicedCopy.offset = randomIntBetween(1, slicedCopy.bytes.length - encoded.length);
+                slicedCopy.length = encoded.length;
+                System.arraycopy(encoded.bytes, encoded.offset, slicedCopy.bytes, slicedCopy.offset, encoded.length);
+                assertArrayEquals(Arrays.copyOfRange(encoded.bytes, encoded.offset, encoded.offset + encoded.length),
+                    Arrays.copyOfRange(slicedCopy.bytes, slicedCopy.offset, slicedCopy.offset + slicedCopy.length));
+                encoded = slicedCopy;
+            }
+            return Uid.decodeId(encoded.bytes, encoded.offset, encoded.length);
+        }
+    }
 }