Skip to content

Commit

Permalink
Update ZipString to deal with reads that do not return all data
Browse files Browse the repository at this point in the history
Refine the logic in `ZipString.hash` and `ZipString.compare` to deal
with the fact a read operation may not return all available bytes.

Fixes gh-38751
  • Loading branch information
philwebb committed Dec 13, 2023
1 parent afad358 commit b4a4e91
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,19 @@ class ByteArrayDataBlock implements CloseableDataBlock {

private final byte[] bytes;

private final int maxReadSize;

/**
* Create a new {@link ByteArrayDataBlock} backed by the given bytes.
* @param bytes the bytes to use
*/
ByteArrayDataBlock(byte... bytes) {
this(bytes, -1);
}

ByteArrayDataBlock(byte[] bytes, int maxReadSize) {
this.bytes = bytes;
this.maxReadSize = maxReadSize;
}

@Override
Expand All @@ -49,6 +56,9 @@ public int read(ByteBuffer dst, long pos) throws IOException {
private int read(ByteBuffer dst, int pos) {
int remaining = dst.remaining();
int length = Math.min(this.bytes.length - pos, remaining);
if (this.maxReadSize > 0 && length > this.maxReadSize) {
length = this.maxReadSize;
}
dst.put(this.bytes, pos, length);
return length;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,19 +108,15 @@ static int hash(ByteBuffer buffer, DataBlock dataBlock, long pos, int len, boole
byte[] bytes = buffer.array();
int hash = 0;
char lastChar = 0;
int codePointSize = 1;
while (len > 0) {
int count = readInBuffer(dataBlock, pos, buffer, len);
len -= count;
pos += count;
int count = readInBuffer(dataBlock, pos, buffer, len, codePointSize);
for (int byteIndex = 0; byteIndex < count;) {
int codePointSize = getCodePointSize(bytes, byteIndex);
codePointSize = getCodePointSize(bytes, byteIndex);
if (!hasEnoughBytes(byteIndex, codePointSize, count)) {
pos--;
len++;
break;
}
int codePoint = getCodePoint(bytes, byteIndex, codePointSize);
byteIndex += codePointSize;
if (codePoint <= 0xFFFF) {
lastChar = (char) (codePoint & 0xFFFF);
hash = 31 * hash + lastChar;
Expand All @@ -130,6 +126,10 @@ static int hash(ByteBuffer buffer, DataBlock dataBlock, long pos, int len, boole
hash = 31 * hash + Character.highSurrogate(codePoint);
hash = 31 * hash + Character.lowSurrogate(codePoint);
}
byteIndex += codePointSize;
pos += codePointSize;
len -= codePointSize;
codePointSize = 1;
}
}
hash = (addEndSlash && lastChar != '/') ? 31 * hash + '/' : hash;
Expand Down Expand Up @@ -198,19 +198,15 @@ private static int compare(ByteBuffer buffer, DataBlock dataBlock, long pos, int
int maxCharSequenceLength = (!addSlash) ? charSequence.length() : charSequence.length() + 1;
int result = 0;
byte[] bytes = buffer.array();
int codePointSize = 1;
while (len > 0) {
int count = readInBuffer(dataBlock, pos, buffer, len);
len -= count;
pos += count;
int count = readInBuffer(dataBlock, pos, buffer, len, codePointSize);
for (int byteIndex = 0; byteIndex < count;) {
int codePointSize = getCodePointSize(bytes, byteIndex);
codePointSize = getCodePointSize(bytes, byteIndex);
if (!hasEnoughBytes(byteIndex, codePointSize, count)) {
pos--;
len++;
break;
}
int codePoint = getCodePoint(bytes, byteIndex, codePointSize);
result += codePointSize;
if (codePoint <= 0xFFFF) {
char ch = (char) (codePoint & 0xFFFF);
if (charSequenceIndex >= maxCharSequenceLength
Expand All @@ -230,10 +226,14 @@ private static int compare(ByteBuffer buffer, DataBlock dataBlock, long pos, int
return -1;
}
}
byteIndex += codePointSize;
pos += codePointSize;
len -= codePointSize;
result += codePointSize;
codePointSize = 1;
if (compareType == CompareType.STARTS_WITH && charSequenceIndex >= charSequence.length()) {
return result;
}
byteIndex += codePointSize;
}
}
return (charSequenceIndex >= charSequence.length()) ? result : -1;
Expand Down Expand Up @@ -273,16 +273,22 @@ static String readString(DataBlock data, long pos, long len) {
}
}

private static int readInBuffer(DataBlock dataBlock, long pos, ByteBuffer buffer, int maxLen) throws IOException {
private static int readInBuffer(DataBlock dataBlock, long pos, ByteBuffer buffer, int maxLen, int minLen)
throws IOException {
buffer.clear();
if (buffer.remaining() > maxLen) {
buffer.limit(maxLen);
}
int count = dataBlock.read(buffer, pos);
if (count <= 0) {
throw new EOFException();
int result = 0;
while (result < minLen) {
int count = dataBlock.read(buffer, pos);
if (count <= 0) {
throw new EOFException();
}
result += count;
pos += count;
}
return count;
return result;
}

private static int getCodePointSize(byte[] bytes, int i) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,10 @@ void testHash(HashSourceType sourceType, String source, boolean addEndSlash, int
case DATA_BLOCK -> {
ByteArrayDataBlock dataBlock = new ByteArrayDataBlock(source.getBytes(StandardCharsets.UTF_8));
assertThat(ZipString.hash(null, dataBlock, 0, (int) dataBlock.size(), addEndSlash)).isEqualTo(expected);

}
case SINGLE_BYTE_READ_DATA_BLOCK -> {
ByteArrayDataBlock dataBlock = new ByteArrayDataBlock(source.getBytes(StandardCharsets.UTF_8), 1);
assertThat(ZipString.hash(null, dataBlock, 0, (int) dataBlock.size(), addEndSlash)).isEqualTo(expected);
}
}
}
Expand Down Expand Up @@ -187,7 +190,7 @@ private AbstractIntegerAssert<?> assertStartsWith(String source, CharSequence ch

enum HashSourceType {

STRING, CHAR_SEQUENCE, DATA_BLOCK
STRING, CHAR_SEQUENCE, DATA_BLOCK, SINGLE_BYTE_READ_DATA_BLOCK

}

Expand Down

0 comments on commit b4a4e91

Please sign in to comment.