Skip to content

Commit

Permalink
Add getApproximateRegionLogicalSizeInBytes() to Block
Browse files Browse the repository at this point in the history
getApproximateRegionLogicalSizeInBytes provides a faster implementation
to get approximate logical sizes of the blocks.
  • Loading branch information
Ying Su authored and mbasmanova committed Oct 14, 2020
1 parent 465644c commit 7bffeb5
Show file tree
Hide file tree
Showing 8 changed files with 169 additions and 11 deletions.
16 changes: 16 additions & 0 deletions presto-common/src/main/java/com/facebook/presto/common/Page.java
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,22 @@ public long getLogicalSizeInBytes()
return logicalSizeInBytes;
}

/**
* Returns the approximate logical size of the page if logicalSizeInBytes was not calculated before.
*/
public long getApproximateLogicalSizeInBytes()
{
if (logicalSizeInBytes < 0) {
long approximateLogicalSizeInBytes = 0;
for (Block block : blocks) {
approximateLogicalSizeInBytes += block.getApproximateRegionLogicalSizeInBytes(0, block.getPositionCount());
}
return approximateLogicalSizeInBytes;
}

return logicalSizeInBytes;
}

public long getRetainedSizeInBytes()
{
long retainedSizeInBytes = this.retainedSizeInBytes;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,18 @@ public long getRegionLogicalSizeInBytes(int position, int length)
return getRawElementBlock().getRegionLogicalSizeInBytes(valueStart, valueEnd - valueStart) + ((Integer.BYTES + Byte.BYTES) * (long) length);
}

@Override
public long getApproximateRegionLogicalSizeInBytes(int position, int length)
{
int positionCount = getPositionCount();
checkValidRegion(positionCount, position, length);

int valueStart = getOffset(position);
int valueEnd = getOffset(position + length);

return getRawElementBlock().getApproximateRegionLogicalSizeInBytes(valueStart, valueEnd - valueStart) + (Integer.BYTES + Byte.BYTES) * length;
}

@Override
public long getPositionsSizeInBytes(boolean[] positions)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,22 @@ public long getRegionLogicalSizeInBytes(int position, int length)
Integer.BYTES * HASH_MULTIPLIER * entryCount;
}

@Override
public long getApproximateRegionLogicalSizeInBytes(int position, int length)
{
int positionCount = getPositionCount();
checkValidRegion(positionCount, position, length);

int entriesStart = getOffset(position);
int entriesEnd = getOffset(position + length);
int entryCount = entriesEnd - entriesStart;

return getRawKeyBlock().getApproximateRegionLogicalSizeInBytes(entriesStart, entryCount) +
getRawValueBlock().getApproximateRegionLogicalSizeInBytes(entriesStart, entryCount) +
(Integer.BYTES + Byte.BYTES) * length + // offsets and mapIsNull
Integer.BYTES * HASH_MULTIPLIER * entryCount; // hashtables
}

@Override
public long getPositionsSizeInBytes(boolean[] positions)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,23 @@ public long getRegionLogicalSizeInBytes(int position, int length)
return regionLogicalSizeInBytes;
}

@Override
public long getApproximateRegionLogicalSizeInBytes(int position, int length)
{
int positionCount = getPositionCount();
checkValidRegion(positionCount, position, length);

int startFieldBlockOffset = getFieldBlockOffset(position);
int fieldBlockLength = getFieldBlockOffset(position + length) - startFieldBlockOffset;

long approximateLogicalSizeInBytes = (Integer.BYTES + Byte.BYTES) * length; // offsets and rowIsNull
for (int i = 0; i < numFields; i++) {
approximateLogicalSizeInBytes += getRawFieldBlocks()[i].getApproximateRegionLogicalSizeInBytes(startFieldBlockOffset, fieldBlockLength);
}

return approximateLogicalSizeInBytes;
}

@Override
public long getPositionsSizeInBytes(boolean[] positions)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,20 @@ default long getRegionLogicalSizeInBytes(int position, int length)
return getRegionSizeInBytes(position, length);
}

/**
* Returns the approximate logical size of {@code block.getRegion(position, length)}.
* This method is faster than getRegionLogicalSizeInBytes().
* For dictionary blocks, this counts the amortized flattened size of the included positions.
* For example, for a DictionaryBlock with 5 ids [1, 1, 1, 1, 1] and a dictionary of
* VariableWidthBlock with 3 elements of sizes [9, 5, 7], the result of
* getApproximateRegionLogicalSizeInBytes(0, 5) would be (9 + 5 + 7) / 3 * 5 = 35,
* while getRegionLogicalSizeInBytes(0, 5) would be 5 * 5 = 25.
*/
default long getApproximateRegionLogicalSizeInBytes(int position, int length)
{
return getRegionSizeInBytes(position, length);
}

/**
* Returns the size of of all positions marked true in the positions array.
* This is equivalent to multiple calls of {@code block.getRegionSizeInBytes(position, length)}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,13 @@ public long getRegionLogicalSizeInBytes(int positionOffset, int length)
return sizeInBytes;
}

@Override
public long getApproximateRegionLogicalSizeInBytes(int position, int length)
{
int dictionaryPositionCount = dictionary.getPositionCount();
return dictionaryPositionCount == 0 ? 0 : dictionary.getApproximateRegionLogicalSizeInBytes(0, dictionaryPositionCount) * length / dictionaryPositionCount;
}

@Override
public long getPositionsSizeInBytes(boolean[] positions)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,12 @@ public long getRegionLogicalSizeInBytes(int position, int length)
return length * value.getLogicalSizeInBytes();
}

@Override
public long getApproximateRegionLogicalSizeInBytes(int position, int length)
{
return positionCount * value.getApproximateRegionLogicalSizeInBytes(0, 1);
}

@Override
public long getPositionsSizeInBytes(boolean[] positions)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import com.facebook.presto.common.block.BlockBuilder;
import com.facebook.presto.common.block.BlockBuilderStatus;
import com.facebook.presto.common.block.BlockEncodingSerde;
import com.facebook.presto.common.block.DictionaryBlock;
import com.facebook.presto.common.block.DictionaryId;
import com.google.common.collect.ImmutableList;
import io.airlift.slice.DynamicSliceOutput;
Expand All @@ -37,6 +38,7 @@
import java.util.UUID;
import java.util.function.Supplier;

import static com.facebook.airlift.testing.Assertions.assertBetweenInclusive;
import static com.facebook.presto.common.type.BigintType.BIGINT;
import static com.facebook.presto.common.type.VarbinaryType.VARBINARY;
import static com.facebook.presto.common.type.VarcharType.VARCHAR;
Expand All @@ -45,6 +47,8 @@
import static io.airlift.slice.SizeOf.SIZE_OF_LONG;
import static io.airlift.slice.SizeOf.SIZE_OF_SHORT;
import static io.airlift.slice.SizeOf.sizeOf;
import static java.lang.Math.max;
import static java.lang.Math.min;
import static java.lang.Math.toIntExact;
import static java.lang.String.format;
import static java.util.Arrays.fill;
Expand Down Expand Up @@ -215,41 +219,107 @@ private void assertBlockSize(Block block)
{
// Asserting on `block` is not very effective because most blocks passed to this method is compact.
// Therefore, we split the `block` into two and assert again.
//------------------Test Whole Block Sizes---------------------------------------------------
// Assert sizeInBytes for the whole block.
long expectedBlockSize = copyBlockViaBlockSerde(block).getSizeInBytes();
assertEquals(block.getSizeInBytes(), expectedBlockSize);
assertEquals(block.getRegionSizeInBytes(0, block.getPositionCount()), expectedBlockSize);

// Assert logicalSize for the whole block. Note that copyBlockViaBlockSerde would flatten DictionaryBlock or RleBlock
long logicalSizeInBytes = block.getLogicalSizeInBytes();

long expectedLogicalBlockSize = copyBlockViaBlockSerde(block).getLogicalSizeInBytes();
assertEquals(block.getLogicalSizeInBytes(), expectedLogicalBlockSize);
assertEquals(logicalSizeInBytes, expectedLogicalBlockSize);
assertEquals(block.getRegionLogicalSizeInBytes(0, block.getPositionCount()), expectedLogicalBlockSize);

// Assert approximateLogicalSize for the whole block
long approximateLogicalSizeInBytes = block.getApproximateRegionLogicalSizeInBytes(0, block.getPositionCount());

long expectedApproximateLogicalBlockSize = expectedLogicalBlockSize;
if (block instanceof DictionaryBlock) {
int dictionaryPositionCount = ((DictionaryBlock) block).getDictionary().getPositionCount();
expectedApproximateLogicalBlockSize = ((DictionaryBlock) block).getDictionary().getApproximateRegionLogicalSizeInBytes(0, dictionaryPositionCount) * block.getPositionCount() / dictionaryPositionCount;
}
assertEquals(approximateLogicalSizeInBytes, expectedApproximateLogicalBlockSize);

//------------------Test First Half Sizes---------------------------------------------------
List<Block> splitBlock = splitBlock(block, 2);
Block firstHalf = splitBlock.get(0);
int firstHalfPositionCount = firstHalf.getPositionCount();

// Assert sizeInBytes for the firstHalf block.
long expectedFirstHalfSize = copyBlockViaBlockSerde(firstHalf).getSizeInBytes();
assertEquals(firstHalf.getSizeInBytes(), expectedFirstHalfSize);
assertEquals(block.getRegionSizeInBytes(0, firstHalf.getPositionCount()), expectedFirstHalfSize);
assertEquals(block.getRegionSizeInBytes(0, firstHalfPositionCount), expectedFirstHalfSize);

// Assert logicalSize for the firstHalf block
long firstHalfLogicalSizeInBytes = firstHalf.getLogicalSizeInBytes();
long expectedFirstHalfLogicalSize = copyBlockViaBlockSerde(firstHalf).getLogicalSizeInBytes();
assertEquals(firstHalf.getLogicalSizeInBytes(), expectedFirstHalfLogicalSize);
assertEquals(firstHalf.getRegionLogicalSizeInBytes(0, firstHalf.getPositionCount()), expectedFirstHalfLogicalSize);

assertEquals(firstHalfLogicalSizeInBytes, expectedFirstHalfLogicalSize);
assertEquals(firstHalf.getRegionLogicalSizeInBytes(0, firstHalfPositionCount), expectedFirstHalfLogicalSize);

// Assert approximateLogicalSize for the firstHalf block using logicalSize
long approximateFirstHalfLogicalSize = firstHalf.getApproximateRegionLogicalSizeInBytes(0, firstHalfPositionCount);

long expectedApproximateFirstHalfLogicalSize = expectedFirstHalfLogicalSize;
if (firstHalf instanceof DictionaryBlock) {
int dictionaryPositionCount = ((DictionaryBlock) firstHalf).getDictionary().getPositionCount();
expectedApproximateFirstHalfLogicalSize = ((DictionaryBlock) firstHalf).getDictionary().getApproximateRegionLogicalSizeInBytes(0, dictionaryPositionCount) * firstHalfPositionCount / dictionaryPositionCount;
}
assertEquals(approximateFirstHalfLogicalSize, expectedApproximateFirstHalfLogicalSize);

// Assert approximateLogicalSize for the firstHalf block using the ratio of firstHalf logicalSize vs whole block logicalSize
long expectedApproximateFirstHalfLogicalSizeFromUnsplittedBlock = logicalSizeInBytes == 0 ?
approximateLogicalSizeInBytes :
approximateLogicalSizeInBytes * firstHalfLogicalSizeInBytes / logicalSizeInBytes;
assertBetweenInclusive(
approximateFirstHalfLogicalSize,
// Allow for some error margins due to skew in blocks
min(expectedApproximateFirstHalfLogicalSizeFromUnsplittedBlock - 3, (long) (expectedApproximateFirstHalfLogicalSizeFromUnsplittedBlock * 0.7)),
max(expectedApproximateFirstHalfLogicalSizeFromUnsplittedBlock + 3, (long) (expectedApproximateFirstHalfLogicalSizeFromUnsplittedBlock * 1.3)));

//------------------Test Second Half Sizes---------------------------------------------------
Block secondHalf = splitBlock.get(1);
int secondHalfPositionCount = secondHalf.getPositionCount();

// Assert sizeInBytes for the secondHalf block.
long expectedSecondHalfSize = copyBlockViaBlockSerde(secondHalf).getSizeInBytes();
assertEquals(secondHalf.getSizeInBytes(), expectedSecondHalfSize);
assertEquals(block.getRegionSizeInBytes(firstHalf.getPositionCount(), secondHalf.getPositionCount()), expectedSecondHalfSize);
assertEquals(block.getRegionSizeInBytes(firstHalfPositionCount, secondHalfPositionCount), expectedSecondHalfSize);

// Assert logicalSize for the secondHalf block.
long secondHalfLogicalSizeInBytes = secondHalf.getLogicalSizeInBytes();
long expectedSecondHalfLogicalSize = copyBlockViaBlockSerde(secondHalf).getLogicalSizeInBytes();
assertEquals(secondHalf.getLogicalSizeInBytes(), expectedSecondHalfLogicalSize);
assertEquals(secondHalf.getRegionLogicalSizeInBytes(0, secondHalf.getPositionCount()), expectedSecondHalfLogicalSize);

assertEquals(secondHalfLogicalSizeInBytes, expectedSecondHalfLogicalSize);
assertEquals(secondHalf.getRegionLogicalSizeInBytes(0, secondHalfPositionCount), expectedSecondHalfLogicalSize);

// Assert approximateLogicalSize for the secondHalf block using logicalSize
long approximateSecondHalfLogicalSize = secondHalf.getApproximateRegionLogicalSizeInBytes(0, secondHalfPositionCount);

long expectedApproximateSecondHalfLogicalSize = copyBlockViaBlockSerde(secondHalf).getApproximateRegionLogicalSizeInBytes(0, secondHalfPositionCount);
if (secondHalf instanceof DictionaryBlock) {
int dictionaryPositionCount = ((DictionaryBlock) secondHalf).getDictionary().getPositionCount();
expectedApproximateSecondHalfLogicalSize = ((DictionaryBlock) secondHalf).getDictionary().getApproximateRegionLogicalSizeInBytes(0, dictionaryPositionCount) * secondHalfPositionCount / dictionaryPositionCount;
}
assertEquals(approximateSecondHalfLogicalSize, expectedApproximateSecondHalfLogicalSize);

// Assert approximateLogicalSize for the secondHalf block using the ratio of firstHalf logicalSize vs whole block logicalSize
long expectedApproximateSecondHalfLogicalSizeFromUnsplittedBlock = logicalSizeInBytes == 0 ?
approximateLogicalSizeInBytes :
approximateLogicalSizeInBytes * secondHalfLogicalSizeInBytes / logicalSizeInBytes;
assertBetweenInclusive(
approximateSecondHalfLogicalSize,
// Allow for some error margins due to skew in blocks
min(expectedApproximateSecondHalfLogicalSizeFromUnsplittedBlock - 3, (long) (expectedApproximateSecondHalfLogicalSizeFromUnsplittedBlock * 0.7)),
max(expectedApproximateSecondHalfLogicalSizeFromUnsplittedBlock + 3, (long) (expectedApproximateSecondHalfLogicalSizeFromUnsplittedBlock * 1.3)));

//----------------Test getPositionsSizeInBytes----------------------------------------
boolean[] positions = new boolean[block.getPositionCount()];
fill(positions, 0, firstHalf.getPositionCount(), true);
fill(positions, 0, firstHalfPositionCount, true);
assertEquals(block.getPositionsSizeInBytes(positions), expectedFirstHalfSize);
fill(positions, true);
assertEquals(block.getPositionsSizeInBytes(positions), expectedBlockSize);
fill(positions, 0, firstHalf.getPositionCount(), false);
fill(positions, 0, firstHalfPositionCount, false);
assertEquals(block.getPositionsSizeInBytes(positions), expectedSecondHalfSize);
}

Expand Down

0 comments on commit 7bffeb5

Please sign in to comment.