Skip to content

Commit

Permalink
PARQUET-372: Do not write stats larger than 4k.
Browse files Browse the repository at this point in the history
This updates the stats conversion to check whether the min and max
values for page stats are larger than 4k. If so, no statistics for a
page are written.

Author: Ryan Blue <blue@apache.org>

Closes apache#275 from rdblue/PARQUET-372-fix-min-max-for-long-values and squashes the following commits:

61e05d9 [Ryan Blue] PARQUET-372: Add comment to explain not truncating values.
fbbc1c4 [Ryan Blue] PARQUET-372: Do not write stats larger than 4k.
  • Loading branch information
rdblue committed Jan 6, 2017
1 parent bfc65e4 commit c706fe5
Show file tree
Hide file tree
Showing 10 changed files with 232 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,11 @@ public byte[] getMinBytes() {
return min == null ? null : min.getBytes();
}

@Override
public boolean isSmallerThan(long size) {
return !hasNonNullValue() || ((min.length() + max.length()) < size);
}

@Override
public String toString() {
if (this.hasNonNullValue())
Expand All @@ -77,11 +82,19 @@ else if (!this.isEmpty())
return "no stats for this column";
}

/**
* @deprecated use {@link #updateStats(Binary)}, will be removed in 2.0.0
*/
@Deprecated
public void updateStats(Binary min_value, Binary max_value) {
if (min.compareTo(min_value) > 0) { min = min_value.copy(); }
if (max.compareTo(max_value) < 0) { max = max_value.copy(); }
}

/**
* @deprecated use {@link #updateStats(Binary)}, will be removed in 2.0.0
*/
@Deprecated
public void initializeStats(Binary min_value, Binary max_value) {
min = min_value.copy();
max = max_value.copy();
Expand All @@ -98,14 +111,26 @@ public Binary genericGetMax() {
return max;
}

/**
* @deprecated use {@link #genericGetMax()}, will be removed in 2.0.0
*/
@Deprecated
public Binary getMax() {
return max;
}

/**
* @deprecated use {@link #genericGetMin()}, will be removed in 2.0.0
*/
@Deprecated
public Binary getMin() {
return min;
}

/**
* @deprecated use {@link #updateStats(Binary)}, will be removed in 2.0.0
*/
@Deprecated
public void setMinMax(Binary min, Binary max) {
this.max = max;
this.min = min;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@ public byte[] getMinBytes() {
return BytesUtils.booleanToBytes(min);
}

@Override
public boolean isSmallerThan(long size) {
return !hasNonNullValue() || (2 < size);
}

@Override
public String toString() {
if (this.hasNonNullValue())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@ public byte[] getMinBytes() {
return BytesUtils.longToBytes(Double.doubleToLongBits(min));
}

@Override
public boolean isSmallerThan(long size) {
return !hasNonNullValue() || (16 < size);
}

@Override
public String toString() {
if(this.hasNonNullValue())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@ public byte[] getMinBytes() {
return BytesUtils.intToBytes(Float.floatToIntBits(min));
}

@Override
public boolean isSmallerThan(long size) {
return !hasNonNullValue() || (8 < size);
}

@Override
public String toString() {
if (this.hasNonNullValue())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@ public byte[] getMinBytes() {
return BytesUtils.intToBytes(min);
}

@Override
public boolean isSmallerThan(long size) {
return !hasNonNullValue() || (8 < size);
}

@Override
public String toString() {
if (this.hasNonNullValue())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@ public byte[] getMinBytes() {
return BytesUtils.longToBytes(min);
}

@Override
public boolean isSmallerThan(long size) {
return !hasNonNullValue() || (16 < size);
}

@Override
public String toString() {
if (this.hasNonNullValue())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,14 @@ public void mergeStatistics(Statistics stats) {
*/
abstract public byte[] getMinBytes();

/**
* Abstract method to return whether the min and max values fit in the given
* size.
* @param size a size in bytes
* @return true iff the min and max values are less than size bytes
*/
abstract public boolean isSmallerThan(long size);

/**
* toString() to display min, max, num_nulls in a string
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ public class ParquetMetadataConverter {

public static final MetadataFilter NO_FILTER = new NoFilter();
public static final MetadataFilter SKIP_ROW_GROUPS = new SkipMetadataFilter();
public static final long MAX_STATS_SIZE = 4096; // limit stats to 4k

private static final Log LOG = Log.getLog(ParquetMetadataConverter.class);

Expand Down Expand Up @@ -284,7 +285,11 @@ dataPageType, getEncoding(encoding),
public static Statistics toParquetStatistics(
org.apache.parquet.column.statistics.Statistics statistics) {
Statistics stats = new Statistics();
if (!statistics.isEmpty()) {
// Don't write stats larger than the max size rather than truncating. The
// rationale is that some engines may use the minimum value in the page as
// the true minimum for aggregations and there is no way to mark that a
// value has been truncated and is a lower bound and not in the page.
if (!statistics.isEmpty() && statistics.isSmallerThan(MAX_STATS_SIZE)) {
stats.setNull_count(statistics.getNumNulls());
if (statistics.hasNonNullValue()) {
stats.setMax(statistics.getMaxBytes());
Expand All @@ -293,6 +298,7 @@ public static Statistics toParquetStatistics(
}
return stats;
}

/**
* @deprecated Replaced by {@link #fromParquetStatistics(
* String createdBy, Statistics statistics, PrimitiveTypeName type)}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,21 @@
import java.util.TreeSet;

import com.google.common.collect.Sets;
import org.apache.parquet.Version;
import org.apache.parquet.bytes.BytesUtils;
import org.apache.parquet.column.statistics.BinaryStatistics;
import org.apache.parquet.column.statistics.BooleanStatistics;
import org.apache.parquet.column.statistics.DoubleStatistics;
import org.apache.parquet.column.statistics.FloatStatistics;
import org.apache.parquet.column.statistics.IntStatistics;
import org.apache.parquet.column.statistics.LongStatistics;
import org.apache.parquet.column.statistics.Statistics;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.hadoop.metadata.ColumnPath;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.io.api.Binary;
import org.junit.Assert;
import org.junit.Test;

Expand Down Expand Up @@ -357,5 +366,153 @@ public void testEncodingsCache() {
assertEquals("java.util.Collections$UnmodifiableSet", res1.getClass().getName());
assertEquals("java.util.Collections$UnmodifiableSet", res2.getClass().getName());
assertEquals("java.util.Collections$UnmodifiableSet", res3.getClass().getName());
}
}

@Test
public void testBinaryStats() {
// make fake stats and verify the size check
BinaryStatistics stats = new BinaryStatistics();
stats.incrementNumNulls(3004);
byte[] min = new byte[904];
byte[] max = new byte[2388];
stats.updateStats(Binary.fromConstantByteArray(min));
stats.updateStats(Binary.fromConstantByteArray(max));
long totalLen = min.length + max.length;
Assert.assertFalse("Should not be smaller than min + max size",
stats.isSmallerThan(totalLen));
Assert.assertTrue("Should be smaller than min + max size + 1",
stats.isSmallerThan(totalLen + 1));

org.apache.parquet.format.Statistics formatStats =
ParquetMetadataConverter.toParquetStatistics(stats);

Assert.assertArrayEquals("Min should match", min, formatStats.getMin());
Assert.assertArrayEquals("Max should match", max, formatStats.getMax());
Assert.assertEquals("Num nulls should match",
3004, formatStats.getNull_count());

// convert to empty stats because the values are too large
stats.setMinMaxFromBytes(max, max);

formatStats = ParquetMetadataConverter.toParquetStatistics(stats);

Assert.assertFalse("Min should not be set", formatStats.isSetMin());
Assert.assertFalse("Max should not be set", formatStats.isSetMax());
Assert.assertFalse("Num nulls should not be set",
formatStats.isSetNull_count());

Statistics roundTripStats = ParquetMetadataConverter.fromParquetStatistics(
Version.FULL_VERSION, formatStats, PrimitiveTypeName.BINARY);

Assert.assertTrue(roundTripStats.isEmpty());
}

@Test
public void testIntegerStats() {
// make fake stats and verify the size check
IntStatistics stats = new IntStatistics();
stats.incrementNumNulls(3004);
int min = Integer.MIN_VALUE;
int max = Integer.MAX_VALUE;
stats.updateStats(min);
stats.updateStats(max);

org.apache.parquet.format.Statistics formatStats =
ParquetMetadataConverter.toParquetStatistics(stats);

Assert.assertEquals("Min should match",
min, BytesUtils.bytesToInt(formatStats.getMin()));
Assert.assertEquals("Max should match",
max, BytesUtils.bytesToInt(formatStats.getMax()));
Assert.assertEquals("Num nulls should match",
3004, formatStats.getNull_count());
}

@Test
public void testLongStats() {
// make fake stats and verify the size check
LongStatistics stats = new LongStatistics();
stats.incrementNumNulls(3004);
long min = Long.MIN_VALUE;
long max = Long.MAX_VALUE;
stats.updateStats(min);
stats.updateStats(max);

org.apache.parquet.format.Statistics formatStats =
ParquetMetadataConverter.toParquetStatistics(stats);

Assert.assertEquals("Min should match",
min, BytesUtils.bytesToLong(formatStats.getMin()));
Assert.assertEquals("Max should match",
max, BytesUtils.bytesToLong(formatStats.getMax()));
Assert.assertEquals("Num nulls should match",
3004, formatStats.getNull_count());
}

@Test
public void testFloatStats() {
// make fake stats and verify the size check
FloatStatistics stats = new FloatStatistics();
stats.incrementNumNulls(3004);
float min = Float.MIN_VALUE;
float max = Float.MAX_VALUE;
stats.updateStats(min);
stats.updateStats(max);

org.apache.parquet.format.Statistics formatStats =
ParquetMetadataConverter.toParquetStatistics(stats);

Assert.assertEquals("Min should match",
min, Float.intBitsToFloat(BytesUtils.bytesToInt(formatStats.getMin())),
0.000001);
Assert.assertEquals("Max should match",
max, Float.intBitsToFloat(BytesUtils.bytesToInt(formatStats.getMax())),
0.000001);
Assert.assertEquals("Num nulls should match",
3004, formatStats.getNull_count());
}

@Test
public void testDoubleStats() {
// make fake stats and verify the size check
DoubleStatistics stats = new DoubleStatistics();
stats.incrementNumNulls(3004);
double min = Double.MIN_VALUE;
double max = Double.MAX_VALUE;
stats.updateStats(min);
stats.updateStats(max);

org.apache.parquet.format.Statistics formatStats =
ParquetMetadataConverter.toParquetStatistics(stats);

Assert.assertEquals("Min should match",
min, Double.longBitsToDouble(BytesUtils.bytesToLong(formatStats.getMin())),
0.000001);
Assert.assertEquals("Max should match",
max, Double.longBitsToDouble(BytesUtils.bytesToLong(formatStats.getMax())),
0.000001);
Assert.assertEquals("Num nulls should match",
3004, formatStats.getNull_count());
}

@Test
public void testBooleanStats() {
// make fake stats and verify the size check
BooleanStatistics stats = new BooleanStatistics();
stats.incrementNumNulls(3004);
boolean min = Boolean.FALSE;
boolean max = Boolean.TRUE;
stats.updateStats(min);
stats.updateStats(max);

org.apache.parquet.format.Statistics formatStats =
ParquetMetadataConverter.toParquetStatistics(stats);

Assert.assertEquals("Min should match",
min, BytesUtils.bytesToBool(formatStats.getMin()));
Assert.assertEquals("Max should match",
max, BytesUtils.bytesToBool(formatStats.getMax()));
Assert.assertEquals("Num nulls should match",
3004, formatStats.getNull_count());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,15 @@ private void validateStatsForPage(DataPage page, DictionaryPage dict, ColumnDesc
PrimitiveConverter converter = getValidatingConverter(page, desc.getType());
Statistics stats = getStatisticsFromPageHeader(page);

if (stats.isEmpty()) {
// stats are empty if num nulls = 0 and there are no non-null values
// this happens if stats are not written (e.g., when stats are too big)
System.err.println(String.format(
"No stats written for page=%s col=%s",
page, Arrays.toString(desc.getPath())));
return;
}

long numNulls = 0;
ColumnReaderImpl column = new ColumnReaderImpl(desc, reader, converter, null);
for (int i = 0; i < reader.getTotalValueCount(); i += 1) {
Expand Down

0 comments on commit c706fe5

Please sign in to comment.