-
Notifications
You must be signed in to change notification settings - Fork 5.3k
/
OrcReader.java
286 lines (250 loc) · 11.6 KB
/
OrcReader.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.presto.orc;
import com.facebook.presto.orc.memory.AbstractAggregatedMemoryContext;
import com.facebook.presto.orc.memory.AggregatedMemoryContext;
import com.facebook.presto.orc.metadata.ExceptionWrappingMetadataReader;
import com.facebook.presto.orc.metadata.Footer;
import com.facebook.presto.orc.metadata.Metadata;
import com.facebook.presto.orc.metadata.MetadataReader;
import com.facebook.presto.orc.metadata.PostScript;
import com.facebook.presto.orc.metadata.PostScript.HiveWriterVersion;
import com.facebook.presto.orc.stream.OrcInputStream;
import com.facebook.presto.spi.type.Type;
import com.google.common.base.Joiner;
import io.airlift.log.Logger;
import io.airlift.slice.Slice;
import io.airlift.slice.Slices;
import io.airlift.units.DataSize;
import org.joda.time.DateTimeZone;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import static io.airlift.slice.SizeOf.SIZE_OF_BYTE;
import static java.lang.Math.min;
import static java.lang.Math.toIntExact;
import static java.util.Objects.requireNonNull;
public class OrcReader
{
public static final int MAX_BATCH_SIZE = 1024;
private static final Logger log = Logger.get(OrcReader.class);
private static final Slice MAGIC = Slices.utf8Slice("ORC");
private static final int CURRENT_MAJOR_VERSION = 0;
private static final int CURRENT_MINOR_VERSION = 12;
private static final int EXPECTED_FOOTER_SIZE = 16 * 1024;
private final OrcDataSource orcDataSource;
private final ExceptionWrappingMetadataReader metadataReader;
private final DataSize maxMergeDistance;
private final DataSize maxReadSize;
private final DataSize maxBlockSize;
private final HiveWriterVersion hiveWriterVersion;
private final int bufferSize;
private final Footer footer;
private final Metadata metadata;
private Optional<OrcDecompressor> decompressor = Optional.empty();
// This is based on the Apache Hive ORC code
public OrcReader(OrcDataSource orcDataSource, MetadataReader delegate, DataSize maxMergeDistance, DataSize maxReadSize, DataSize maxBlockSize)
throws IOException
{
orcDataSource = wrapWithCacheIfTiny(requireNonNull(orcDataSource, "orcDataSource is null"), maxMergeDistance);
this.orcDataSource = orcDataSource;
this.metadataReader = new ExceptionWrappingMetadataReader(orcDataSource.getId(), requireNonNull(delegate, "delegate is null"));
this.maxMergeDistance = requireNonNull(maxMergeDistance, "maxMergeDistance is null");
this.maxReadSize = requireNonNull(maxReadSize, "maxReadSize is null");
this.maxBlockSize = requireNonNull(maxBlockSize, "maxBlockSize is null");
//
// Read the file tail:
//
// variable: Footer
// variable: Metadata
// variable: PostScript - contains length of footer and metadata
// 3 bytes: file magic "ORC"
// 1 byte: postScriptSize = PostScript + Magic
// figure out the size of the file using the option or filesystem
long size = orcDataSource.getSize();
if (size <= 0) {
throw new OrcCorruptionException(orcDataSource.getId(), "Invalid file size %s", size);
}
// Read the tail of the file
byte[] buffer = new byte[toIntExact(min(size, EXPECTED_FOOTER_SIZE))];
orcDataSource.readFully(size - buffer.length, buffer);
// get length of PostScript - last byte of the file
int postScriptSize = buffer[buffer.length - SIZE_OF_BYTE] & 0xff;
// make sure this is an ORC file and not an RCFile or something else
verifyOrcFooter(orcDataSource, postScriptSize, buffer);
// decode the post script
int postScriptOffset = buffer.length - SIZE_OF_BYTE - postScriptSize;
PostScript postScript = metadataReader.readPostScript(buffer, postScriptOffset, postScriptSize);
// verify this is a supported version
checkOrcVersion(orcDataSource, postScript.getVersion());
this.bufferSize = toIntExact(postScript.getCompressionBlockSize());
// check compression codec is supported
switch (postScript.getCompression()) {
case UNCOMPRESSED:
break;
case ZLIB:
decompressor = Optional.of(new OrcZlibDecompressor(orcDataSource.getId(), bufferSize));
break;
case SNAPPY:
decompressor = Optional.of(new OrcSnappyDecompressor(orcDataSource.getId(), bufferSize));
break;
case ZSTD:
decompressor = Optional.of(new OrcZstdDecompressor(orcDataSource.getId(), bufferSize));
break;
default:
throw new UnsupportedOperationException("Unsupported compression type: " + postScript.getCompression());
}
this.hiveWriterVersion = postScript.getHiveWriterVersion();
int footerSize = toIntExact(postScript.getFooterLength());
int metadataSize = toIntExact(postScript.getMetadataLength());
// check if extra bytes need to be read
Slice completeFooterSlice;
int completeFooterSize = footerSize + metadataSize + postScriptSize + SIZE_OF_BYTE;
if (completeFooterSize > buffer.length) {
// allocate a new buffer large enough for the complete footer
byte[] newBuffer = new byte[completeFooterSize];
completeFooterSlice = Slices.wrappedBuffer(newBuffer);
// initial read was not large enough, so read missing section
orcDataSource.readFully(size - completeFooterSize, newBuffer, 0, completeFooterSize - buffer.length);
// copy already read bytes into the new buffer
completeFooterSlice.setBytes(completeFooterSize - buffer.length, buffer);
}
else {
// footer is already in the bytes in buffer, just adjust position, length
completeFooterSlice = Slices.wrappedBuffer(buffer, buffer.length - completeFooterSize, completeFooterSize);
}
// read metadata
Slice metadataSlice = completeFooterSlice.slice(0, metadataSize);
try (InputStream metadataInputStream = new OrcInputStream(orcDataSource.getId(), metadataSlice.getInput(), decompressor, new AggregatedMemoryContext())) {
this.metadata = metadataReader.readMetadata(hiveWriterVersion, metadataInputStream);
}
// read footer
Slice footerSlice = completeFooterSlice.slice(metadataSize, footerSize);
try (InputStream footerInputStream = new OrcInputStream(orcDataSource.getId(), footerSlice.getInput(), decompressor, new AggregatedMemoryContext())) {
this.footer = metadataReader.readFooter(hiveWriterVersion, footerInputStream);
}
}
public List<String> getColumnNames()
{
return footer.getTypes().get(0).getFieldNames();
}
public Footer getFooter()
{
return footer;
}
public Metadata getMetadata()
{
return metadata;
}
public int getBufferSize()
{
return bufferSize;
}
public OrcRecordReader createRecordReader(Map<Integer, Type> includedColumns, OrcPredicate predicate, DateTimeZone hiveStorageTimeZone, AbstractAggregatedMemoryContext systemMemoryUsage)
throws IOException
{
return createRecordReader(includedColumns, predicate, 0, orcDataSource.getSize(), hiveStorageTimeZone, systemMemoryUsage);
}
public OrcRecordReader createRecordReader(
Map<Integer, Type> includedColumns,
OrcPredicate predicate,
long offset,
long length,
DateTimeZone hiveStorageTimeZone,
AbstractAggregatedMemoryContext systemMemoryUsage)
throws IOException
{
return new OrcRecordReader(
requireNonNull(includedColumns, "includedColumns is null"),
requireNonNull(predicate, "predicate is null"),
footer.getNumberOfRows(),
footer.getStripes(),
footer.getFileStats(),
metadata.getStripeStatsList(),
orcDataSource,
offset,
length,
footer.getTypes(),
decompressor,
footer.getRowsInRowGroup(),
requireNonNull(hiveStorageTimeZone, "hiveStorageTimeZone is null"),
hiveWriterVersion,
metadataReader,
maxMergeDistance,
maxReadSize,
maxBlockSize,
footer.getUserMetadata(),
systemMemoryUsage);
}
private static OrcDataSource wrapWithCacheIfTiny(OrcDataSource dataSource, DataSize maxCacheSize)
{
if (dataSource instanceof CachingOrcDataSource) {
return dataSource;
}
if (dataSource.getSize() > maxCacheSize.toBytes()) {
return dataSource;
}
DiskRange diskRange = new DiskRange(0, toIntExact(dataSource.getSize()));
return new CachingOrcDataSource(dataSource, desiredOffset -> diskRange);
}
/**
* Verify this is an ORC file to prevent users from trying to read text
* files or RC files as ORC files.
*/
// This is based on the Apache Hive ORC code
private static void verifyOrcFooter(
OrcDataSource source,
int postScriptSize,
byte[] buffer)
throws IOException
{
int magicLength = MAGIC.length();
if (postScriptSize < magicLength + 1) {
throw new OrcCorruptionException(source.getId(), "Invalid postscript length %s", postScriptSize);
}
if (!MAGIC.equals(Slices.wrappedBuffer(buffer, buffer.length - 1 - magicLength, magicLength))) {
// Old versions of ORC (0.11) wrote the magic to the head of the file
byte[] headerMagic = new byte[magicLength];
source.readFully(0, headerMagic);
// if it isn't there, this isn't an ORC file
if (!MAGIC.equals(Slices.wrappedBuffer(headerMagic))) {
throw new OrcCorruptionException(source.getId(), "Invalid postscript");
}
}
}
/**
* Check to see if this ORC file is from a future version and if so,
* warn the user that we may not be able to read all of the column encodings.
*/
// This is based on the Apache Hive ORC code
private static void checkOrcVersion(OrcDataSource orcDataSource, List<Integer> version)
{
if (version.size() >= 1) {
int major = version.get(0);
int minor = 0;
if (version.size() > 1) {
minor = version.get(1);
}
if (major > CURRENT_MAJOR_VERSION || (major == CURRENT_MAJOR_VERSION && minor > CURRENT_MINOR_VERSION)) {
log.warn("ORC file %s was written by a newer Hive version %s. This file may not be readable by this version of Hive (%s.%s).",
orcDataSource,
Joiner.on('.').join(version),
CURRENT_MAJOR_VERSION,
CURRENT_MINOR_VERSION);
}
}
}
}