forked from twitter/elephant-bird
-
Notifications
You must be signed in to change notification settings - Fork 3
/
LzoProtobufB64LinePigLoader.java
95 lines (82 loc) · 3.6 KB
/
LzoProtobufB64LinePigLoader.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
package com.twitter.elephantbird.pig.load;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.pig.ResourceSchema;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.protobuf.Message;
import com.twitter.elephantbird.mapreduce.input.LzoProtobufB64LineInputFormat;
import com.twitter.elephantbird.mapreduce.input.LzoRecordReader;
import com.twitter.elephantbird.mapreduce.io.ProtobufWritable;
import com.twitter.elephantbird.pig.util.PigUtil;
import com.twitter.elephantbird.pig.util.ProjectedProtobufTupleFactory;
import com.twitter.elephantbird.pig.util.ProtobufToPig;
import com.twitter.elephantbird.util.Protobufs;
import com.twitter.elephantbird.util.TypeRef;
/**
* This is the base class for all base64 encoded, line-oriented protocol buffer based pig loaders.
* Data is expected to be one base64 encoded serialized protocol buffer per line. The specific
* protocol buffer is a template parameter.<br>
* Initialize with a String argument that represents the full classpath of the protocol buffer class to be loaded.<br>
* The no-arg constructor will not work and is only there for internal Pig reasons.
*/
public class LzoProtobufB64LinePigLoader<M extends Message> extends LzoBaseLoadFunc {
private static final Logger LOG = LoggerFactory.getLogger(LzoProtobufB64LinePigLoader.class);
protected TypeRef<M> typeRef = null;
private final ProtobufToPig protoToPig = new ProtobufToPig();
private ProjectedProtobufTupleFactory<M> tupleTemplate = null;
public LzoProtobufB64LinePigLoader() {
}
/**
*
* @param protoClassName full classpath to the generated Protocol Buffer to be loaded.
*/
public LzoProtobufB64LinePigLoader(String protoClassName) {
TypeRef<M> typeRef = PigUtil.getProtobufTypeRef(protoClassName);
setTypeRef(typeRef);
}
/**
* Set the type parameter so it doesn't get erased by Java. Must be called before getNext!
*
* @param typeRef
*/
public void setTypeRef(TypeRef<M> typeRef) {
this.typeRef = typeRef;
}
@Override
public RequiredFieldResponse pushProjection(RequiredFieldList requiredFieldList)
throws FrontendException {
return pushProjectionHelper(requiredFieldList);
}
/**
* Return every non-null line as a single-element tuple to Pig.
* <p>
* A small fraction of bad records in input are tolerated.
* See {@link LzoRecordReader} for more information on error handling.
*/
@Override
public Tuple getNext() throws IOException {
if (tupleTemplate == null) {
tupleTemplate = new ProjectedProtobufTupleFactory<M>(typeRef, requiredFieldList);
}
M value = getNextBinaryValue(typeRef);
return value != null ?
tupleTemplate.newTuple(value) : null;
}
@Override
public ResourceSchema getSchema(String filename, Job job) throws IOException {
return new ResourceSchema(protoToPig.toSchema(Protobufs.getMessageDescriptor(typeRef.getRawClass())));
}
@Override
public InputFormat<LongWritable, ProtobufWritable<M>> getInputFormat() throws IOException {
if (typeRef == null) {
LOG.error("Protobuf class must be specified before an InputFormat can be created. Do not use the no-argument constructor.");
throw new IllegalArgumentException("Protobuf class must be specified before an InputFormat can be created. Do not use the no-argument constructor.");
}
return new LzoProtobufB64LineInputFormat<M>(typeRef);
}
}