Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Merge branch 'eb-dev' into rc_file_exp

Conflicts:
	src/java/com/twitter/elephantbird/pig/load/LzoBaseLoadFunc.java
	src/test/com/twitter/elephantbird/pig/piggybank/TestThriftToPig.java
  • Loading branch information...
commit 3d97a417058e0fe0735d5afe5ea23d42df09170a 2 parents b784885 + 342f336
@rangadi authored
Showing with 674 additions and 66 deletions.
  1. +2 −2 build.xml
  2. +16 −7 src/java/com/twitter/elephantbird/mapreduce/input/LzoJsonRecordReader.java
  3. +170 −0 src/java/com/twitter/elephantbird/mapreduce/input/MultiInputFormat.java
  4. +10 −0 src/java/com/twitter/elephantbird/mapreduce/output/LzoBinaryB64LineRecordWriter.java
  5. +144 −0 src/java/com/twitter/elephantbird/pig/load/FilterLoadFunc.java
  6. +13 −3 src/java/com/twitter/elephantbird/pig/load/JsonLoader.java
  7. +7 −8 src/java/com/twitter/elephantbird/pig/load/LzoBaseLoadFunc.java
  8. +10 −12 src/java/com/twitter/elephantbird/pig/load/LzoProtobufB64LinePigLoader.java
  9. +1 −1  src/java/com/twitter/elephantbird/pig/load/LzoProtobufBlockPigLoader.java
  10. +3 −3 src/java/com/twitter/elephantbird/pig/load/LzoThriftB64LinePigLoader.java
  11. +61 −0 src/java/com/twitter/elephantbird/pig/load/MultiFormatLoader.java
  12. +2 −5 ...java/com/twitter/elephantbird/pig/util/{ProjectedProtoTuple.java → ProjectedProtobufTupleFactory.java}
  13. +2 −5 src/java/com/twitter/elephantbird/pig/util/{ProjectedThriftTuple.java → ProjectedThriftTupleFactory.java}
  14. +5 −0 src/java/com/twitter/elephantbird/pig/util/ThriftToPig.java
  15. +23 −0 src/java/com/twitter/elephantbird/util/HadoopUtils.java
  16. +3 −1 src/java/com/twitter/elephantbird/util/Protobufs.java
  17. +2 −14 src/java/com/twitter/elephantbird/util/ThriftUtils.java
  18. +31 −0 src/test/com/twitter/elephantbird/mapreduce/input/TestLzoJsonRecordReader.java
  19. +32 −0 src/test/com/twitter/elephantbird/pig/load/TestJsonLoader.java
  20. +132 −0 src/test/com/twitter/elephantbird/pig/load/TestMultiFormatLoader.java
  21. +2 −2 src/test/com/twitter/elephantbird/pig/piggybank/TestProtoToPig.java
  22. +3 −3 src/test/com/twitter/elephantbird/pig/piggybank/TestThriftToPig.java
View
4 build.xml
@@ -40,6 +40,7 @@
<property name="test.junit.maxmemory" value="512m" />
<property name="javac.debug" value="on"/>
<property name="javac.optimize" value="on"/>
+ <property name="test.library.path" value="none"/>
<path id="test.classpath">
<pathelement location="${classes.dir}"/>
@@ -263,8 +264,7 @@
<sysproperty key="test.log.dir" value="${test.log.dir}"/>
<sysproperty key="test.source.dir" value="${test.src.dir}"/>
<sysproperty key="test.build.extraconf" value="${test.build.extraconf}" />
- <sysproperty key="java.library.path"
- value="${lib.dir}/*.jar:${classes.dir}:${test.build.classes}:${basedir}/bin"/>
+ <sysproperty key="java.library.path" value="${test.library.path}"/>
<classpath refid="${test.classpath.id}"/>
<formatter type="${test.junit.output.format}" />
<batchtest todir="${test.log.dir}" unless="testcase">
View
23 src/java/com/twitter/elephantbird/mapreduce/input/LzoJsonRecordReader.java
@@ -89,14 +89,23 @@ public boolean nextKeyValue() throws IOException, InterruptedException {
public static boolean decodeLineToJson(JSONParser parser, Text line, MapWritable value) {
try {
JSONObject jsonObj = (JSONObject)parser.parse(line.toString());
- for (Object key: jsonObj.keySet()) {
- Text mapKey = new Text(key.toString());
- Text mapValue = new Text();
- if (jsonObj.get(key) != null) {
- mapValue.set(jsonObj.get(key).toString());
+ if (jsonObj != null) {
+ for (Object key: jsonObj.keySet()) {
+ Text mapKey = new Text(key.toString());
+ Text mapValue = new Text();
+ if (jsonObj.get(key) != null) {
+ mapValue.set(jsonObj.get(key).toString());
+ }
+
+ value.put(mapKey, mapValue);
}
-
- value.put(mapKey, mapValue);
+ }
+ else {
+ // JSONParser#parse(String) may return a null reference, e.g. when
+ // the input parameter is the string "null". A single line with
+ // "null" is not valid JSON though.
+ LOG.warn("Could not json-decode string: " + line);
+ return false;
}
return true;
} catch (ParseException e) {
View
170 src/java/com/twitter/elephantbird/mapreduce/input/MultiInputFormat.java
@@ -0,0 +1,170 @@
+package com.twitter.elephantbird.mapreduce.input;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.compress.CompressionCodec;
+import org.apache.hadoop.io.compress.CompressionCodecFactory;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.thrift.TBase;
+
+import com.google.protobuf.Message;
+import com.twitter.data.proto.BlockStorage.SerializedBlock;
+import com.twitter.elephantbird.mapreduce.io.BinaryWritable;
+import com.twitter.elephantbird.util.HadoopUtils;
+import com.twitter.elephantbird.util.Protobufs;
+import com.twitter.elephantbird.util.TypeRef;
+
+/**
+ * The input could consist of heterogeneous mix of formats storing
+ * compatible objects. Currently supported formats:
+ * <ol>
+ * <li> Lzo Block storage of Thrift and Protobuf objects
+ * <li> Lzo B64Line storage of Thrift and Protobuf objects
+ * </ol>
+ *
+ * <p>
+ * A small fraction of bad records are tolerated. See {@link LzoRecordReader}
+ * for more information on error handling.
+ */
+public class MultiInputFormat<M>
+ extends LzoInputFormat<LongWritable, BinaryWritable<M>> {
+
+ // TODO need handle multiple input formats in a job better.
+ // might be better to store classname in the input split rather than in config.
+ private static String CLASS_CONF_KEY = "elephantbird.class.for.MultiInputFormat";
+
+ private TypeRef<M> typeRef;
+
+ public MultiInputFormat() {}
+
+ public MultiInputFormat(TypeRef<M> typeRef) {
+ this.typeRef = typeRef;
+ }
+
+ private static enum Format {
+ LZO_BLOCK,
+ LZO_B64LINE;
+ };
+
+ /**
+ * Sets jobs input format to {@link MultiInputFormat} and stores
+ * supplied clazz's name in job configuration. This configuration is
+ * read on the remote tasks to initialize the input format correctly.
+ */
+ public static void setInputFormatClass(Class<?> clazz, Job job) {
+ job.setInputFormatClass(MultiInputFormat.class);
+ HadoopUtils.setInputFormatClass(job.getConfiguration(), CLASS_CONF_KEY, clazz);
+ }
+
+ @SuppressWarnings("unchecked") // return type is runtime dependent
+ @Override
+ public RecordReader<LongWritable, BinaryWritable<M>>
+ createRecordReader(InputSplit split, TaskAttemptContext taskAttempt)
+ throws IOException, InterruptedException {
+ Configuration conf = taskAttempt.getConfiguration();
+ if (typeRef == null) {
+ setTypeRef(conf);
+ }
+ Class<?> recordClass = typeRef.getRawClass();
+
+ Format fileFormat = determineFileFormat(split, conf);
+
+ // Thrift
+ if (TBase.class.isAssignableFrom(recordClass)) {
+ switch (fileFormat) {
+ case LZO_BLOCK:
+ return new LzoThriftBlockRecordReader(typeRef);
+ case LZO_B64LINE:
+ return new LzoThriftB64LineRecordReader(typeRef);
+ }
+ }
+
+ // Protobuf
+ if (Message.class.isAssignableFrom(recordClass)) {
+ switch (fileFormat) {
+ case LZO_BLOCK:
+ return new LzoProtobufBlockRecordReader(typeRef);
+ case LZO_B64LINE:
+ return new LzoProtobufBlockRecordReader(typeRef);
+ }
+ }
+
+ throw new IOException( "could not determine reader for "
+ + ((FileSplit)split).getPath() + " with class " + recordClass.getName());
+ }
+
+ /** set typeRef from conf */
+ private void setTypeRef(Configuration conf) {
+ String className = conf.get(CLASS_CONF_KEY);
+
+ if (className == null) {
+ throw new RuntimeException(CLASS_CONF_KEY + " is not set");
+ }
+
+ Class<?> clazz = null;
+ try {
+ clazz = conf.getClassByName(className);
+ } catch (ClassNotFoundException e) {
+ throw new RuntimeException("failed to instantiate class '" + className + "'", e);
+ }
+
+ typeRef = new TypeRef<M>(clazz){};
+ }
+
+ /**
+ * Checks to see if the input records are stored as {@link SerializedBlock}.
+ * The block format starts with {@link Protobufs#KNOWN_GOOD_POSITION_MARKER}.
+ * Otherwise the input is assumed to be Base64 encoded lines.
+ */
+ private static Format determineFileFormat(InputSplit split,
+ Configuration conf)
+ throws IOException {
+ FileSplit fileSplit = (FileSplit)split;
+
+ Path file = fileSplit.getPath();
+
+ /* we could have a an optional configuration that maps a regex on a
+ * file name to a format. E.g. ".*-block.lzo" to LZO_BLOCK file.
+ */
+
+ // most of the cost is opening the file and
+ // reading first lzo block (about 256k of compressed data)
+
+ CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(file);
+ if (codec == null) {
+ throw new IOException("No codec for file " + file + " found");
+ }
+
+ InputStream in = file.getFileSystem(conf).open(file);
+ InputStream lzoIn = null;
+
+ // check if the file starts with magic bytes for Block storage format.
+ try {
+ lzoIn = codec.createInputStream(in);
+
+ for(byte magic : Protobufs.KNOWN_GOOD_POSITION_MARKER) {
+ int b = lzoIn.read();
+ if (b < 0 || (byte)b != magic) {
+ return Format.LZO_B64LINE;
+ }
+ }
+ } finally {
+ IOUtils.closeStream(lzoIn);
+ IOUtils.closeStream(in);
+ }
+
+ // the check passed
+ return Format.LZO_BLOCK;
+ }
+}
+
+
View
10 src/java/com/twitter/elephantbird/mapreduce/output/LzoBinaryB64LineRecordWriter.java
@@ -5,12 +5,15 @@
import com.twitter.elephantbird.mapreduce.io.BinaryConverter;
import com.twitter.elephantbird.mapreduce.io.BinaryWritable;
+import com.twitter.elephantbird.mapreduce.io.ThriftConverter;
+import com.twitter.elephantbird.mapreduce.io.ThriftWritable;
import com.twitter.elephantbird.util.Codecs;
import com.twitter.elephantbird.util.Protobufs;
import org.apache.commons.codec.binary.Base64;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.thrift.TBase;
/**
* A RecordWriter-derived class for use with the LzoProtobufB64LineOutputFormat.
@@ -43,4 +46,11 @@ public void close(TaskAttemptContext taskAttemptContext)
throws IOException, InterruptedException {
out.close();
}
+
+ // for convenience
+ public static <M extends TBase<?, ?>> LzoBinaryB64LineRecordWriter<M, ThriftWritable<M>>
+ newThriftWriter(Class<M> tClass, DataOutputStream out) {
+ return new LzoBinaryB64LineRecordWriter<M, ThriftWritable<M>>
+ (ThriftConverter.newInstance(tClass), out);
+ }
}
View
144 src/java/com/twitter/elephantbird/pig/load/FilterLoadFunc.java
@@ -0,0 +1,144 @@
+package com.twitter.elephantbird.pig.load;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.pig.Expression;
+import org.apache.pig.LoadCaster;
+import org.apache.pig.LoadFunc;
+import org.apache.pig.LoadMetadata;
+import org.apache.pig.LoadPushDown;
+import org.apache.pig.ResourceSchema;
+import org.apache.pig.ResourceStatistics;
+import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+
+/**
+ * A wrapper LoadFunc that delegates all the functionality to another loader.
+ * Similar to a FilterInputStream.
+ */
+public class FilterLoadFunc extends LoadFunc implements LoadMetadata, LoadPushDown {
+
+ protected LoadFunc loader;
+
+ /**
+ * @param loader This could be null. It may not be feasible to set
+ * loader during construction. It can be set later with setLoader()
+ */
+ public FilterLoadFunc(LoadFunc loader) {
+ this.loader = loader;
+ }
+
+ public void setLoader(LoadFunc loader) {
+ this.loader = loader;
+ }
+
+ // just for readability
+ private boolean isSet() {
+ return loader != null;
+ }
+ // LoadFunc implementation:
+
+ @Override @SuppressWarnings("unchecked")
+ public InputFormat getInputFormat() throws IOException {
+ return isSet() ? loader.getInputFormat() : null;
+ }
+
+ @Override
+ public LoadCaster getLoadCaster() throws IOException {
+ return isSet() ? loader.getLoadCaster() : null;
+ }
+
+ @Override
+ public Tuple getNext() throws IOException {
+ return isSet() ? loader.getNext() : null;
+ }
+
+ @Override @SuppressWarnings("unchecked")
+ public void prepareToRead(RecordReader reader, PigSplit split) throws IOException {
+ if (isSet()) {
+ loader.prepareToRead(reader, split);
+ }
+ }
+
+ @Override
+ public String relativeToAbsolutePath(String location, Path curDir)
+ throws IOException {
+ return isSet() ?
+ loader.relativeToAbsolutePath(location, curDir):
+ super.relativeToAbsolutePath(location, curDir);
+ }
+
+ @Override
+ public void setLocation(String location, Job job) throws IOException {
+ if (isSet()) {
+ loader.setLocation(location, job);
+ }
+ }
+
+ @Override
+ public void setUDFContextSignature(String signature) {
+ if (isSet()) {
+ loader.setUDFContextSignature(signature);
+ } else {
+ super.setUDFContextSignature(signature);
+ }
+ }
+
+ // LoadMetadata & LoadPushDown interface.
+
+ // helpers for casting:
+ private static LoadMetadata asLoadMetadata(LoadFunc loader) {
+ return loader instanceof LoadMetadata ? (LoadMetadata) loader : null;
+ }
+
+ private static LoadPushDown asLoadPushDown(LoadFunc loader) {
+ return loader instanceof LoadPushDown ? (LoadPushDown) loader : null;
+ }
+
+
+ @Override
+ public String[] getPartitionKeys(String location, Job job) throws IOException {
+ LoadMetadata metadata = asLoadMetadata(loader);
+ return metadata == null ? null : metadata.getPartitionKeys(location, job);
+ }
+
+ @Override
+ public ResourceSchema getSchema(String location, Job job) throws IOException {
+ LoadMetadata metadata = asLoadMetadata(loader);
+ return metadata == null ? null : metadata.getSchema(location, job);
+ }
+
+ @Override
+ public ResourceStatistics getStatistics(String location, Job job) throws IOException {
+ LoadMetadata metadata = asLoadMetadata(loader);
+ return metadata == null ? null : metadata.getStatistics(location, job);
+ }
+
+ @Override
+ public void setPartitionFilter(Expression partitionFilter) throws IOException {
+ LoadMetadata metadata = asLoadMetadata(loader);
+ if ( metadata != null ) {
+ metadata.setPartitionFilter(partitionFilter);
+ }
+ }
+
+ @Override
+ public List<OperatorSet> getFeatures() {
+ LoadPushDown pushDown = asLoadPushDown(loader);
+ return pushDown == null ? null : pushDown.getFeatures();
+ }
+
+ @Override
+ public RequiredFieldResponse pushProjection(
+ RequiredFieldList requiredFieldList) throws FrontendException {
+ LoadPushDown pushDown = asLoadPushDown( loader );
+ return pushDown == null ? null : pushDown.pushProjection( requiredFieldList );
+ }
+
+}
View
16 src/java/com/twitter/elephantbird/pig/load/JsonLoader.java
@@ -95,9 +95,19 @@ protected Tuple parseStringToTuple(String line) {
try {
Map<String, String> values = Maps.newHashMap();
JSONObject jsonObj = (JSONObject)jsonParser_.parse(line);
- for (Object key: jsonObj.keySet()) {
- Object value = jsonObj.get(key);
- values.put(key.toString(), value != null ? value.toString() : null);
+ if (jsonObj != null) {
+ for (Object key: jsonObj.keySet()) {
+ Object value = jsonObj.get(key);
+ values.put(key.toString(), value != null ? value.toString() : null);
+ }
+ }
+ else {
+ // JSONParser#parse(String) may return a null reference, e.g. when
+ // the input parameter is the string "null". A single line with
+ // "null" is not valid JSON though.
+ LOG.warn("Could not json-decode string: " + line);
+ incrCounter(JsonLoaderCounters.LinesParseError, 1L);
+ return null;
}
return tupleFactory_.newTuple(values);
} catch (ParseException e) {
View
15 src/java/com/twitter/elephantbird/pig/load/LzoBaseLoadFunc.java
@@ -35,6 +35,7 @@
public abstract class LzoBaseLoadFunc extends LoadFunc implements LoadMetadata, LoadPushDown {
private static final Logger LOG = LoggerFactory.getLogger(LzoBaseLoadFunc.class);
+ @SuppressWarnings("unchecked")
protected RecordReader reader_;
// Making accessing Hadoop counters from Pig slightly more convenient.
@@ -42,7 +43,7 @@
protected Configuration jobConf;
protected String contextSignature;
- protected static final String projectionSuffix = "_LzoBaseLoadFunc_projectedFields";
+ protected static final String projectionKey = "LzoBaseLoadFunc_projectedFields";
protected RequiredFieldList requiredFieldList = null;
@@ -91,8 +92,7 @@ public void setLocation(String location, Job job) throws IOException {
FileInputFormat.setInputPaths(job, location);
this.jobConf = job.getConfiguration();
- String projectedFields = getUDFProperties().getProperty(
- contextSignature + projectionSuffix);
+ String projectedFields = getUDFProperties().getProperty(projectionKey);
if (projectedFields != null) {
requiredFieldList =
(RequiredFieldList) ObjectSerializer.deserialize(projectedFields);
@@ -138,15 +138,14 @@ public RequiredFieldResponse pushProjection(
* {@link LoadPushDown#pushProjection(RequiredFieldList)}. <p>
*
* Stores requiredFieldList in context. The requiredFields are read from
- * context on the backend inside {@link #setLocation(String, Job)}.
+ * context on the backend (in side {@link #setLocation(String, Job)}).
*/
protected RequiredFieldResponse pushProjectionHelper(
RequiredFieldList requiredFieldList)
throws FrontendException {
try {
- getUDFProperties().setProperty(
- contextSignature + projectionSuffix,
- ObjectSerializer.serialize(requiredFieldList));
+ getUDFProperties().setProperty(projectionKey,
+ ObjectSerializer.serialize(requiredFieldList));
} catch (IOException e) { // not expected
throw new FrontendException(e);
}
@@ -155,7 +154,7 @@ protected RequiredFieldResponse pushProjectionHelper(
}
@Override
- public void prepareToRead(RecordReader reader, PigSplit split) {
+ public void prepareToRead(@SuppressWarnings("unchecked") RecordReader reader, PigSplit split) {
this.reader_ = reader;
}
View
22 src/java/com/twitter/elephantbird/pig/load/LzoProtobufB64LinePigLoader.java
@@ -16,7 +16,7 @@
import com.twitter.elephantbird.mapreduce.input.LzoRecordReader;
import com.twitter.elephantbird.mapreduce.io.ProtobufWritable;
import com.twitter.elephantbird.pig.util.PigUtil;
-import com.twitter.elephantbird.pig.util.ProjectedProtoTuple;
+import com.twitter.elephantbird.pig.util.ProjectedProtobufTupleFactory;
import com.twitter.elephantbird.pig.util.ProtobufToPig;
import com.twitter.elephantbird.util.Protobufs;
import com.twitter.elephantbird.util.TypeRef;
@@ -31,12 +31,11 @@
public class LzoProtobufB64LinePigLoader<M extends Message> extends LzoBaseLoadFunc {
private static final Logger LOG = LoggerFactory.getLogger(LzoProtobufB64LinePigLoader.class);
- protected TypeRef<M> typeRef_ = null;
- private final ProtobufToPig protoToPig_ = new ProtobufToPig();
- private ProjectedProtoTuple<M> tupleTemplate = null;
+ protected TypeRef<M> typeRef = null;
+ private final ProtobufToPig protoToPig = new ProtobufToPig();
+ private ProjectedProtobufTupleFactory<M> tupleTemplate = null;
public LzoProtobufB64LinePigLoader() {
- LOG.info("LzoProtobufB64LineLoader zero-parameter creation");
}
/**
@@ -54,7 +53,7 @@ public LzoProtobufB64LinePigLoader(String protoClassName) {
* @param typeRef
*/
public void setTypeRef(TypeRef<M> typeRef) {
- typeRef_ = typeRef;
+ this.typeRef = typeRef;
}
@Override
@@ -72,26 +71,25 @@ public RequiredFieldResponse pushProjection(RequiredFieldList requiredFieldList)
@Override
public Tuple getNext() throws IOException {
if (tupleTemplate == null) {
- tupleTemplate = new ProjectedProtoTuple<M>(typeRef_, requiredFieldList);
+ tupleTemplate = new ProjectedProtobufTupleFactory<M>(typeRef, requiredFieldList);
}
- M value = getNextBinaryValue(typeRef_);
+ M value = getNextBinaryValue(typeRef);
return value != null ?
tupleTemplate.newTuple(value) : null;
}
@Override
public ResourceSchema getSchema(String filename, Job job) throws IOException {
- return new ResourceSchema(protoToPig_.toSchema(Protobufs.getMessageDescriptor(typeRef_.getRawClass())));
-
+ return new ResourceSchema(protoToPig.toSchema(Protobufs.getMessageDescriptor(typeRef.getRawClass())));
}
@Override
public InputFormat<LongWritable, ProtobufWritable<M>> getInputFormat() throws IOException {
- if (typeRef_ == null) {
+ if (typeRef == null) {
LOG.error("Protobuf class must be specified before an InputFormat can be created. Do not use the no-argument constructor.");
throw new IllegalArgumentException("Protobuf class must be specified before an InputFormat can be created. Do not use the no-argument constructor.");
}
- return new LzoProtobufB64LineInputFormat<M>(typeRef_);
+ return new LzoProtobufB64LineInputFormat<M>(typeRef);
}
}
View
2  src/java/com/twitter/elephantbird/pig/load/LzoProtobufBlockPigLoader.java
@@ -32,6 +32,6 @@ public LzoProtobufBlockPigLoader(String protoClassName) {
@Override
public InputFormat<LongWritable, ProtobufWritable<M>> getInputFormat() throws IOException {
- return new LzoProtobufBlockInputFormat<M>(typeRef_);
+ return new LzoProtobufBlockInputFormat<M>(typeRef);
}
}
View
6 src/java/com/twitter/elephantbird/pig/load/LzoThriftB64LinePigLoader.java
@@ -15,7 +15,7 @@
import com.twitter.elephantbird.mapreduce.input.LzoThriftB64LineInputFormat;
import com.twitter.elephantbird.mapreduce.io.ThriftWritable;
import com.twitter.elephantbird.pig.util.PigUtil;
-import com.twitter.elephantbird.pig.util.ProjectedThriftTuple;
+import com.twitter.elephantbird.pig.util.ProjectedThriftTupleFactory;
import com.twitter.elephantbird.pig.util.ThriftToPig;
import com.twitter.elephantbird.util.TypeRef;
@@ -23,7 +23,7 @@
public class LzoThriftB64LinePigLoader<M extends TBase<?, ?>> extends LzoBaseLoadFunc {
protected final TypeRef<M> typeRef_;
- private ProjectedThriftTuple<M> tupleTemplate;
+ private ProjectedThriftTupleFactory<M> tupleTemplate;
public LzoThriftB64LinePigLoader(String thriftClassName) {
typeRef_ = PigUtil.getThriftTypeRef(thriftClassName);
@@ -38,7 +38,7 @@ public LzoThriftB64LinePigLoader(String thriftClassName) {
@Override
public Tuple getNext() throws IOException {
if (tupleTemplate == null) {
- tupleTemplate = new ProjectedThriftTuple<M>(typeRef_, requiredFieldList);
+ tupleTemplate = new ProjectedThriftTupleFactory<M>(typeRef_, requiredFieldList);
}
M value = getNextBinaryValue(typeRef_);
View
61 src/java/com/twitter/elephantbird/pig/load/MultiFormatLoader.java
@@ -0,0 +1,61 @@
+package com.twitter.elephantbird.pig.load;
+
+import java.io.IOException;
+
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.apache.pig.LoadFunc;
+import org.apache.thrift.TBase;
+
+import com.google.protobuf.Message;
+import com.twitter.elephantbird.mapreduce.input.MultiInputFormat;
+import com.twitter.elephantbird.mapreduce.io.BinaryWritable;
+import com.twitter.elephantbird.pig.util.PigUtil;
+import com.twitter.elephantbird.util.TypeRef;
+
+/**
+ * A loader based on {@link MultiInputFormat} to read input written in
+ * different file formats.
+ *
+ * @see MultiInputFormat
+ */
+public class MultiFormatLoader<M> extends FilterLoadFunc {
+
+ private TypeRef<M> typeRef = null;
+
+ /**
+ * @param className Thrift or Protobuf class
+ */
+ public MultiFormatLoader(String className) {
+ super(null);
+ Class<?> clazz = PigUtil.getClass(className);
+ typeRef = new TypeRef<M>(clazz){};
+
+ /* Initialize loader
+ * It does not matter that we are using 'B64Line' though the input
+ * can be in a different format. These loaders depend only on the
+ * class name and differ only what getInputFormat() returns. since
+ * we override getInputFormat(), the difference does not matter.
+ *
+ * The loader is required to handle rest of the functionality of
+ * LoadFunc, LoadMetadata etc.
+ */
+ LoadFunc ldr;
+ if (Message.class.isAssignableFrom(clazz)) {
+ ldr = new LzoProtobufB64LinePigLoader<Message>(className);
+
+ } else if (TBase.class.isAssignableFrom(clazz)) {
+ ldr = new LzoThriftB64LinePigLoader<TBase<?, ?>>(className);
+
+ } else {
+ throw new RuntimeException(className + " is not a Protobuf or Thrift class");
+ }
+
+ setLoader(ldr);
+ }
+
+ @Override
+ public InputFormat<LongWritable, BinaryWritable<M>> getInputFormat() throws IOException {
+ return new MultiInputFormat<M>(typeRef);
+ }
+}
View
7 ...antbird/pig/util/ProjectedProtoTuple.java → ...g/util/ProjectedProtobufTupleFactory.java
@@ -17,11 +17,8 @@
/**
* A tuple factory to create protobuf tuples where
* only a subset of fields are required.
- *
- * It is not called a "Factory" to avoid confusion with
- * a traditional TupleFactory that creates a raw tuple.
*/
-public class ProjectedProtoTuple<M extends Message> {
+public class ProjectedProtobufTupleFactory<M extends Message> {
private static TupleFactory tf = TupleFactory.getInstance();
@@ -29,7 +26,7 @@
private final ProtobufToPig protoConv;
- public ProjectedProtoTuple(TypeRef<M> typeRef, RequiredFieldList requiredFieldList) {
+ public ProjectedProtobufTupleFactory(TypeRef<M> typeRef, RequiredFieldList requiredFieldList) {
List<FieldDescriptor> protoFields =
Protobufs.getMessageDescriptor(typeRef.getRawClass()).getFields();
View
7 ...ntbird/pig/util/ProjectedThriftTuple.java → ...pig/util/ProjectedThriftTupleFactory.java
@@ -17,18 +17,15 @@
/**
* A tuple factory to create thrift tuples where
* only a subset of fields are required.
- *
- * It is not called a "Factory" to avoid confusion with
- * a traditional TupleFactory that creates a raw tuple.
*/
-public class ProjectedThriftTuple<T extends TBase<?, ?>> {
+public class ProjectedThriftTupleFactory<T extends TBase<?, ?>> {
private static TupleFactory tf = TupleFactory.getInstance();
private int[] requiredFields;
private final TStructDescriptor tStructDesc;
- public ProjectedThriftTuple(TypeRef<T> typeRef, RequiredFieldList requiredFieldList) {
+ public ProjectedThriftTupleFactory(TypeRef<T> typeRef, RequiredFieldList requiredFieldList) {
tStructDesc = TStructDescriptor.getInstance(typeRef.getRawClass());
int numFields = tStructDesc.getFields().size();
View
5 src/java/com/twitter/elephantbird/pig/util/ThriftToPig.java
@@ -232,6 +232,11 @@ protected Object getObjectAt(int index) {
public static Schema toSchema(Class<? extends TBase<?, ?>> tClass) {
return toSchema(TStructDescriptor.getInstance(tClass));
}
+
+ public Schema toSchema() {
+ return toSchema(structDesc);
+ }
+
public static Schema toSchema(TStructDescriptor tDesc ) {
Schema schema = new Schema();
View
23 src/java/com/twitter/elephantbird/util/HadoopUtils.java
@@ -1,5 +1,6 @@
package com.twitter.elephantbird.util;
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.TaskInputOutputContext;
@@ -32,4 +33,26 @@ public static Counter getCounter(JobContext ctx, String group, String counter) {
+ "will return a dummy counter for '" + name + "'");
return new Counter(name, name) {};
}
+
+ /**
+ * A helper to set configuration to class name.
+ * Throws a RuntimeExcpetion if the
+ * configuration is already set to a different class name.
+ */
+ public static void setInputFormatClass(Configuration conf,
+ String configKey,
+ Class<?> clazz) {
+ String existingClass = conf.get(configKey);
+ String className = clazz.getName();
+
+ if (existingClass != null && !existingClass.equals(className)) {
+ throw new RuntimeException(
+ "Already registered a different thriftClass for "
+ + configKey
+ + ". old: " + existingClass
+ + " new: " + className);
+ } else {
+ conf.set(configKey, className);
+ }
+ }
}
View
4 src/java/com/twitter/elephantbird/util/Protobufs.java
@@ -276,7 +276,9 @@ public static Type getTypeByName(Message message, String name) {
public static void setClassConf(Configuration jobConf, Class<?> genericClass,
Class<? extends Message> protoClass) {
- jobConf.set(CLASS_CONF_PREFIX + genericClass.getName(), protoClass.getName());
+ HadoopUtils.setInputFormatClass(jobConf,
+ CLASS_CONF_PREFIX + genericClass.getName(),
+ protoClass);
}
/**
View
16 src/java/com/twitter/elephantbird/util/ThriftUtils.java
@@ -11,22 +11,10 @@
public static void setClassConf(Configuration jobConf, Class<?> genericClass,
Class<? extends TBase<?, ?>> thriftClass) {
- String name = CLASS_CONF_PREFIX + genericClass.getName();
- String existingThrift = jobConf.get(name);
- if (existingThrift != null) {
- if (!existingThrift.equals(thriftClass.getName())) {
- throw new RuntimeException(
- "Already registered a different thriftClass for "
- + genericClass.getName()
- + ". old: " + existingThrift
- + " new: " + thriftClass);
- }
- } else {
- jobConf.set(name, thriftClass.getName());
- }
+ String configKey = CLASS_CONF_PREFIX + genericClass.getName();
+ HadoopUtils.setInputFormatClass(jobConf, configKey, thriftClass);
}
-
/**
* Verify that clazz is a Thrift class. i.e. is a subclass of TBase
*/
View
31 src/test/com/twitter/elephantbird/mapreduce/input/TestLzoJsonRecordReader.java
@@ -0,0 +1,31 @@
+package com.twitter.elephantbird.mapreduce.input;
+
+import junit.framework.TestCase;
+import org.apache.hadoop.io.MapWritable;
+import org.apache.hadoop.io.Text;
+import org.json.simple.parser.JSONParser;
+import org.junit.Test;
+
+/**
+ * Test the LzoJsonRecordReader, make sure it reads the data properly.
+ */
+public class TestLzoJsonRecordReader extends TestCase {
+
+ /**
+ * {@link LzoJsonRecordReader#decodeLineToJson(JSONParser, Text, MapWritable)}
+ * must not choke on lines containing the word "null" (i.e. not the null
+ * value but the string "null").
+ *
+ * This can happen when the original input line to JSONParser contains "null"
+ * as a string. In this case {@link JSONParser#parse(java.io.Reader)} will
+ * return a null reference.
+ *
+ */
+ @Test
+ public void testNullString() {
+ Text line = new Text("null");
+ boolean result = LzoJsonRecordReader.decodeLineToJson(new JSONParser(), line, new MapWritable());
+ assertEquals("Parsing line with contents 'null'", false, result);
+ }
+
+}
View
32 src/test/com/twitter/elephantbird/pig/load/TestJsonLoader.java
@@ -0,0 +1,32 @@
+package com.twitter.elephantbird.pig.load;
+
+import junit.framework.TestCase;
+import org.apache.pig.data.Tuple;
+import org.json.simple.parser.JSONParser;
+import org.junit.Test;
+
+/**
+ * Test the JsonLoader, make sure it reads the data properly.
+ */
+
+public class TestJsonLoader extends TestCase {
+
+ /**
+ * {@link JsonLoader#parseStringToTuple(String)} must not choke on lines
+ * containing the word "null" (i.e. not the null value but the string
+ * "null").
+ *
+ * This can happen when the original input line to JSONParser contains "null"
+ * as a string. In this case {@link JSONParser#parse(java.io.Reader)} will
+ * return a null reference.
+ *
+ */
+ @Test
+ public void testNullString() {
+ String nullString = "null";
+ JsonLoader jsonLoader = new JsonLoader();
+ Tuple result = jsonLoader.parseStringToTuple(nullString);
+ assertEquals("Parsing line with contents 'null'", null, result);
+ }
+
+}
View
132 src/test/com/twitter/elephantbird/pig/load/TestMultiFormatLoader.java
@@ -0,0 +1,132 @@
+package com.twitter.elephantbird.pig.load;
+
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.pig.ExecType;
+import org.apache.pig.PigServer;
+import org.apache.pig.data.Tuple;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import com.google.common.collect.ImmutableMap;
+import com.hadoop.compression.lzo.GPLNativeCodeLoader;
+import com.hadoop.compression.lzo.LzopCodec;
+import com.twitter.elephantbird.mapreduce.io.ThriftBlockWriter;
+import com.twitter.elephantbird.mapreduce.io.ThriftWritable;
+import com.twitter.elephantbird.mapreduce.output.LzoBinaryB64LineRecordWriter;
+import com.twitter.elephantbird.pig.util.ThriftToPig;
+import com.twitter.elephantbird.thrift.test.TestName;
+import com.twitter.elephantbird.thrift.test.TestPerson;
+import com.twitter.elephantbird.thrift.test.TestPhoneType;
+
+/**
+ * Test {@link MultiFormatLoader} using a Thrift struct.
+ */
+public class TestMultiFormatLoader {
+ // create a directory with two lzo files, one in Base64Line format
+ // and the other in Serialized blocks, and load them using
+ // MultiFormatLoader
+
+ private PigServer pigServer;
+ private final String testDir =
+ System.getProperty("test.build.data") + "/TestMultiFormatLoader";
+ private final File inputDir = new File(testDir, "in");
+ private final TestPerson[] records = new TestPerson[]{ makePerson(0),
+ makePerson(1),
+ makePerson(2) };
+ @Before
+ public void setUp() throws Exception {
+
+ if (!GPLNativeCodeLoader.isNativeCodeLoaded()) {
+ // TODO: Consider using @RunWith / @SuiteClasses
+ return;
+ }
+
+ pigServer = new PigServer(ExecType.LOCAL);
+ // set lzo codec:
+ pigServer.getPigContext().getProperties().setProperty(
+ "io.compression.codecs", "com.hadoop.compression.lzo.LzopCodec");
+ pigServer.getPigContext().getProperties().setProperty(
+ "io.compression.codec.lzo.class", "com.hadoop.compression.lzo.LzoCodec");
+
+ Configuration conf = new Configuration();
+ inputDir.mkdirs();
+
+ // write to block file
+ ThriftBlockWriter<TestPerson> blk_writer =
+ new ThriftBlockWriter<TestPerson>(createLzoOut("1-block.lzo", conf),
+ TestPerson.class);
+ for (TestPerson rec : records) {
+ blk_writer.write(rec);
+ }
+ blk_writer.close();
+
+ // write tb64 lines
+ LzoBinaryB64LineRecordWriter<TestPerson, ThriftWritable<TestPerson>> b64_writer =
+ LzoBinaryB64LineRecordWriter.newThriftWriter(TestPerson.class,
+ createLzoOut("2-b64.lzo", conf));
+ for (TestPerson rec: records) {
+ thriftWritable.set(rec);
+ b64_writer.write(null, thriftWritable);
+ }
+ b64_writer.close(null);
+ }
+
+ @Test
+ public void testMultiFormatLoader() throws Exception {
+ if (pigServer == null) {
+ //setUp didn't run because of missing lzo native libraries
+ return;
+ }
+
+ pigServer.registerQuery(String.format(
+ "A = load '%s' using %s('%s');\n",
+ inputDir.toURI().toString(),
+ MultiFormatLoader.class.getName(),
+ TestPerson.class.getName()));
+
+ Iterator<Tuple> rows = pigServer.openIterator("A");
+ // verify:
+ for (int i=0; i<2; i++) {
+ for(TestPerson person : records) {
+ String expected = personToString(person);
+ Assert.assertEquals(expected, rows.next().toString());
+ }
+ }
+
+ FileUtil.fullyDelete(inputDir);
+ }
+
+ private DataOutputStream createLzoOut(String name, Configuration conf) throws IOException {
+ File file = new File(inputDir, name);
+ LzopCodec codec = new LzopCodec();
+ codec.setConf(conf);
+
+ if (file.exists()) {
+ file.delete();
+ }
+ return new DataOutputStream(codec.createOutputStream(new FileOutputStream(file)));
+ }
+
+ // thrift class related :
+ private ThriftToPig<TestPerson> thriftToPig = ThriftToPig.newInstance(TestPerson.class);
+ private ThriftWritable<TestPerson> thriftWritable = ThriftWritable.newInstance(TestPerson.class);
+
+ // return a Person thrift object
+ private TestPerson makePerson(int index) {
+ return new TestPerson(
+ new TestName("bob " + index, "jenkins"),
+ ImmutableMap.of(TestPhoneType.HOME,
+ "408-555-5555" + "ex" + index));
+ }
+ private String personToString(TestPerson person) {
+ return thriftToPig.getPigTuple(person).toString();
+ }
+}
View
4 src/test/com/twitter/elephantbird/pig/piggybank/TestProtoToPig.java
@@ -19,7 +19,7 @@
import com.twitter.data.proto.tutorial.AddressBookProtos.Person;
import com.twitter.data.proto.tutorial.pig.piggybank.AddressBookProtobufBytesToTuple;
import com.twitter.elephantbird.pig.util.PigUtil;
-import com.twitter.elephantbird.pig.util.ProjectedProtoTuple;
+import com.twitter.elephantbird.pig.util.ProjectedProtobufTupleFactory;
import com.twitter.elephantbird.pig.util.ProtobufTuple;
import com.twitter.elephantbird.util.TypeRef;
@@ -48,7 +48,7 @@ public void testLazyProtoToPig() throws ExecException {
TypeRef<Person> typeRef = PigUtil.getProtobufTypeRef(Person.class.getName());
Tuple projectedTuple =
- new ProjectedProtoTuple<Person>(typeRef, evenFields(fieldDescs)).newTuple(personProto);
+ new ProjectedProtobufTupleFactory<Person>(typeRef, evenFields(fieldDescs)).newTuple(personProto);
int idx = 0;
for (FieldDescriptor fd : fieldDescs) {
View
6 src/test/com/twitter/elephantbird/pig/piggybank/TestThriftToPig.java
@@ -29,13 +29,13 @@
import com.twitter.data.proto.tutorial.thrift.PhoneNumber;
import com.twitter.data.proto.tutorial.thrift.PhoneType;
import com.twitter.elephantbird.mapreduce.io.ThriftConverter;
-import com.twitter.elephantbird.pig.util.ProjectedThriftTuple;
+import com.twitter.elephantbird.pig.util.ProjectedThriftTupleFactory;
import com.twitter.elephantbird.pig.util.ThriftToPig;
import com.twitter.elephantbird.pig.util.PigToThrift;
+import com.twitter.elephantbird.thrift.TStructDescriptor.Field;
import com.twitter.elephantbird.thrift.test.TestName;
import com.twitter.elephantbird.thrift.test.TestPerson;
import com.twitter.elephantbird.thrift.test.TestPhoneType;
-import com.twitter.elephantbird.thrift.TStructDescriptor.Field;
import com.twitter.elephantbird.util.TypeRef;
public class TestThriftToPig {
@@ -75,7 +75,7 @@
}
try {
- Tuple pt = new ProjectedThriftTuple<M>(typeRef, reqFieldList).newTuple(obj);
+ Tuple pt = new ProjectedThriftTupleFactory<M>(typeRef, reqFieldList).newTuple(obj);
int pidx=0;
for(int idx : idxList) {
assertEquals(t.get(idx).toString(), pt.get(pidx++).toString());
Please sign in to comment.
Something went wrong with that request. Please try again.