Permalink
Browse files

Merge branch 'eb-dev' into rc_file_exp

Conflicts:
	src/java/com/twitter/elephantbird/pig/load/LzoBaseLoadFunc.java
	src/test/com/twitter/elephantbird/pig/piggybank/TestThriftToPig.java
  • Loading branch information...
Raghu Angadi
Raghu Angadi committed Nov 28, 2011
2 parents b784885 + 342f336 commit 3d97a417058e0fe0735d5afe5ea23d42df09170a
Showing with 674 additions and 66 deletions.
  1. +2 −2 build.xml
  2. +16 −7 src/java/com/twitter/elephantbird/mapreduce/input/LzoJsonRecordReader.java
  3. +170 −0 src/java/com/twitter/elephantbird/mapreduce/input/MultiInputFormat.java
  4. +10 −0 src/java/com/twitter/elephantbird/mapreduce/output/LzoBinaryB64LineRecordWriter.java
  5. +144 −0 src/java/com/twitter/elephantbird/pig/load/FilterLoadFunc.java
  6. +13 −3 src/java/com/twitter/elephantbird/pig/load/JsonLoader.java
  7. +7 −8 src/java/com/twitter/elephantbird/pig/load/LzoBaseLoadFunc.java
  8. +10 −12 src/java/com/twitter/elephantbird/pig/load/LzoProtobufB64LinePigLoader.java
  9. +1 −1 src/java/com/twitter/elephantbird/pig/load/LzoProtobufBlockPigLoader.java
  10. +3 −3 src/java/com/twitter/elephantbird/pig/load/LzoThriftB64LinePigLoader.java
  11. +61 −0 src/java/com/twitter/elephantbird/pig/load/MultiFormatLoader.java
  12. +2 −5 ...com/twitter/elephantbird/pig/util/{ProjectedProtoTuple.java → ProjectedProtobufTupleFactory.java}
  13. +2 −5 .../com/twitter/elephantbird/pig/util/{ProjectedThriftTuple.java → ProjectedThriftTupleFactory.java}
  14. +5 −0 src/java/com/twitter/elephantbird/pig/util/ThriftToPig.java
  15. +23 −0 src/java/com/twitter/elephantbird/util/HadoopUtils.java
  16. +3 −1 src/java/com/twitter/elephantbird/util/Protobufs.java
  17. +2 −14 src/java/com/twitter/elephantbird/util/ThriftUtils.java
  18. +31 −0 src/test/com/twitter/elephantbird/mapreduce/input/TestLzoJsonRecordReader.java
  19. +32 −0 src/test/com/twitter/elephantbird/pig/load/TestJsonLoader.java
  20. +132 −0 src/test/com/twitter/elephantbird/pig/load/TestMultiFormatLoader.java
  21. +2 −2 src/test/com/twitter/elephantbird/pig/piggybank/TestProtoToPig.java
  22. +3 −3 src/test/com/twitter/elephantbird/pig/piggybank/TestThriftToPig.java
View
@@ -40,6 +40,7 @@
<property name="test.junit.maxmemory" value="512m" />
<property name="javac.debug" value="on"/>
<property name="javac.optimize" value="on"/>
+ <property name="test.library.path" value="none"/>
<path id="test.classpath">
<pathelement location="${classes.dir}"/>
@@ -263,8 +264,7 @@
<sysproperty key="test.log.dir" value="${test.log.dir}"/>
<sysproperty key="test.source.dir" value="${test.src.dir}"/>
<sysproperty key="test.build.extraconf" value="${test.build.extraconf}" />
- <sysproperty key="java.library.path"
- value="${lib.dir}/*.jar:${classes.dir}:${test.build.classes}:${basedir}/bin"/>
+ <sysproperty key="java.library.path" value="${test.library.path}"/>
<classpath refid="${test.classpath.id}"/>
<formatter type="${test.junit.output.format}" />
<batchtest todir="${test.log.dir}" unless="testcase">
@@ -89,14 +89,23 @@ public boolean nextKeyValue() throws IOException, InterruptedException {
public static boolean decodeLineToJson(JSONParser parser, Text line, MapWritable value) {
try {
JSONObject jsonObj = (JSONObject)parser.parse(line.toString());
- for (Object key: jsonObj.keySet()) {
- Text mapKey = new Text(key.toString());
- Text mapValue = new Text();
- if (jsonObj.get(key) != null) {
- mapValue.set(jsonObj.get(key).toString());
+ if (jsonObj != null) {
+ for (Object key: jsonObj.keySet()) {
+ Text mapKey = new Text(key.toString());
+ Text mapValue = new Text();
+ if (jsonObj.get(key) != null) {
+ mapValue.set(jsonObj.get(key).toString());
+ }
+
+ value.put(mapKey, mapValue);
}
-
- value.put(mapKey, mapValue);
+ }
+ else {
+ // JSONParser#parse(String) may return a null reference, e.g. when
+ // the input parameter is the string "null". A single line with
+ // "null" is not valid JSON though.
+ LOG.warn("Could not json-decode string: " + line);
+ return false;
}
return true;
} catch (ParseException e) {
@@ -0,0 +1,170 @@
+package com.twitter.elephantbird.mapreduce.input;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.compress.CompressionCodec;
+import org.apache.hadoop.io.compress.CompressionCodecFactory;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.thrift.TBase;
+
+import com.google.protobuf.Message;
+import com.twitter.data.proto.BlockStorage.SerializedBlock;
+import com.twitter.elephantbird.mapreduce.io.BinaryWritable;
+import com.twitter.elephantbird.util.HadoopUtils;
+import com.twitter.elephantbird.util.Protobufs;
+import com.twitter.elephantbird.util.TypeRef;
+
+/**
+ * The input could consist of heterogeneous mix of formats storing
+ * compatible objects. Currently supported formats:
+ * <ol>
+ * <li> Lzo Block storage of Thrift and Protobuf objects
+ * <li> Lzo B64Line storage of Thrift and Protobuf objects
+ * </ol>
+ *
+ * <p>
+ * A small fraction of bad records are tolerated. See {@link LzoRecordReader}
+ * for more information on error handling.
+ */
+public class MultiInputFormat<M>
+ extends LzoInputFormat<LongWritable, BinaryWritable<M>> {
+
+ // TODO need handle multiple input formats in a job better.
+ // might be better to store classname in the input split rather than in config.
+ private static String CLASS_CONF_KEY = "elephantbird.class.for.MultiInputFormat";
+
+ private TypeRef<M> typeRef;
+
+ public MultiInputFormat() {}
+
+ public MultiInputFormat(TypeRef<M> typeRef) {
+ this.typeRef = typeRef;
+ }
+
+ private static enum Format {
+ LZO_BLOCK,
+ LZO_B64LINE;
+ };
+
+ /**
+ * Sets jobs input format to {@link MultiInputFormat} and stores
+ * supplied clazz's name in job configuration. This configuration is
+ * read on the remote tasks to initialize the input format correctly.
+ */
+ public static void setInputFormatClass(Class<?> clazz, Job job) {
+ job.setInputFormatClass(MultiInputFormat.class);
+ HadoopUtils.setInputFormatClass(job.getConfiguration(), CLASS_CONF_KEY, clazz);
+ }
+
+ @SuppressWarnings("unchecked") // return type is runtime dependent
+ @Override
+ public RecordReader<LongWritable, BinaryWritable<M>>
+ createRecordReader(InputSplit split, TaskAttemptContext taskAttempt)
+ throws IOException, InterruptedException {
+ Configuration conf = taskAttempt.getConfiguration();
+ if (typeRef == null) {
+ setTypeRef(conf);
+ }
+ Class<?> recordClass = typeRef.getRawClass();
+
+ Format fileFormat = determineFileFormat(split, conf);
+
+ // Thrift
+ if (TBase.class.isAssignableFrom(recordClass)) {
+ switch (fileFormat) {
+ case LZO_BLOCK:
+ return new LzoThriftBlockRecordReader(typeRef);
+ case LZO_B64LINE:
+ return new LzoThriftB64LineRecordReader(typeRef);
+ }
+ }
+
+ // Protobuf
+ if (Message.class.isAssignableFrom(recordClass)) {
+ switch (fileFormat) {
+ case LZO_BLOCK:
+ return new LzoProtobufBlockRecordReader(typeRef);
+ case LZO_B64LINE:
+ return new LzoProtobufBlockRecordReader(typeRef);
+ }
+ }
+
+ throw new IOException( "could not determine reader for "
+ + ((FileSplit)split).getPath() + " with class " + recordClass.getName());
+ }
+
+ /** set typeRef from conf */
+ private void setTypeRef(Configuration conf) {
+ String className = conf.get(CLASS_CONF_KEY);
+
+ if (className == null) {
+ throw new RuntimeException(CLASS_CONF_KEY + " is not set");
+ }
+
+ Class<?> clazz = null;
+ try {
+ clazz = conf.getClassByName(className);
+ } catch (ClassNotFoundException e) {
+ throw new RuntimeException("failed to instantiate class '" + className + "'", e);
+ }
+
+ typeRef = new TypeRef<M>(clazz){};
+ }
+
+ /**
+ * Checks to see if the input records are stored as {@link SerializedBlock}.
+ * The block format starts with {@link Protobufs#KNOWN_GOOD_POSITION_MARKER}.
+ * Otherwise the input is assumed to be Base64 encoded lines.
+ */
+ private static Format determineFileFormat(InputSplit split,
+ Configuration conf)
+ throws IOException {
+ FileSplit fileSplit = (FileSplit)split;
+
+ Path file = fileSplit.getPath();
+
+ /* we could have a an optional configuration that maps a regex on a
+ * file name to a format. E.g. ".*-block.lzo" to LZO_BLOCK file.
+ */
+
+ // most of the cost is opening the file and
+ // reading first lzo block (about 256k of compressed data)
+
+ CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(file);
+ if (codec == null) {
+ throw new IOException("No codec for file " + file + " found");
+ }
+
+ InputStream in = file.getFileSystem(conf).open(file);
+ InputStream lzoIn = null;
+
+ // check if the file starts with magic bytes for Block storage format.
+ try {
+ lzoIn = codec.createInputStream(in);
+
+ for(byte magic : Protobufs.KNOWN_GOOD_POSITION_MARKER) {
+ int b = lzoIn.read();
+ if (b < 0 || (byte)b != magic) {
+ return Format.LZO_B64LINE;
+ }
+ }
+ } finally {
+ IOUtils.closeStream(lzoIn);
+ IOUtils.closeStream(in);
+ }
+
+ // the check passed
+ return Format.LZO_BLOCK;
+ }
+}
+
+
@@ -5,12 +5,15 @@
import com.twitter.elephantbird.mapreduce.io.BinaryConverter;
import com.twitter.elephantbird.mapreduce.io.BinaryWritable;
+import com.twitter.elephantbird.mapreduce.io.ThriftConverter;
+import com.twitter.elephantbird.mapreduce.io.ThriftWritable;
import com.twitter.elephantbird.util.Codecs;
import com.twitter.elephantbird.util.Protobufs;
import org.apache.commons.codec.binary.Base64;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.thrift.TBase;
/**
* A RecordWriter-derived class for use with the LzoProtobufB64LineOutputFormat.
@@ -43,4 +46,11 @@ public void close(TaskAttemptContext taskAttemptContext)
throws IOException, InterruptedException {
out.close();
}
+
+ // for convenience
+ public static <M extends TBase<?, ?>> LzoBinaryB64LineRecordWriter<M, ThriftWritable<M>>
+ newThriftWriter(Class<M> tClass, DataOutputStream out) {
+ return new LzoBinaryB64LineRecordWriter<M, ThriftWritable<M>>
+ (ThriftConverter.newInstance(tClass), out);
+ }
}
@@ -0,0 +1,144 @@
+package com.twitter.elephantbird.pig.load;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.InputFormat;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.pig.Expression;
+import org.apache.pig.LoadCaster;
+import org.apache.pig.LoadFunc;
+import org.apache.pig.LoadMetadata;
+import org.apache.pig.LoadPushDown;
+import org.apache.pig.ResourceSchema;
+import org.apache.pig.ResourceStatistics;
+import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+
+/**
+ * A wrapper LoadFunc that delegates all the functionality to another loader.
+ * Similar to a FilterInputStream.
+ */
+public class FilterLoadFunc extends LoadFunc implements LoadMetadata, LoadPushDown {
+
+ protected LoadFunc loader;
+
+ /**
+ * @param loader This could be null. It may not be feasible to set
+ * loader during construction. It can be set later with setLoader()
+ */
+ public FilterLoadFunc(LoadFunc loader) {
+ this.loader = loader;
+ }
+
+ public void setLoader(LoadFunc loader) {
+ this.loader = loader;
+ }
+
+ // just for readability
+ private boolean isSet() {
+ return loader != null;
+ }
+ // LoadFunc implementation:
+
+ @Override @SuppressWarnings("unchecked")
+ public InputFormat getInputFormat() throws IOException {
+ return isSet() ? loader.getInputFormat() : null;
+ }
+
+ @Override
+ public LoadCaster getLoadCaster() throws IOException {
+ return isSet() ? loader.getLoadCaster() : null;
+ }
+
+ @Override
+ public Tuple getNext() throws IOException {
+ return isSet() ? loader.getNext() : null;
+ }
+
+ @Override @SuppressWarnings("unchecked")
+ public void prepareToRead(RecordReader reader, PigSplit split) throws IOException {
+ if (isSet()) {
+ loader.prepareToRead(reader, split);
+ }
+ }
+
+ @Override
+ public String relativeToAbsolutePath(String location, Path curDir)
+ throws IOException {
+ return isSet() ?
+ loader.relativeToAbsolutePath(location, curDir):
+ super.relativeToAbsolutePath(location, curDir);
+ }
+
+ @Override
+ public void setLocation(String location, Job job) throws IOException {
+ if (isSet()) {
+ loader.setLocation(location, job);
+ }
+ }
+
+ @Override
+ public void setUDFContextSignature(String signature) {
+ if (isSet()) {
+ loader.setUDFContextSignature(signature);
+ } else {
+ super.setUDFContextSignature(signature);
+ }
+ }
+
+ // LoadMetadata & LoadPushDown interface.
+
+ // helpers for casting:
+ private static LoadMetadata asLoadMetadata(LoadFunc loader) {
+ return loader instanceof LoadMetadata ? (LoadMetadata) loader : null;
+ }
+
+ private static LoadPushDown asLoadPushDown(LoadFunc loader) {
+ return loader instanceof LoadPushDown ? (LoadPushDown) loader : null;
+ }
+
+
+ @Override
+ public String[] getPartitionKeys(String location, Job job) throws IOException {
+ LoadMetadata metadata = asLoadMetadata(loader);
+ return metadata == null ? null : metadata.getPartitionKeys(location, job);
+ }
+
+ @Override
+ public ResourceSchema getSchema(String location, Job job) throws IOException {
+ LoadMetadata metadata = asLoadMetadata(loader);
+ return metadata == null ? null : metadata.getSchema(location, job);
+ }
+
+ @Override
+ public ResourceStatistics getStatistics(String location, Job job) throws IOException {
+ LoadMetadata metadata = asLoadMetadata(loader);
+ return metadata == null ? null : metadata.getStatistics(location, job);
+ }
+
+ @Override
+ public void setPartitionFilter(Expression partitionFilter) throws IOException {
+ LoadMetadata metadata = asLoadMetadata(loader);
+ if ( metadata != null ) {
+ metadata.setPartitionFilter(partitionFilter);
+ }
+ }
+
+ @Override
+ public List<OperatorSet> getFeatures() {
+ LoadPushDown pushDown = asLoadPushDown(loader);
+ return pushDown == null ? null : pushDown.getFeatures();
+ }
+
+ @Override
+ public RequiredFieldResponse pushProjection(
+ RequiredFieldList requiredFieldList) throws FrontendException {
+ LoadPushDown pushDown = asLoadPushDown( loader );
+ return pushDown == null ? null : pushDown.pushProjection( requiredFieldList );
+ }
+
+}
Oops, something went wrong.

0 comments on commit 3d97a41

Please sign in to comment.