Initial open source commit

(Herringbone is a common pattern for parquet flooring, whee!)
stripe-archive · Nov 21, 2014 · 1f2d9f0 · 1f2d9f0
commit 1f2d9f0
Show file tree

Hide file tree

Showing 47 changed files with 6,116 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,10 @@
+target/
+data/
+.idea/
+*.pyc
+*.iml
+# ignore ROC plots
+*.pdf
+.tddium*
+
+.DS_Store
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright (c) 2014- Stripe, Inc. (https://stripe.com)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,40 @@
+Herringbone
+===========
+
+Herringbone is a suite of tools for working with parquet files on hdfs, and with impala and hive.
+
+The available commands are:
+
+`flatten`: transform a directory of parquet files with a nested structure into a directory of parquet files with a flat schema that can be loaded into impala or hive (neither of which support nested schemas)
+
+    $ herringbone flatten -i /path/to/input/directory -o /path/to/output/directory
+
+`load`: load a directory of parquet files (which must have a flat schema) into impala or hive (defaulting to impala)
+
+    $ herringbone load [--hive] [-u] -d db_name -t table -p /path/to/parquet/directory
+
+`tsv`: transform a directory of parquet files into a directory of tsv files (which you can concat properly later with `hadoop fs -getmerge /path/to/tsvs`)
+
+    $ herringbone tsv -i /path/to/input/directory -o /path/to/output/directory
+
+`compact`: transform a directory of parquet files into a directory of fewer larger parquet files
+
+    $ herringbone compact -i /path/to/input/directory -o /path/to/output/directory
+
+See `herringbone COMMAND --help` for more information on a specific command.
+
+Building
+--------
+
+You'll need thrift 0.9.1 on your path.
+
+    $ git clone github.com/stripe/herringbone
+    $ cd herringbone
+    $ mvn package
+
+Authors
+-------
+
+ - [Avi Bryant](http://twitter.com/avibryant)
+ - [Danielle Sucher](http://twitter.com/daniellesucher)
+ - [Jeff Balogh](http://twitter.com/jbalogh)
diff --git a/bin/herringbone b/bin/herringbone
@@ -0,0 +1,72 @@
+#!/usr/bin/env ruby
+
+usage = <<-USAGE
+Herringbone is a suite of tools for working with parquet files on hdfs.
+
+The available commands are:
+
+flatten: Transform a directory of parquet files with a nested structure into a directory of parquet files with a flat schema that can be loaded into impala or hive
+
+load: Load a directory of parquet files (which must have a flat schema) into impala or hive (defaults to impala)
+
+tsv: Transform a directory of parquet files into a directory of tsv files (which you can concat properly later with `hadoop fs -getmerge /path/to/tsvs`)
+
+compact: Transform a directory of parquet files into a directory of fewer larger parquet files
+
+
+Example usage:
+
+`herringbone flatten -i /path/to/input/directory -o /path/to/output/directory`
+
+`herringbone load [--hive] [-u] -d db_name -t table -p /path/to/parquet/directory`
+
+`herringbone tsv -i /path/to/input/directory -o /path/to/output/directory`
+
+`herringbone compact -i /path/to/input/directory -o /path/to/output/directory`
+
+
+See 'herringbone COMMAND --help' for more information on a specific command.
+
+
+  USAGE
+
+command_jobs = {
+  'compact' => 'CompactJob',
+  'load' => 'ParquetLoad',
+  'flatten' => 'FlattenJob',
+  'tsv' => 'TsvJob',
+}
+
+# Validate the given command and print usage if needed.
+command = ARGV.shift
+JOB = command_jobs[command]
+
+if ['-h', '--help'].include?(command)
+  puts usage
+  exit 0
+elsif !JOB
+  STDERR.puts "\nError: #{command} is not an available command\n\n"
+  puts "#{'=' * 30}\n\n"
+  puts usage
+  exit 1
+end
+
+jar_path = File.join(
+  File.dirname(__FILE__),
+  '../',
+  'herringbone-main',
+  'target',
+  'herringbone-0.0.1-jar-with-dependencies.jar'
+)
+JAR = File.expand_path(jar_path)
+
+ENV["HADOOP_CLASSPATH"] = JAR
+ENV["HADOOP_USER_CLASSPATH_FIRST"] = "true"
+
+exec(
+  "hadoop",
+  "jar",
+  JAR,
+  "com.stripe.herringbone.#{JOB}",
+  *ARGV
+)
diff --git a/herringbone-impala/pom.xml b/herringbone-impala/pom.xml
@@ -0,0 +1,118 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <groupId>com.stripe</groupId>
+  <artifactId>herringbone-impala</artifactId>
+  <version>0.0.2</version>
+  <packaging>jar</packaging>
+
+  <name>Herringbone Impala</name>
+
+  <pluginRepositories>
+    <pluginRepository>
+      <id>dtrott</id>
+      <url>http://maven.davidtrott.com/repository</url>
+    </pluginRepository>
+  </pluginRepositories>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <version>3.1</version>
+        <configuration>
+          <source>1.6</source>
+          <target>1.6</target>
+        </configuration>
+      </plugin>
+
+      <plugin>
+        <artifactId>maven-jar-plugin</artifactId>
+        <version>2.3.1</version>
+      </plugin>
+
+      <plugin>
+        <artifactId>maven-resources-plugin</artifactId>
+        <version>2.4.3</version>
+      </plugin>
+
+      <plugin>
+        <groupId>net.alchim31.maven</groupId>
+        <artifactId>scala-maven-plugin</artifactId>
+        <version>3.1.6</version>
+        <executions>
+          <execution>
+            <goals>
+              <goal>compile</goal>
+              <goal>testCompile</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+
+      <plugin>
+        <groupId>org.apache.thrift.tools</groupId>
+        <artifactId>maven-thrift-plugin</artifactId>
+        <version>0.1.11</version>
+        <configuration>
+          <checkStaleness>true</checkStaleness>
+          <thriftExecutable>thrift</thriftExecutable>
+        </configuration>
+        <executions>
+          <execution>
+            <id>thrift-sources</id>
+            <phase>generate-sources</phase>
+            <goals>
+              <goal>compile</goal>
+            </goals>
+          </execution>
+          <execution>
+            <id>thrift-test-sources</id>
+            <phase>generate-test-sources</phase>
+            <goals>
+              <goal>testCompile</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+
+    </plugins>
+  </build>
+
+  <properties>
+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    <scala.version>2.10.3</scala.version>
+    <maven.compiler.source>1.6</maven.compiler.source>
+    <maven.compiler.target>1.6</maven.compiler.target>
+  </properties>
+
+  <repositories>
+    <repository>
+      <id>cloudera-releases</id>
+      <url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
+      <releases>
+        <enabled>true</enabled>
+      </releases>
+      <snapshots>
+        <enabled>false</enabled>
+      </snapshots>
+    </repository>
+  </repositories>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.thrift</groupId>
+      <artifactId>libthrift</artifactId>
+      <version>0.9.1</version>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-log4j12</artifactId>
+      <version>1.5.2</version>
+    </dependency>
+  </dependencies>
+</project>
+
+
diff --git a/herringbone-impala/src/main/scala/com/stripe/herringbone/impala/Connection.scala b/herringbone-impala/src/main/scala/com/stripe/herringbone/impala/Connection.scala
@@ -0,0 +1,65 @@
+package com.stripe.herringbone.impala
+
+import org.apache.thrift.transport.TSocket
+import org.apache.thrift.protocol.TBinaryProtocol
+
+import com.cloudera.impala.thrift.ImpalaService.{Client => ClouderaImpalaClient}
+import com.cloudera.beeswax.api._
+
+import scala.annotation.tailrec
+import scala.collection.JavaConversions._
+
+case class Connection(host: String, port: Int) {
+  var isOpen = false
+  lazy val socket = new TSocket(host, port)
+  lazy val client = new ClouderaImpalaClient(new TBinaryProtocol(socket))
+
+  open
+
+  def open = {
+    if (!isOpen) {
+      socket.open
+      client.ResetCatalog
+      isOpen = true
+    }
+  }
+
+  def close = {
+    if (isOpen) {
+      socket.close
+      isOpen = false
+    }
+  }
+
+  // Refresh the metadata store.
+  def refresh = {
+    if (!isOpen) throw ConnectionException("Connection closed")
+    client.ResetCatalog
+  }
+
+  // Perform a query, and pass in a function that will be called with each
+  // row of the results
+  def query(raw: String)(fn: Seq[ImpalaValue] => Unit) {
+    val cursor = execute(raw)
+    cursor.foreach { row => fn(row) }
+    cursor.close
+  }
+
+  // Perform a query and return a cursor for iterating over the results.
+  // You probably want to call cursor.close when you're done with it.
+  def execute(raw: String): Cursor = {
+    if (!isOpen) throw ConnectionException("Connection closed")
+    validateQuery(raw)
+
+    val query = new Query
+    query.query = raw
+
+    val handle = client.query(query)
+    Cursor(handle, client)
+  }
+
+  private def validateQuery(raw: String) = {
+    val words = raw.split("\\s+")
+    if (words.isEmpty) throw InvalidQueryException("Empty query")
+  }
+}