Skip to content
This repository has been archived by the owner on Jan 5, 2021. It is now read-only.

Commit

Permalink
Initial open source commit
Browse files Browse the repository at this point in the history
(Herringbone is a common pattern for parquet flooring, whee!)
  • Loading branch information
DanielleSucher committed Nov 21, 2014
0 parents commit 1f2d9f0
Show file tree
Hide file tree
Showing 47 changed files with 6,116 additions and 0 deletions.
10 changes: 10 additions & 0 deletions .gitignore
@@ -0,0 +1,10 @@
target/
data/
.idea/
*.pyc
*.iml
# ignore ROC plots
*.pdf
.tddium*

.DS_Store
21 changes: 21 additions & 0 deletions LICENSE
@@ -0,0 +1,21 @@
The MIT License (MIT)

Copyright (c) 2014- Stripe, Inc. (https://stripe.com)

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
40 changes: 40 additions & 0 deletions README.md
@@ -0,0 +1,40 @@
Herringbone
===========

Herringbone is a suite of tools for working with parquet files on hdfs, and with impala and hive.

The available commands are:

`flatten`: transform a directory of parquet files with a nested structure into a directory of parquet files with a flat schema that can be loaded into impala or hive (neither of which support nested schemas)

$ herringbone flatten -i /path/to/input/directory -o /path/to/output/directory

`load`: load a directory of parquet files (which must have a flat schema) into impala or hive (defaulting to impala)

$ herringbone load [--hive] [-u] -d db_name -t table -p /path/to/parquet/directory

`tsv`: transform a directory of parquet files into a directory of tsv files (which you can concat properly later with `hadoop fs -getmerge /path/to/tsvs`)

$ herringbone tsv -i /path/to/input/directory -o /path/to/output/directory

`compact`: transform a directory of parquet files into a directory of fewer larger parquet files

$ herringbone compact -i /path/to/input/directory -o /path/to/output/directory

See `herringbone COMMAND --help` for more information on a specific command.

Building
--------

You'll need thrift 0.9.1 on your path.

$ git clone github.com/stripe/herringbone
$ cd herringbone
$ mvn package

Authors
-------

- [Avi Bryant](http://twitter.com/avibryant)
- [Danielle Sucher](http://twitter.com/daniellesucher)
- [Jeff Balogh](http://twitter.com/jbalogh)
72 changes: 72 additions & 0 deletions bin/herringbone
@@ -0,0 +1,72 @@
#!/usr/bin/env ruby

usage = <<-USAGE
Herringbone is a suite of tools for working with parquet files on hdfs.
The available commands are:
flatten: Transform a directory of parquet files with a nested structure into a directory of parquet files with a flat schema that can be loaded into impala or hive
load: Load a directory of parquet files (which must have a flat schema) into impala or hive (defaults to impala)
tsv: Transform a directory of parquet files into a directory of tsv files (which you can concat properly later with `hadoop fs -getmerge /path/to/tsvs`)
compact: Transform a directory of parquet files into a directory of fewer larger parquet files
Example usage:
`herringbone flatten -i /path/to/input/directory -o /path/to/output/directory`
`herringbone load [--hive] [-u] -d db_name -t table -p /path/to/parquet/directory`
`herringbone tsv -i /path/to/input/directory -o /path/to/output/directory`
`herringbone compact -i /path/to/input/directory -o /path/to/output/directory`
See 'herringbone COMMAND --help' for more information on a specific command.
USAGE

command_jobs = {
'compact' => 'CompactJob',
'load' => 'ParquetLoad',
'flatten' => 'FlattenJob',
'tsv' => 'TsvJob',
}

# Validate the given command and print usage if needed.
command = ARGV.shift
JOB = command_jobs[command]

if ['-h', '--help'].include?(command)
puts usage
exit 0
elsif !JOB
STDERR.puts "\nError: #{command} is not an available command\n\n"
puts "#{'=' * 30}\n\n"
puts usage
exit 1
end

jar_path = File.join(
File.dirname(__FILE__),
'../',
'herringbone-main',
'target',
'herringbone-0.0.1-jar-with-dependencies.jar'
)
JAR = File.expand_path(jar_path)

ENV["HADOOP_CLASSPATH"] = JAR
ENV["HADOOP_USER_CLASSPATH_FIRST"] = "true"

exec(
"hadoop",
"jar",
JAR,
"com.stripe.herringbone.#{JOB}",
*ARGV
)
118 changes: 118 additions & 0 deletions herringbone-impala/pom.xml
@@ -0,0 +1,118 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>com.stripe</groupId>
<artifactId>herringbone-impala</artifactId>
<version>0.0.2</version>
<packaging>jar</packaging>

<name>Herringbone Impala</name>

<pluginRepositories>
<pluginRepository>
<id>dtrott</id>
<url>http://maven.davidtrott.com/repository</url>
</pluginRepository>
</pluginRepositories>

<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>1.6</source>
<target>1.6</target>
</configuration>
</plugin>

<plugin>
<artifactId>maven-jar-plugin</artifactId>
<version>2.3.1</version>
</plugin>

<plugin>
<artifactId>maven-resources-plugin</artifactId>
<version>2.4.3</version>
</plugin>

<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.1.6</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>

<plugin>
<groupId>org.apache.thrift.tools</groupId>
<artifactId>maven-thrift-plugin</artifactId>
<version>0.1.11</version>
<configuration>
<checkStaleness>true</checkStaleness>
<thriftExecutable>thrift</thriftExecutable>
</configuration>
<executions>
<execution>
<id>thrift-sources</id>
<phase>generate-sources</phase>
<goals>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>thrift-test-sources</id>
<phase>generate-test-sources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>

</plugins>
</build>

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<scala.version>2.10.3</scala.version>
<maven.compiler.source>1.6</maven.compiler.source>
<maven.compiler.target>1.6</maven.compiler.target>
</properties>

<repositories>
<repository>
<id>cloudera-releases</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>false</enabled>
</snapshots>
</repository>
</repositories>

<dependencies>
<dependency>
<groupId>org.apache.thrift</groupId>
<artifactId>libthrift</artifactId>
<version>0.9.1</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.5.2</version>
</dependency>
</dependencies>
</project>


@@ -0,0 +1,65 @@
package com.stripe.herringbone.impala

import org.apache.thrift.transport.TSocket
import org.apache.thrift.protocol.TBinaryProtocol

import com.cloudera.impala.thrift.ImpalaService.{Client => ClouderaImpalaClient}
import com.cloudera.beeswax.api._

import scala.annotation.tailrec
import scala.collection.JavaConversions._

case class Connection(host: String, port: Int) {
var isOpen = false
lazy val socket = new TSocket(host, port)
lazy val client = new ClouderaImpalaClient(new TBinaryProtocol(socket))

open

def open = {
if (!isOpen) {
socket.open
client.ResetCatalog
isOpen = true
}
}

def close = {
if (isOpen) {
socket.close
isOpen = false
}
}

// Refresh the metadata store.
def refresh = {
if (!isOpen) throw ConnectionException("Connection closed")
client.ResetCatalog
}

// Perform a query, and pass in a function that will be called with each
// row of the results
def query(raw: String)(fn: Seq[ImpalaValue] => Unit) {
val cursor = execute(raw)
cursor.foreach { row => fn(row) }
cursor.close
}

// Perform a query and return a cursor for iterating over the results.
// You probably want to call cursor.close when you're done with it.
def execute(raw: String): Cursor = {
if (!isOpen) throw ConnectionException("Connection closed")
validateQuery(raw)

val query = new Query
query.query = raw

val handle = client.query(query)
Cursor(handle, client)
}

private def validateQuery(raw: String) = {
val words = raw.split("\\s+")
if (words.isEmpty) throw InvalidQueryException("Empty query")
}
}

0 comments on commit 1f2d9f0

Please sign in to comment.