From 1f2d9f06d25ba90373d2a50e7852313c8292421e Mon Sep 17 00:00:00 2001 From: Danielle Sucher Date: Fri, 21 Nov 2014 13:18:34 -0500 Subject: [PATCH] Initial open source commit (Herringbone is a common pattern for parquet flooring, whee!) --- .gitignore | 10 + LICENSE | 21 + README.md | 40 + bin/herringbone | 72 ++ herringbone-impala/pom.xml | 118 ++ .../herringbone/impala/Connection.scala | 65 ++ .../stripe/herringbone/impala/Cursor.scala | 98 ++ .../herringbone/impala/Exceptions.scala | 7 + .../herringbone/impala/ImpalaClient.scala | 16 + .../herringbone/impala/ImpalaValue.scala | 43 + .../src/main/thrift/ImpalaService.thrift | 177 +++ .../src/main/thrift/Status.thrift | 32 + .../src/main/thrift/beeswax.thrift | 175 +++ .../src/main/thrift/cli_service.thrift | 1015 +++++++++++++++++ .../src/main/thrift/fb303.thrift | 112 ++ .../src/main/thrift/hive_metastore.thrift | 528 +++++++++ herringbone-main/pom.xml | 168 +++ .../herringbone/CompactInputFormat.scala | 168 +++ .../com/stripe/herringbone/CompactJob.scala | 98 ++ .../com/stripe/herringbone/FlattenJob.scala | 78 ++ .../com/stripe/herringbone/ParquetLoad.scala | 45 + .../scala/com/stripe/herringbone/TsvJob.scala | 98 ++ .../herringbone/flatten/FlatConsumer.scala | 108 ++ .../herringbone/flatten/FlatConverter.scala | 54 + .../herringbone/flatten/ParquetFlatConf.scala | 11 + .../flatten/ParquetFlatMapper.scala | 29 + .../herringbone/flatten/TypeFlattener.scala | 59 + .../stripe/herringbone/load/FieldUtils.scala | 53 + .../stripe/herringbone/load/HadoopFs.scala | 39 + .../stripe/herringbone/load/HiveLoader.scala | 76 ++ .../load/HiveServer2Connection.scala | 35 + .../herringbone/load/ImpalaLoader.scala | 122 ++ .../herringbone/load/ParquetLoadConf.scala | 18 + .../herringbone/load/ParquetLoader.scala | 9 + .../herringbone/util/ParquetUtils.scala | 36 + .../src/main/thrift/ImpalaService.thrift | 177 +++ .../src/main/thrift/Status.thrift | 32 + .../src/main/thrift/beeswax.thrift | 175 +++ .../src/main/thrift/cli_service.thrift | 1015 +++++++++++++++++ herringbone-main/src/main/thrift/fb303.thrift | 112 ++ .../src/main/thrift/hive_metastore.thrift | 528 +++++++++ .../src/test/resources/test.parquet | Bin 0 -> 916 bytes .../stripe/herringbone/FlattenJobTest.scala | 22 + .../flatten/FlatConverterTest.scala | 61 + .../flatten/TypeFlattenerTest.scala | 95 ++ .../herringbone/load/FieldUtilsTest.scala | 49 + pom.xml | 17 + 47 files changed, 6116 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md create mode 100755 bin/herringbone create mode 100644 herringbone-impala/pom.xml create mode 100644 herringbone-impala/src/main/scala/com/stripe/herringbone/impala/Connection.scala create mode 100644 herringbone-impala/src/main/scala/com/stripe/herringbone/impala/Cursor.scala create mode 100644 herringbone-impala/src/main/scala/com/stripe/herringbone/impala/Exceptions.scala create mode 100644 herringbone-impala/src/main/scala/com/stripe/herringbone/impala/ImpalaClient.scala create mode 100644 herringbone-impala/src/main/scala/com/stripe/herringbone/impala/ImpalaValue.scala create mode 100644 herringbone-impala/src/main/thrift/ImpalaService.thrift create mode 100644 herringbone-impala/src/main/thrift/Status.thrift create mode 100644 herringbone-impala/src/main/thrift/beeswax.thrift create mode 100644 herringbone-impala/src/main/thrift/cli_service.thrift create mode 100644 herringbone-impala/src/main/thrift/fb303.thrift create mode 100644 herringbone-impala/src/main/thrift/hive_metastore.thrift create mode 100644 herringbone-main/pom.xml create mode 100644 herringbone-main/src/main/scala/com/stripe/herringbone/CompactInputFormat.scala create mode 100644 herringbone-main/src/main/scala/com/stripe/herringbone/CompactJob.scala create mode 100644 herringbone-main/src/main/scala/com/stripe/herringbone/FlattenJob.scala create mode 100644 herringbone-main/src/main/scala/com/stripe/herringbone/ParquetLoad.scala create mode 100644 herringbone-main/src/main/scala/com/stripe/herringbone/TsvJob.scala create mode 100644 herringbone-main/src/main/scala/com/stripe/herringbone/flatten/FlatConsumer.scala create mode 100644 herringbone-main/src/main/scala/com/stripe/herringbone/flatten/FlatConverter.scala create mode 100644 herringbone-main/src/main/scala/com/stripe/herringbone/flatten/ParquetFlatConf.scala create mode 100644 herringbone-main/src/main/scala/com/stripe/herringbone/flatten/ParquetFlatMapper.scala create mode 100644 herringbone-main/src/main/scala/com/stripe/herringbone/flatten/TypeFlattener.scala create mode 100644 herringbone-main/src/main/scala/com/stripe/herringbone/load/FieldUtils.scala create mode 100644 herringbone-main/src/main/scala/com/stripe/herringbone/load/HadoopFs.scala create mode 100644 herringbone-main/src/main/scala/com/stripe/herringbone/load/HiveLoader.scala create mode 100644 herringbone-main/src/main/scala/com/stripe/herringbone/load/HiveServer2Connection.scala create mode 100644 herringbone-main/src/main/scala/com/stripe/herringbone/load/ImpalaLoader.scala create mode 100644 herringbone-main/src/main/scala/com/stripe/herringbone/load/ParquetLoadConf.scala create mode 100644 herringbone-main/src/main/scala/com/stripe/herringbone/load/ParquetLoader.scala create mode 100644 herringbone-main/src/main/scala/com/stripe/herringbone/util/ParquetUtils.scala create mode 100644 herringbone-main/src/main/thrift/ImpalaService.thrift create mode 100644 herringbone-main/src/main/thrift/Status.thrift create mode 100644 herringbone-main/src/main/thrift/beeswax.thrift create mode 100644 herringbone-main/src/main/thrift/cli_service.thrift create mode 100644 herringbone-main/src/main/thrift/fb303.thrift create mode 100644 herringbone-main/src/main/thrift/hive_metastore.thrift create mode 100644 herringbone-main/src/test/resources/test.parquet create mode 100644 herringbone-main/src/test/scala/com/stripe/herringbone/FlattenJobTest.scala create mode 100644 herringbone-main/src/test/scala/com/stripe/herringbone/flatten/FlatConverterTest.scala create mode 100644 herringbone-main/src/test/scala/com/stripe/herringbone/flatten/TypeFlattenerTest.scala create mode 100644 herringbone-main/src/test/scala/com/stripe/herringbone/load/FieldUtilsTest.scala create mode 100644 pom.xml diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3c6ba44 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +target/ +data/ +.idea/ +*.pyc +*.iml +# ignore ROC plots +*.pdf +.tddium* + +.DS_Store diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..2754f88 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2014- Stripe, Inc. (https://stripe.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..fcbf773 --- /dev/null +++ b/README.md @@ -0,0 +1,40 @@ +Herringbone +=========== + +Herringbone is a suite of tools for working with parquet files on hdfs, and with impala and hive. + +The available commands are: + +`flatten`: transform a directory of parquet files with a nested structure into a directory of parquet files with a flat schema that can be loaded into impala or hive (neither of which support nested schemas) + + $ herringbone flatten -i /path/to/input/directory -o /path/to/output/directory + +`load`: load a directory of parquet files (which must have a flat schema) into impala or hive (defaulting to impala) + + $ herringbone load [--hive] [-u] -d db_name -t table -p /path/to/parquet/directory + +`tsv`: transform a directory of parquet files into a directory of tsv files (which you can concat properly later with `hadoop fs -getmerge /path/to/tsvs`) + + $ herringbone tsv -i /path/to/input/directory -o /path/to/output/directory + +`compact`: transform a directory of parquet files into a directory of fewer larger parquet files + + $ herringbone compact -i /path/to/input/directory -o /path/to/output/directory + +See `herringbone COMMAND --help` for more information on a specific command. + +Building +-------- + +You'll need thrift 0.9.1 on your path. + + $ git clone github.com/stripe/herringbone + $ cd herringbone + $ mvn package + +Authors +------- + + - [Avi Bryant](http://twitter.com/avibryant) + - [Danielle Sucher](http://twitter.com/daniellesucher) + - [Jeff Balogh](http://twitter.com/jbalogh) diff --git a/bin/herringbone b/bin/herringbone new file mode 100755 index 0000000..0a5e6c6 --- /dev/null +++ b/bin/herringbone @@ -0,0 +1,72 @@ +#!/usr/bin/env ruby + +usage = <<-USAGE +Herringbone is a suite of tools for working with parquet files on hdfs. + +The available commands are: + +flatten: Transform a directory of parquet files with a nested structure into a directory of parquet files with a flat schema that can be loaded into impala or hive + +load: Load a directory of parquet files (which must have a flat schema) into impala or hive (defaults to impala) + +tsv: Transform a directory of parquet files into a directory of tsv files (which you can concat properly later with `hadoop fs -getmerge /path/to/tsvs`) + +compact: Transform a directory of parquet files into a directory of fewer larger parquet files + + +Example usage: + +`herringbone flatten -i /path/to/input/directory -o /path/to/output/directory` + +`herringbone load [--hive] [-u] -d db_name -t table -p /path/to/parquet/directory` + +`herringbone tsv -i /path/to/input/directory -o /path/to/output/directory` + +`herringbone compact -i /path/to/input/directory -o /path/to/output/directory` + + +See 'herringbone COMMAND --help' for more information on a specific command. + + + USAGE + +command_jobs = { + 'compact' => 'CompactJob', + 'load' => 'ParquetLoad', + 'flatten' => 'FlattenJob', + 'tsv' => 'TsvJob', +} + +# Validate the given command and print usage if needed. +command = ARGV.shift +JOB = command_jobs[command] + +if ['-h', '--help'].include?(command) + puts usage + exit 0 +elsif !JOB + STDERR.puts "\nError: #{command} is not an available command\n\n" + puts "#{'=' * 30}\n\n" + puts usage + exit 1 +end + +jar_path = File.join( + File.dirname(__FILE__), + '../', + 'herringbone-main', + 'target', + 'herringbone-0.0.1-jar-with-dependencies.jar' +) +JAR = File.expand_path(jar_path) + +ENV["HADOOP_CLASSPATH"] = JAR +ENV["HADOOP_USER_CLASSPATH_FIRST"] = "true" + +exec( + "hadoop", + "jar", + JAR, + "com.stripe.herringbone.#{JOB}", + *ARGV +) diff --git a/herringbone-impala/pom.xml b/herringbone-impala/pom.xml new file mode 100644 index 0000000..f90c0f0 --- /dev/null +++ b/herringbone-impala/pom.xml @@ -0,0 +1,118 @@ + + 4.0.0 + + com.stripe + herringbone-impala + 0.0.2 + jar + + Herringbone Impala + + + + dtrott + http://maven.davidtrott.com/repository + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.1 + + 1.6 + 1.6 + + + + + maven-jar-plugin + 2.3.1 + + + + maven-resources-plugin + 2.4.3 + + + + net.alchim31.maven + scala-maven-plugin + 3.1.6 + + + + compile + testCompile + + + + + + + org.apache.thrift.tools + maven-thrift-plugin + 0.1.11 + + true + thrift + + + + thrift-sources + generate-sources + + compile + + + + thrift-test-sources + generate-test-sources + + testCompile + + + + + + + + + + UTF-8 + 2.10.3 + 1.6 + 1.6 + + + + + cloudera-releases + https://repository.cloudera.com/artifactory/cloudera-repos + + true + + + false + + + + + + + org.apache.thrift + libthrift + 0.9.1 + + + org.slf4j + slf4j-log4j12 + 1.5.2 + + + + + diff --git a/herringbone-impala/src/main/scala/com/stripe/herringbone/impala/Connection.scala b/herringbone-impala/src/main/scala/com/stripe/herringbone/impala/Connection.scala new file mode 100644 index 0000000..11f7a9a --- /dev/null +++ b/herringbone-impala/src/main/scala/com/stripe/herringbone/impala/Connection.scala @@ -0,0 +1,65 @@ +package com.stripe.herringbone.impala + +import org.apache.thrift.transport.TSocket +import org.apache.thrift.protocol.TBinaryProtocol + +import com.cloudera.impala.thrift.ImpalaService.{Client => ClouderaImpalaClient} +import com.cloudera.beeswax.api._ + +import scala.annotation.tailrec +import scala.collection.JavaConversions._ + +case class Connection(host: String, port: Int) { + var isOpen = false + lazy val socket = new TSocket(host, port) + lazy val client = new ClouderaImpalaClient(new TBinaryProtocol(socket)) + + open + + def open = { + if (!isOpen) { + socket.open + client.ResetCatalog + isOpen = true + } + } + + def close = { + if (isOpen) { + socket.close + isOpen = false + } + } + + // Refresh the metadata store. + def refresh = { + if (!isOpen) throw ConnectionException("Connection closed") + client.ResetCatalog + } + + // Perform a query, and pass in a function that will be called with each + // row of the results + def query(raw: String)(fn: Seq[ImpalaValue] => Unit) { + val cursor = execute(raw) + cursor.foreach { row => fn(row) } + cursor.close + } + + // Perform a query and return a cursor for iterating over the results. + // You probably want to call cursor.close when you're done with it. + def execute(raw: String): Cursor = { + if (!isOpen) throw ConnectionException("Connection closed") + validateQuery(raw) + + val query = new Query + query.query = raw + + val handle = client.query(query) + Cursor(handle, client) + } + + private def validateQuery(raw: String) = { + val words = raw.split("\\s+") + if (words.isEmpty) throw InvalidQueryException("Empty query") + } +} diff --git a/herringbone-impala/src/main/scala/com/stripe/herringbone/impala/Cursor.scala b/herringbone-impala/src/main/scala/com/stripe/herringbone/impala/Cursor.scala new file mode 100644 index 0000000..394af69 --- /dev/null +++ b/herringbone-impala/src/main/scala/com/stripe/herringbone/impala/Cursor.scala @@ -0,0 +1,98 @@ +package com.stripe.herringbone.impala + +import org.apache.hadoop.hive.metastore.api.FieldSchema + +import com.cloudera.impala.thrift.ImpalaService.{Client => ClouderaImpalaClient} +import com.cloudera.beeswax.api._ + +import scala.collection.mutable.ArrayBuffer +import scala.collection.JavaConversions._ + +case class Cursor(handle: QueryHandle, client: ClouderaImpalaClient) { + var done = false + var isOpen = true + var rowBuffer = ArrayBuffer.empty[Seq[ImpalaValue]] + val bufferSize = 1024 + private lazy val metadata: ResultsMetadata = client.get_results_metadata(handle) + + def foreach(fn: Seq[ImpalaValue] => Unit) = { + var row = fetchRow + while (row.isDefined) { + fn(row.get) + row = fetchRow + } + } + + def fetchRow: Option[Seq[ImpalaValue]] = { + if (rowBuffer.isEmpty) { + if (done) { + None + } else { + fetchMore + fetchRow + } + } else { + val row = rowBuffer.head + rowBuffer = rowBuffer.tail + Some(row) + } + } + + // Close the cursor on the remote server. Once a cursor is closed, you + // can no longer fetch any rows from it. + def close = { + if (!isOpen) { + isOpen = false + client.close(handle) + } + } + + // Returns true if there are any more rows to fetch. + def hasMore = !done || !rowBuffer.isEmpty + + def runtime_profile = client.GetRuntimeProfile(handle) + + private def fetchMore = { + while (!done && rowBuffer.size < bufferSize) { + fetchBatch + } + } + + private def fetchBatch = { + if (!isOpen) throw CursorException("Cursor has expired or been closed") + + try { + val response = client.fetch(handle, false, bufferSize) + validateQueryState(client.get_state(handle)) + + val rows = response.data.map { row => parseRow(row) } + rowBuffer ++= rows + + if (!response.has_more) { + done = true + close + } + } catch { + case e: BeeswaxException => { + isOpen = false + throw e + } + case e: Exception => throw e + } + } + + private def parseRow(row: String) = { + val fields = row.split(metadata.delim) + + metadata.schema.getFieldSchemas.zip(fields).map { case(schema, rawValue) => + ImpalaValue(rawValue, schema.getName, schema.getType) + } + } + + private def validateQueryState(state: QueryState) = { + if (state == QueryState.EXCEPTION) { + close + throw CursorException("The query was aborted") + } + } +} diff --git a/herringbone-impala/src/main/scala/com/stripe/herringbone/impala/Exceptions.scala b/herringbone-impala/src/main/scala/com/stripe/herringbone/impala/Exceptions.scala new file mode 100644 index 0000000..17725e3 --- /dev/null +++ b/herringbone-impala/src/main/scala/com/stripe/herringbone/impala/Exceptions.scala @@ -0,0 +1,7 @@ +package com.stripe.herringbone.impala + +case class ConnectionException(message: String) extends Exception +case class CursorException(message: String) extends Exception +case class InvalidQueryException(message: String) extends Exception +case class ParsingException(message: String) extends Exception + diff --git a/herringbone-impala/src/main/scala/com/stripe/herringbone/impala/ImpalaClient.scala b/herringbone-impala/src/main/scala/com/stripe/herringbone/impala/ImpalaClient.scala new file mode 100644 index 0000000..a873abf --- /dev/null +++ b/herringbone-impala/src/main/scala/com/stripe/herringbone/impala/ImpalaClient.scala @@ -0,0 +1,16 @@ +package com.stripe.herringbone.impala + +case class ImpalaClient(host: String, port: Int) { + lazy val connection = Connection(host, port) + + def execute(raw: String) { + query(raw){ row => + println(row.map { _.raw }.mkString(" ")) + } + } + + def query(raw: String)(fn: Seq[ImpalaValue] => Unit) { + println(raw) + connection.query(raw){ row => fn(row) } + } +} diff --git a/herringbone-impala/src/main/scala/com/stripe/herringbone/impala/ImpalaValue.scala b/herringbone-impala/src/main/scala/com/stripe/herringbone/impala/ImpalaValue.scala new file mode 100644 index 0000000..bf0b375 --- /dev/null +++ b/herringbone-impala/src/main/scala/com/stripe/herringbone/impala/ImpalaValue.scala @@ -0,0 +1,43 @@ +package com.stripe.herringbone.impala + +import java.text.SimpleDateFormat + +case class ImpalaValue(raw: String, fieldName: String, fieldType: String) { + lazy val convertedValue = convertRawValue(raw) + + private def convertRawValue(raw: String): Option[Any] = { + if (raw == "NULL") { + None + } else { + val converted = fieldType match { + case "string" => raw + case "boolean" => convertBoolean(raw) + case "tinyint" | "smallint" | "int" | "bigint" => raw.toInt + case "double" | "float" | "decimal" => raw.toDouble + case "timestamp" => convertTimestamp(raw) + case _ => throw ParsingException("Unknown type: " + fieldType) + } + Some(converted) + } + } + + private def convertBoolean(raw: String) = { + try { + raw.toBoolean + } catch { + case e: java.lang.IllegalArgumentException => + throw ParsingException("Invalid value for boolean: " + raw) + } + } + + private def convertTimestamp(raw: String) = { + val formatStr = if (raw.indexOf(".") == -1) { + "YYYY-MM-DD HH:MM:SS" + } else { + "YYYY-MM-DD HH:MM:SS.sssssssss" + } + + val dateFormat = new SimpleDateFormat(formatStr) + dateFormat.parse(raw) + } +} diff --git a/herringbone-impala/src/main/thrift/ImpalaService.thrift b/herringbone-impala/src/main/thrift/ImpalaService.thrift new file mode 100644 index 0000000..1246ca4 --- /dev/null +++ b/herringbone-impala/src/main/thrift/ImpalaService.thrift @@ -0,0 +1,177 @@ +// Copyright 2012 Cloudera Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +namespace cpp impala +namespace java com.cloudera.impala.thrift +namespace rb impala.protocol + +include "Status.thrift" +include "beeswax.thrift" +include "cli_service.thrift" + +// ImpalaService accepts query execution options through beeswax.Query.configuration in +// key:value form. For example, the list of strings could be: +// "num_nodes:1", "abort_on_error:false" +// The valid keys are listed in this enum. They map to TQueryOptions. +// Note: If you add an option or change the default, you also need to update: +// - ImpalaInternalService.thrift: TQueryOptions +// - ImpaladClientExecutor.getBeeswaxQueryConfigurations() +// - ImpalaServer::SetQueryOptions() +// - ImpalaServer::TQueryOptionsToMap() +enum TImpalaQueryOptions { + // if true, abort execution on the first error + ABORT_ON_ERROR, + + // maximum # of errors to be reported; Unspecified or 0 indicates backend default + MAX_ERRORS, + + // if true, disable llvm codegen + DISABLE_CODEGEN, + + // batch size to be used by backend; Unspecified or a size of 0 indicates backend + // default + BATCH_SIZE, + + // a per-machine approximate limit on the memory consumption of this query; + // unspecified or a limit of 0 means no limit; + // otherwise specified either as: + // a) an int (= number of bytes); + // b) a float followed by "M" (MB) or "G" (GB) + MEM_LIMIT, + + // specifies the degree of parallelism with which to execute the query; + // 1: single-node execution + // NUM_NODES_ALL: executes on all nodes that contain relevant data + // NUM_NODES_ALL_RACKS: executes on one node per rack that holds relevant data + // > 1: executes on at most that many nodes at any point in time (ie, there can be + // more nodes than numNodes with plan fragments for this query, but at most + // numNodes would be active at any point in time) + // Constants (NUM_NODES_ALL, NUM_NODES_ALL_RACKS) are defined in JavaConstants.thrift. + NUM_NODES, + + // maximum length of the scan range; only applicable to HDFS scan range; Unspecified or + // a length of 0 indicates backend default; + MAX_SCAN_RANGE_LENGTH, + + // Maximum number of io buffers (per disk) + MAX_IO_BUFFERS, + + // Number of scanner threads. + NUM_SCANNER_THREADS, + + // If true, Impala will try to execute on file formats that are not fully supported yet + ALLOW_UNSUPPORTED_FORMATS, + + // if set and > -1, specifies the default limit applied to a top-level SELECT statement + // with an ORDER BY but without a LIMIT clause (ie, if the SELECT statement also has + // a LIMIT clause, this default is ignored) + DEFAULT_ORDER_BY_LIMIT, + + // DEBUG ONLY: + // If set to + // "[:]::", + // the exec node with the given id will perform the specified action in the given + // phase. If the optional backend number (starting from 0) is specified, only that + // backend instance will perform the debug action, otherwise all backends will behave + // in that way. + // If the string doesn't have the required format or if any of its components is + // invalid, the option is ignored. + DEBUG_ACTION, + + // If true, raise an error when the DEFAULT_ORDER_BY_LIMIT has been reached. + ABORT_ON_DEFAULT_LIMIT_EXCEEDED, + + // Compression codec for parquet when inserting into parquet tables. + // Valid values are "snappy", "gzip" and "none" + // Leave blank to use default. + PARQUET_COMPRESSION_CODEC, + + // HBase scan query option. If set and > 0, HBASE_CACHING is the value for + // "hbase.client.Scan.setCaching()" when querying HBase table. Otherwise, use backend + // default. + // If the value is too high, then the hbase region server will have a hard time (GC + // pressure and long response times). If the value is too small, then there will be + // extra trips to the hbase region server. + HBASE_CACHING, + + // HBase scan query option. If set, HBase scan will always set + // "hbase.client.setCacheBlocks" to CACHE_BLOCKS. Default is false. + // If the table is large and the query is doing big scan, set it to false to + // avoid polluting the cache in the hbase region server. + // If the table is small and the table is used several time, set it to true to improve + // performance. + HBASE_CACHE_BLOCKS, +} + +// The summary of an insert. +struct TInsertResult { + // Number of appended rows per modified partition. Only applies to HDFS tables. + // The keys represent partitions to create, coded as k1=v1/k2=v2/k3=v3..., with the + // root in an unpartitioned table being the empty string. + 1: required map rows_appended +} + +// Response from a call to PingImpalaService +struct TPingImpalaServiceResp { + // The Impala service's version string. + 1: string version +} + +// Parameters for a ResetTable request which will invalidate a table's metadata. +// DEPRECATED. +struct TResetTableReq { + // Name of the table's parent database. + 1: required string db_name + + // Name of the table. + 2: required string table_name +} + +// For all rpc that return a TStatus as part of their result type, +// if the status_code field is set to anything other than OK, the contents +// of the remainder of the result type is undefined (typically not set) +service ImpalaService extends beeswax.BeeswaxService { + // Cancel execution of query. Returns RUNTIME_ERROR if query_id + // unknown. + // This terminates all threads running on behalf of this query at + // all nodes that were involved in the execution. + // Throws BeeswaxException if the query handle is invalid (this doesn't + // necessarily indicate an error: the query might have finished). + Status.TStatus Cancel(1:beeswax.QueryHandle query_id) + throws(1:beeswax.BeeswaxException error); + + // Invalidates all catalog metadata, forcing a reload + // DEPRECATED; execute query "invalidate metadata" to refresh metadata + Status.TStatus ResetCatalog(); + + // Invalidates a specific table's catalog metadata, forcing a reload on the next access + // DEPRECATED; execute query "refresh " to refresh metadata + Status.TStatus ResetTable(1:TResetTableReq request) + + // Returns the runtime profile string for the given query handle. + string GetRuntimeProfile(1:beeswax.QueryHandle query_id) + throws(1:beeswax.BeeswaxException error); + + // Closes the query handle and return the result summary of the insert. + TInsertResult CloseInsert(1:beeswax.QueryHandle handle) + throws(1:beeswax.QueryNotFoundException error, 2:beeswax.BeeswaxException error2); + + // Client calls this RPC to verify that the server is an ImpalaService. Returns the + // server version. + TPingImpalaServiceResp PingImpalaService(); +} + +// Impala HiveServer2 service +service ImpalaHiveServer2Service extends cli_service.TCLIService { +} diff --git a/herringbone-impala/src/main/thrift/Status.thrift b/herringbone-impala/src/main/thrift/Status.thrift new file mode 100644 index 0000000..8906d1e --- /dev/null +++ b/herringbone-impala/src/main/thrift/Status.thrift @@ -0,0 +1,32 @@ +// Copyright 2012 Cloudera Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +namespace cpp impala +namespace java com.cloudera.impala.thrift +namespace rb impala.protocol + +enum TStatusCode { + OK, + CANCELLED, + ANALYSIS_ERROR, + NOT_IMPLEMENTED_ERROR, + RUNTIME_ERROR, + MEM_LIMIT_EXCEEDED, + INTERNAL_ERROR +} + +struct TStatus { + 1: required TStatusCode status_code + 2: list error_msgs +} diff --git a/herringbone-impala/src/main/thrift/beeswax.thrift b/herringbone-impala/src/main/thrift/beeswax.thrift new file mode 100644 index 0000000..2707457 --- /dev/null +++ b/herringbone-impala/src/main/thrift/beeswax.thrift @@ -0,0 +1,175 @@ +/* + * Licensed to Cloudera, Inc. under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Cloudera, Inc. licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Interface for interacting with Beeswax Server + */ + +namespace java com.cloudera.beeswax.api +namespace py beeswaxd +namespace cpp beeswax +namespace rb impala.protocol.beeswax + +include "hive_metastore.thrift" + +// A Query +struct Query { + 1: string query; + // A list of HQL commands to execute before the query. + // This is typically defining UDFs, setting settings, and loading resources. + 3: list configuration; + + // User and groups to "act as" for purposes of Hadoop. + 4: string hadoop_user; +} + +typedef string LogContextId + +enum QueryState { + CREATED, + INITIALIZED, + COMPILED, + RUNNING, + FINISHED, + EXCEPTION +} + +struct QueryHandle { + 1: string id; + 2: LogContextId log_context; +} + +struct QueryExplanation { + 1: string textual +} + +struct Results { + // If set, data is valid. Otherwise, results aren't ready yet. + 1: bool ready, + // Columns for the results + 2: list columns, + // A set of results + 3: list data, + // The starting row of the results + 4: i64 start_row, + // Whether there are more results to fetch + 5: bool has_more +} + +/** + * Metadata information about the results. + * Applicable only for SELECT. + */ +struct ResultsMetadata { + /** The schema of the results */ + 1: hive_metastore.Schema schema, + /** The directory containing the results. Not applicable for partition table. */ + 2: string table_dir, + /** If the results are straight from an existing table, the table name. */ + 3: string in_tablename, + /** Field delimiter */ + 4: string delim, +} + +exception BeeswaxException { + 1: string message, + // Use get_log(log_context) to retrieve any log related to this exception + 2: LogContextId log_context, + // (Optional) The QueryHandle that caused this exception + 3: QueryHandle handle, + 4: optional i32 errorCode = 0, + 5: optional string SQLState = " " +} + +exception QueryNotFoundException { +} + +/** Represents a Hadoop-style configuration variable. */ +struct ConfigVariable { + 1: string key, + 2: string value, + 3: string description +} + +service BeeswaxService { + /** + * Submit a query and return a handle (QueryHandle). The query runs asynchronously. + */ + QueryHandle query(1:Query query) throws(1:BeeswaxException error), + + /** + * run a query synchronously and return a handle (QueryHandle). + */ + QueryHandle executeAndWait(1:Query query, 2:LogContextId clientCtx) + throws(1:BeeswaxException error), + + /** + * Get the query plan for a query. + */ + QueryExplanation explain(1:Query query) + throws(1:BeeswaxException error), + + /** + * Get the results of a query. This is non-blocking. Caller should check + * Results.ready to determine if the results are in yet. The call requests + * the batch size of fetch. + */ + Results fetch(1:QueryHandle query_id, 2:bool start_over, 3:i32 fetch_size=-1) + throws(1:QueryNotFoundException error, 2:BeeswaxException error2), + + /** + * Get the state of the query + */ + QueryState get_state(1:QueryHandle handle) throws(1:QueryNotFoundException error), + + /** + * Get the result metadata + */ + ResultsMetadata get_results_metadata(1:QueryHandle handle) + throws(1:QueryNotFoundException error), + + /** + * Used to test connection to server. A "noop" command. + */ + string echo(1:string s) + + /** + * Returns a string representation of the configuration object being used. + * Handy for debugging. + */ + string dump_config() + + /** + * Get the log messages related to the given context. + */ + string get_log(1:LogContextId context) throws(1:QueryNotFoundException error) + + /* + * Returns "default" configuration. + */ + list get_default_configuration(1:bool include_hadoop) + + /* + * closes the query with given handle + */ + void close(1:QueryHandle handle) throws(1:QueryNotFoundException error, + 2:BeeswaxException error2) + + /* + * clean the log context for given id + */ + void clean(1:LogContextId log_context) +} diff --git a/herringbone-impala/src/main/thrift/cli_service.thrift b/herringbone-impala/src/main/thrift/cli_service.thrift new file mode 100644 index 0000000..24a3558 --- /dev/null +++ b/herringbone-impala/src/main/thrift/cli_service.thrift @@ -0,0 +1,1015 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Coding Conventions for this file: +// +// Structs/Enums/Unions +// * Struct, Enum, and Union names begin with a "T", +// and use a capital letter for each new word, with no underscores. +// * All fields should be declared as either optional or required. +// +// Functions +// * Function names start with a capital letter and have a capital letter for +// each new word, with no underscores. +// * Each function should take exactly one parameter, named TFunctionNameReq, +// and should return either void or TFunctionNameResp. This convention allows +// incremental updates. +// +// Services +// * Service names begin with the letter "T", use a capital letter for each +// new word (with no underscores), and end with the word "Service". + +namespace java org.apache.hive.service.cli.thrift +namespace cpp apache.hive.service.cli.thrift +namespace rb impala.protocol.hive + +// List of protocol versions. A new token should be +// added to the end of this list every time a change is made. +enum TProtocolVersion { + HIVE_CLI_SERVICE_PROTOCOL_V1 +} + +enum TTypeId { + BOOLEAN_TYPE, + TINYINT_TYPE, + SMALLINT_TYPE, + INT_TYPE, + BIGINT_TYPE, + FLOAT_TYPE, + DOUBLE_TYPE, + STRING_TYPE, + TIMESTAMP_TYPE, + BINARY_TYPE, + ARRAY_TYPE, + MAP_TYPE, + STRUCT_TYPE, + UNION_TYPE, + USER_DEFINED_TYPE, + DECIMAL_TYPE +} + +const set PRIMITIVE_TYPES = [ + TTypeId.BOOLEAN_TYPE + TTypeId.TINYINT_TYPE + TTypeId.SMALLINT_TYPE + TTypeId.INT_TYPE + TTypeId.BIGINT_TYPE + TTypeId.FLOAT_TYPE + TTypeId.DOUBLE_TYPE + TTypeId.STRING_TYPE + TTypeId.TIMESTAMP_TYPE + TTypeId.BINARY_TYPE, + TTypeId.DECIMAL_TYPE +] + +const set COMPLEX_TYPES = [ + TTypeId.ARRAY_TYPE + TTypeId.MAP_TYPE + TTypeId.STRUCT_TYPE + TTypeId.UNION_TYPE + TTypeId.USER_DEFINED_TYPE +] + +const set COLLECTION_TYPES = [ + TTypeId.ARRAY_TYPE + TTypeId.MAP_TYPE +] + +const map TYPE_NAMES = { + TTypeId.BOOLEAN_TYPE: "BOOLEAN", + TTypeId.TINYINT_TYPE: "TINYINT", + TTypeId.SMALLINT_TYPE: "SMALLINT", + TTypeId.INT_TYPE: "INT", + TTypeId.BIGINT_TYPE: "BIGINT", + TTypeId.FLOAT_TYPE: "FLOAT", + TTypeId.DOUBLE_TYPE: "DOUBLE", + TTypeId.STRING_TYPE: "STRING", + TTypeId.TIMESTAMP_TYPE: "TIMESTAMP", + TTypeId.BINARY_TYPE: "BINARY", + TTypeId.ARRAY_TYPE: "ARRAY", + TTypeId.MAP_TYPE: "MAP", + TTypeId.STRUCT_TYPE: "STRUCT", + TTypeId.UNION_TYPE: "UNIONTYPE" + TTypeId.DECIMAL_TYPE: "DECIMAL" +} + +// Thrift does not support recursively defined types or forward declarations, +// which makes it difficult to represent Hive's nested types. +// To get around these limitations TTypeDesc employs a type list that maps +// integer "pointers" to TTypeEntry objects. The following examples show +// how different types are represented using this scheme: +// +// "INT": +// TTypeDesc { +// types = [ +// TTypeEntry.primitive_entry { +// type = INT_TYPE +// } +// ] +// } +// +// "ARRAY": +// TTypeDesc { +// types = [ +// TTypeEntry.array_entry { +// object_type_ptr = 1 +// }, +// TTypeEntry.primitive_entry { +// type = INT_TYPE +// } +// ] +// } +// +// "MAP": +// TTypeDesc { +// types = [ +// TTypeEntry.map_entry { +// key_type_ptr = 1 +// value_type_ptr = 2 +// }, +// TTypeEntry.primitive_entry { +// type = INT_TYPE +// }, +// TTypeEntry.primitive_entry { +// type = STRING_TYPE +// } +// ] +// } + +typedef i32 TTypeEntryPtr + +// Type entry for a primitive type. +struct TPrimitiveTypeEntry { + // The primitive type token. This must satisfy the condition + // that type is in the PRIMITIVE_TYPES set. + 1: required TTypeId type +} + +// Type entry for an ARRAY type. +struct TArrayTypeEntry { + 1: required TTypeEntryPtr objectTypePtr +} + +// Type entry for a MAP type. +struct TMapTypeEntry { + 1: required TTypeEntryPtr keyTypePtr + 2: required TTypeEntryPtr valueTypePtr +} + +// Type entry for a STRUCT type. +struct TStructTypeEntry { + 1: required map nameToTypePtr +} + +// Type entry for a UNIONTYPE type. +struct TUnionTypeEntry { + 1: required map nameToTypePtr +} + +struct TUserDefinedTypeEntry { + // The fully qualified name of the class implementing this type. + 1: required string typeClassName +} + +// We use a union here since Thrift does not support inheritance. +union TTypeEntry { + 1: TPrimitiveTypeEntry primitiveEntry + 2: TArrayTypeEntry arrayEntry + 3: TMapTypeEntry mapEntry + 4: TStructTypeEntry structEntry + 5: TUnionTypeEntry unionEntry + 6: TUserDefinedTypeEntry userDefinedTypeEntry +} + +// Type descriptor for columns. +struct TTypeDesc { + // The "top" type is always the first element of the list. + // If the top type is an ARRAY, MAP, STRUCT, or UNIONTYPE + // type, then subsequent elements represent nested types. + 1: required list types +} + +// A result set column descriptor. +struct TColumnDesc { + // The name of the column + 1: required string columnName + + // The type descriptor for this column + 2: required TTypeDesc typeDesc + + // The ordinal position of this column in the schema + 3: required i32 position + + 4: optional string comment +} + +// Metadata used to describe the schema (column names, types, comments) +// of result sets. +struct TTableSchema { + 1: required list columns +} + +// A Boolean column value. +struct TBoolValue { + // NULL if value is unset. + 1: optional bool value +} + +// A Byte column value. +struct TByteValue { + // NULL if value is unset. + 1: optional byte value +} + +// A signed, 16 bit column value. +struct TI16Value { + // NULL if value is unset + 1: optional i16 value +} + +// A signed, 32 bit column value +struct TI32Value { + // NULL if value is unset + 1: optional i32 value +} + +// A signed 64 bit column value +struct TI64Value { + // NULL if value is unset + 1: optional i64 value +} + +// A floating point 64 bit column value +struct TDoubleValue { + // NULL if value is unset + 1: optional double value +} + +struct TStringValue { + // NULL if value is unset + 1: optional string value +} + +union TColumn { + 1: list boolColumn + 2: list byteColumn + 3: list i16Column + 4: list i32Column + 5: list i64Column + 6: list doubleColumn + 7: list stringColumn +} + +// A single column value in a result set. +// Note that Hive's type system is richer than Thrift's, +// so in some cases we have to map multiple Hive types +// to the same Thrift type. On the client-side this is +// disambiguated by looking at the Schema of the +// result set. +union TColumnValue { + 1: TBoolValue boolVal // BOOLEAN + 2: TByteValue byteVal // TINYINT + 3: TI16Value i16Val // SMALLINT + 4: TI32Value i32Val // INT + 5: TI64Value i64Val // BIGINT, TIMESTAMP + 6: TDoubleValue doubleVal // FLOAT, DOUBLE + 7: TStringValue stringVal // STRING, LIST, MAP, STRUCT, UNIONTYPE, BINARY, DECIMAL +} + +// Represents a row in a rowset. +struct TRow { + 1: required list colVals +} + +// Represents a rowset +struct TRowSet { + // The starting row offset of this rowset. + 1: required i64 startRowOffset + 2: required list rows + 3: optional list columns +} + +// The return status code contained in each response. +enum TStatusCode { + SUCCESS_STATUS, + SUCCESS_WITH_INFO_STATUS, + STILL_EXECUTING_STATUS, + ERROR_STATUS, + INVALID_HANDLE_STATUS +} + +// The return status of a remote request +struct TStatus { + 1: required TStatusCode statusCode + + // If status is SUCCESS_WITH_INFO, info_msgs may be populated with + // additional diagnostic information. + 2: optional list infoMessages + + // If status is ERROR, then the following fields may be set + 3: optional string sqlState // as defined in the ISO/IEF CLI specification + 4: optional i32 errorCode // internal error code + 5: optional string errorMessage +} + +// The state of an operation (i.e. a query or other +// asynchronous operation that generates a result set) +// on the server. +enum TOperationState { + // The operation has been initialized + INITIALIZED_STATE, + + // The operation is running. In this state the result + // set is not available. + RUNNING_STATE, + + // The operation has completed. When an operation is in + // this state its result set may be fetched. + FINISHED_STATE, + + // The operation was canceled by a client + CANCELED_STATE, + + // The operation was closed by a client + CLOSED_STATE, + + // The operation failed due to an error + ERROR_STATE, + + // The operation is in an unrecognized state + UKNOWN_STATE, +} + + +// A string identifier. This is interpreted literally. +typedef string TIdentifier + +// A search pattern. +// +// Valid search pattern characters: +// '_': Any single character. +// '%': Any sequence of zero or more characters. +// '\': Escape character used to include special characters, +// e.g. '_', '%', '\'. If a '\' precedes a non-special +// character it has no special meaning and is interpreted +// literally. +typedef string TPattern + + +// A search pattern or identifier. Used as input +// parameter for many of the catalog functions. +typedef string TPatternOrIdentifier + +struct THandleIdentifier { + // 16 byte globally unique identifier + // This is the public ID of the handle and + // can be used for reporting. + 1: required binary guid, + + // 16 byte secret generated by the server + // and used to verify that the handle is not + // being hijacked by another user. + 2: required binary secret, +} + +// Client-side handle to persistent +// session information on the server-side. +struct TSessionHandle { + 1: required THandleIdentifier sessionId +} + +// The subtype of an OperationHandle. +enum TOperationType { + EXECUTE_STATEMENT, + GET_TYPE_INFO, + GET_CATALOGS, + GET_SCHEMAS, + GET_TABLES, + GET_TABLE_TYPES, + GET_COLUMNS, + GET_FUNCTIONS, + UNKNOWN, +} + +// Client-side reference to a task running +// asynchronously on the server. +struct TOperationHandle { + 1: required THandleIdentifier operationId + 2: required TOperationType operationType + + // If hasResultSet = TRUE, then this operation + // generates a result set that can be fetched. + // Note that the result set may be empty. + // + // If hasResultSet = FALSE, then this operation + // does not generate a result set, and calling + // GetResultSetMetadata or FetchResults against + // this OperationHandle will generate an error. + 3: required bool hasResultSet + + // For operations that don't generate result sets, + // modifiedRowCount is either: + // + // 1) The number of rows that were modified by + // the DML operation (e.g. number of rows inserted, + // number of rows deleted, etc). + // + // 2) 0 for operations that don't modify or add rows. + // + // 3) < 0 if the operation is capable of modifiying rows, + // but Hive is unable to determine how many rows were + // modified. For example, Hive's LOAD DATA command + // doesn't generate row count information because + // Hive doesn't inspect the data as it is loaded. + // + // modifiedRowCount is unset if the operation generates + // a result set. + 4: optional double modifiedRowCount +} + + +// OpenSession() +// +// Open a session (connection) on the server against +// which operations may be executed. +struct TOpenSessionReq { + // The version of the HiveServer2 protocol that the client is using. + 1: required TProtocolVersion client_protocol = TProtocolVersion.HIVE_CLI_SERVICE_PROTOCOL_V1 + + // Username and password for authentication. + // Depending on the authentication scheme being used, + // this information may instead be provided by a lower + // protocol layer, in which case these fields may be + // left unset. + 2: optional string username + 3: optional string password + + // Configuration overlay which is applied when the session is + // first created. + 4: optional map configuration +} + +struct TOpenSessionResp { + 1: required TStatus status + + // The protocol version that the server is using. + 2: required TProtocolVersion serverProtocolVersion = TProtocolVersion.HIVE_CLI_SERVICE_PROTOCOL_V1 + + // Session Handle + 3: optional TSessionHandle sessionHandle + + // The configuration settings for this session. + 4: optional map configuration +} + + +// CloseSession() +// +// Closes the specified session and frees any resources +// currently allocated to that session. Any open +// operations in that session will be canceled. +struct TCloseSessionReq { + 1: required TSessionHandle sessionHandle +} + +struct TCloseSessionResp { + 1: required TStatus status +} + + + +enum TGetInfoType { + CLI_MAX_DRIVER_CONNECTIONS = 0, + CLI_MAX_CONCURRENT_ACTIVITIES = 1, + CLI_DATA_SOURCE_NAME = 2, + CLI_FETCH_DIRECTION = 8, + CLI_SERVER_NAME = 13, + CLI_SEARCH_PATTERN_ESCAPE = 14, + CLI_DBMS_NAME = 17, + CLI_DBMS_VER = 18, + CLI_ACCESSIBLE_TABLES = 19, + CLI_ACCESSIBLE_PROCEDURES = 20, + CLI_CURSOR_COMMIT_BEHAVIOR = 23, + CLI_DATA_SOURCE_READ_ONLY = 25, + CLI_DEFAULT_TXN_ISOLATION = 26, + CLI_IDENTIFIER_CASE = 28, + CLI_IDENTIFIER_QUOTE_CHAR = 29, + CLI_MAX_COLUMN_NAME_LEN = 30, + CLI_MAX_CURSOR_NAME_LEN = 31, + CLI_MAX_SCHEMA_NAME_LEN = 32, + CLI_MAX_CATALOG_NAME_LEN = 34, + CLI_MAX_TABLE_NAME_LEN = 35, + CLI_SCROLL_CONCURRENCY = 43, + CLI_TXN_CAPABLE = 46, + CLI_USER_NAME = 47, + CLI_TXN_ISOLATION_OPTION = 72, + CLI_INTEGRITY = 73, + CLI_GETDATA_EXTENSIONS = 81, + CLI_NULL_COLLATION = 85, + CLI_ALTER_TABLE = 86, + CLI_ORDER_BY_COLUMNS_IN_SELECT = 90, + CLI_SPECIAL_CHARACTERS = 94, + CLI_MAX_COLUMNS_IN_GROUP_BY = 97, + CLI_MAX_COLUMNS_IN_INDEX = 98, + CLI_MAX_COLUMNS_IN_ORDER_BY = 99, + CLI_MAX_COLUMNS_IN_SELECT = 100, + CLI_MAX_COLUMNS_IN_TABLE = 101, + CLI_MAX_INDEX_SIZE = 102, + CLI_MAX_ROW_SIZE = 104, + CLI_MAX_STATEMENT_LEN = 105, + CLI_MAX_TABLES_IN_SELECT = 106, + CLI_MAX_USER_NAME_LEN = 107, + CLI_OJ_CAPABILITIES = 115, + + CLI_XOPEN_CLI_YEAR = 10000, + CLI_CURSOR_SENSITIVITY = 10001, + CLI_DESCRIBE_PARAMETER = 10002, + CLI_CATALOG_NAME = 10003, + CLI_COLLATION_SEQ = 10004, + CLI_MAX_IDENTIFIER_LEN = 10005, +} + +union TGetInfoValue { + 1: string stringValue + 2: i16 smallIntValue + 3: i32 integerBitmask + 4: i32 integerFlag + 5: i32 binaryValue + 6: i64 lenValue +} + +// GetInfo() +// +// This function is based on ODBC's CLIGetInfo() function. +// The function returns general information about the data source +// using the same keys as ODBC. +struct TGetInfoReq { + // The sesssion to run this request against + 1: required TSessionHandle sessionHandle + + 2: required TGetInfoType infoType +} + +struct TGetInfoResp { + 1: required TStatus status + + 2: required TGetInfoValue infoValue +} + + +// ExecuteStatement() +// +// Execute a statement. +// The returned OperationHandle can be used to check on the +// status of the statement, and to fetch results once the +// statement has finished executing. +struct TExecuteStatementReq { + // The session to exexcute the statement against + 1: required TSessionHandle sessionHandle + + // The statement to be executed (DML, DDL, SET, etc) + 2: required string statement + + // Configuration properties that are overlayed on top of the + // the existing session configuration before this statement + // is executed. These properties apply to this statement + // only and will not affect the subsequent state of the Session. + 3: optional map confOverlay +} + +struct TExecuteStatementResp { + 1: required TStatus status + 2: optional TOperationHandle operationHandle +} + + +// GetTypeInfo() +// +// Get information about types supported by the HiveServer instance. +// The information is returned as a result set which can be fetched +// using the OperationHandle provided in the response. +// +// Refer to the documentation for ODBC's CLIGetTypeInfo function for +// the format of the result set. +struct TGetTypeInfoReq { + // The session to run this request against. + 1: required TSessionHandle sessionHandle +} + +struct TGetTypeInfoResp { + 1: required TStatus status + 2: optional TOperationHandle operationHandle +} + + +// GetCatalogs() +// +// Returns the list of catalogs (databases) +// Results are ordered by TABLE_CATALOG +// +// Resultset columns : +// col1 +// name: TABLE_CAT +// type: STRING +// desc: Catalog name. NULL if not applicable. +// +struct TGetCatalogsReq { + // Session to run this request against + 1: required TSessionHandle sessionHandle +} + +struct TGetCatalogsResp { + 1: required TStatus status + 2: optional TOperationHandle operationHandle +} + + +// GetSchemas() +// +// Retrieves the schema names available in this database. +// The results are ordered by TABLE_CATALOG and TABLE_SCHEM. +// col1 +// name: TABLE_SCHEM +// type: STRING +// desc: schema name +// col2 +// name: TABLE_CATALOG +// type: STRING +// desc: catalog name +struct TGetSchemasReq { + // Session to run this request against + 1: required TSessionHandle sessionHandle + + // Name of the catalog. Must not contain a search pattern. + 2: optional TIdentifier catalogName + + // schema name or pattern + 3: optional TPatternOrIdentifier schemaName +} + +struct TGetSchemasResp { + 1: required TStatus status + 2: optional TOperationHandle operationHandle +} + + +// GetTables() +// +// Returns a list of tables with catalog, schema, and table +// type information. The information is returned as a result +// set which can be fetched using the OperationHandle +// provided in the response. +// Results are ordered by TABLE_TYPE, TABLE_CAT, TABLE_SCHEM, and TABLE_NAME +// +// Result Set Columns: +// +// col1 +// name: TABLE_CAT +// type: STRING +// desc: Catalog name. NULL if not applicable. +// +// col2 +// name: TABLE_SCHEM +// type: STRING +// desc: Schema name. +// +// col3 +// name: TABLE_NAME +// type: STRING +// desc: Table name. +// +// col4 +// name: TABLE_TYPE +// type: STRING +// desc: The table type, e.g. "TABLE", "VIEW", etc. +// +// col5 +// name: REMARKS +// type: STRING +// desc: Comments about the table +// +struct TGetTablesReq { + // Session to run this request against + 1: required TSessionHandle sessionHandle + + // Name of the catalog or a search pattern. + 2: optional TPatternOrIdentifier catalogName + + // Name of the schema or a search pattern. + 3: optional TPatternOrIdentifier schemaName + + // Name of the table or a search pattern. + 4: optional TPatternOrIdentifier tableName + + // List of table types to match + // e.g. "TABLE", "VIEW", "SYSTEM TABLE", "GLOBAL TEMPORARY", + // "LOCAL TEMPORARY", "ALIAS", "SYNONYM", etc. + 5: optional list tableTypes +} + +struct TGetTablesResp { + 1: required TStatus status + 2: optional TOperationHandle operationHandle +} + + +// GetTableTypes() +// +// Returns the table types available in this database. +// The results are ordered by table type. +// +// col1 +// name: TABLE_TYPE +// type: STRING +// desc: Table type name. +struct TGetTableTypesReq { + // Session to run this request against + 1: required TSessionHandle sessionHandle +} + +struct TGetTableTypesResp { + 1: required TStatus status + 2: optional TOperationHandle operationHandle +} + + +// GetColumns() +// +// Returns a list of columns in the specified tables. +// The information is returned as a result set which can be fetched +// using the OperationHandle provided in the response. +// Results are ordered by TABLE_CAT, TABLE_SCHEM, TABLE_NAME, +// and ORDINAL_POSITION. +// +// Result Set Columns are the same as those for the ODBC CLIColumns +// function. +// +struct TGetColumnsReq { + // Session to run this request against + 1: required TSessionHandle sessionHandle + + // Name of the catalog. Must not contain a search pattern. + 2: optional TIdentifier catalogName + + // Schema name or search pattern + 3: optional TPatternOrIdentifier schemaName + + // Table name or search pattern + 4: optional TPatternOrIdentifier tableName + + // Column name or search pattern + 5: optional TPatternOrIdentifier columnName +} + +struct TGetColumnsResp { + 1: required TStatus status + 2: optional TOperationHandle operationHandle +} + + +// GetFunctions() +// +// Returns a list of functions supported by the data source. The +// behavior of this function matches +// java.sql.DatabaseMetaData.getFunctions() both in terms of +// inputs and outputs. +// +// Result Set Columns: +// +// col1 +// name: FUNCTION_CAT +// type: STRING +// desc: Function catalog (may be null) +// +// col2 +// name: FUNCTION_SCHEM +// type: STRING +// desc: Function schema (may be null) +// +// col3 +// name: FUNCTION_NAME +// type: STRING +// desc: Function name. This is the name used to invoke the function. +// +// col4 +// name: REMARKS +// type: STRING +// desc: Explanatory comment on the function. +// +// col5 +// name: FUNCTION_TYPE +// type: SMALLINT +// desc: Kind of function. One of: +// * functionResultUnknown - Cannot determine if a return value or a table +// will be returned. +// * functionNoTable - Does not a return a table. +// * functionReturnsTable - Returns a table. +// +// col6 +// name: SPECIFIC_NAME +// type: STRING +// desc: The name which uniquely identifies this function within its schema. +// In this case this is the fully qualified class name of the class +// that implements this function. +// +struct TGetFunctionsReq { + // Session to run this request against + 1: required TSessionHandle sessionHandle + + // A catalog name; must match the catalog name as it is stored in the + // database; "" retrieves those without a catalog; null means + // that the catalog name should not be used to narrow the search. + 2: optional TIdentifier catalogName + + // A schema name pattern; must match the schema name as it is stored + // in the database; "" retrieves those without a schema; null means + // that the schema name should not be used to narrow the search. + 3: optional TPatternOrIdentifier schemaName + + // A function name pattern; must match the function name as it is stored + // in the database. + 4: required TPatternOrIdentifier functionName +} + +struct TGetFunctionsResp { + 1: required TStatus status + 2: optional TOperationHandle operationHandle +} + + +// GetOperationStatus() +// +// Get the status of an operation running on the server. +struct TGetOperationStatusReq { + // Session to run this request against + 1: required TOperationHandle operationHandle +} + +struct TGetOperationStatusResp { + 1: required TStatus status + 2: optional TOperationState operationState +} + + +// CancelOperation() +// +// Cancels processing on the specified operation handle and +// frees any resources which were allocated. +struct TCancelOperationReq { + // Operation to cancel + 1: required TOperationHandle operationHandle +} + +struct TCancelOperationResp { + 1: required TStatus status +} + + +// CloseOperation() +// +// Given an operation in the FINISHED, CANCELED, +// or ERROR states, CloseOperation() will free +// all of the resources which were allocated on +// the server to service the operation. +struct TCloseOperationReq { + 1: required TOperationHandle operationHandle +} + +struct TCloseOperationResp { + 1: required TStatus status +} + + +// GetResultSetMetadata() +// +// Retrieves schema information for the specified operation +struct TGetResultSetMetadataReq { + // Operation for which to fetch result set schema information + 1: required TOperationHandle operationHandle +} + +struct TGetResultSetMetadataResp { + 1: required TStatus status + 2: optional TTableSchema schema +} + + +enum TFetchOrientation { + // Get the next rowset. The fetch offset is ignored. + FETCH_NEXT, + + // Get the previous rowset. The fetch offset is ignored. + // NOT SUPPORTED + FETCH_PRIOR, + + // Return the rowset at the given fetch offset relative + // to the curren rowset. + // NOT SUPPORTED + FETCH_RELATIVE, + + // Return the rowset at the specified fetch offset. + // NOT SUPPORTED + FETCH_ABSOLUTE, + + // Get the first rowset in the result set. + FETCH_FIRST, + + // Get the last rowset in the result set. + // NOT SUPPORTED + FETCH_LAST +} + +// FetchResults() +// +// Fetch rows from the server corresponding to +// a particular OperationHandle. +struct TFetchResultsReq { + // Operation from which to fetch results. + 1: required TOperationHandle operationHandle + + // The fetch orientation. For V1 this must be either + // FETCH_NEXT or FETCH_FIRST. Defaults to FETCH_NEXT. + 2: required TFetchOrientation orientation = TFetchOrientation.FETCH_NEXT + + // Max number of rows that should be returned in + // the rowset. + 3: required i64 maxRows +} + +struct TFetchResultsResp { + 1: required TStatus status + + // TRUE if there are more rows left to fetch from the server. + 2: optional bool hasMoreRows + + // The rowset. This is optional so that we have the + // option in the future of adding alternate formats for + // representing result set data, e.g. delimited strings, + // binary encoded, etc. + 3: optional TRowSet results +} + +// GetLog() +// +// Fetch operation log from the server corresponding to +// a particular OperationHandle. +struct TGetLogReq { + // Operation whose log is requested + 1: required TOperationHandle operationHandle +} + +struct TGetLogResp { + 1: required TStatus status + + 2: required string log +} + +service TCLIService { + + TOpenSessionResp OpenSession(1:TOpenSessionReq req); + + TCloseSessionResp CloseSession(1:TCloseSessionReq req); + + TGetInfoResp GetInfo(1:TGetInfoReq req); + + TExecuteStatementResp ExecuteStatement(1:TExecuteStatementReq req); + + TGetTypeInfoResp GetTypeInfo(1:TGetTypeInfoReq req); + + TGetCatalogsResp GetCatalogs(1:TGetCatalogsReq req); + + TGetSchemasResp GetSchemas(1:TGetSchemasReq req); + + TGetTablesResp GetTables(1:TGetTablesReq req); + + TGetTableTypesResp GetTableTypes(1:TGetTableTypesReq req); + + TGetColumnsResp GetColumns(1:TGetColumnsReq req); + + TGetFunctionsResp GetFunctions(1:TGetFunctionsReq req); + + TGetOperationStatusResp GetOperationStatus(1:TGetOperationStatusReq req); + + TCancelOperationResp CancelOperation(1:TCancelOperationReq req); + + TCloseOperationResp CloseOperation(1:TCloseOperationReq req); + + TGetResultSetMetadataResp GetResultSetMetadata(1:TGetResultSetMetadataReq req); + + TFetchResultsResp FetchResults(1:TFetchResultsReq req); + + TGetLogResp GetLog(1:TGetLogReq req); +} diff --git a/herringbone-impala/src/main/thrift/fb303.thrift b/herringbone-impala/src/main/thrift/fb303.thrift new file mode 100644 index 0000000..6438092 --- /dev/null +++ b/herringbone-impala/src/main/thrift/fb303.thrift @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/** + * fb303.thrift + */ + +namespace java com.facebook.fb303 +namespace cpp facebook.fb303 +namespace rb Impala.Protocol.fb303 + +/** + * Common status reporting mechanism across all services + */ +enum fb_status { + DEAD = 0, + STARTING = 1, + ALIVE = 2, + STOPPING = 3, + STOPPED = 4, + WARNING = 5, +} + +/** + * Standard base service + */ +service FacebookService { + + /** + * Returns a descriptive name of the service + */ + string getName(), + + /** + * Returns the version of the service + */ + string getVersion(), + + /** + * Gets the status of this service + */ + fb_status getStatus(), + + /** + * User friendly description of status, such as why the service is in + * the dead or warning state, or what is being started or stopped. + */ + string getStatusDetails(), + + /** + * Gets the counters for this service + */ + map getCounters(), + + /** + * Gets the value of a single counter + */ + i64 getCounter(1: string key), + + /** + * Sets an option + */ + void setOption(1: string key, 2: string value), + + /** + * Gets an option + */ + string getOption(1: string key), + + /** + * Gets all options + */ + map getOptions(), + + /** + * Returns a CPU profile over the given time interval (client and server + * must agree on the profile format). + */ + string getCpuProfile(1: i32 profileDurationInSec), + + /** + * Returns the unix time that the server has been running since + */ + i64 aliveSince(), + + /** + * Tell the server to reload its configuration, reopen log files, etc + */ + oneway void reinitialize(), + + /** + * Suggest a shutdown to the server + */ + oneway void shutdown(), + +} diff --git a/herringbone-impala/src/main/thrift/hive_metastore.thrift b/herringbone-impala/src/main/thrift/hive_metastore.thrift new file mode 100644 index 0000000..5e05367 --- /dev/null +++ b/herringbone-impala/src/main/thrift/hive_metastore.thrift @@ -0,0 +1,528 @@ +#!/usr/local/bin/thrift -java + +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +# +# Thrift Service that the MetaStore is built on +# + +include "fb303.thrift" + +namespace java org.apache.hadoop.hive.metastore.api +namespace php metastore +namespace cpp Apache.Hadoop.Hive +namespace rb Impala.Protocol.HiveMetastore + +const string DDL_TIME = "transient_lastDdlTime" + +struct Version { + 1: string version, + 2: string comments +} + +struct FieldSchema { + 1: string name, // name of the field + 2: string type, // type of the field. primitive types defined above, specify list, map for lists & maps + 3: string comment +} + +struct Type { + 1: string name, // one of the types in PrimitiveTypes or CollectionTypes or User defined types + 2: optional string type1, // object type if the name is 'list' (LIST_TYPE), key type if the name is 'map' (MAP_TYPE) + 3: optional string type2, // val type if the name is 'map' (MAP_TYPE) + //4: optional list fields // if the name is one of the user defined types +} + +enum HiveObjectType { + GLOBAL = 1, + DATABASE = 2, + TABLE = 3, + PARTITION = 4, + COLUMN = 5, +} + +enum PrincipalType { + USER = 1, + ROLE = 2, + GROUP = 3, +} + +const string HIVE_FILTER_FIELD_OWNER = "hive_filter_field_owner__" +const string HIVE_FILTER_FIELD_PARAMS = "hive_filter_field_params__" +const string HIVE_FILTER_FIELD_LAST_ACCESS = "hive_filter_field_last_access__" + +enum PartitionEventType { + LOAD_DONE = 1, +} + +struct HiveObjectRef{ + 1: HiveObjectType objectType, + 2: string dbName, + 3: string objectName, + 4: list partValues, + 5: string columnName, +} + +struct PrivilegeGrantInfo { + 1: string privilege, + 2: i32 createTime, + 3: string grantor, + 4: PrincipalType grantorType, + 5: bool grantOption, +} + +struct HiveObjectPrivilege { + 1: HiveObjectRef hiveObject, + 2: string principalName, + 3: PrincipalType principalType, + 4: PrivilegeGrantInfo grantInfo, +} + +struct PrivilegeBag { + 1: list privileges, +} + +struct PrincipalPrivilegeSet { + 1: map> userPrivileges, // user name -> privilege grant info + 2: map> groupPrivileges, // group name -> privilege grant info + 3: map> rolePrivileges, //role name -> privilege grant info +} + +struct Role { + 1: string roleName, + 2: i32 createTime, + 3: string ownerName, +} + +// namespace for tables +struct Database { + 1: string name, + 2: string description, + 3: string locationUri, + 4: map parameters, // properties associated with the database + 5: optional PrincipalPrivilegeSet privileges +} + +// This object holds the information needed by SerDes +struct SerDeInfo { + 1: string name, // name of the serde, table name by default + 2: string serializationLib, // usually the class that implements the extractor & loader + 3: map parameters // initialization parameters +} + +// sort order of a column (column name along with asc(1)/desc(0)) +struct Order { + 1: string col, // sort column name + 2: i32 order // asc(1) or desc(0) +} + +// this object holds all the information about physical storage of the data belonging to a table +struct StorageDescriptor { + 1: list cols, // required (refer to types defined above) + 2: string location, // defaults to //tablename + 3: string inputFormat, // SequenceFileInputFormat (binary) or TextInputFormat` or custom format + 4: string outputFormat, // SequenceFileOutputFormat (binary) or IgnoreKeyTextOutputFormat or custom format + 5: bool compressed, // compressed or not + 6: i32 numBuckets, // this must be specified if there are any dimension columns + 7: SerDeInfo serdeInfo, // serialization and deserialization information + 8: list bucketCols, // reducer grouping columns and clustering columns and bucketing columns` + 9: list sortCols, // sort order of the data in each bucket + 10: map parameters // any user supplied key value hash +} + +// table information +struct Table { + 1: string tableName, // name of the table + 2: string dbName, // database name ('default') + 3: string owner, // owner of this table + 4: i32 createTime, // creation time of the table + 5: i32 lastAccessTime, // last access time (usually this will be filled from HDFS and shouldn't be relied on) + 6: i32 retention, // retention time + 7: StorageDescriptor sd, // storage descriptor of the table + 8: list partitionKeys, // partition keys of the table. only primitive types are supported + 9: map parameters, // to store comments or any other user level parameters + 10: string viewOriginalText, // original view text, null for non-view + 11: string viewExpandedText, // expanded view text, null for non-view + 12: string tableType, // table type enum, e.g. EXTERNAL_TABLE + 13: optional PrincipalPrivilegeSet privileges, +} + +struct Partition { + 1: list values // string value is converted to appropriate partition key type + 2: string dbName, + 3: string tableName, + 4: i32 createTime, + 5: i32 lastAccessTime, + 6: StorageDescriptor sd, + 7: map parameters, + 8: optional PrincipalPrivilegeSet privileges +} + +struct Index { + 1: string indexName, // unique with in the whole database namespace + 2: string indexHandlerClass, // reserved + 3: string dbName, + 4: string origTableName, + 5: i32 createTime, + 6: i32 lastAccessTime, + 7: string indexTableName, + 8: StorageDescriptor sd, + 9: map parameters, + 10: bool deferredRebuild +} + +// schema of the table/query results etc. +struct Schema { + // column names, types, comments + 1: list fieldSchemas, // delimiters etc + 2: map properties +} + +// Key-value store to be used with selected +// Metastore APIs (create, alter methods). +// The client can pass environment properties / configs that can be +// accessed in hooks. +struct EnvironmentContext { + 1: map properties +} + +exception MetaException { + 1: string message +} + +exception UnknownTableException { + 1: string message +} + +exception UnknownDBException { + 1: string message +} + +exception AlreadyExistsException { + 1: string message +} + +exception InvalidPartitionException { + 1: string message +} + +exception UnknownPartitionException { + 1: string message +} + +exception InvalidObjectException { + 1: string message +} + +exception NoSuchObjectException { + 1: string message +} + +exception IndexAlreadyExistsException { + 1: string message +} + +exception InvalidOperationException { + 1: string message +} + +exception ConfigValSecurityException { + 1: string message +} + +/** +* This interface is live. +*/ +service ThriftHiveMetastore extends fb303.FacebookService +{ + void create_database(1:Database database) throws(1:AlreadyExistsException o1, 2:InvalidObjectException o2, 3:MetaException o3) + Database get_database(1:string name) throws(1:NoSuchObjectException o1, 2:MetaException o2) + void drop_database(1:string name, 2:bool deleteData, 3:bool cascade) throws(1:NoSuchObjectException o1, 2:InvalidOperationException o2, 3:MetaException o3) + list get_databases(1:string pattern) throws(1:MetaException o1) + list get_all_databases() throws(1:MetaException o1) + void alter_database(1:string dbname, 2:Database db) throws(1:MetaException o1, 2:NoSuchObjectException o2) + + // returns the type with given name (make seperate calls for the dependent types if needed) + Type get_type(1:string name) throws(1:MetaException o1, 2:NoSuchObjectException o2) + bool create_type(1:Type type) throws(1:AlreadyExistsException o1, 2:InvalidObjectException o2, 3:MetaException o3) + bool drop_type(1:string type) throws(1:MetaException o1, 2:NoSuchObjectException o2) + map get_type_all(1:string name) + throws(1:MetaException o2) + + // Gets a list of FieldSchemas describing the columns of a particular table + list get_fields(1: string db_name, 2: string table_name) throws (1: MetaException o1, 2: UnknownTableException o2, 3: UnknownDBException o3), + + // Gets a list of FieldSchemas describing both the columns and the partition keys of a particular table + list get_schema(1: string db_name, 2: string table_name) throws (1: MetaException o1, 2: UnknownTableException o2, 3: UnknownDBException o3) + + // create a Hive table. Following fields must be set + // tableName + // database (only 'default' for now until Hive QL supports databases) + // owner (not needed, but good to have for tracking purposes) + // sd.cols (list of field schemas) + // sd.inputFormat (SequenceFileInputFormat (binary like falcon tables or u_full) or TextInputFormat) + // sd.outputFormat (SequenceFileInputFormat (binary) or TextInputFormat) + // sd.serdeInfo.serializationLib (SerDe class name eg org.apache.hadoop.hive.serde.simple_meta.MetadataTypedColumnsetSerDe + // * See notes on DDL_TIME + void create_table(1:Table tbl) throws(1:AlreadyExistsException o1, 2:InvalidObjectException o2, 3:MetaException o3, 4:NoSuchObjectException o4) + void create_table_with_environment_context(1:Table tbl, + 2:EnvironmentContext environment_context) + throws (1:AlreadyExistsException o1, + 2:InvalidObjectException o2, 3:MetaException o3, + 4:NoSuchObjectException o4) + // drops the table and all the partitions associated with it if the table has partitions + // delete data (including partitions) if deleteData is set to true + void drop_table(1:string dbname, 2:string name, 3:bool deleteData) + throws(1:NoSuchObjectException o1, 2:MetaException o3) + list get_tables(1: string db_name, 2: string pattern) throws (1: MetaException o1) + list get_all_tables(1: string db_name) throws (1: MetaException o1) + + Table get_table(1:string dbname, 2:string tbl_name) + throws (1:MetaException o1, 2:NoSuchObjectException o2) + list
get_table_objects_by_name(1:string dbname, 2:list tbl_names) + throws (1:MetaException o1, 2:InvalidOperationException o2, 3:UnknownDBException o3) + + // Get a list of table names that match a filter. + // The filter operators are LIKE, <, <=, >, >=, =, <> + // + // In the filter statement, values interpreted as strings must be enclosed in quotes, + // while values interpreted as integers should not be. Strings and integers are the only + // supported value types. + // + // The currently supported key names in the filter are: + // Constants.HIVE_FILTER_FIELD_OWNER, which filters on the tables' owner's name + // and supports all filter operators + // Constants.HIVE_FILTER_FIELD_LAST_ACCESS, which filters on the last access times + // and supports all filter operators except LIKE + // Constants.HIVE_FILTER_FIELD_PARAMS, which filters on the tables' parameter keys and values + // and only supports the filter operators = and <>. + // Append the parameter key name to HIVE_FILTER_FIELD_PARAMS in the filter statement. + // For example, to filter on parameter keys called "retention", the key name in the filter + // statement should be Constants.HIVE_FILTER_FIELD_PARAMS + "retention" + // Also, = and <> only work for keys that exist + // in the tables. E.g., if you are looking for tables where key1 <> value, it will only + // look at tables that have a value for the parameter key1. + // Some example filter statements include: + // filter = Constants.HIVE_FILTER_FIELD_OWNER + " like \".*test.*\" and " + + // Constants.HIVE_FILTER_FIELD_LAST_ACCESS + " = 0"; + // filter = Constants.HIVE_FILTER_FIELD_PARAMS + "retention = \"30\" or " + + // Constants.HIVE_FILTER_FIELD_PARAMS + "retention = \"90\"" + // @param dbName + // The name of the database from which you will retrieve the table names + // @param filterType + // The type of filter + // @param filter + // The filter string + // @param max_tables + // The maximum number of tables returned + // @return A list of table names that match the desired filter + list get_table_names_by_filter(1:string dbname, 2:string filter, 3:i16 max_tables=-1) + throws (1:MetaException o1, 2:InvalidOperationException o2, 3:UnknownDBException o3) + + // alter table applies to only future partitions not for existing partitions + // * See notes on DDL_TIME + void alter_table(1:string dbname, 2:string tbl_name, 3:Table new_tbl) + throws (1:InvalidOperationException o1, 2:MetaException o2) + void alter_table_with_environment_context(1:string dbname, 2:string tbl_name, + 3:Table new_tbl, 4:EnvironmentContext environment_context) + throws (1:InvalidOperationException o1, 2:MetaException o2) + // the following applies to only tables that have partitions + // * See notes on DDL_TIME + Partition add_partition(1:Partition new_part) + throws(1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3) + Partition add_partition_with_environment_context(1:Partition new_part, + 2:EnvironmentContext environment_context) + throws (1:InvalidObjectException o1, 2:AlreadyExistsException o2, + 3:MetaException o3) + i32 add_partitions(1:list new_parts) + throws(1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3) + Partition append_partition(1:string db_name, 2:string tbl_name, 3:list part_vals) + throws (1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3) + Partition append_partition_by_name(1:string db_name, 2:string tbl_name, 3:string part_name) + throws (1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3) + bool drop_partition(1:string db_name, 2:string tbl_name, 3:list part_vals, 4:bool deleteData) + throws(1:NoSuchObjectException o1, 2:MetaException o2) + bool drop_partition_by_name(1:string db_name, 2:string tbl_name, 3:string part_name, 4:bool deleteData) + throws(1:NoSuchObjectException o1, 2:MetaException o2) + Partition get_partition(1:string db_name, 2:string tbl_name, 3:list part_vals) + throws(1:MetaException o1, 2:NoSuchObjectException o2) + + Partition get_partition_with_auth(1:string db_name, 2:string tbl_name, 3:list part_vals, + 4: string user_name, 5: list group_names) throws(1:MetaException o1, 2:NoSuchObjectException o2) + + Partition get_partition_by_name(1:string db_name 2:string tbl_name, 3:string part_name) + throws(1:MetaException o1, 2:NoSuchObjectException o2) + + // returns all the partitions for this table in reverse chronological order. + // If max parts is given then it will return only that many. + list get_partitions(1:string db_name, 2:string tbl_name, 3:i16 max_parts=-1) + throws(1:NoSuchObjectException o1, 2:MetaException o2) + list get_partitions_with_auth(1:string db_name, 2:string tbl_name, 3:i16 max_parts=-1, + 4: string user_name, 5: list group_names) throws(1:NoSuchObjectException o1, 2:MetaException o2) + + list get_partition_names(1:string db_name, 2:string tbl_name, 3:i16 max_parts=-1) + throws(1:MetaException o2) + + // get_partition*_ps methods allow filtering by a partial partition specification, + // as needed for dynamic partitions. The values that are not restricted should + // be empty strings. Nulls were considered (instead of "") but caused errors in + // generated Python code. The size of part_vals may be smaller than the + // number of partition columns - the unspecified values are considered the same + // as "". + list get_partitions_ps(1:string db_name 2:string tbl_name + 3:list part_vals, 4:i16 max_parts=-1) + throws(1:MetaException o1, 2:NoSuchObjectException o2) + list get_partitions_ps_with_auth(1:string db_name, 2:string tbl_name, 3:list part_vals, 4:i16 max_parts=-1, + 5: string user_name, 6: list group_names) throws(1:NoSuchObjectException o1, 2:MetaException o2) + + list get_partition_names_ps(1:string db_name, + 2:string tbl_name, 3:list part_vals, 4:i16 max_parts=-1) + throws(1:MetaException o1, 2:NoSuchObjectException o2) + + // get the partitions matching the given partition filter + list get_partitions_by_filter(1:string db_name 2:string tbl_name + 3:string filter, 4:i16 max_parts=-1) + throws(1:MetaException o1, 2:NoSuchObjectException o2) + + // get partitions give a list of partition names + list get_partitions_by_names(1:string db_name 2:string tbl_name 3:list names) + throws(1:MetaException o1, 2:NoSuchObjectException o2) + + // changes the partition to the new partition object. partition is identified from the part values + // in the new_part + // * See notes on DDL_TIME + void alter_partition(1:string db_name, 2:string tbl_name, 3:Partition new_part) + throws (1:InvalidOperationException o1, 2:MetaException o2) + + void alter_partition_with_environment_context(1:string db_name, + 2:string tbl_name, 3:Partition new_part, + 4:EnvironmentContext environment_context) + throws (1:InvalidOperationException o1, 2:MetaException o2) + + // rename the old partition to the new partition object by changing old part values to the part values + // in the new_part. old partition is identified from part_vals. + // partition keys in new_part should be the same as those in old partition. + void rename_partition(1:string db_name, 2:string tbl_name, 3:list part_vals, 4:Partition new_part) + throws (1:InvalidOperationException o1, 2:MetaException o2) + + // gets the value of the configuration key in the metastore server. returns + // defaultValue if the key does not exist. if the configuration key does not + // begin with "hive", "mapred", or "hdfs", a ConfigValSecurityException is + // thrown. + string get_config_value(1:string name, 2:string defaultValue) + throws(1:ConfigValSecurityException o1) + + // converts a partition name into a partition values array + list partition_name_to_vals(1: string part_name) + throws(1: MetaException o1) + // converts a partition name into a partition specification (a mapping from + // the partition cols to the values) + map partition_name_to_spec(1: string part_name) + throws(1: MetaException o1) + + void markPartitionForEvent(1:string db_name, 2:string tbl_name, 3:map part_vals, + 4:PartitionEventType eventType) throws (1: MetaException o1, 2: NoSuchObjectException o2, + 3: UnknownDBException o3, 4: UnknownTableException o4, 5: UnknownPartitionException o5, + 6: InvalidPartitionException o6) + bool isPartitionMarkedForEvent(1:string db_name, 2:string tbl_name, 3:map part_vals, + 4: PartitionEventType eventType) throws (1: MetaException o1, 2:NoSuchObjectException o2, + 3: UnknownDBException o3, 4: UnknownTableException o4, 5: UnknownPartitionException o5, + 6: InvalidPartitionException o6) + + //index + Index add_index(1:Index new_index, 2: Table index_table) + throws(1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3) + void alter_index(1:string dbname, 2:string base_tbl_name, 3:string idx_name, 4:Index new_idx) + throws (1:InvalidOperationException o1, 2:MetaException o2) + bool drop_index_by_name(1:string db_name, 2:string tbl_name, 3:string index_name, 4:bool deleteData) + throws(1:NoSuchObjectException o1, 2:MetaException o2) + Index get_index_by_name(1:string db_name 2:string tbl_name, 3:string index_name) + throws(1:MetaException o1, 2:NoSuchObjectException o2) + + list get_indexes(1:string db_name, 2:string tbl_name, 3:i16 max_indexes=-1) + throws(1:NoSuchObjectException o1, 2:MetaException o2) + list get_index_names(1:string db_name, 2:string tbl_name, 3:i16 max_indexes=-1) + throws(1:MetaException o2) + + //authorization privileges + + bool create_role(1:Role role) throws(1:MetaException o1) + bool drop_role(1:string role_name) throws(1:MetaException o1) + list get_role_names() throws(1:MetaException o1) + bool grant_role(1:string role_name, 2:string principal_name, 3:PrincipalType principal_type, + 4:string grantor, 5:PrincipalType grantorType, 6:bool grant_option) throws(1:MetaException o1) + bool revoke_role(1:string role_name, 2:string principal_name, 3:PrincipalType principal_type) + throws(1:MetaException o1) + list list_roles(1:string principal_name, 2:PrincipalType principal_type) throws(1:MetaException o1) + + PrincipalPrivilegeSet get_privilege_set(1:HiveObjectRef hiveObject, 2:string user_name, + 3: list group_names) throws(1:MetaException o1) + list list_privileges(1:string principal_name, 2:PrincipalType principal_type, + 3: HiveObjectRef hiveObject) throws(1:MetaException o1) + + bool grant_privileges(1:PrivilegeBag privileges) throws(1:MetaException o1) + bool revoke_privileges(1:PrivilegeBag privileges) throws(1:MetaException o1) + + // this is used by metastore client to send UGI information to metastore server immediately + // after setting up a connection. + list set_ugi(1:string user_name, 2:list group_names) throws (1:MetaException o1) + + //Authentication (delegation token) interfaces + + // get metastore server delegation token for use from the map/reduce tasks to authenticate + // to metastore server + string get_delegation_token(1:string token_owner, 2:string renewer_kerberos_principal_name) + throws (1:MetaException o1) + + // method to renew delegation token obtained from metastore server + i64 renew_delegation_token(1:string token_str_form) throws (1:MetaException o1) + + // method to cancel delegation token obtained from metastore server + void cancel_delegation_token(1:string token_str_form) throws (1:MetaException o1) +} + +// * Note about the DDL_TIME: When creating or altering a table or a partition, +// if the DDL_TIME is not set, the current time will be used. + +// For storing info about archived partitions in parameters + +// Whether the partition is archived +const string IS_ARCHIVED = "is_archived", +// The original location of the partition, before archiving. After archiving, +// this directory will contain the archive. When the partition +// is dropped, this directory will be deleted +const string ORIGINAL_LOCATION = "original_location", + +// these should be needed only for backward compatibility with filestore +const string META_TABLE_COLUMNS = "columns", +const string META_TABLE_COLUMN_TYPES = "columns.types", +const string BUCKET_FIELD_NAME = "bucket_field_name", +const string BUCKET_COUNT = "bucket_count", +const string FIELD_TO_DIMENSION = "field_to_dimension", +const string META_TABLE_NAME = "name", +const string META_TABLE_DB = "db", +const string META_TABLE_LOCATION = "location", +const string META_TABLE_SERDE = "serde", +const string META_TABLE_PARTITION_COLUMNS = "partition_columns", +const string FILE_INPUT_FORMAT = "file.inputformat", +const string FILE_OUTPUT_FORMAT = "file.outputformat", +const string META_TABLE_STORAGE = "storage_handler", + + + diff --git a/herringbone-main/pom.xml b/herringbone-main/pom.xml new file mode 100644 index 0000000..08a54ab --- /dev/null +++ b/herringbone-main/pom.xml @@ -0,0 +1,168 @@ + + 4.0.0 + + com.stripe + herringbone-main + 0.0.1 + jar + + Herringbone Main + + + + dtrott + http://maven.davidtrott.com/repository + + + + + + + org.scalatest + scalatest-maven-plugin + 1.0-M2 + + ${project.build.directory}/surefire-reports + . + WDF TestSuite.txt + ${project.build.directory}/html/scalatest + false + + + + test + + test + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.1 + + 1.6 + 1.6 + + + + maven-jar-plugin + 2.3.1 + + + + maven-resources-plugin + 2.4.3 + + + + net.alchim31.maven + scala-maven-plugin + 3.1.6 + + incremental + true + + + + + compile + testCompile + + + + + + + org.apache.maven.plugins + maven-shade-plugin + 2.3 + + false + target/herringbone-${project.version}-jar-with-dependencies.jar + + + + package + + shade + + + + + + + + + 1.6.0rc4 + UTF-8 + 2.10.4 + 1.7 + 1.7 + + + + + com.twitter + parquet-common + ${parquet.version} + + + com.twitter + parquet-encoding + ${parquet.version} + + + com.twitter + parquet-column + ${parquet.version} + + + com.twitter + parquet-hadoop + ${parquet.version} + + + org.apache.hadoop + hadoop-client + 2.5.2 + provided + + + org.apache.hive + hive-jdbc + 0.14.0 + + + com.twitter + parquet-hadoop-bundle + + + + + org.rogach + scallop_2.10 + 0.9.5 + + + org.scala-lang + jline + 2.9.0-1 + + + org.scalatest + scalatest_2.10 + 2.0 + test + + + org.scalamock + scalamock-scalatest-support_2.10 + 3.1.RC1 + test + + + diff --git a/herringbone-main/src/main/scala/com/stripe/herringbone/CompactInputFormat.scala b/herringbone-main/src/main/scala/com/stripe/herringbone/CompactInputFormat.scala new file mode 100644 index 0000000..c9f1628 --- /dev/null +++ b/herringbone-main/src/main/scala/com/stripe/herringbone/CompactInputFormat.scala @@ -0,0 +1,168 @@ +package com.stripe.herringbone + +import java.util.{List => JavaList} +import java.io.DataOutput +import java.io.DataInput + +import scala.collection.mutable.MutableList +import scala.collection.JavaConverters._ +import scala.collection.JavaConversions._ + +import org.apache.hadoop.io.Writable +import org.apache.hadoop.mapreduce.{InputSplit,Job,JobContext,Mapper,TaskAttemptContext} +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat +import parquet.hadoop.api.ReadSupport +import parquet.hadoop.{ParquetInputFormat,ParquetInputSplit,ParquetOutputFormat,ParquetRecordReader} +import parquet.hadoop.example.{ExampleOutputFormat,GroupReadSupport} +import parquet.hadoop.util.ContextUtil +import parquet.example.data.{Group,GroupWriter} +import parquet.example.data.simple.SimpleGroup + + +class CompactInputFormat[T](readSupportClass: Class[_ <: ReadSupport[T]]) extends ParquetInputFormat[T](readSupportClass) { + + // We can't accurately predict the size of the resulting merged file, so aim + // for 900MB. Our HDFS block size is 1024MB so we'll get pretty close. + val TARGET = 1024 * 1024 * 900 // 900MB. + + override def getSplits(context: JobContext): JavaList[InputSplit] = { + // Limit the splits to 20MB so it's easy to assemble them into 900MB chunks. + // This is not actually reliable. Chunks can come back bigger than 20MB, but + // it does limit the size of most chunks. + val conf = ContextUtil.getConfiguration(context) + conf.set("mapred.max.split.size", (20 * 1024 * 1024).toString) + + val splits = super.getSplits(conf, getFooters(context)).asScala.toList + val m = if (splits.isEmpty) splits else mergeSplits(splits) + m.asInstanceOf[List[InputSplit]].asJava + } + + def mergeSplits(splits: List[ParquetInputSplit]): List[MergedInputSplit] = { + val sizes = splits.map { _.getLength } + println(s"""${splits.length} initial splits were generated. + | Max: ${mb(sizes.max)} + | Min: ${mb(sizes.min)} + | Avg: ${mb(sizes.sum.toDouble / sizes.length)}""".stripMargin) + + // TODO: get a CS undergrad to give us better bin packing. + var buckets = MutableList[MutableList[ParquetInputSplit]](MutableList(splits.head)) + splits.tail.foreach { split => + val bucket = buckets.minBy { b => b.map { _.getLength }.sum } + if ((split.getLength + bucket.map { _.getLength }.sum) < TARGET) { + bucket += split + } else { + buckets += MutableList(split) + } + } + + val newSizes = buckets.map { _.map { _.getLength }.sum }.toList + println(s"""${buckets.length} merged splits were generated. + | Max: ${mb(newSizes.max)} + | Min: ${mb(newSizes.min)} + | Avg: ${mb(newSizes.sum.toDouble / newSizes.length)}""".stripMargin) + + buckets.map { b => new MergedInputSplit(b.toList) }.toList + } + + override def createRecordReader(split: InputSplit, context: TaskAttemptContext): MergedRecordReader[T] = { + val readSupport = getReadSupport(ContextUtil.getConfiguration(context)) + split match { + case s: MergedInputSplit => new MergedRecordReader[T](s, context, readSupport) + case _ => throw new Exception(s"Expected a MergedInputSplit. Found a $split.") + } + } + + // Helper for pretty-printing byte values. + def mb(n: Double): String = { + val K = 1024 + val M = K * K + val G = K * M + if (n < K) f"$n%.2fB" + else if (n < M) f"${n / K}%.2fK" + else if (n < G) f"${n / M}%.2fM" + else f"${n / G}%.2fG" + } +} + +class MergedInputSplit(var splits: List[ParquetInputSplit]) extends InputSplit with Writable { + def this() = this(List()) + + var splitNumber = 0 + + def currentSplit: ParquetInputSplit = splits(splitNumber) + def nextSplit: Option[ParquetInputSplit] = { + if (splitNumber < splits.length - 1) { + splitNumber += 1 + Some(currentSplit) + } else { + None + } + } + + // write and readFields are paired for serialization/deserialization. + override def write(out: DataOutput) = { + out.writeInt(splits.length) + splits.foreach { s => s.write(out) } + } + + override def readFields(in: DataInput) = { + val count = in.readInt + splits = for (i <- List.range(0, count)) yield { + val s = new ParquetInputSplit + s.readFields(in) + s + } + } + + override def getLength: Long = splits.map { _.getLength }.sum + override def getLocations: Array[String] = splits.flatMap { _.getLocations }.toArray + override def toString = "" +} + +class MergedRecordReader[T](split: MergedInputSplit, + taskContext: TaskAttemptContext, + readSupport: ReadSupport[T]) extends ParquetRecordReader[T](readSupport) { + val totalLength = split.getLength + var progress = 0L + + override def initialize(split: InputSplit, context: TaskAttemptContext) { + super.initialize(split.asInstanceOf[MergedInputSplit].currentSplit, context) + } + + def startNextSplit(split: MergedInputSplit, context: TaskAttemptContext): Boolean = { + split.nextSplit match { + case Some(s) => { + super.initialize(s, context) + true + } + case None => false + } + } + + // nextKeyValue is used to ask for the next tuple and returns false when the + // recordReader has no more tuples. Since we're wrapping multiple splits, and + // therefore multiple record readers, we detect when the current inernal + // reader is done and move to the next reader. + override def nextKeyValue: Boolean = { + val next = super.nextKeyValue + if (next) { + next + } else { + super.close + progress += split.currentSplit.getLength + + if (startNextSplit(split, taskContext)) { + nextKeyValue + } else { + false + } + } + } + + override def toString = "" + override def getProgress: Float = progress / totalLength +} + + +class CompactGroupInputFormat extends CompactInputFormat[Group](classOf[GroupReadSupport]) { } diff --git a/herringbone-main/src/main/scala/com/stripe/herringbone/CompactJob.scala b/herringbone-main/src/main/scala/com/stripe/herringbone/CompactJob.scala new file mode 100644 index 0000000..ba690f7 --- /dev/null +++ b/herringbone-main/src/main/scala/com/stripe/herringbone/CompactJob.scala @@ -0,0 +1,98 @@ +package com.stripe.herringbone + +import com.stripe.herringbone.util.ParquetUtils + +import java.util.{List => JavaList} +import java.io.DataOutput +import java.io.DataInput + +import scala.collection.mutable.MutableList +import scala.collection.JavaConverters._ + +import org.apache.hadoop.conf.{Configuration,Configured} +import org.apache.hadoop.fs.{FileSystem,Path} +import org.apache.hadoop.mapreduce.{Job,Mapper} +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat +import org.apache.hadoop.util.{Tool,ToolRunner} + +import org.codehaus.jackson.map.ObjectMapper +import org.codehaus.jackson.`type`.TypeReference + +import org.rogach.scallop.ScallopConf + +import parquet.example.data.{Group,GroupWriter} +import parquet.hadoop.{BadConfigurationException,ParquetOutputFormat} +import parquet.hadoop.api.{DelegatingWriteSupport,WriteSupport} +import parquet.hadoop.api.WriteSupport.FinalizedWriteContext +import parquet.hadoop.example.GroupWriteSupport + +class ParquetCompactConf(arguments: Seq[String]) extends ScallopConf(arguments) { + val inputPath = opt[String](required = true) + val outputPath = opt[String](required = true) +} + +class ParquetCompactWriteSupport extends DelegatingWriteSupport[Group](new GroupWriteSupport) { + var extraMetadata: java.util.Map[String, String] = _ + + override def init(configuration: Configuration): WriteSupport.WriteContext = { + extractMetadata(configuration) + super.init(configuration) + } + + override def finalizeWrite(): FinalizedWriteContext = { + new FinalizedWriteContext(extraMetadata) + } + + def extractMetadata(configuration: Configuration) = { + val metadataJson = configuration.get(ParquetCompactWriteSupport.ExtraMetadataKey) + try { + extraMetadata = new ObjectMapper().readValue(metadataJson, new TypeReference[java.util.Map[String,String]](){}) + } catch { case e: java.io.IOException => + throw new BadConfigurationException("Unable to deserialize extra extra metadata: " + metadataJson, e) + } + } +} + +object ParquetCompactWriteSupport { + val ExtraMetadataKey = "herringbone.compact.extrametadata" +} + +class CompactJob extends Configured with Tool { + override def run(arguments: Array[String]) = { + val args = new ParquetCompactConf(arguments) + val fs = FileSystem.get(getConf) + val inputPath = new Path(args.inputPath()) + val outputPath = new Path(args.outputPath()) + + // Pass along metadata (which includes the thrift schema) to the results. + val metadata = ParquetUtils.readKeyValueMetaData(inputPath, fs) + val metadataJson = new ObjectMapper().writeValueAsString(metadata) + getConf.set(ParquetCompactWriteSupport.ExtraMetadataKey, metadataJson) + + val job = new Job(getConf) + + FileInputFormat.setInputPaths(job, inputPath) + FileOutputFormat.setOutputPath(job, outputPath) + ParquetOutputFormat.setWriteSupportClass(job, classOf[ParquetCompactWriteSupport]) + GroupWriteSupport.setSchema(ParquetUtils.readSchema(inputPath, fs), job.getConfiguration) + + job.setJobName("compact " + args.inputPath() + " → " + args.outputPath()) + job.setInputFormatClass(classOf[CompactGroupInputFormat]); + job.setOutputFormatClass(classOf[ParquetOutputFormat[Group]]) + job.setMapperClass(classOf[Mapper[Void,Group,Void,Group]]) + job.setJarByClass(classOf[CompactJob]) + job.getConfiguration.set("mapreduce.job.user.classpath.first", "true") + job.setNumReduceTasks(0) + + if(job.waitForCompletion(true)) 0 else 1 + } +} + +object CompactJob { + + def main(args: Array[String]) = { + val result = ToolRunner.run(new Configuration, new CompactJob, args) + System.exit(result) + } +} diff --git a/herringbone-main/src/main/scala/com/stripe/herringbone/FlattenJob.scala b/herringbone-main/src/main/scala/com/stripe/herringbone/FlattenJob.scala new file mode 100644 index 0000000..5d78de1 --- /dev/null +++ b/herringbone-main/src/main/scala/com/stripe/herringbone/FlattenJob.scala @@ -0,0 +1,78 @@ +package com.stripe.herringbone + +import com.stripe.herringbone.flatten.{ParquetFlatConf,ParquetFlatMapper,TypeFlattener} +import com.stripe.herringbone.flatten.FlatConverter +import com.stripe.herringbone.util.ParquetUtils + +import org.apache.hadoop.mapreduce._ +import org.apache.hadoop.mapreduce.lib.input._ +import org.apache.hadoop.mapreduce.lib.output._ +import org.apache.hadoop.util._ +import org.apache.hadoop.fs._ +import org.apache.hadoop.conf._ + +import parquet.example.data._ +import parquet.example.data.simple._ +import parquet.hadoop._ +import parquet.hadoop.example._ +import parquet.io.api._ +import parquet.schema._ + +import org.rogach.scallop._ + +class FlattenMapper extends ParquetFlatMapper[Group] { + def valueOut(value: Group) = { + FlatConverter.flattenGroup(value, flattenedSchema, separator, renameId) + } +} + +class FlattenJob extends Configured with Tool { + override def run(args: Array[String]) = { + val conf = new ParquetFlatConf(args) + val fs = FileSystem.get(getConf) + val inputPath = new Path(conf.inputPath()) + val outputPath = new Path(conf.outputPath()) + val previousPath = conf.previousPath.get.map{new Path(_)} + + val separator = conf.separator() + getConf.set(ParquetFlatMapper.SeparatorKey, separator) + + val renameId = conf.renameId() + getConf.set(ParquetFlatMapper.RenameIdKey, renameId.toString) + + if (fs.exists(outputPath)) { + println(s"Deleting existing $outputPath") + fs.delete(outputPath, true) + } + + val flattenedSchema = TypeFlattener.flatten( + ParquetUtils.readSchema(inputPath, fs), + previousPath.map { ParquetUtils.readSchema(_, fs) }, + separator, + renameId + ) + + val jobName = "flatten " + conf.inputPath() + " -> " + conf.outputPath() + val job = new Job(getConf, jobName) + + FileInputFormat.setInputPaths(job, inputPath) + FileOutputFormat.setOutputPath(job, outputPath) + ExampleOutputFormat.setSchema(job, flattenedSchema) + + job.setInputFormatClass(classOf[CompactGroupInputFormat]); + job.setOutputFormatClass(classOf[ExampleOutputFormat]) + job.setMapperClass(classOf[FlattenMapper]) + job.setJarByClass(classOf[FlattenJob]) + job.getConfiguration.set("mapreduce.job.user.classpath.first", "true") + job.setNumReduceTasks(0) + + if (job.waitForCompletion(true)) 0 else 1 + } +} + +object FlattenJob { + def main(args: Array[String]) = { + val result = ToolRunner.run(new Configuration, new FlattenJob, args) + System.exit(result) + } +} diff --git a/herringbone-main/src/main/scala/com/stripe/herringbone/ParquetLoad.scala b/herringbone-main/src/main/scala/com/stripe/herringbone/ParquetLoad.scala new file mode 100644 index 0000000..013aa4a --- /dev/null +++ b/herringbone-main/src/main/scala/com/stripe/herringbone/ParquetLoad.scala @@ -0,0 +1,45 @@ +package com.stripe.herringbone + +import com.stripe.herringbone.load._ + +import org.apache.hadoop.conf._ +import org.apache.hadoop.util._ + +class ParquetLoad extends Configured with Tool { + override def run(args: Array[String]): Int = { + val conf = new ParquetLoadConf(args) + val hadoopFs = new HadoopFs() + val fieldUtils = FieldUtils(hadoopFs, ImpalaHiveSchemaTypeMapper) + + val loader: ParquetLoader = if (conf.hive()) { + HiveLoader(conf, hadoopFs, fieldUtils) + } else { + ImpalaLoader(conf, hadoopFs, fieldUtils) + } + + if (conf.updatePartitions()) { + val tableExists = loader.checkTableExists(conf.table(), conf.database()) + + (conf.path.get, tableExists) match { + case (_, true) => loader.updateTable(conf.table(), conf.database()) + case (Some(path), false) => loader.createTable(path, conf.table(), conf.database()) + case (None, false) => { + println("ERROR - path not specified and table not yet created. Specify path from which to create the table") + return 1 + } + } + } else { + loader.createTable(conf.path(), conf.table(), conf.database()) + } + loader.closeConnection + + 0 + } +} + +object ParquetLoad { + def main(args: Array[String]) = { + val result = ToolRunner.run(new Configuration, new ParquetLoad, args) + System.exit(result) + } +} diff --git a/herringbone-main/src/main/scala/com/stripe/herringbone/TsvJob.scala b/herringbone-main/src/main/scala/com/stripe/herringbone/TsvJob.scala new file mode 100644 index 0000000..ab61dca --- /dev/null +++ b/herringbone-main/src/main/scala/com/stripe/herringbone/TsvJob.scala @@ -0,0 +1,98 @@ +package com.stripe.herringbone + +import com.stripe.herringbone.flatten.{ParquetFlatConf,ParquetFlatMapper,TypeFlattener} +import com.stripe.herringbone.flatten.FlatConverter +import com.stripe.herringbone.util.ParquetUtils + +import java.io.{BufferedWriter, OutputStreamWriter} + +import org.apache.hadoop.mapreduce._ +import org.apache.hadoop.mapreduce.lib.input._ +import org.apache.hadoop.mapreduce.lib.output._ +import org.apache.hadoop.util._ +import org.apache.hadoop.fs._ +import org.apache.hadoop.conf._ +import org.apache.hadoop.io.Text + +import org.rogach.scallop._ + +import parquet.example.data._ +import parquet.example.data.simple._ +import parquet.hadoop._ +import parquet.hadoop.example._ +import parquet.io.api._ +import parquet.schema._ + +import scala.collection.JavaConversions._ + +class TsvMapper extends ParquetFlatMapper[Text] { + def valueOut(value: Group) = { + val tsvLine = FlatConverter.groupToTSV(value, flattenedSchema, separator, renameId) + "\n" + new Text(tsvLine) + } +} + +class TsvJob extends Configured with Tool { + override def run(args: Array[String]) = { + val conf = new ParquetFlatConf(args) + val fs = FileSystem.get(getConf) + val inputPath = new Path(conf.inputPath()) + val outputPath = new Path(conf.outputPath()) + val previousPath = conf.previousPath.get.map{new Path(_)} + + val separator = conf.separator() + getConf.set(ParquetFlatMapper.SeparatorKey, separator) + + val renameId = conf.renameId() + getConf.set(ParquetFlatMapper.RenameIdKey, renameId.toString) + + if (fs.exists(outputPath)) { + println(s"Deleting existing $outputPath") + fs.delete(outputPath, true) + } + + val flattenedSchema = TypeFlattener.flatten( + ParquetUtils.readSchema(inputPath, fs), + previousPath.map { ParquetUtils.readSchema(_, fs) }, + separator, + renameId + ) + + val jobName = "tsv " + conf.inputPath() + " -> " + conf.outputPath() + val job = new Job(getConf, jobName) + + FileInputFormat.setInputPaths(job, inputPath) + FileOutputFormat.setOutputPath(job, outputPath) + ExampleOutputFormat.setSchema(job, flattenedSchema) + + job.setInputFormatClass(classOf[CompactGroupInputFormat]) + job.setOutputFormatClass(classOf[TextOutputFormat[Text, Text]].asInstanceOf[Class[Nothing]]) + job.setMapperClass(classOf[TsvMapper]) + job.setJarByClass(classOf[TsvJob]) + job.getConfiguration.set("mapreduce.job.user.classpath.first", "true") + job.setNumReduceTasks(0) + + if (job.waitForCompletion(true)) { + val headerPath = new Path(conf.outputPath() + "/_header.tsv") + writeHeader(fs, headerPath, flattenedSchema) + 0 + } else { + 1 + } + } + + def writeHeader(fs: FileSystem, outputPath: Path, schema: MessageType) { + val header = FlatConverter.constructHeader(schema) + val writer = new BufferedWriter(new OutputStreamWriter(fs.create(outputPath, true))) + writer.write(header) + writer.write("\n") + writer.close() + } +} + +object TsvJob { + def main(args: Array[String]) = { + val result = ToolRunner.run(new Configuration, new TsvJob, args) + System.exit(result) + } +} diff --git a/herringbone-main/src/main/scala/com/stripe/herringbone/flatten/FlatConsumer.scala b/herringbone-main/src/main/scala/com/stripe/herringbone/flatten/FlatConsumer.scala new file mode 100644 index 0000000..e0f837a --- /dev/null +++ b/herringbone-main/src/main/scala/com/stripe/herringbone/flatten/FlatConsumer.scala @@ -0,0 +1,108 @@ +package com.stripe.herringbone.flatten + +import org.apache.hadoop.mapreduce._ +import org.apache.hadoop.mapreduce.lib.input._ +import org.apache.hadoop.mapreduce.lib.output._ +import org.apache.hadoop.util._ +import org.apache.hadoop.fs._ +import org.apache.hadoop.conf._ + +import parquet.example.data._ +import parquet.example.data.simple._ +import parquet.hadoop._ +import parquet.hadoop.example._ +import parquet.io.api._ +import parquet.schema._ + +class FlatConsumer(output: Group, separator: String, renameId: Boolean) extends RecordConsumer { + + case class StackFrame(field: String, var values: List[String]) + var stack = List[StackFrame]() + // Impala stops working after a field becomes too long. The docs + // indicate that we should have 32k. However, a binary search on a + // too-long field yielded 6776 as the maximum working value. + val MaxStringBytes = 6776 + + def startMessage {} + def endMessage {} + def startGroup {} + def endGroup {} + + def startField(field: String, index: Int) { + stack ::= StackFrame(field, Nil) + } + + def endField(field: String, index: Int) { + if(stack.head.values.size > 0) { + withField {name => + val joined = Binary.fromString( + stack + .head + .values + .reverse + .map{_.replace("\t", " ")} + .mkString(",")) + val truncated = truncate(joined, MaxStringBytes) + output.add(name, truncated) + } + } + stack = stack.tail + } + + def addInteger(value: Int) { + writeField(value.toString){name => output.add(name, value)} + } + + def addLong(value: Long) { + writeField(value.toString){name => output.add(name, value)} + } + + def addBoolean(value: Boolean) { + writeField(value.toString){name => output.add(name, value)} + } + + def truncate(value: Binary, length: Integer): Binary = { + if (value.length <= length) { + value + } else { + val bytesTruncated = new Array[Byte](length) + value.toByteBuffer.get(bytesTruncated, 0, length) + Binary.fromByteArray(bytesTruncated) + } + } + + def addBinary(value: Binary) { + // Truncate strings so Impala doesn't break + val truncated = truncate(value, MaxStringBytes) + writeField(truncated.toStringUsingUTF8){name => output.add(name, truncated)} + } + + def addFloat(value: Float) { + writeField(value.toString){name => output.add(name, value)} + } + + def addDouble(value: Double) { + writeField(value.toString){name => output.add(name, value)} + } + + def withField(fn: String=>Unit) { + val path = if (TypeFlattener.omitIdField(stack.head.field, stack.size, renameId)) + stack.tail + else + stack + + val name = path.reverse.map{_.field}.mkString(separator) + if(output.getType.containsField(name)) + fn(name) + } + + def writeField(stringRep: =>String)(fn: String => Unit) { + withField{name => + val fieldType = output.getType.getType(name) + if(fieldType.asInstanceOf[PrimitiveType].getPrimitiveTypeName == PrimitiveType.PrimitiveTypeName.BINARY) + stack.head.values ::= stringRep + else + fn(name) + } + } +} diff --git a/herringbone-main/src/main/scala/com/stripe/herringbone/flatten/FlatConverter.scala b/herringbone-main/src/main/scala/com/stripe/herringbone/flatten/FlatConverter.scala new file mode 100644 index 0000000..51741e7 --- /dev/null +++ b/herringbone-main/src/main/scala/com/stripe/herringbone/flatten/FlatConverter.scala @@ -0,0 +1,54 @@ +package com.stripe.herringbone.flatten + +import org.apache.hadoop.fs.Path +import org.apache.hadoop.conf.Configuration + +import parquet.example.data.Group +import parquet.example.data.GroupWriter +import parquet.example.data.simple.SimpleGroup +import parquet.schema.MessageType + +import scala.collection.JavaConversions._ + +object FlatConverter { + def groupToTSV(group: Group, flatSchema: MessageType, separator: String, renameId: Boolean): String = { + val flatGroup = flattenGroup(group, flatSchema, separator, renameId) + val fieldValues = (0 until flatSchema.getFieldCount).map{ field => + val valueCount = flatGroup.getFieldRepetitionCount(field) + if (valueCount == 0) { + "" + } else if (valueCount == 1) { + escapeString(flatGroup.getValueToString(field, 0)) + } else { + escapeString(flatGroup.getValueToString(field, 0)) + System.err.println("Warning: Field contains multiple values, extracting only the first") + System.err.println(flatGroup.toString) + } + } + fieldValues.mkString("\t") + } + + def constructHeader(schema: MessageType) = { + schema + .getPaths() + .toList + .map{_(0)} + .mkString("\t") + } + + def flattenGroup(group: Group, flatSchema: MessageType, separator: String, renameId: Boolean) = { + var flatGroup = new SimpleGroup(flatSchema) + val writer = new GroupWriter(new FlatConsumer(flatGroup, separator, renameId), group.getType) + writer.write(group) + flatGroup + } + + private def escapeString(s: String) = { + val quote = "\"" + if (s.contains("\t")) + // This is how pandas escapes tabs and quotes + quote + s.replace(quote, "\"\"") + quote + else + s + } +} diff --git a/herringbone-main/src/main/scala/com/stripe/herringbone/flatten/ParquetFlatConf.scala b/herringbone-main/src/main/scala/com/stripe/herringbone/flatten/ParquetFlatConf.scala new file mode 100644 index 0000000..89ea49f --- /dev/null +++ b/herringbone-main/src/main/scala/com/stripe/herringbone/flatten/ParquetFlatConf.scala @@ -0,0 +1,11 @@ +package com.stripe.herringbone.flatten + +import org.rogach.scallop._ + +class ParquetFlatConf(arguments: Seq[String]) extends ScallopConf(arguments) { + val inputPath = opt[String](required = true) + val outputPath = opt[String](required = true) + val previousPath = opt[String](descr = "Path of previously generated flat output, so field ordering can be maintained (optional)") + val separator = opt[String](default = Some("__")) + val renameId = opt[Boolean](descr = "Flatten a.b.id as a__b instead of a__b__id") +} diff --git a/herringbone-main/src/main/scala/com/stripe/herringbone/flatten/ParquetFlatMapper.scala b/herringbone-main/src/main/scala/com/stripe/herringbone/flatten/ParquetFlatMapper.scala new file mode 100644 index 0000000..f6c1a03 --- /dev/null +++ b/herringbone-main/src/main/scala/com/stripe/herringbone/flatten/ParquetFlatMapper.scala @@ -0,0 +1,29 @@ +package com.stripe.herringbone.flatten + +import org.apache.hadoop.mapreduce.Mapper +import parquet.example.data.Group +import parquet.schema.{MessageType,MessageTypeParser} + +abstract class ParquetFlatMapper[ValueOut] extends Mapper[Void,Group,Void,ValueOut] { + var flattenedSchema: MessageType = _ + var separator: String = _ + var renameId: Boolean = _ + + override def setup(context: Mapper[Void,Group,Void,ValueOut]#Context) { + // the schema is stored in the job context when we call ExampleOutputFormat.setSchema + flattenedSchema = MessageTypeParser.parseMessageType(context.getConfiguration.get("parquet.example.schema")) + separator = context.getConfiguration.get(ParquetFlatMapper.SeparatorKey) + renameId = context.getConfiguration.get(ParquetFlatMapper.RenameIdKey) == "true" + } + + override def map(key: Void, value: Group, context: Mapper[Void,Group,Void,ValueOut]#Context) { + context.write(key, valueOut(value)) + } + + def valueOut(value: Group): ValueOut +} + +object ParquetFlatMapper { + val SeparatorKey = "herringbone.flatten.separator" + val RenameIdKey = "herringbone.flatten.rename.id" +} diff --git a/herringbone-main/src/main/scala/com/stripe/herringbone/flatten/TypeFlattener.scala b/herringbone-main/src/main/scala/com/stripe/herringbone/flatten/TypeFlattener.scala new file mode 100644 index 0000000..246972e --- /dev/null +++ b/herringbone-main/src/main/scala/com/stripe/herringbone/flatten/TypeFlattener.scala @@ -0,0 +1,59 @@ +package com.stripe.herringbone.flatten + +import parquet.schema._ +import java.util.{List=>JList} +import scala.collection.JavaConverters._ + +class TypeFlattener(separator: String, renameId: Boolean) extends TypeConverter[List[Type]] { + def convertPrimitiveType(path: JList[GroupType], primitiveType: PrimitiveType) = { + val typeName = + if(TypeFlattener.isRepeated(primitiveType)) + PrimitiveType.PrimitiveTypeName.BINARY + else + primitiveType.getPrimitiveTypeName + + val types = if (TypeFlattener.omitIdField(primitiveType.getName, path.size, renameId)) + path.asScala.tail + else + (path.asScala.tail :+ primitiveType) + + val name = types.map{_.getName}.mkString(separator) + List(new PrimitiveType(Type.Repetition.OPTIONAL, typeName, primitiveType.getTypeLength, name)) + } + + def convertGroupType(path: JList[GroupType], groupType: GroupType, children: JList[List[Type]]) = { + if(TypeFlattener.isRepeated(groupType)) + Nil + else + flatten(children) + } + + def convertMessageType(messageType: MessageType, children: JList[List[Type]]) = flatten(children) + + def flatten(children: JList[List[Type]]) = children.asScala.flatten.toList +} + +object TypeFlattener { + def flatten(messageType: MessageType, + previousMessageType: Option[MessageType], + separator: String, + renameId: Boolean) = { + val flattened = messageType.convertWith(new TypeFlattener(separator, renameId)) + val fieldsToUse = previousMessageType match { + case Some(prevMessageType) => { + // if passed a previous flattened schema, preserve that field ordering, + // and append any new fields + val prevFields = prevMessageType.getFields.asScala.toList + prevFields ::: flattened.filterNot{prevFields.contains(_)} + } + case None => flattened + } + new MessageType(messageType.getName, fieldsToUse.asJava) + } + + def isRepeated(t: Type) = t.isRepetition(Type.Repetition.REPEATED) + + def omitIdField(fieldName: String, numberOfFields: Integer, renameId: Boolean) = { + renameId && Seq("id", "_id").contains(fieldName) && numberOfFields > 1 + } +} diff --git a/herringbone-main/src/main/scala/com/stripe/herringbone/load/FieldUtils.scala b/herringbone-main/src/main/scala/com/stripe/herringbone/load/FieldUtils.scala new file mode 100644 index 0000000..7599e21 --- /dev/null +++ b/herringbone-main/src/main/scala/com/stripe/herringbone/load/FieldUtils.scala @@ -0,0 +1,53 @@ +package com.stripe.herringbone.load + +import com.stripe.herringbone.util.ParquetUtils + +import org.apache.hadoop.fs._ + +import parquet.schema.{ PrimitiveType, Type } +import parquet.schema.PrimitiveType.PrimitiveTypeName +import parquet.schema.PrimitiveType.PrimitiveTypeName._ + +import scala.collection.JavaConversions._ + +case class FieldUtils(hadoopFs: HadoopFs, schemaTypeMapper: SchemaTypeMapper) { + def findPartitionFields(path: Path) = { + hadoopFs.findPartitions(path).map { + case (name, example) if (example.forall{_.isDigit}) => + "`%s` int".format(name) + case (name, _) => + "`%s` string".format(name) + } + } + + def findTableFields(path: Path) = { + val schema = ParquetUtils.readSchema(path, hadoopFs.fileSystem) + tableFieldsFromSchemaFields(schema.getFields) + } + + def tableFieldsFromSchemaFields(fields: Seq[Type]) = { + fields + .filter { f => f.isPrimitive } + .map { f => + "`%s` %s".format(f.getName, schemaTypeMapper.getSchemaType(f.asInstanceOf[PrimitiveType].getPrimitiveTypeName)) + }.toList + } +} + +trait SchemaTypeMapper { + def getSchemaType(pt: PrimitiveTypeName): String +} + +object ImpalaHiveSchemaTypeMapper extends SchemaTypeMapper { + def getSchemaType(pt: PrimitiveTypeName) = { + pt match { + case BINARY => "STRING" + case INT32 => "INT" + case INT64 | INT96 => "BIGINT" + case DOUBLE => "DOUBLE" + case BOOLEAN => "BOOLEAN" + case FLOAT => "FLOAT" + case FIXED_LEN_BYTE_ARRAY => "BINARY" + } + } +} diff --git a/herringbone-main/src/main/scala/com/stripe/herringbone/load/HadoopFs.scala b/herringbone-main/src/main/scala/com/stripe/herringbone/load/HadoopFs.scala new file mode 100644 index 0000000..abda424 --- /dev/null +++ b/herringbone-main/src/main/scala/com/stripe/herringbone/load/HadoopFs.scala @@ -0,0 +1,39 @@ +package com.stripe.herringbone.load + +import com.stripe.herringbone.util.ParquetUtils + +import org.apache.hadoop.conf._ +import org.apache.hadoop.fs._ +import org.apache.hadoop.util._ + +class HadoopFs { + lazy val fileSystem = FileSystem.get(new Configuration) + + def findAbsolutePath(path: Path) = { + fileSystem.getFileStatus(path).getPath.toUri.getPath + } + + def findSortedLeafPaths(path: Path): List[Path] = + findLeafPaths(path).sortBy{case (path,time) => time}.map{_._1} + + def findLeafPaths(path: Path): List[(Path,Long)] = { + val parquetFileStatuses = fileSystem.listStatus(path, ParquetUtils.parquetFilter) + if (parquetFileStatuses.size > 0) + List((path, parquetFileStatuses.head.getModificationTime)) + else { + fileSystem.listStatus(path, ParquetUtils.partitionFilter) + .toList + .map{_.getPath} + .flatMap{findLeafPaths(_)} + } + } + + def findPartitions(path: Path) = { + path.toUri.getPath.split("/") + .filter{_.contains("=")} + .map{segment => + val parts = segment.split("=") + (parts(0), parts(1)) + }.toList + } +} diff --git a/herringbone-main/src/main/scala/com/stripe/herringbone/load/HiveLoader.scala b/herringbone-main/src/main/scala/com/stripe/herringbone/load/HiveLoader.scala new file mode 100644 index 0000000..1557677 --- /dev/null +++ b/herringbone-main/src/main/scala/com/stripe/herringbone/load/HiveLoader.scala @@ -0,0 +1,76 @@ +package com.stripe.herringbone + +import com.stripe.herringbone.load._ + +import java.sql.ResultSet + +import org.apache.hadoop.conf._ +import org.apache.hadoop.fs._ +import org.apache.hadoop.util._ + +case class HiveLoader(conf: ParquetLoadConf, + hadoopFs: HadoopFs, + fieldUtils: FieldUtils) extends ParquetLoader { + + val connection = HiveServer2Connection(conf.connectionUrl() + ":" + conf.connectionPort()) + + def checkTableExists(table: String, database: String): Boolean = { + connection.execute("USE %s".format(database)) + var exists: Boolean = false + connection.executeQuery("SHOW TABLES") { resultSet => + val existingTable = resultSet.getString(1).trim + if (existingTable == table) + exists = true + } + exists + } + + def createTable(pathString: String, table: String, database: String = "default") { + val path = new Path(pathString) + val location = hadoopFs.findAbsolutePath(path) + val leafPaths = hadoopFs.findSortedLeafPaths(path) + + if (leafPaths.isEmpty) + error("Could not find parquet files under " + path) + + val tableFields = fieldUtils.findTableFields(leafPaths.last) + val partitionFields = fieldUtils.findPartitionFields(leafPaths.last) + val tableWhileImporting = table + "__import" + + connection.execute("CREATE DATABASE IF NOT EXISTS %s".format(database)) + connection.execute("USE %s".format(database)) + + createTableWithPartitionFields(location, tableWhileImporting, tableFields, partitionFields) + + connection.execute("DROP TABLE IF EXISTS %s".format(table)) + connection.execute("ALTER TABLE %s RENAME TO %s".format(tableWhileImporting, table)) + + if (!partitionFields.isEmpty) + updateTable(table, database) + } + + def createTableWithPartitionFields(location: String, table: String, tableFields: List[String], + partitionFields: List[String]) { + + connection.execute("DROP TABLE IF EXISTS `%s`".format (table)) + + val tableClause = "CREATE EXTERNAL TABLE IF NOT EXISTS `%s` (%s)".format( + table, tableFields.mkString(", ")) + + val partitionClause = + if (partitionFields.isEmpty) + "" + else + " PARTITIONED BY (%s)".format(partitionFields.mkString(" ,")) + + val storedClause = " STORED AS PARQUET LOCATION \"%s\"".format(location) + + connection.execute(tableClause + partitionClause + storedClause) + } + + def updateTable(table: String, database: String) = { + connection.execute("MSCK REPAIR TABLE %s".format(table)) + } + + def closeConnection() = connection.close +} diff --git a/herringbone-main/src/main/scala/com/stripe/herringbone/load/HiveServer2Connection.scala b/herringbone-main/src/main/scala/com/stripe/herringbone/load/HiveServer2Connection.scala new file mode 100644 index 0000000..cb34423 --- /dev/null +++ b/herringbone-main/src/main/scala/com/stripe/herringbone/load/HiveServer2Connection.scala @@ -0,0 +1,35 @@ +package com.stripe.herringbone.load + +import java.sql.{ Connection, DriverManager, ResultSet } + +case class HiveServer2Connection(connectionUrl: String) { + lazy val connection: Connection = { + Class.forName("org.apache.hive.jdbc.HiveDriver") + DriverManager.getConnection(connectionUrl) + } + + def execute(query: String) { + try { + println(query) + val statement = connection.createStatement + statement.execute(query) + } catch { + case e: Throwable => e.printStackTrace + } + } + + def executeQuery(query: String)(fn: ResultSet => Unit) { + try { + println(query) + val statement = connection.createStatement + val resultSet = statement.executeQuery(query) + while (resultSet.next) { + fn(resultSet) + } + } catch { + case e: Throwable => e.printStackTrace + } + } + + def close = connection.close +} diff --git a/herringbone-main/src/main/scala/com/stripe/herringbone/load/ImpalaLoader.scala b/herringbone-main/src/main/scala/com/stripe/herringbone/load/ImpalaLoader.scala new file mode 100644 index 0000000..ca7d57b --- /dev/null +++ b/herringbone-main/src/main/scala/com/stripe/herringbone/load/ImpalaLoader.scala @@ -0,0 +1,122 @@ +package com.stripe.herringbone.load + +import com.stripe.herringbone.impala.{ImpalaClient,ImpalaValue} + +import org.apache.hadoop.conf._ +import org.apache.hadoop.util._ +import org.apache.hadoop.fs._ + +case class ImpalaLoader(conf: ParquetLoadConf, + hadoopFs: HadoopFs, + fieldUtils: FieldUtils) extends ParquetLoader { + + lazy val impalaClient = ImpalaClient(conf.connectionUrl(), + conf.connectionPort().toInt) + + def checkTableExists(table: String, database: String): Boolean = { + execute("USE %s".format(database)) + var exists: Boolean = false + query("SHOW TABLES"){row => + row.foreach { value => + if (value.raw == table) exists = true + } + } + exists + } + + def createTable(pathString: String, table: String, database: String = "default") { + val path = new Path(pathString) + val location = hadoopFs.findAbsolutePath(path) + val leafPaths = hadoopFs.findSortedLeafPaths(path) + + if(leafPaths.isEmpty) + error("Could not find parquet files under " + path) + + val tableFields = fieldUtils.findTableFields(leafPaths.last) + val partitionFields = fieldUtils.findPartitionFields(leafPaths.last) + + execute("CREATE DATABASE IF NOT EXISTS importing") + execute("USE importing") + + createTableWithPartitionFields(location, table, tableFields, partitionFields) + + if(partitionFields.size > 0) + addPartitions(table, leafPaths.map{hadoopFs.findPartitions(_)}) + + execute("CREATE DATABASE IF NOT EXISTS %s".format(database)) + execute("DROP TABLE IF EXISTS %s.%s".format(database, table)) + execute("ALTER TABLE importing.%s RENAME TO %s.%s".format(table, database, table)) + if (partitionFields.isEmpty) execute("COMPUTE STATS %s.%s".format(database, table)) + } + + def updateTable(table: String, database: String) { + execute("USE %s".format(database)) + + val basePath = findBasePath(table) + val tablePartitions = findTablePartitions(table) + val leafPaths = hadoopFs.findSortedLeafPaths(new Path(basePath)) + leafPaths.reverse.foreach{path => + val partitions = hadoopFs.findPartitions(path) + if(!tablePartitions.contains(partitions.map{_._2})) + addPartition(table, partitions) + } + } + + def findBasePath(table: String) = { + var location: String = null + query("DESCRIBE FORMATTED %s".format(table)){row => + if(row(0).raw.startsWith("Location:")) + location = row(1).raw + } + location + } + + def findTablePartitions(table: String) = { + var partitions: List[List[String]] = Nil + query("SHOW TABLE STATS %s".format(table)){row => + if(row.size > 4) + partitions ::= List(row(0).raw) + } + partitions + } + + def createTableWithPartitionFields(location: String, table: String, tableFields: List[String], partitionFields: List[String]) { + execute("DROP TABLE IF EXISTS `%s`".format (table)) + + val tableClause = "CREATE EXTERNAL TABLE IF NOT EXISTS `%s` (%s)".format(table, tableFields.mkString(", ")) + val partitionClause = + if(partitionFields.isEmpty) + "" + else + " PARTITIONED BY (%s)".format(partitionFields.mkString(" ,")) + val storedClause = " STORED AS PARQUETFILE LOCATION \"%s\"".format(location) + + execute(tableClause + partitionClause + storedClause) + } + + def addPartitions(table: String, partitions: List[List[(String, String)]]) { + partitions.foreach{addPartition(table, _)} + } + + def addPartition(table: String, partitions: List[(String,String)]) { + val partitionClause = + partitions.map { + case (name, value) if(value.forall{_.isDigit}) => + "`%s`=%s".format(name, value) + case (name, value) => + "`%s`='%s'".format(name, value) + }.mkString(", ") + + execute("ALTER TABLE %s ADD IF NOT EXISTS PARTITION (%s)".format(table, partitionClause)) + } + + private def execute(stmt: String) { + impalaClient.execute(stmt) + } + + private def query(stmt: String)(fn: Seq[ImpalaValue] => Unit) { + impalaClient.query(stmt){ r => fn(r) } + } + + def closeConnection() = {} +} diff --git a/herringbone-main/src/main/scala/com/stripe/herringbone/load/ParquetLoadConf.scala b/herringbone-main/src/main/scala/com/stripe/herringbone/load/ParquetLoadConf.scala new file mode 100644 index 0000000..3615695 --- /dev/null +++ b/herringbone-main/src/main/scala/com/stripe/herringbone/load/ParquetLoadConf.scala @@ -0,0 +1,18 @@ +package com.stripe.herringbone.load + +import org.rogach.scallop._ + +class ParquetLoadConf(arguments: Seq[String]) extends ScallopConf(arguments) { + val database = opt[String](default = Some("default")) + val table = opt[String](required = true) + val path = opt[String]() + val hive = opt[Boolean]("hive") + val connectionUrl = opt[String](required = true) + val connectionPort = opt[String](required = true) + + val updatePartitions = toggle(descrYes = "Create table if not present, otherwise update with new partitions", default = Some(false)) + validateOpt (path, updatePartitions) { + case (None, None) => Left("You must specify at least one of path or update-partitions") + case _ => Right(Unit) + } +} diff --git a/herringbone-main/src/main/scala/com/stripe/herringbone/load/ParquetLoader.scala b/herringbone-main/src/main/scala/com/stripe/herringbone/load/ParquetLoader.scala new file mode 100644 index 0000000..54a5d68 --- /dev/null +++ b/herringbone-main/src/main/scala/com/stripe/herringbone/load/ParquetLoader.scala @@ -0,0 +1,9 @@ +package com.stripe.herringbone.load + +trait ParquetLoader { + def checkTableExists(table: String, db: String): Boolean + def updateTable(table: String, db: String): Unit + def createTable(path: String, table: String, db: String): Unit + def closeConnection(): Unit +} + diff --git a/herringbone-main/src/main/scala/com/stripe/herringbone/util/ParquetUtils.scala b/herringbone-main/src/main/scala/com/stripe/herringbone/util/ParquetUtils.scala new file mode 100644 index 0000000..ca675d4 --- /dev/null +++ b/herringbone-main/src/main/scala/com/stripe/herringbone/util/ParquetUtils.scala @@ -0,0 +1,36 @@ +package com.stripe.herringbone.util + +import org.apache.hadoop.conf._ +import org.apache.hadoop.util._ +import org.apache.hadoop.fs._ + +import parquet.hadoop.ParquetFileReader + +object ParquetUtils { + def getParquetMetadata(path: Path, fs: FileSystem) = { + // Just use the first parquet file to figure out the impala fields + // This also dodges the problem of any non-parquet files stashed + // in the path. + val parquetFileStatuses = fs.listStatus(path, parquetFilter) + val representativeParquetPath = parquetFileStatuses.head.getPath + + val footers = ParquetFileReader.readFooters(new Configuration, representativeParquetPath) + footers.get(0).getParquetMetadata + } + + def readSchema(path: Path, fs: FileSystem) = { + getParquetMetadata(path, fs).getFileMetaData.getSchema + } + + def readKeyValueMetaData(path: Path, fs: FileSystem) = { + getParquetMetadata(path, fs).getFileMetaData.getKeyValueMetaData + } + + val parquetFilter = new PathFilter { + def accept(path: Path) = path.getName.endsWith(".parquet") + } + + val partitionFilter = new PathFilter { + def accept(path: Path) = path.getName.contains("=") + } +} diff --git a/herringbone-main/src/main/thrift/ImpalaService.thrift b/herringbone-main/src/main/thrift/ImpalaService.thrift new file mode 100644 index 0000000..1246ca4 --- /dev/null +++ b/herringbone-main/src/main/thrift/ImpalaService.thrift @@ -0,0 +1,177 @@ +// Copyright 2012 Cloudera Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +namespace cpp impala +namespace java com.cloudera.impala.thrift +namespace rb impala.protocol + +include "Status.thrift" +include "beeswax.thrift" +include "cli_service.thrift" + +// ImpalaService accepts query execution options through beeswax.Query.configuration in +// key:value form. For example, the list of strings could be: +// "num_nodes:1", "abort_on_error:false" +// The valid keys are listed in this enum. They map to TQueryOptions. +// Note: If you add an option or change the default, you also need to update: +// - ImpalaInternalService.thrift: TQueryOptions +// - ImpaladClientExecutor.getBeeswaxQueryConfigurations() +// - ImpalaServer::SetQueryOptions() +// - ImpalaServer::TQueryOptionsToMap() +enum TImpalaQueryOptions { + // if true, abort execution on the first error + ABORT_ON_ERROR, + + // maximum # of errors to be reported; Unspecified or 0 indicates backend default + MAX_ERRORS, + + // if true, disable llvm codegen + DISABLE_CODEGEN, + + // batch size to be used by backend; Unspecified or a size of 0 indicates backend + // default + BATCH_SIZE, + + // a per-machine approximate limit on the memory consumption of this query; + // unspecified or a limit of 0 means no limit; + // otherwise specified either as: + // a) an int (= number of bytes); + // b) a float followed by "M" (MB) or "G" (GB) + MEM_LIMIT, + + // specifies the degree of parallelism with which to execute the query; + // 1: single-node execution + // NUM_NODES_ALL: executes on all nodes that contain relevant data + // NUM_NODES_ALL_RACKS: executes on one node per rack that holds relevant data + // > 1: executes on at most that many nodes at any point in time (ie, there can be + // more nodes than numNodes with plan fragments for this query, but at most + // numNodes would be active at any point in time) + // Constants (NUM_NODES_ALL, NUM_NODES_ALL_RACKS) are defined in JavaConstants.thrift. + NUM_NODES, + + // maximum length of the scan range; only applicable to HDFS scan range; Unspecified or + // a length of 0 indicates backend default; + MAX_SCAN_RANGE_LENGTH, + + // Maximum number of io buffers (per disk) + MAX_IO_BUFFERS, + + // Number of scanner threads. + NUM_SCANNER_THREADS, + + // If true, Impala will try to execute on file formats that are not fully supported yet + ALLOW_UNSUPPORTED_FORMATS, + + // if set and > -1, specifies the default limit applied to a top-level SELECT statement + // with an ORDER BY but without a LIMIT clause (ie, if the SELECT statement also has + // a LIMIT clause, this default is ignored) + DEFAULT_ORDER_BY_LIMIT, + + // DEBUG ONLY: + // If set to + // "[:]::", + // the exec node with the given id will perform the specified action in the given + // phase. If the optional backend number (starting from 0) is specified, only that + // backend instance will perform the debug action, otherwise all backends will behave + // in that way. + // If the string doesn't have the required format or if any of its components is + // invalid, the option is ignored. + DEBUG_ACTION, + + // If true, raise an error when the DEFAULT_ORDER_BY_LIMIT has been reached. + ABORT_ON_DEFAULT_LIMIT_EXCEEDED, + + // Compression codec for parquet when inserting into parquet tables. + // Valid values are "snappy", "gzip" and "none" + // Leave blank to use default. + PARQUET_COMPRESSION_CODEC, + + // HBase scan query option. If set and > 0, HBASE_CACHING is the value for + // "hbase.client.Scan.setCaching()" when querying HBase table. Otherwise, use backend + // default. + // If the value is too high, then the hbase region server will have a hard time (GC + // pressure and long response times). If the value is too small, then there will be + // extra trips to the hbase region server. + HBASE_CACHING, + + // HBase scan query option. If set, HBase scan will always set + // "hbase.client.setCacheBlocks" to CACHE_BLOCKS. Default is false. + // If the table is large and the query is doing big scan, set it to false to + // avoid polluting the cache in the hbase region server. + // If the table is small and the table is used several time, set it to true to improve + // performance. + HBASE_CACHE_BLOCKS, +} + +// The summary of an insert. +struct TInsertResult { + // Number of appended rows per modified partition. Only applies to HDFS tables. + // The keys represent partitions to create, coded as k1=v1/k2=v2/k3=v3..., with the + // root in an unpartitioned table being the empty string. + 1: required map rows_appended +} + +// Response from a call to PingImpalaService +struct TPingImpalaServiceResp { + // The Impala service's version string. + 1: string version +} + +// Parameters for a ResetTable request which will invalidate a table's metadata. +// DEPRECATED. +struct TResetTableReq { + // Name of the table's parent database. + 1: required string db_name + + // Name of the table. + 2: required string table_name +} + +// For all rpc that return a TStatus as part of their result type, +// if the status_code field is set to anything other than OK, the contents +// of the remainder of the result type is undefined (typically not set) +service ImpalaService extends beeswax.BeeswaxService { + // Cancel execution of query. Returns RUNTIME_ERROR if query_id + // unknown. + // This terminates all threads running on behalf of this query at + // all nodes that were involved in the execution. + // Throws BeeswaxException if the query handle is invalid (this doesn't + // necessarily indicate an error: the query might have finished). + Status.TStatus Cancel(1:beeswax.QueryHandle query_id) + throws(1:beeswax.BeeswaxException error); + + // Invalidates all catalog metadata, forcing a reload + // DEPRECATED; execute query "invalidate metadata" to refresh metadata + Status.TStatus ResetCatalog(); + + // Invalidates a specific table's catalog metadata, forcing a reload on the next access + // DEPRECATED; execute query "refresh
" to refresh metadata + Status.TStatus ResetTable(1:TResetTableReq request) + + // Returns the runtime profile string for the given query handle. + string GetRuntimeProfile(1:beeswax.QueryHandle query_id) + throws(1:beeswax.BeeswaxException error); + + // Closes the query handle and return the result summary of the insert. + TInsertResult CloseInsert(1:beeswax.QueryHandle handle) + throws(1:beeswax.QueryNotFoundException error, 2:beeswax.BeeswaxException error2); + + // Client calls this RPC to verify that the server is an ImpalaService. Returns the + // server version. + TPingImpalaServiceResp PingImpalaService(); +} + +// Impala HiveServer2 service +service ImpalaHiveServer2Service extends cli_service.TCLIService { +} diff --git a/herringbone-main/src/main/thrift/Status.thrift b/herringbone-main/src/main/thrift/Status.thrift new file mode 100644 index 0000000..8906d1e --- /dev/null +++ b/herringbone-main/src/main/thrift/Status.thrift @@ -0,0 +1,32 @@ +// Copyright 2012 Cloudera Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +namespace cpp impala +namespace java com.cloudera.impala.thrift +namespace rb impala.protocol + +enum TStatusCode { + OK, + CANCELLED, + ANALYSIS_ERROR, + NOT_IMPLEMENTED_ERROR, + RUNTIME_ERROR, + MEM_LIMIT_EXCEEDED, + INTERNAL_ERROR +} + +struct TStatus { + 1: required TStatusCode status_code + 2: list error_msgs +} diff --git a/herringbone-main/src/main/thrift/beeswax.thrift b/herringbone-main/src/main/thrift/beeswax.thrift new file mode 100644 index 0000000..2707457 --- /dev/null +++ b/herringbone-main/src/main/thrift/beeswax.thrift @@ -0,0 +1,175 @@ +/* + * Licensed to Cloudera, Inc. under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Cloudera, Inc. licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Interface for interacting with Beeswax Server + */ + +namespace java com.cloudera.beeswax.api +namespace py beeswaxd +namespace cpp beeswax +namespace rb impala.protocol.beeswax + +include "hive_metastore.thrift" + +// A Query +struct Query { + 1: string query; + // A list of HQL commands to execute before the query. + // This is typically defining UDFs, setting settings, and loading resources. + 3: list configuration; + + // User and groups to "act as" for purposes of Hadoop. + 4: string hadoop_user; +} + +typedef string LogContextId + +enum QueryState { + CREATED, + INITIALIZED, + COMPILED, + RUNNING, + FINISHED, + EXCEPTION +} + +struct QueryHandle { + 1: string id; + 2: LogContextId log_context; +} + +struct QueryExplanation { + 1: string textual +} + +struct Results { + // If set, data is valid. Otherwise, results aren't ready yet. + 1: bool ready, + // Columns for the results + 2: list columns, + // A set of results + 3: list data, + // The starting row of the results + 4: i64 start_row, + // Whether there are more results to fetch + 5: bool has_more +} + +/** + * Metadata information about the results. + * Applicable only for SELECT. + */ +struct ResultsMetadata { + /** The schema of the results */ + 1: hive_metastore.Schema schema, + /** The directory containing the results. Not applicable for partition table. */ + 2: string table_dir, + /** If the results are straight from an existing table, the table name. */ + 3: string in_tablename, + /** Field delimiter */ + 4: string delim, +} + +exception BeeswaxException { + 1: string message, + // Use get_log(log_context) to retrieve any log related to this exception + 2: LogContextId log_context, + // (Optional) The QueryHandle that caused this exception + 3: QueryHandle handle, + 4: optional i32 errorCode = 0, + 5: optional string SQLState = " " +} + +exception QueryNotFoundException { +} + +/** Represents a Hadoop-style configuration variable. */ +struct ConfigVariable { + 1: string key, + 2: string value, + 3: string description +} + +service BeeswaxService { + /** + * Submit a query and return a handle (QueryHandle). The query runs asynchronously. + */ + QueryHandle query(1:Query query) throws(1:BeeswaxException error), + + /** + * run a query synchronously and return a handle (QueryHandle). + */ + QueryHandle executeAndWait(1:Query query, 2:LogContextId clientCtx) + throws(1:BeeswaxException error), + + /** + * Get the query plan for a query. + */ + QueryExplanation explain(1:Query query) + throws(1:BeeswaxException error), + + /** + * Get the results of a query. This is non-blocking. Caller should check + * Results.ready to determine if the results are in yet. The call requests + * the batch size of fetch. + */ + Results fetch(1:QueryHandle query_id, 2:bool start_over, 3:i32 fetch_size=-1) + throws(1:QueryNotFoundException error, 2:BeeswaxException error2), + + /** + * Get the state of the query + */ + QueryState get_state(1:QueryHandle handle) throws(1:QueryNotFoundException error), + + /** + * Get the result metadata + */ + ResultsMetadata get_results_metadata(1:QueryHandle handle) + throws(1:QueryNotFoundException error), + + /** + * Used to test connection to server. A "noop" command. + */ + string echo(1:string s) + + /** + * Returns a string representation of the configuration object being used. + * Handy for debugging. + */ + string dump_config() + + /** + * Get the log messages related to the given context. + */ + string get_log(1:LogContextId context) throws(1:QueryNotFoundException error) + + /* + * Returns "default" configuration. + */ + list get_default_configuration(1:bool include_hadoop) + + /* + * closes the query with given handle + */ + void close(1:QueryHandle handle) throws(1:QueryNotFoundException error, + 2:BeeswaxException error2) + + /* + * clean the log context for given id + */ + void clean(1:LogContextId log_context) +} diff --git a/herringbone-main/src/main/thrift/cli_service.thrift b/herringbone-main/src/main/thrift/cli_service.thrift new file mode 100644 index 0000000..24a3558 --- /dev/null +++ b/herringbone-main/src/main/thrift/cli_service.thrift @@ -0,0 +1,1015 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Coding Conventions for this file: +// +// Structs/Enums/Unions +// * Struct, Enum, and Union names begin with a "T", +// and use a capital letter for each new word, with no underscores. +// * All fields should be declared as either optional or required. +// +// Functions +// * Function names start with a capital letter and have a capital letter for +// each new word, with no underscores. +// * Each function should take exactly one parameter, named TFunctionNameReq, +// and should return either void or TFunctionNameResp. This convention allows +// incremental updates. +// +// Services +// * Service names begin with the letter "T", use a capital letter for each +// new word (with no underscores), and end with the word "Service". + +namespace java org.apache.hive.service.cli.thrift +namespace cpp apache.hive.service.cli.thrift +namespace rb impala.protocol.hive + +// List of protocol versions. A new token should be +// added to the end of this list every time a change is made. +enum TProtocolVersion { + HIVE_CLI_SERVICE_PROTOCOL_V1 +} + +enum TTypeId { + BOOLEAN_TYPE, + TINYINT_TYPE, + SMALLINT_TYPE, + INT_TYPE, + BIGINT_TYPE, + FLOAT_TYPE, + DOUBLE_TYPE, + STRING_TYPE, + TIMESTAMP_TYPE, + BINARY_TYPE, + ARRAY_TYPE, + MAP_TYPE, + STRUCT_TYPE, + UNION_TYPE, + USER_DEFINED_TYPE, + DECIMAL_TYPE +} + +const set PRIMITIVE_TYPES = [ + TTypeId.BOOLEAN_TYPE + TTypeId.TINYINT_TYPE + TTypeId.SMALLINT_TYPE + TTypeId.INT_TYPE + TTypeId.BIGINT_TYPE + TTypeId.FLOAT_TYPE + TTypeId.DOUBLE_TYPE + TTypeId.STRING_TYPE + TTypeId.TIMESTAMP_TYPE + TTypeId.BINARY_TYPE, + TTypeId.DECIMAL_TYPE +] + +const set COMPLEX_TYPES = [ + TTypeId.ARRAY_TYPE + TTypeId.MAP_TYPE + TTypeId.STRUCT_TYPE + TTypeId.UNION_TYPE + TTypeId.USER_DEFINED_TYPE +] + +const set COLLECTION_TYPES = [ + TTypeId.ARRAY_TYPE + TTypeId.MAP_TYPE +] + +const map TYPE_NAMES = { + TTypeId.BOOLEAN_TYPE: "BOOLEAN", + TTypeId.TINYINT_TYPE: "TINYINT", + TTypeId.SMALLINT_TYPE: "SMALLINT", + TTypeId.INT_TYPE: "INT", + TTypeId.BIGINT_TYPE: "BIGINT", + TTypeId.FLOAT_TYPE: "FLOAT", + TTypeId.DOUBLE_TYPE: "DOUBLE", + TTypeId.STRING_TYPE: "STRING", + TTypeId.TIMESTAMP_TYPE: "TIMESTAMP", + TTypeId.BINARY_TYPE: "BINARY", + TTypeId.ARRAY_TYPE: "ARRAY", + TTypeId.MAP_TYPE: "MAP", + TTypeId.STRUCT_TYPE: "STRUCT", + TTypeId.UNION_TYPE: "UNIONTYPE" + TTypeId.DECIMAL_TYPE: "DECIMAL" +} + +// Thrift does not support recursively defined types or forward declarations, +// which makes it difficult to represent Hive's nested types. +// To get around these limitations TTypeDesc employs a type list that maps +// integer "pointers" to TTypeEntry objects. The following examples show +// how different types are represented using this scheme: +// +// "INT": +// TTypeDesc { +// types = [ +// TTypeEntry.primitive_entry { +// type = INT_TYPE +// } +// ] +// } +// +// "ARRAY": +// TTypeDesc { +// types = [ +// TTypeEntry.array_entry { +// object_type_ptr = 1 +// }, +// TTypeEntry.primitive_entry { +// type = INT_TYPE +// } +// ] +// } +// +// "MAP": +// TTypeDesc { +// types = [ +// TTypeEntry.map_entry { +// key_type_ptr = 1 +// value_type_ptr = 2 +// }, +// TTypeEntry.primitive_entry { +// type = INT_TYPE +// }, +// TTypeEntry.primitive_entry { +// type = STRING_TYPE +// } +// ] +// } + +typedef i32 TTypeEntryPtr + +// Type entry for a primitive type. +struct TPrimitiveTypeEntry { + // The primitive type token. This must satisfy the condition + // that type is in the PRIMITIVE_TYPES set. + 1: required TTypeId type +} + +// Type entry for an ARRAY type. +struct TArrayTypeEntry { + 1: required TTypeEntryPtr objectTypePtr +} + +// Type entry for a MAP type. +struct TMapTypeEntry { + 1: required TTypeEntryPtr keyTypePtr + 2: required TTypeEntryPtr valueTypePtr +} + +// Type entry for a STRUCT type. +struct TStructTypeEntry { + 1: required map nameToTypePtr +} + +// Type entry for a UNIONTYPE type. +struct TUnionTypeEntry { + 1: required map nameToTypePtr +} + +struct TUserDefinedTypeEntry { + // The fully qualified name of the class implementing this type. + 1: required string typeClassName +} + +// We use a union here since Thrift does not support inheritance. +union TTypeEntry { + 1: TPrimitiveTypeEntry primitiveEntry + 2: TArrayTypeEntry arrayEntry + 3: TMapTypeEntry mapEntry + 4: TStructTypeEntry structEntry + 5: TUnionTypeEntry unionEntry + 6: TUserDefinedTypeEntry userDefinedTypeEntry +} + +// Type descriptor for columns. +struct TTypeDesc { + // The "top" type is always the first element of the list. + // If the top type is an ARRAY, MAP, STRUCT, or UNIONTYPE + // type, then subsequent elements represent nested types. + 1: required list types +} + +// A result set column descriptor. +struct TColumnDesc { + // The name of the column + 1: required string columnName + + // The type descriptor for this column + 2: required TTypeDesc typeDesc + + // The ordinal position of this column in the schema + 3: required i32 position + + 4: optional string comment +} + +// Metadata used to describe the schema (column names, types, comments) +// of result sets. +struct TTableSchema { + 1: required list columns +} + +// A Boolean column value. +struct TBoolValue { + // NULL if value is unset. + 1: optional bool value +} + +// A Byte column value. +struct TByteValue { + // NULL if value is unset. + 1: optional byte value +} + +// A signed, 16 bit column value. +struct TI16Value { + // NULL if value is unset + 1: optional i16 value +} + +// A signed, 32 bit column value +struct TI32Value { + // NULL if value is unset + 1: optional i32 value +} + +// A signed 64 bit column value +struct TI64Value { + // NULL if value is unset + 1: optional i64 value +} + +// A floating point 64 bit column value +struct TDoubleValue { + // NULL if value is unset + 1: optional double value +} + +struct TStringValue { + // NULL if value is unset + 1: optional string value +} + +union TColumn { + 1: list boolColumn + 2: list byteColumn + 3: list i16Column + 4: list i32Column + 5: list i64Column + 6: list doubleColumn + 7: list stringColumn +} + +// A single column value in a result set. +// Note that Hive's type system is richer than Thrift's, +// so in some cases we have to map multiple Hive types +// to the same Thrift type. On the client-side this is +// disambiguated by looking at the Schema of the +// result set. +union TColumnValue { + 1: TBoolValue boolVal // BOOLEAN + 2: TByteValue byteVal // TINYINT + 3: TI16Value i16Val // SMALLINT + 4: TI32Value i32Val // INT + 5: TI64Value i64Val // BIGINT, TIMESTAMP + 6: TDoubleValue doubleVal // FLOAT, DOUBLE + 7: TStringValue stringVal // STRING, LIST, MAP, STRUCT, UNIONTYPE, BINARY, DECIMAL +} + +// Represents a row in a rowset. +struct TRow { + 1: required list colVals +} + +// Represents a rowset +struct TRowSet { + // The starting row offset of this rowset. + 1: required i64 startRowOffset + 2: required list rows + 3: optional list columns +} + +// The return status code contained in each response. +enum TStatusCode { + SUCCESS_STATUS, + SUCCESS_WITH_INFO_STATUS, + STILL_EXECUTING_STATUS, + ERROR_STATUS, + INVALID_HANDLE_STATUS +} + +// The return status of a remote request +struct TStatus { + 1: required TStatusCode statusCode + + // If status is SUCCESS_WITH_INFO, info_msgs may be populated with + // additional diagnostic information. + 2: optional list infoMessages + + // If status is ERROR, then the following fields may be set + 3: optional string sqlState // as defined in the ISO/IEF CLI specification + 4: optional i32 errorCode // internal error code + 5: optional string errorMessage +} + +// The state of an operation (i.e. a query or other +// asynchronous operation that generates a result set) +// on the server. +enum TOperationState { + // The operation has been initialized + INITIALIZED_STATE, + + // The operation is running. In this state the result + // set is not available. + RUNNING_STATE, + + // The operation has completed. When an operation is in + // this state its result set may be fetched. + FINISHED_STATE, + + // The operation was canceled by a client + CANCELED_STATE, + + // The operation was closed by a client + CLOSED_STATE, + + // The operation failed due to an error + ERROR_STATE, + + // The operation is in an unrecognized state + UKNOWN_STATE, +} + + +// A string identifier. This is interpreted literally. +typedef string TIdentifier + +// A search pattern. +// +// Valid search pattern characters: +// '_': Any single character. +// '%': Any sequence of zero or more characters. +// '\': Escape character used to include special characters, +// e.g. '_', '%', '\'. If a '\' precedes a non-special +// character it has no special meaning and is interpreted +// literally. +typedef string TPattern + + +// A search pattern or identifier. Used as input +// parameter for many of the catalog functions. +typedef string TPatternOrIdentifier + +struct THandleIdentifier { + // 16 byte globally unique identifier + // This is the public ID of the handle and + // can be used for reporting. + 1: required binary guid, + + // 16 byte secret generated by the server + // and used to verify that the handle is not + // being hijacked by another user. + 2: required binary secret, +} + +// Client-side handle to persistent +// session information on the server-side. +struct TSessionHandle { + 1: required THandleIdentifier sessionId +} + +// The subtype of an OperationHandle. +enum TOperationType { + EXECUTE_STATEMENT, + GET_TYPE_INFO, + GET_CATALOGS, + GET_SCHEMAS, + GET_TABLES, + GET_TABLE_TYPES, + GET_COLUMNS, + GET_FUNCTIONS, + UNKNOWN, +} + +// Client-side reference to a task running +// asynchronously on the server. +struct TOperationHandle { + 1: required THandleIdentifier operationId + 2: required TOperationType operationType + + // If hasResultSet = TRUE, then this operation + // generates a result set that can be fetched. + // Note that the result set may be empty. + // + // If hasResultSet = FALSE, then this operation + // does not generate a result set, and calling + // GetResultSetMetadata or FetchResults against + // this OperationHandle will generate an error. + 3: required bool hasResultSet + + // For operations that don't generate result sets, + // modifiedRowCount is either: + // + // 1) The number of rows that were modified by + // the DML operation (e.g. number of rows inserted, + // number of rows deleted, etc). + // + // 2) 0 for operations that don't modify or add rows. + // + // 3) < 0 if the operation is capable of modifiying rows, + // but Hive is unable to determine how many rows were + // modified. For example, Hive's LOAD DATA command + // doesn't generate row count information because + // Hive doesn't inspect the data as it is loaded. + // + // modifiedRowCount is unset if the operation generates + // a result set. + 4: optional double modifiedRowCount +} + + +// OpenSession() +// +// Open a session (connection) on the server against +// which operations may be executed. +struct TOpenSessionReq { + // The version of the HiveServer2 protocol that the client is using. + 1: required TProtocolVersion client_protocol = TProtocolVersion.HIVE_CLI_SERVICE_PROTOCOL_V1 + + // Username and password for authentication. + // Depending on the authentication scheme being used, + // this information may instead be provided by a lower + // protocol layer, in which case these fields may be + // left unset. + 2: optional string username + 3: optional string password + + // Configuration overlay which is applied when the session is + // first created. + 4: optional map configuration +} + +struct TOpenSessionResp { + 1: required TStatus status + + // The protocol version that the server is using. + 2: required TProtocolVersion serverProtocolVersion = TProtocolVersion.HIVE_CLI_SERVICE_PROTOCOL_V1 + + // Session Handle + 3: optional TSessionHandle sessionHandle + + // The configuration settings for this session. + 4: optional map configuration +} + + +// CloseSession() +// +// Closes the specified session and frees any resources +// currently allocated to that session. Any open +// operations in that session will be canceled. +struct TCloseSessionReq { + 1: required TSessionHandle sessionHandle +} + +struct TCloseSessionResp { + 1: required TStatus status +} + + + +enum TGetInfoType { + CLI_MAX_DRIVER_CONNECTIONS = 0, + CLI_MAX_CONCURRENT_ACTIVITIES = 1, + CLI_DATA_SOURCE_NAME = 2, + CLI_FETCH_DIRECTION = 8, + CLI_SERVER_NAME = 13, + CLI_SEARCH_PATTERN_ESCAPE = 14, + CLI_DBMS_NAME = 17, + CLI_DBMS_VER = 18, + CLI_ACCESSIBLE_TABLES = 19, + CLI_ACCESSIBLE_PROCEDURES = 20, + CLI_CURSOR_COMMIT_BEHAVIOR = 23, + CLI_DATA_SOURCE_READ_ONLY = 25, + CLI_DEFAULT_TXN_ISOLATION = 26, + CLI_IDENTIFIER_CASE = 28, + CLI_IDENTIFIER_QUOTE_CHAR = 29, + CLI_MAX_COLUMN_NAME_LEN = 30, + CLI_MAX_CURSOR_NAME_LEN = 31, + CLI_MAX_SCHEMA_NAME_LEN = 32, + CLI_MAX_CATALOG_NAME_LEN = 34, + CLI_MAX_TABLE_NAME_LEN = 35, + CLI_SCROLL_CONCURRENCY = 43, + CLI_TXN_CAPABLE = 46, + CLI_USER_NAME = 47, + CLI_TXN_ISOLATION_OPTION = 72, + CLI_INTEGRITY = 73, + CLI_GETDATA_EXTENSIONS = 81, + CLI_NULL_COLLATION = 85, + CLI_ALTER_TABLE = 86, + CLI_ORDER_BY_COLUMNS_IN_SELECT = 90, + CLI_SPECIAL_CHARACTERS = 94, + CLI_MAX_COLUMNS_IN_GROUP_BY = 97, + CLI_MAX_COLUMNS_IN_INDEX = 98, + CLI_MAX_COLUMNS_IN_ORDER_BY = 99, + CLI_MAX_COLUMNS_IN_SELECT = 100, + CLI_MAX_COLUMNS_IN_TABLE = 101, + CLI_MAX_INDEX_SIZE = 102, + CLI_MAX_ROW_SIZE = 104, + CLI_MAX_STATEMENT_LEN = 105, + CLI_MAX_TABLES_IN_SELECT = 106, + CLI_MAX_USER_NAME_LEN = 107, + CLI_OJ_CAPABILITIES = 115, + + CLI_XOPEN_CLI_YEAR = 10000, + CLI_CURSOR_SENSITIVITY = 10001, + CLI_DESCRIBE_PARAMETER = 10002, + CLI_CATALOG_NAME = 10003, + CLI_COLLATION_SEQ = 10004, + CLI_MAX_IDENTIFIER_LEN = 10005, +} + +union TGetInfoValue { + 1: string stringValue + 2: i16 smallIntValue + 3: i32 integerBitmask + 4: i32 integerFlag + 5: i32 binaryValue + 6: i64 lenValue +} + +// GetInfo() +// +// This function is based on ODBC's CLIGetInfo() function. +// The function returns general information about the data source +// using the same keys as ODBC. +struct TGetInfoReq { + // The sesssion to run this request against + 1: required TSessionHandle sessionHandle + + 2: required TGetInfoType infoType +} + +struct TGetInfoResp { + 1: required TStatus status + + 2: required TGetInfoValue infoValue +} + + +// ExecuteStatement() +// +// Execute a statement. +// The returned OperationHandle can be used to check on the +// status of the statement, and to fetch results once the +// statement has finished executing. +struct TExecuteStatementReq { + // The session to exexcute the statement against + 1: required TSessionHandle sessionHandle + + // The statement to be executed (DML, DDL, SET, etc) + 2: required string statement + + // Configuration properties that are overlayed on top of the + // the existing session configuration before this statement + // is executed. These properties apply to this statement + // only and will not affect the subsequent state of the Session. + 3: optional map confOverlay +} + +struct TExecuteStatementResp { + 1: required TStatus status + 2: optional TOperationHandle operationHandle +} + + +// GetTypeInfo() +// +// Get information about types supported by the HiveServer instance. +// The information is returned as a result set which can be fetched +// using the OperationHandle provided in the response. +// +// Refer to the documentation for ODBC's CLIGetTypeInfo function for +// the format of the result set. +struct TGetTypeInfoReq { + // The session to run this request against. + 1: required TSessionHandle sessionHandle +} + +struct TGetTypeInfoResp { + 1: required TStatus status + 2: optional TOperationHandle operationHandle +} + + +// GetCatalogs() +// +// Returns the list of catalogs (databases) +// Results are ordered by TABLE_CATALOG +// +// Resultset columns : +// col1 +// name: TABLE_CAT +// type: STRING +// desc: Catalog name. NULL if not applicable. +// +struct TGetCatalogsReq { + // Session to run this request against + 1: required TSessionHandle sessionHandle +} + +struct TGetCatalogsResp { + 1: required TStatus status + 2: optional TOperationHandle operationHandle +} + + +// GetSchemas() +// +// Retrieves the schema names available in this database. +// The results are ordered by TABLE_CATALOG and TABLE_SCHEM. +// col1 +// name: TABLE_SCHEM +// type: STRING +// desc: schema name +// col2 +// name: TABLE_CATALOG +// type: STRING +// desc: catalog name +struct TGetSchemasReq { + // Session to run this request against + 1: required TSessionHandle sessionHandle + + // Name of the catalog. Must not contain a search pattern. + 2: optional TIdentifier catalogName + + // schema name or pattern + 3: optional TPatternOrIdentifier schemaName +} + +struct TGetSchemasResp { + 1: required TStatus status + 2: optional TOperationHandle operationHandle +} + + +// GetTables() +// +// Returns a list of tables with catalog, schema, and table +// type information. The information is returned as a result +// set which can be fetched using the OperationHandle +// provided in the response. +// Results are ordered by TABLE_TYPE, TABLE_CAT, TABLE_SCHEM, and TABLE_NAME +// +// Result Set Columns: +// +// col1 +// name: TABLE_CAT +// type: STRING +// desc: Catalog name. NULL if not applicable. +// +// col2 +// name: TABLE_SCHEM +// type: STRING +// desc: Schema name. +// +// col3 +// name: TABLE_NAME +// type: STRING +// desc: Table name. +// +// col4 +// name: TABLE_TYPE +// type: STRING +// desc: The table type, e.g. "TABLE", "VIEW", etc. +// +// col5 +// name: REMARKS +// type: STRING +// desc: Comments about the table +// +struct TGetTablesReq { + // Session to run this request against + 1: required TSessionHandle sessionHandle + + // Name of the catalog or a search pattern. + 2: optional TPatternOrIdentifier catalogName + + // Name of the schema or a search pattern. + 3: optional TPatternOrIdentifier schemaName + + // Name of the table or a search pattern. + 4: optional TPatternOrIdentifier tableName + + // List of table types to match + // e.g. "TABLE", "VIEW", "SYSTEM TABLE", "GLOBAL TEMPORARY", + // "LOCAL TEMPORARY", "ALIAS", "SYNONYM", etc. + 5: optional list tableTypes +} + +struct TGetTablesResp { + 1: required TStatus status + 2: optional TOperationHandle operationHandle +} + + +// GetTableTypes() +// +// Returns the table types available in this database. +// The results are ordered by table type. +// +// col1 +// name: TABLE_TYPE +// type: STRING +// desc: Table type name. +struct TGetTableTypesReq { + // Session to run this request against + 1: required TSessionHandle sessionHandle +} + +struct TGetTableTypesResp { + 1: required TStatus status + 2: optional TOperationHandle operationHandle +} + + +// GetColumns() +// +// Returns a list of columns in the specified tables. +// The information is returned as a result set which can be fetched +// using the OperationHandle provided in the response. +// Results are ordered by TABLE_CAT, TABLE_SCHEM, TABLE_NAME, +// and ORDINAL_POSITION. +// +// Result Set Columns are the same as those for the ODBC CLIColumns +// function. +// +struct TGetColumnsReq { + // Session to run this request against + 1: required TSessionHandle sessionHandle + + // Name of the catalog. Must not contain a search pattern. + 2: optional TIdentifier catalogName + + // Schema name or search pattern + 3: optional TPatternOrIdentifier schemaName + + // Table name or search pattern + 4: optional TPatternOrIdentifier tableName + + // Column name or search pattern + 5: optional TPatternOrIdentifier columnName +} + +struct TGetColumnsResp { + 1: required TStatus status + 2: optional TOperationHandle operationHandle +} + + +// GetFunctions() +// +// Returns a list of functions supported by the data source. The +// behavior of this function matches +// java.sql.DatabaseMetaData.getFunctions() both in terms of +// inputs and outputs. +// +// Result Set Columns: +// +// col1 +// name: FUNCTION_CAT +// type: STRING +// desc: Function catalog (may be null) +// +// col2 +// name: FUNCTION_SCHEM +// type: STRING +// desc: Function schema (may be null) +// +// col3 +// name: FUNCTION_NAME +// type: STRING +// desc: Function name. This is the name used to invoke the function. +// +// col4 +// name: REMARKS +// type: STRING +// desc: Explanatory comment on the function. +// +// col5 +// name: FUNCTION_TYPE +// type: SMALLINT +// desc: Kind of function. One of: +// * functionResultUnknown - Cannot determine if a return value or a table +// will be returned. +// * functionNoTable - Does not a return a table. +// * functionReturnsTable - Returns a table. +// +// col6 +// name: SPECIFIC_NAME +// type: STRING +// desc: The name which uniquely identifies this function within its schema. +// In this case this is the fully qualified class name of the class +// that implements this function. +// +struct TGetFunctionsReq { + // Session to run this request against + 1: required TSessionHandle sessionHandle + + // A catalog name; must match the catalog name as it is stored in the + // database; "" retrieves those without a catalog; null means + // that the catalog name should not be used to narrow the search. + 2: optional TIdentifier catalogName + + // A schema name pattern; must match the schema name as it is stored + // in the database; "" retrieves those without a schema; null means + // that the schema name should not be used to narrow the search. + 3: optional TPatternOrIdentifier schemaName + + // A function name pattern; must match the function name as it is stored + // in the database. + 4: required TPatternOrIdentifier functionName +} + +struct TGetFunctionsResp { + 1: required TStatus status + 2: optional TOperationHandle operationHandle +} + + +// GetOperationStatus() +// +// Get the status of an operation running on the server. +struct TGetOperationStatusReq { + // Session to run this request against + 1: required TOperationHandle operationHandle +} + +struct TGetOperationStatusResp { + 1: required TStatus status + 2: optional TOperationState operationState +} + + +// CancelOperation() +// +// Cancels processing on the specified operation handle and +// frees any resources which were allocated. +struct TCancelOperationReq { + // Operation to cancel + 1: required TOperationHandle operationHandle +} + +struct TCancelOperationResp { + 1: required TStatus status +} + + +// CloseOperation() +// +// Given an operation in the FINISHED, CANCELED, +// or ERROR states, CloseOperation() will free +// all of the resources which were allocated on +// the server to service the operation. +struct TCloseOperationReq { + 1: required TOperationHandle operationHandle +} + +struct TCloseOperationResp { + 1: required TStatus status +} + + +// GetResultSetMetadata() +// +// Retrieves schema information for the specified operation +struct TGetResultSetMetadataReq { + // Operation for which to fetch result set schema information + 1: required TOperationHandle operationHandle +} + +struct TGetResultSetMetadataResp { + 1: required TStatus status + 2: optional TTableSchema schema +} + + +enum TFetchOrientation { + // Get the next rowset. The fetch offset is ignored. + FETCH_NEXT, + + // Get the previous rowset. The fetch offset is ignored. + // NOT SUPPORTED + FETCH_PRIOR, + + // Return the rowset at the given fetch offset relative + // to the curren rowset. + // NOT SUPPORTED + FETCH_RELATIVE, + + // Return the rowset at the specified fetch offset. + // NOT SUPPORTED + FETCH_ABSOLUTE, + + // Get the first rowset in the result set. + FETCH_FIRST, + + // Get the last rowset in the result set. + // NOT SUPPORTED + FETCH_LAST +} + +// FetchResults() +// +// Fetch rows from the server corresponding to +// a particular OperationHandle. +struct TFetchResultsReq { + // Operation from which to fetch results. + 1: required TOperationHandle operationHandle + + // The fetch orientation. For V1 this must be either + // FETCH_NEXT or FETCH_FIRST. Defaults to FETCH_NEXT. + 2: required TFetchOrientation orientation = TFetchOrientation.FETCH_NEXT + + // Max number of rows that should be returned in + // the rowset. + 3: required i64 maxRows +} + +struct TFetchResultsResp { + 1: required TStatus status + + // TRUE if there are more rows left to fetch from the server. + 2: optional bool hasMoreRows + + // The rowset. This is optional so that we have the + // option in the future of adding alternate formats for + // representing result set data, e.g. delimited strings, + // binary encoded, etc. + 3: optional TRowSet results +} + +// GetLog() +// +// Fetch operation log from the server corresponding to +// a particular OperationHandle. +struct TGetLogReq { + // Operation whose log is requested + 1: required TOperationHandle operationHandle +} + +struct TGetLogResp { + 1: required TStatus status + + 2: required string log +} + +service TCLIService { + + TOpenSessionResp OpenSession(1:TOpenSessionReq req); + + TCloseSessionResp CloseSession(1:TCloseSessionReq req); + + TGetInfoResp GetInfo(1:TGetInfoReq req); + + TExecuteStatementResp ExecuteStatement(1:TExecuteStatementReq req); + + TGetTypeInfoResp GetTypeInfo(1:TGetTypeInfoReq req); + + TGetCatalogsResp GetCatalogs(1:TGetCatalogsReq req); + + TGetSchemasResp GetSchemas(1:TGetSchemasReq req); + + TGetTablesResp GetTables(1:TGetTablesReq req); + + TGetTableTypesResp GetTableTypes(1:TGetTableTypesReq req); + + TGetColumnsResp GetColumns(1:TGetColumnsReq req); + + TGetFunctionsResp GetFunctions(1:TGetFunctionsReq req); + + TGetOperationStatusResp GetOperationStatus(1:TGetOperationStatusReq req); + + TCancelOperationResp CancelOperation(1:TCancelOperationReq req); + + TCloseOperationResp CloseOperation(1:TCloseOperationReq req); + + TGetResultSetMetadataResp GetResultSetMetadata(1:TGetResultSetMetadataReq req); + + TFetchResultsResp FetchResults(1:TFetchResultsReq req); + + TGetLogResp GetLog(1:TGetLogReq req); +} diff --git a/herringbone-main/src/main/thrift/fb303.thrift b/herringbone-main/src/main/thrift/fb303.thrift new file mode 100644 index 0000000..6438092 --- /dev/null +++ b/herringbone-main/src/main/thrift/fb303.thrift @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/** + * fb303.thrift + */ + +namespace java com.facebook.fb303 +namespace cpp facebook.fb303 +namespace rb Impala.Protocol.fb303 + +/** + * Common status reporting mechanism across all services + */ +enum fb_status { + DEAD = 0, + STARTING = 1, + ALIVE = 2, + STOPPING = 3, + STOPPED = 4, + WARNING = 5, +} + +/** + * Standard base service + */ +service FacebookService { + + /** + * Returns a descriptive name of the service + */ + string getName(), + + /** + * Returns the version of the service + */ + string getVersion(), + + /** + * Gets the status of this service + */ + fb_status getStatus(), + + /** + * User friendly description of status, such as why the service is in + * the dead or warning state, or what is being started or stopped. + */ + string getStatusDetails(), + + /** + * Gets the counters for this service + */ + map getCounters(), + + /** + * Gets the value of a single counter + */ + i64 getCounter(1: string key), + + /** + * Sets an option + */ + void setOption(1: string key, 2: string value), + + /** + * Gets an option + */ + string getOption(1: string key), + + /** + * Gets all options + */ + map getOptions(), + + /** + * Returns a CPU profile over the given time interval (client and server + * must agree on the profile format). + */ + string getCpuProfile(1: i32 profileDurationInSec), + + /** + * Returns the unix time that the server has been running since + */ + i64 aliveSince(), + + /** + * Tell the server to reload its configuration, reopen log files, etc + */ + oneway void reinitialize(), + + /** + * Suggest a shutdown to the server + */ + oneway void shutdown(), + +} diff --git a/herringbone-main/src/main/thrift/hive_metastore.thrift b/herringbone-main/src/main/thrift/hive_metastore.thrift new file mode 100644 index 0000000..5e05367 --- /dev/null +++ b/herringbone-main/src/main/thrift/hive_metastore.thrift @@ -0,0 +1,528 @@ +#!/usr/local/bin/thrift -java + +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +# +# Thrift Service that the MetaStore is built on +# + +include "fb303.thrift" + +namespace java org.apache.hadoop.hive.metastore.api +namespace php metastore +namespace cpp Apache.Hadoop.Hive +namespace rb Impala.Protocol.HiveMetastore + +const string DDL_TIME = "transient_lastDdlTime" + +struct Version { + 1: string version, + 2: string comments +} + +struct FieldSchema { + 1: string name, // name of the field + 2: string type, // type of the field. primitive types defined above, specify list, map for lists & maps + 3: string comment +} + +struct Type { + 1: string name, // one of the types in PrimitiveTypes or CollectionTypes or User defined types + 2: optional string type1, // object type if the name is 'list' (LIST_TYPE), key type if the name is 'map' (MAP_TYPE) + 3: optional string type2, // val type if the name is 'map' (MAP_TYPE) + //4: optional list fields // if the name is one of the user defined types +} + +enum HiveObjectType { + GLOBAL = 1, + DATABASE = 2, + TABLE = 3, + PARTITION = 4, + COLUMN = 5, +} + +enum PrincipalType { + USER = 1, + ROLE = 2, + GROUP = 3, +} + +const string HIVE_FILTER_FIELD_OWNER = "hive_filter_field_owner__" +const string HIVE_FILTER_FIELD_PARAMS = "hive_filter_field_params__" +const string HIVE_FILTER_FIELD_LAST_ACCESS = "hive_filter_field_last_access__" + +enum PartitionEventType { + LOAD_DONE = 1, +} + +struct HiveObjectRef{ + 1: HiveObjectType objectType, + 2: string dbName, + 3: string objectName, + 4: list partValues, + 5: string columnName, +} + +struct PrivilegeGrantInfo { + 1: string privilege, + 2: i32 createTime, + 3: string grantor, + 4: PrincipalType grantorType, + 5: bool grantOption, +} + +struct HiveObjectPrivilege { + 1: HiveObjectRef hiveObject, + 2: string principalName, + 3: PrincipalType principalType, + 4: PrivilegeGrantInfo grantInfo, +} + +struct PrivilegeBag { + 1: list privileges, +} + +struct PrincipalPrivilegeSet { + 1: map> userPrivileges, // user name -> privilege grant info + 2: map> groupPrivileges, // group name -> privilege grant info + 3: map> rolePrivileges, //role name -> privilege grant info +} + +struct Role { + 1: string roleName, + 2: i32 createTime, + 3: string ownerName, +} + +// namespace for tables +struct Database { + 1: string name, + 2: string description, + 3: string locationUri, + 4: map parameters, // properties associated with the database + 5: optional PrincipalPrivilegeSet privileges +} + +// This object holds the information needed by SerDes +struct SerDeInfo { + 1: string name, // name of the serde, table name by default + 2: string serializationLib, // usually the class that implements the extractor & loader + 3: map parameters // initialization parameters +} + +// sort order of a column (column name along with asc(1)/desc(0)) +struct Order { + 1: string col, // sort column name + 2: i32 order // asc(1) or desc(0) +} + +// this object holds all the information about physical storage of the data belonging to a table +struct StorageDescriptor { + 1: list cols, // required (refer to types defined above) + 2: string location, // defaults to //tablename + 3: string inputFormat, // SequenceFileInputFormat (binary) or TextInputFormat` or custom format + 4: string outputFormat, // SequenceFileOutputFormat (binary) or IgnoreKeyTextOutputFormat or custom format + 5: bool compressed, // compressed or not + 6: i32 numBuckets, // this must be specified if there are any dimension columns + 7: SerDeInfo serdeInfo, // serialization and deserialization information + 8: list bucketCols, // reducer grouping columns and clustering columns and bucketing columns` + 9: list sortCols, // sort order of the data in each bucket + 10: map parameters // any user supplied key value hash +} + +// table information +struct Table { + 1: string tableName, // name of the table + 2: string dbName, // database name ('default') + 3: string owner, // owner of this table + 4: i32 createTime, // creation time of the table + 5: i32 lastAccessTime, // last access time (usually this will be filled from HDFS and shouldn't be relied on) + 6: i32 retention, // retention time + 7: StorageDescriptor sd, // storage descriptor of the table + 8: list partitionKeys, // partition keys of the table. only primitive types are supported + 9: map parameters, // to store comments or any other user level parameters + 10: string viewOriginalText, // original view text, null for non-view + 11: string viewExpandedText, // expanded view text, null for non-view + 12: string tableType, // table type enum, e.g. EXTERNAL_TABLE + 13: optional PrincipalPrivilegeSet privileges, +} + +struct Partition { + 1: list values // string value is converted to appropriate partition key type + 2: string dbName, + 3: string tableName, + 4: i32 createTime, + 5: i32 lastAccessTime, + 6: StorageDescriptor sd, + 7: map parameters, + 8: optional PrincipalPrivilegeSet privileges +} + +struct Index { + 1: string indexName, // unique with in the whole database namespace + 2: string indexHandlerClass, // reserved + 3: string dbName, + 4: string origTableName, + 5: i32 createTime, + 6: i32 lastAccessTime, + 7: string indexTableName, + 8: StorageDescriptor sd, + 9: map parameters, + 10: bool deferredRebuild +} + +// schema of the table/query results etc. +struct Schema { + // column names, types, comments + 1: list fieldSchemas, // delimiters etc + 2: map properties +} + +// Key-value store to be used with selected +// Metastore APIs (create, alter methods). +// The client can pass environment properties / configs that can be +// accessed in hooks. +struct EnvironmentContext { + 1: map properties +} + +exception MetaException { + 1: string message +} + +exception UnknownTableException { + 1: string message +} + +exception UnknownDBException { + 1: string message +} + +exception AlreadyExistsException { + 1: string message +} + +exception InvalidPartitionException { + 1: string message +} + +exception UnknownPartitionException { + 1: string message +} + +exception InvalidObjectException { + 1: string message +} + +exception NoSuchObjectException { + 1: string message +} + +exception IndexAlreadyExistsException { + 1: string message +} + +exception InvalidOperationException { + 1: string message +} + +exception ConfigValSecurityException { + 1: string message +} + +/** +* This interface is live. +*/ +service ThriftHiveMetastore extends fb303.FacebookService +{ + void create_database(1:Database database) throws(1:AlreadyExistsException o1, 2:InvalidObjectException o2, 3:MetaException o3) + Database get_database(1:string name) throws(1:NoSuchObjectException o1, 2:MetaException o2) + void drop_database(1:string name, 2:bool deleteData, 3:bool cascade) throws(1:NoSuchObjectException o1, 2:InvalidOperationException o2, 3:MetaException o3) + list get_databases(1:string pattern) throws(1:MetaException o1) + list get_all_databases() throws(1:MetaException o1) + void alter_database(1:string dbname, 2:Database db) throws(1:MetaException o1, 2:NoSuchObjectException o2) + + // returns the type with given name (make seperate calls for the dependent types if needed) + Type get_type(1:string name) throws(1:MetaException o1, 2:NoSuchObjectException o2) + bool create_type(1:Type type) throws(1:AlreadyExistsException o1, 2:InvalidObjectException o2, 3:MetaException o3) + bool drop_type(1:string type) throws(1:MetaException o1, 2:NoSuchObjectException o2) + map get_type_all(1:string name) + throws(1:MetaException o2) + + // Gets a list of FieldSchemas describing the columns of a particular table + list get_fields(1: string db_name, 2: string table_name) throws (1: MetaException o1, 2: UnknownTableException o2, 3: UnknownDBException o3), + + // Gets a list of FieldSchemas describing both the columns and the partition keys of a particular table + list get_schema(1: string db_name, 2: string table_name) throws (1: MetaException o1, 2: UnknownTableException o2, 3: UnknownDBException o3) + + // create a Hive table. Following fields must be set + // tableName + // database (only 'default' for now until Hive QL supports databases) + // owner (not needed, but good to have for tracking purposes) + // sd.cols (list of field schemas) + // sd.inputFormat (SequenceFileInputFormat (binary like falcon tables or u_full) or TextInputFormat) + // sd.outputFormat (SequenceFileInputFormat (binary) or TextInputFormat) + // sd.serdeInfo.serializationLib (SerDe class name eg org.apache.hadoop.hive.serde.simple_meta.MetadataTypedColumnsetSerDe + // * See notes on DDL_TIME + void create_table(1:Table tbl) throws(1:AlreadyExistsException o1, 2:InvalidObjectException o2, 3:MetaException o3, 4:NoSuchObjectException o4) + void create_table_with_environment_context(1:Table tbl, + 2:EnvironmentContext environment_context) + throws (1:AlreadyExistsException o1, + 2:InvalidObjectException o2, 3:MetaException o3, + 4:NoSuchObjectException o4) + // drops the table and all the partitions associated with it if the table has partitions + // delete data (including partitions) if deleteData is set to true + void drop_table(1:string dbname, 2:string name, 3:bool deleteData) + throws(1:NoSuchObjectException o1, 2:MetaException o3) + list get_tables(1: string db_name, 2: string pattern) throws (1: MetaException o1) + list get_all_tables(1: string db_name) throws (1: MetaException o1) + + Table get_table(1:string dbname, 2:string tbl_name) + throws (1:MetaException o1, 2:NoSuchObjectException o2) + list
get_table_objects_by_name(1:string dbname, 2:list tbl_names) + throws (1:MetaException o1, 2:InvalidOperationException o2, 3:UnknownDBException o3) + + // Get a list of table names that match a filter. + // The filter operators are LIKE, <, <=, >, >=, =, <> + // + // In the filter statement, values interpreted as strings must be enclosed in quotes, + // while values interpreted as integers should not be. Strings and integers are the only + // supported value types. + // + // The currently supported key names in the filter are: + // Constants.HIVE_FILTER_FIELD_OWNER, which filters on the tables' owner's name + // and supports all filter operators + // Constants.HIVE_FILTER_FIELD_LAST_ACCESS, which filters on the last access times + // and supports all filter operators except LIKE + // Constants.HIVE_FILTER_FIELD_PARAMS, which filters on the tables' parameter keys and values + // and only supports the filter operators = and <>. + // Append the parameter key name to HIVE_FILTER_FIELD_PARAMS in the filter statement. + // For example, to filter on parameter keys called "retention", the key name in the filter + // statement should be Constants.HIVE_FILTER_FIELD_PARAMS + "retention" + // Also, = and <> only work for keys that exist + // in the tables. E.g., if you are looking for tables where key1 <> value, it will only + // look at tables that have a value for the parameter key1. + // Some example filter statements include: + // filter = Constants.HIVE_FILTER_FIELD_OWNER + " like \".*test.*\" and " + + // Constants.HIVE_FILTER_FIELD_LAST_ACCESS + " = 0"; + // filter = Constants.HIVE_FILTER_FIELD_PARAMS + "retention = \"30\" or " + + // Constants.HIVE_FILTER_FIELD_PARAMS + "retention = \"90\"" + // @param dbName + // The name of the database from which you will retrieve the table names + // @param filterType + // The type of filter + // @param filter + // The filter string + // @param max_tables + // The maximum number of tables returned + // @return A list of table names that match the desired filter + list get_table_names_by_filter(1:string dbname, 2:string filter, 3:i16 max_tables=-1) + throws (1:MetaException o1, 2:InvalidOperationException o2, 3:UnknownDBException o3) + + // alter table applies to only future partitions not for existing partitions + // * See notes on DDL_TIME + void alter_table(1:string dbname, 2:string tbl_name, 3:Table new_tbl) + throws (1:InvalidOperationException o1, 2:MetaException o2) + void alter_table_with_environment_context(1:string dbname, 2:string tbl_name, + 3:Table new_tbl, 4:EnvironmentContext environment_context) + throws (1:InvalidOperationException o1, 2:MetaException o2) + // the following applies to only tables that have partitions + // * See notes on DDL_TIME + Partition add_partition(1:Partition new_part) + throws(1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3) + Partition add_partition_with_environment_context(1:Partition new_part, + 2:EnvironmentContext environment_context) + throws (1:InvalidObjectException o1, 2:AlreadyExistsException o2, + 3:MetaException o3) + i32 add_partitions(1:list new_parts) + throws(1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3) + Partition append_partition(1:string db_name, 2:string tbl_name, 3:list part_vals) + throws (1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3) + Partition append_partition_by_name(1:string db_name, 2:string tbl_name, 3:string part_name) + throws (1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3) + bool drop_partition(1:string db_name, 2:string tbl_name, 3:list part_vals, 4:bool deleteData) + throws(1:NoSuchObjectException o1, 2:MetaException o2) + bool drop_partition_by_name(1:string db_name, 2:string tbl_name, 3:string part_name, 4:bool deleteData) + throws(1:NoSuchObjectException o1, 2:MetaException o2) + Partition get_partition(1:string db_name, 2:string tbl_name, 3:list part_vals) + throws(1:MetaException o1, 2:NoSuchObjectException o2) + + Partition get_partition_with_auth(1:string db_name, 2:string tbl_name, 3:list part_vals, + 4: string user_name, 5: list group_names) throws(1:MetaException o1, 2:NoSuchObjectException o2) + + Partition get_partition_by_name(1:string db_name 2:string tbl_name, 3:string part_name) + throws(1:MetaException o1, 2:NoSuchObjectException o2) + + // returns all the partitions for this table in reverse chronological order. + // If max parts is given then it will return only that many. + list get_partitions(1:string db_name, 2:string tbl_name, 3:i16 max_parts=-1) + throws(1:NoSuchObjectException o1, 2:MetaException o2) + list get_partitions_with_auth(1:string db_name, 2:string tbl_name, 3:i16 max_parts=-1, + 4: string user_name, 5: list group_names) throws(1:NoSuchObjectException o1, 2:MetaException o2) + + list get_partition_names(1:string db_name, 2:string tbl_name, 3:i16 max_parts=-1) + throws(1:MetaException o2) + + // get_partition*_ps methods allow filtering by a partial partition specification, + // as needed for dynamic partitions. The values that are not restricted should + // be empty strings. Nulls were considered (instead of "") but caused errors in + // generated Python code. The size of part_vals may be smaller than the + // number of partition columns - the unspecified values are considered the same + // as "". + list get_partitions_ps(1:string db_name 2:string tbl_name + 3:list part_vals, 4:i16 max_parts=-1) + throws(1:MetaException o1, 2:NoSuchObjectException o2) + list get_partitions_ps_with_auth(1:string db_name, 2:string tbl_name, 3:list part_vals, 4:i16 max_parts=-1, + 5: string user_name, 6: list group_names) throws(1:NoSuchObjectException o1, 2:MetaException o2) + + list get_partition_names_ps(1:string db_name, + 2:string tbl_name, 3:list part_vals, 4:i16 max_parts=-1) + throws(1:MetaException o1, 2:NoSuchObjectException o2) + + // get the partitions matching the given partition filter + list get_partitions_by_filter(1:string db_name 2:string tbl_name + 3:string filter, 4:i16 max_parts=-1) + throws(1:MetaException o1, 2:NoSuchObjectException o2) + + // get partitions give a list of partition names + list get_partitions_by_names(1:string db_name 2:string tbl_name 3:list names) + throws(1:MetaException o1, 2:NoSuchObjectException o2) + + // changes the partition to the new partition object. partition is identified from the part values + // in the new_part + // * See notes on DDL_TIME + void alter_partition(1:string db_name, 2:string tbl_name, 3:Partition new_part) + throws (1:InvalidOperationException o1, 2:MetaException o2) + + void alter_partition_with_environment_context(1:string db_name, + 2:string tbl_name, 3:Partition new_part, + 4:EnvironmentContext environment_context) + throws (1:InvalidOperationException o1, 2:MetaException o2) + + // rename the old partition to the new partition object by changing old part values to the part values + // in the new_part. old partition is identified from part_vals. + // partition keys in new_part should be the same as those in old partition. + void rename_partition(1:string db_name, 2:string tbl_name, 3:list part_vals, 4:Partition new_part) + throws (1:InvalidOperationException o1, 2:MetaException o2) + + // gets the value of the configuration key in the metastore server. returns + // defaultValue if the key does not exist. if the configuration key does not + // begin with "hive", "mapred", or "hdfs", a ConfigValSecurityException is + // thrown. + string get_config_value(1:string name, 2:string defaultValue) + throws(1:ConfigValSecurityException o1) + + // converts a partition name into a partition values array + list partition_name_to_vals(1: string part_name) + throws(1: MetaException o1) + // converts a partition name into a partition specification (a mapping from + // the partition cols to the values) + map partition_name_to_spec(1: string part_name) + throws(1: MetaException o1) + + void markPartitionForEvent(1:string db_name, 2:string tbl_name, 3:map part_vals, + 4:PartitionEventType eventType) throws (1: MetaException o1, 2: NoSuchObjectException o2, + 3: UnknownDBException o3, 4: UnknownTableException o4, 5: UnknownPartitionException o5, + 6: InvalidPartitionException o6) + bool isPartitionMarkedForEvent(1:string db_name, 2:string tbl_name, 3:map part_vals, + 4: PartitionEventType eventType) throws (1: MetaException o1, 2:NoSuchObjectException o2, + 3: UnknownDBException o3, 4: UnknownTableException o4, 5: UnknownPartitionException o5, + 6: InvalidPartitionException o6) + + //index + Index add_index(1:Index new_index, 2: Table index_table) + throws(1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3) + void alter_index(1:string dbname, 2:string base_tbl_name, 3:string idx_name, 4:Index new_idx) + throws (1:InvalidOperationException o1, 2:MetaException o2) + bool drop_index_by_name(1:string db_name, 2:string tbl_name, 3:string index_name, 4:bool deleteData) + throws(1:NoSuchObjectException o1, 2:MetaException o2) + Index get_index_by_name(1:string db_name 2:string tbl_name, 3:string index_name) + throws(1:MetaException o1, 2:NoSuchObjectException o2) + + list get_indexes(1:string db_name, 2:string tbl_name, 3:i16 max_indexes=-1) + throws(1:NoSuchObjectException o1, 2:MetaException o2) + list get_index_names(1:string db_name, 2:string tbl_name, 3:i16 max_indexes=-1) + throws(1:MetaException o2) + + //authorization privileges + + bool create_role(1:Role role) throws(1:MetaException o1) + bool drop_role(1:string role_name) throws(1:MetaException o1) + list get_role_names() throws(1:MetaException o1) + bool grant_role(1:string role_name, 2:string principal_name, 3:PrincipalType principal_type, + 4:string grantor, 5:PrincipalType grantorType, 6:bool grant_option) throws(1:MetaException o1) + bool revoke_role(1:string role_name, 2:string principal_name, 3:PrincipalType principal_type) + throws(1:MetaException o1) + list list_roles(1:string principal_name, 2:PrincipalType principal_type) throws(1:MetaException o1) + + PrincipalPrivilegeSet get_privilege_set(1:HiveObjectRef hiveObject, 2:string user_name, + 3: list group_names) throws(1:MetaException o1) + list list_privileges(1:string principal_name, 2:PrincipalType principal_type, + 3: HiveObjectRef hiveObject) throws(1:MetaException o1) + + bool grant_privileges(1:PrivilegeBag privileges) throws(1:MetaException o1) + bool revoke_privileges(1:PrivilegeBag privileges) throws(1:MetaException o1) + + // this is used by metastore client to send UGI information to metastore server immediately + // after setting up a connection. + list set_ugi(1:string user_name, 2:list group_names) throws (1:MetaException o1) + + //Authentication (delegation token) interfaces + + // get metastore server delegation token for use from the map/reduce tasks to authenticate + // to metastore server + string get_delegation_token(1:string token_owner, 2:string renewer_kerberos_principal_name) + throws (1:MetaException o1) + + // method to renew delegation token obtained from metastore server + i64 renew_delegation_token(1:string token_str_form) throws (1:MetaException o1) + + // method to cancel delegation token obtained from metastore server + void cancel_delegation_token(1:string token_str_form) throws (1:MetaException o1) +} + +// * Note about the DDL_TIME: When creating or altering a table or a partition, +// if the DDL_TIME is not set, the current time will be used. + +// For storing info about archived partitions in parameters + +// Whether the partition is archived +const string IS_ARCHIVED = "is_archived", +// The original location of the partition, before archiving. After archiving, +// this directory will contain the archive. When the partition +// is dropped, this directory will be deleted +const string ORIGINAL_LOCATION = "original_location", + +// these should be needed only for backward compatibility with filestore +const string META_TABLE_COLUMNS = "columns", +const string META_TABLE_COLUMN_TYPES = "columns.types", +const string BUCKET_FIELD_NAME = "bucket_field_name", +const string BUCKET_COUNT = "bucket_count", +const string FIELD_TO_DIMENSION = "field_to_dimension", +const string META_TABLE_NAME = "name", +const string META_TABLE_DB = "db", +const string META_TABLE_LOCATION = "location", +const string META_TABLE_SERDE = "serde", +const string META_TABLE_PARTITION_COLUMNS = "partition_columns", +const string FILE_INPUT_FORMAT = "file.inputformat", +const string FILE_OUTPUT_FORMAT = "file.outputformat", +const string META_TABLE_STORAGE = "storage_handler", + + + diff --git a/herringbone-main/src/test/resources/test.parquet b/herringbone-main/src/test/resources/test.parquet new file mode 100644 index 0000000000000000000000000000000000000000..17c1541ef121c7570dcab3414ad951c9f4918d8d GIT binary patch literal 916 zcmbtT-D=w~6xLF=n2ekSHlir;J-leLntyQ5FdUq4D-=X*Wk%K!WUAt>?Y{dWod%sQVSIaVx-6K!( zl}zHi&qc;$vdyn0`FZf8O{tr3m%gIzOu|g;RC&Aqv8LJRri%g!*=vQTiEs4=MN2M`n|nFq*G0c!8|iU*mc^^au}QF zR{)&TPs>|GUT@0-`#hEOX;i@2sPKdF_*4W#&iRXo37`2VgW&X~KV-4b#3*2bk572; M4E=%YGKc&47i@*Wu>b%7 literal 0 HcmV?d00001 diff --git a/herringbone-main/src/test/scala/com/stripe/herringbone/FlattenJobTest.scala b/herringbone-main/src/test/scala/com/stripe/herringbone/FlattenJobTest.scala new file mode 100644 index 0000000..c896c47 --- /dev/null +++ b/herringbone-main/src/test/scala/com/stripe/herringbone/FlattenJobTest.scala @@ -0,0 +1,22 @@ +package com.stripe.herringbone.test + +import com.stripe.herringbone.flatten._ +import org.scalatest._ +import parquet.example.Paper +import parquet.io.api.Binary + +class FlattenJobTest extends FlatSpec with Matchers { + def toBinary(x: Array[Byte]) = Binary.fromByteArray(x) + + "truncate" should "truncate to correct length" in { + val consumer = new FlatConsumer(Paper.r1, "__", false) + val bytes = toBinary(Array[Byte](1,2,3,4)) + assert(consumer.truncate(bytes, 3).getBytes().sameElements(Array[Byte](1,2,3))) + } + + "truncate" should "not truncate if unnecessary" in { + val consumer = new FlatConsumer(Paper.r1, "__", false) + val bytes = toBinary(Array[Byte](1,2,3,4)) + assert(consumer.truncate(bytes, 8) == bytes) + } +} diff --git a/herringbone-main/src/test/scala/com/stripe/herringbone/flatten/FlatConverterTest.scala b/herringbone-main/src/test/scala/com/stripe/herringbone/flatten/FlatConverterTest.scala new file mode 100644 index 0000000..241cd23 --- /dev/null +++ b/herringbone-main/src/test/scala/com/stripe/herringbone/flatten/FlatConverterTest.scala @@ -0,0 +1,61 @@ +package com.stripe.herringbone.test + +import com.stripe.herringbone.flatten.{FlatConverter,TypeFlattener} + +import org.scalatest._ +import org.apache.hadoop.fs.Path + +import parquet.example.Paper +import parquet.example.data.simple.SimpleGroup +import parquet.example.data.GroupWriter +import parquet.schema.MessageType +import parquet.schema.PrimitiveType +import parquet.schema.Type.Repetition.OPTIONAL +import parquet.schema.Type.Repetition.REQUIRED +import parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY + +import scala.collection.mutable.StringBuilder +import java.io.StringWriter + +class FlatConverterTest extends FlatSpec with Matchers { + + def nestedGroupFixture = + new { + val group = Paper.r1 + val schema = Paper.schema + val flatSchema = TypeFlattener.flatten(schema, None, "__", true) + val flatGroup = FlatConverter.flattenGroup(group, flatSchema, "__", true) + } + + def flatGroupFixture = + new { + val flatSchema = + new MessageType("Charge", + new PrimitiveType(REQUIRED, BINARY, "_id"), + new PrimitiveType(OPTIONAL, BINARY, "email"), + new PrimitiveType(REQUIRED, BINARY, "merchant") + ) + val flatGroupMissingFields = new SimpleGroup(flatSchema) + flatGroupMissingFields.add("_id", "ch_1") + flatGroupMissingFields.add("merchant", "acct_1") + val flatGroupAllFields = new SimpleGroup(flatSchema) + flatGroupAllFields.add("email", "bob@stripe.com") + flatGroupAllFields.add("merchant", "acct_1") + flatGroupAllFields.add("_id", "ch_1") + } + + "groupToTSV" should "convert a flattened group" in { + val f = nestedGroupFixture + val groupTSV = FlatConverter.groupToTSV(f.flatGroup, f.flatSchema, "__", true) + assert(groupTSV == "10\t\t20,40,60") + } + + "groupToTSV" should "respect schema ordering, handle optional fields" in { + val f = flatGroupFixture + val missingTSV = FlatConverter.groupToTSV(f.flatGroupMissingFields, f.flatSchema, "__", true) + assert(missingTSV == "ch_1\t\tacct_1") + val allTSV = FlatConverter.groupToTSV(f.flatGroupAllFields, f.flatSchema, "__", true) + assert(allTSV == "ch_1\tbob@stripe.com\tacct_1") + } +} + diff --git a/herringbone-main/src/test/scala/com/stripe/herringbone/flatten/TypeFlattenerTest.scala b/herringbone-main/src/test/scala/com/stripe/herringbone/flatten/TypeFlattenerTest.scala new file mode 100644 index 0000000..d2eb6b2 --- /dev/null +++ b/herringbone-main/src/test/scala/com/stripe/herringbone/flatten/TypeFlattenerTest.scala @@ -0,0 +1,95 @@ +package com.stripe.herringbone.test + +import com.stripe.herringbone.flatten.TypeFlattener + +import org.scalatest._ + +import parquet.schema.GroupType +import parquet.schema.MessageType +import parquet.schema.PrimitiveType +import parquet.schema.Type.Repetition.OPTIONAL +import parquet.schema.Type.Repetition.REPEATED +import parquet.schema.Type.Repetition.REQUIRED +import parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY +import parquet.schema.PrimitiveType.PrimitiveTypeName.INT64 + +class TypeFlattenerTest extends FlatSpec with Matchers { + + "flatten" should "omit the idField in nested fieldname if specified" in { + val input = new MessageType("Document", + new PrimitiveType(OPTIONAL, BINARY, "_id"), + new GroupType(OPTIONAL, "Page", + new PrimitiveType(OPTIONAL, BINARY, "_id"))) + + val expected = new MessageType("Document", + new PrimitiveType(OPTIONAL, BINARY, "_id"), + new PrimitiveType(OPTIONAL, BINARY, "Page")) + + val result = TypeFlattener.flatten(input, None, "__", true) + assert(expected == result) + } + + "flatten" should "not omit the idField in nested fieldname if none is specified" in { + val input = new MessageType("Document", + new PrimitiveType(OPTIONAL, BINARY, "_id"), + new GroupType(OPTIONAL, "Page", + new PrimitiveType(OPTIONAL, BINARY, "_id"))) + + val expected = new MessageType("Document", + new PrimitiveType(OPTIONAL, BINARY, "_id"), + new PrimitiveType(OPTIONAL, BINARY, "Page___id")) + + val result = TypeFlattener.flatten(input, None, "__", false) + assert(expected == result) + } + + "flatten" should "not include repeated groups" in { + val input = new MessageType("Document", + new PrimitiveType(OPTIONAL, BINARY, "_id"), + new GroupType(REPEATED, "Nope", + new PrimitiveType(REPEATED, INT64, "Never"))) + + val expected = new MessageType("Document", + new PrimitiveType(OPTIONAL, BINARY, "_id")) + + val result = TypeFlattener.flatten(input, None, "__", true) + assert(expected == result) + } + + "flatten" should "set all fields as optional" in { + val input = new MessageType("Document", + new GroupType(OPTIONAL, "Yep", + new GroupType(REQUIRED, "Grouped", + new PrimitiveType(REQUIRED, BINARY, "Yes"), + new PrimitiveType(REPEATED, BINARY, "Maybe")), + new PrimitiveType(OPTIONAL, BINARY, "Sometimes"))) + + val expected = new MessageType("Document", + new PrimitiveType(OPTIONAL, BINARY, "Yep__Grouped__Yes"), + new PrimitiveType(OPTIONAL, BINARY, "Yep__Grouped__Maybe"), + new PrimitiveType(OPTIONAL, BINARY, "Yep__Sometimes")) + + val result = TypeFlattener.flatten(input, None, "__", true) + assert(expected == result) + } + + "flatten" should "preserve the order of previously flattened fields" in { + val input = new MessageType("Document", + new PrimitiveType(REQUIRED, BINARY, "Old__Two"), + new GroupType(OPTIONAL, "New", + new PrimitiveType(REQUIRED, BINARY, "One")), + new PrimitiveType(REQUIRED, BINARY, "Old__One")) + + val old = new MessageType("Document", + new PrimitiveType(OPTIONAL, BINARY, "Old__One"), + new PrimitiveType(OPTIONAL, BINARY, "Old__Two")) + + val expected = new MessageType("Document", + new PrimitiveType(OPTIONAL, BINARY, "Old__One"), + new PrimitiveType(OPTIONAL, BINARY, "Old__Two"), + new PrimitiveType(OPTIONAL, BINARY, "New__One")) + + val result = TypeFlattener.flatten(input, Some(old), "__", true) + assert(expected == result) + } +} diff --git a/herringbone-main/src/test/scala/com/stripe/herringbone/load/FieldUtilsTest.scala b/herringbone-main/src/test/scala/com/stripe/herringbone/load/FieldUtilsTest.scala new file mode 100644 index 0000000..2094b43 --- /dev/null +++ b/herringbone-main/src/test/scala/com/stripe/herringbone/load/FieldUtilsTest.scala @@ -0,0 +1,49 @@ +package com.stripe.herringbone.test.load + +import com.stripe.herringbone.load.{FieldUtils, HadoopFs, ImpalaHiveSchemaTypeMapper} +import org.apache.hadoop.fs._ +import org.scalamock.scalatest.MockFactory +import org.scalatest._ +import parquet.schema.{PrimitiveType, Type} + +class FieldUtilsTest extends FlatSpec with Matchers with MockFactory { + + "findPartitionFields" should "find the partition field names and types" in { + val hadoopFs = mock[HadoopFs] + val path = new Path("path") + + val partitions = List(("day", "123"), ("type", "foo")) + (hadoopFs.findPartitions _).expects(path).returning(partitions) + + val expected = List("`day` int", "`type` string") + FieldUtils(hadoopFs, ImpalaHiveSchemaTypeMapper).findPartitionFields(path) should equal (expected) + } + + "tableFieldsFromSchemaFields" should "find the table fields from the parquet schema" in { + val hadoopFs = mock[HadoopFs] + val optional = Type.Repetition.valueOf("OPTIONAL") + val input = List( + new PrimitiveType(optional, PrimitiveType.PrimitiveTypeName.valueOf("BINARY"), "a"), + new PrimitiveType(optional, PrimitiveType.PrimitiveTypeName.valueOf("INT32"), "b"), + new PrimitiveType(optional, PrimitiveType.PrimitiveTypeName.valueOf("INT64"), "c"), + new PrimitiveType(optional, PrimitiveType.PrimitiveTypeName.valueOf("INT96"), "d"), + new PrimitiveType(optional, PrimitiveType.PrimitiveTypeName.valueOf("DOUBLE"), "e"), + new PrimitiveType(optional, PrimitiveType.PrimitiveTypeName.valueOf("BOOLEAN"), "f"), + new PrimitiveType(optional, PrimitiveType.PrimitiveTypeName.valueOf("FLOAT"), "g"), + new PrimitiveType(optional, PrimitiveType.PrimitiveTypeName.valueOf("FIXED_LEN_BYTE_ARRAY"), "h") + ) + + val expected = List( + "`a` STRING", + "`b` INT", + "`c` BIGINT", + "`d` BIGINT", + "`e` DOUBLE", + "`f` BOOLEAN", + "`g` FLOAT", + "`h` BINARY" + ) + + FieldUtils(hadoopFs, ImpalaHiveSchemaTypeMapper).tableFieldsFromSchemaFields(input) should equal (expected) + } +} diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..9239264 --- /dev/null +++ b/pom.xml @@ -0,0 +1,17 @@ + + 4.0.0 + + com.stripe + herringbone + 0.0.1 + pom + + Herringbone + + + herringbone-impala + herringbone-main + + +