Skip to content

Commit

Permalink
Fix sbt and build script
Browse files Browse the repository at this point in the history
  • Loading branch information
citrusraj committed Jul 23, 2019
1 parent 0211a84 commit 3309749
Show file tree
Hide file tree
Showing 23 changed files with 668 additions and 765 deletions.
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -23,3 +23,4 @@ project/target/
target/
.scala_dependencies
.worksheet
shaded_dependencies
37 changes: 23 additions & 14 deletions README.md
Expand Up @@ -4,6 +4,7 @@ This datasource provides the capability to work with Hive ACID V2 tables, both F

Please refer to [Quick Start](#Quick Start) for details on getting started with using this Datasource.


# Latest Binaries

ACID datasource is published to Maven Central Repository and can be used by adding a dependency in your POM file.
Expand All @@ -27,9 +28,11 @@ ACID datasource is published to Maven Central Repository and can be used by addi

ACID datasource has been tested to work with Apache Spark 2.4.3, but it should work with older versions as well. However, because of a Hive dependency, this datasource needs Hadoop version 2.8.2 or higher due to [HADOOP-14683](https://jira.apache.org/jira/browse/HADOOP-14683)

_NB: Hive ACID is supported in Hive 3.1.1 onwards and for that hive Metastore db needs to be [upgraded](https://cwiki.apache.org/confluence/display/Hive/Hive+Schema+Tool) to 3.1.1._

### Data Storage Compatibility

1. ACID datasource does not control data storage format and layout, which is managed by Hive. It should be able to work with data written by Hive version 3.0.0 and above. Please see [Hive ACID storage layout](https://cwiki.apache.org/confluence/display/Hive/Hive+Transactions#HiveTransactions-BasicDesign).
1. ACID datasource does not control data storage format and layout, which is managed by Hive. It works with data written by Hive version 3.0.0 and above. Please see [Hive ACID storage layout](https://cwiki.apache.org/confluence/display/Hive/Hive+Transactions#HiveTransactions-BasicDesign).

2. ACID datasource works with data stored on local files, HDFS as well as cloud blobstores (AWS S3, Azure Blob Storage etc).

Expand All @@ -43,28 +46,26 @@ This project has the following sbt projects:

To compile:

1. Compile shaded dependencies
1. Compile shaded dependencies and publish it locally

```bash
cd shaded-dependencies
sbt clean assembly
sbt "project shaded_dependencies" clean assembly
```

2. publish the dependency locally
2. Publish it locally for acid datsource

```bash
sbt publishLocal
sbt "project shaded_dependencies" publishLocal
```

This would create and publish `spark-hiveacid-shaded-dependencies_2.11-assembly.jar` which has all the runtime and test dependencies.

Create final assembled jar
3. Compile acid-datasource

```bash
cd acid-datasource
sbt assembly
sbt "project acid_datasource" clean package
```

This would create `hiveacid-datasource-assembly-0.1.jar`

# Testing

Tests run against a standalone docker setup. Please refer to [Docker setup] (docker/README.md) to build and start a container.
Expand All @@ -73,14 +74,22 @@ _NB: Container run HMS server, HS2 Server and HDFS and listens on port 10000,100

To run unit test,

```bash
sbt "project acid_datasource" test
```

# Publishing

to run
To publish fully assembled jar

```bash
cd acid-datasource
sbt test
sbt "project acid_datasource" assembly publish
```

This would create `spark-hiveacid-datasource_2.11-assembly.jar`



Refer to [SBT docs](https://www.scala-sbt.org/1.x/docs/Command-Line-Reference.html) for more commands.

# Quick Start
Expand Down
68 changes: 0 additions & 68 deletions acid-datasource/build.sbt

This file was deleted.

8 changes: 0 additions & 8 deletions acid-datasource/publish.sbt

This file was deleted.

Expand Up @@ -157,8 +157,7 @@ class HiveAcidRelation(var sqlContext: SQLContext,
setPushDownFiltersInHadoopConf(hadoopConf, dataFilters)
setRequiredColumnsInHadoopConf(hadoopConf, requiredNonPartitionedColumns)

// TODO: change this to logDebug
log.warn(s"sarg.pushdown: ${hadoopConf.get("sarg.pushdown")}," +
logDebug(s"sarg.pushdown: ${hadoopConf.get("sarg.pushdown")}," +
s"hive.io.file.readcolumn.names: ${hadoopConf.get("hive.io.file.readcolumn.names")}, " +
s"hive.io.file.readcolumn.ids: ${hadoopConf.get("hive.io.file.readcolumn.ids")}")

Expand Down Expand Up @@ -201,8 +200,7 @@ class HiveAcidRelation(var sqlContext: SQLContext,
return Base64.encodeBase64String(out.toBytes)
}

// TODO: change this to logDebug
log.warn(s"searchArgument: ${f}")
logDebug(s"searchArgument: ${f}")
conf.set("sarg.pushdown", toKryo(f))
conf.setBoolean(ConfVars.HIVEOPTINDEXFILTER.varname, true)
}
Expand Down
Expand Up @@ -29,6 +29,10 @@ import scala.collection.JavaConversions._
import scala.collection.mutable.ListBuffer
import scala.reflect.ClassTag

object Cache {
import com.google.common.collect.MapMaker
val jobConf = new MapMaker().softValues().makeMap[String, Any]()
}

class Hive3Partition(rddId: Int, override val index: Int, s: InputSplit)
extends Partition {
Expand Down Expand Up @@ -134,12 +138,12 @@ class Hive3RDD[K, V](
logDebug("Re-using user-broadcasted JobConf")
conf.asInstanceOf[JobConf]
} else {
// Option(Hive3RDD.getCachedMetadata(jobConfCacheKey))
// .map { conf =>
// logDebug("Re-using cached JobConf")
// conf.asInstanceOf[JobConf]
// }
// .getOrElse {
Option(Hive3RDD.getCachedMetadata(jobConfCacheKey))
.map { conf =>
logDebug("Re-using cached JobConf")
conf.asInstanceOf[JobConf]
}
.getOrElse {
// Create a JobConf that will be cached and used across this RDD's getJobConf() calls in
// the local process. The local cache is accessed through Hive3RDD.putCachedMetadata().
// The caching helps minimize GC, since a JobConf can contain ~10KB of temporary
Expand All @@ -152,7 +156,7 @@ class Hive3RDD[K, V](
Hive3RDD.putCachedMetadata(jobConfCacheKey, newJobConf)
newJobConf
}
// }
}
}
}
}
Expand Down Expand Up @@ -397,13 +401,11 @@ object Hive3RDD extends Logging {
val RECORDS_BETWEEN_BYTES_READ_METRIC_UPDATES = 256

def getCachedMetadata(key: String): Any = {
// SparkEnv.get.hadoopJobMetadata.get(key)
()
Cache.jobConf.get(key)
}

private def putCachedMetadata(key: String, value: Any): Unit = {
//SparkEnv.get.hadoopJobMetadata.put(key, value)
()
Cache.jobConf.put(key, value)
}

/** Add Hadoop configuration specific to a single partition and attempt. */
Expand Down
Expand Up @@ -21,17 +21,15 @@ public class TestHiveClient {
private static String driverName = "com.qubole.shaded.hive.jdbc.HiveDriver";
private static Connection con = null;
private static Statement stmt = null;
private Boolean verbose = false;

TestHiveClient(Boolean verbose) {
TestHiveClient() {
try {
Class.forName(driverName);
} catch (ClassNotFoundException e) {
e.printStackTrace();
System.exit(1);
}
try {
this.verbose = verbose;
con = DriverManager.getConnection("jdbc:hive2://0.0.0.0:10001?allowMultiQueries=true", "root", "root");
stmt = con.createStatement();
}
Expand All @@ -41,7 +39,6 @@ public class TestHiveClient {
}

public String executeQuery(String cmd) throws Exception {
if (verbose) System.out.println("\n\nHive> " + cmd + "\n");
// Start Hive txn
ResultSet rs = null;
String resStr = null;
Expand All @@ -62,7 +59,6 @@ public String executeQuery(String cmd) throws Exception {
}

public void execute(String cmd) throws SQLException {
if (verbose) System.out.println("\n\nHive> " + cmd + "\n");
stmt.execute(cmd);
}

Expand Down

0 comments on commit 3309749

Please sign in to comment.