Fix sbt and build script

qubole · Jul 23, 2019 · 3309749 · 3309749
1 parent 0211a84
commit 3309749
Show file tree

Hide file tree

Showing 23 changed files with 668 additions and 765 deletions.
diff --git a/.gitignore b/.gitignore
@@ -23,3 +23,4 @@ project/target/
 target/
 .scala_dependencies
 .worksheet
+shaded_dependencies
diff --git a/README.md b/README.md
@@ -4,6 +4,7 @@ This datasource provides the capability to work with Hive ACID V2 tables, both F
 
 Please refer to [Quick Start](#Quick Start) for details on getting started with using this Datasource.
 
+
 # Latest Binaries
 
 ACID datasource is published to Maven Central Repository and can be used by adding a dependency in your POM file.
@@ -27,9 +28,11 @@ ACID datasource is published to Maven Central Repository and can be used by addi
 
 ACID datasource has been tested to work with Apache Spark 2.4.3, but it should work with older versions as well. However, because of a Hive dependency, this datasource needs Hadoop version 2.8.2 or higher due to [HADOOP-14683](https://jira.apache.org/jira/browse/HADOOP-14683)
 
+_NB: Hive ACID is supported in Hive 3.1.1 onwards and for that hive Metastore db needs to be [upgraded](https://cwiki.apache.org/confluence/display/Hive/Hive+Schema+Tool) to 3.1.1._
+
 ### Data Storage Compatibility
 
-1. ACID datasource does not control data storage format and layout, which is managed by Hive. It should be able to work with data written by Hive version 3.0.0 and above. Please see [Hive ACID storage layout](https://cwiki.apache.org/confluence/display/Hive/Hive+Transactions#HiveTransactions-BasicDesign).
+1. ACID datasource does not control data storage format and layout, which is managed by Hive. It works with data written by Hive version 3.0.0 and above. Please see [Hive ACID storage layout](https://cwiki.apache.org/confluence/display/Hive/Hive+Transactions#HiveTransactions-BasicDesign).
 
 2. ACID datasource works with data stored on local files, HDFS as well as cloud blobstores (AWS S3, Azure Blob Storage etc).
 
@@ -43,28 +46,26 @@ This project has the following sbt projects:
 
 To compile:
 
-1. Compile shaded dependencies 
+1. Compile shaded dependencies and publish it locally
 
 ```bash
-cd shaded-dependencies
-sbt clean assembly
+sbt "project shaded_dependencies" clean assembly
 ```
 
-2. publish the dependency locally
+2. Publish it locally for acid datsource
+
 ```bash
-sbt publishLocal
+sbt "project shaded_dependencies" publishLocal
 ```
 
 This would create and publish `spark-hiveacid-shaded-dependencies_2.11-assembly.jar` which has all the runtime and test dependencies.
 
-Create final assembled jar
+3. Compile acid-datasource
+
 ```bash
-cd acid-datasource
-sbt assembly
+sbt "project acid_datasource" clean package
 ```
 
-This would create `hiveacid-datasource-assembly-0.1.jar`
-
 # Testing
 
 Tests run against a standalone docker setup. Please refer to [Docker setup] (docker/README.md) to build and start a container.
@@ -73,14 +74,22 @@ _NB: Container run HMS server, HS2 Server and HDFS and listens on port 10000,100
 
 To run unit test,
 
+```bash
+sbt "project acid_datasource" test
+```
+
+# Publishing
 
-to run
+To publish fully assembled jar
 
 ```bash
-cd acid-datasource
-sbt test
+sbt "project acid_datasource" assembly publish
 ```
 
+This would create `spark-hiveacid-datasource_2.11-assembly.jar`
+
+
+
 Refer to [SBT docs](https://www.scala-sbt.org/1.x/docs/Command-Line-Reference.html) for more commands.
 
 # Quick Start

diff --git a/acid-datasource/build.sbt b/acid-datasource/build.sbt
diff --git a/acid-datasource/publish.sbt b/acid-datasource/publish.sbt
diff --git a/acid-datasource/src/main/scala/com/qubole/spark/datasources/hiveacid/HiveAcidRelation.scala b/acid-datasource/src/main/scala/com/qubole/spark/datasources/hiveacid/HiveAcidRelation.scala
@@ -157,8 +157,7 @@ class HiveAcidRelation(var sqlContext: SQLContext,
     setPushDownFiltersInHadoopConf(hadoopConf, dataFilters)
     setRequiredColumnsInHadoopConf(hadoopConf, requiredNonPartitionedColumns)
 
-    // TODO: change this to logDebug
-    log.warn(s"sarg.pushdown: ${hadoopConf.get("sarg.pushdown")}," +
+    logDebug(s"sarg.pushdown: ${hadoopConf.get("sarg.pushdown")}," +
       s"hive.io.file.readcolumn.names: ${hadoopConf.get("hive.io.file.readcolumn.names")}, " +
       s"hive.io.file.readcolumn.ids: ${hadoopConf.get("hive.io.file.readcolumn.ids")}")
 
@@ -201,8 +200,7 @@ class HiveAcidRelation(var sqlContext: SQLContext,
           return Base64.encodeBase64String(out.toBytes)
         }
 
-        // TODO: change this to logDebug
-        log.warn(s"searchArgument: ${f}")
+        logDebug(s"searchArgument: ${f}")
         conf.set("sarg.pushdown", toKryo(f))
         conf.setBoolean(ConfVars.HIVEOPTINDEXFILTER.varname, true)
       }

diff --git a/acid-datasource/src/main/scala/com/qubole/spark/datasources/hiveacid/rdd/Hive3Rdd.scala b/acid-datasource/src/main/scala/com/qubole/spark/datasources/hiveacid/rdd/Hive3Rdd.scala
@@ -29,6 +29,10 @@ import scala.collection.JavaConversions._
 import scala.collection.mutable.ListBuffer
 import scala.reflect.ClassTag
 
+object Cache {
+  import com.google.common.collect.MapMaker
+  val jobConf = new MapMaker().softValues().makeMap[String, Any]()
+}
 
 class Hive3Partition(rddId: Int, override val index: Int, s: InputSplit)
   extends Partition {
@@ -134,12 +138,12 @@ class Hive3RDD[K, V](
         logDebug("Re-using user-broadcasted JobConf")
         conf.asInstanceOf[JobConf]
       } else {
-//        Option(Hive3RDD.getCachedMetadata(jobConfCacheKey))
-//          .map { conf =>
-//            logDebug("Re-using cached JobConf")
-//            conf.asInstanceOf[JobConf]
-//          }
-//          .getOrElse {
+        Option(Hive3RDD.getCachedMetadata(jobConfCacheKey))
+          .map { conf =>
+            logDebug("Re-using cached JobConf")
+            conf.asInstanceOf[JobConf]
+          }
+          .getOrElse {
             // Create a JobConf that will be cached and used across this RDD's getJobConf() calls in
             // the local process. The local cache is accessed through Hive3RDD.putCachedMetadata().
             // The caching helps minimize GC, since a JobConf can contain ~10KB of temporary
@@ -152,7 +156,7 @@ class Hive3RDD[K, V](
               Hive3RDD.putCachedMetadata(jobConfCacheKey, newJobConf)
               newJobConf
             }
-//          }
+          }
       }
     }
   }
@@ -397,13 +401,11 @@ object Hive3RDD extends Logging {
   val RECORDS_BETWEEN_BYTES_READ_METRIC_UPDATES = 256
 
   def getCachedMetadata(key: String): Any = {
-    // SparkEnv.get.hadoopJobMetadata.get(key)
-    ()
+    Cache.jobConf.get(key)
   }
 
   private def putCachedMetadata(key: String, value: Any): Unit = {
-    //SparkEnv.get.hadoopJobMetadata.put(key, value)
-    ()
+    Cache.jobConf.put(key, value)
   }
 
   /** Add Hadoop configuration specific to a single partition and attempt. */

diff --git a/acid-datasource/src/test/java/com/qubole/spark/datasources/hiveacid/TestHiveClient.java b/acid-datasource/src/test/java/com/qubole/spark/datasources/hiveacid/TestHiveClient.java
@@ -21,17 +21,15 @@ public class TestHiveClient {
 	private static String driverName = "com.qubole.shaded.hive.jdbc.HiveDriver";
 	private static Connection con = null;
 	private static Statement stmt = null;
-	private Boolean verbose = false;
 
-	TestHiveClient(Boolean verbose) {
+	TestHiveClient() {
 		try {
 			Class.forName(driverName);
 		} catch (ClassNotFoundException e) {
 			e.printStackTrace();
 			System.exit(1);
 		}
 		try {
-			this.verbose = verbose;
 			con = DriverManager.getConnection("jdbc:hive2://0.0.0.0:10001?allowMultiQueries=true", "root", "root");
 			stmt = con.createStatement();
 		}
@@ -41,7 +39,6 @@ public class TestHiveClient {
 	}
 
 	public String executeQuery(String cmd) throws Exception {
-		if (verbose) System.out.println("\n\nHive> " + cmd + "\n");
 		// Start Hive txn
 		ResultSet rs = null;
 		String resStr = null;
@@ -62,7 +59,6 @@ public String executeQuery(String cmd) throws Exception {
 	}
 
 	public void execute(String cmd) throws SQLException {
-		if (verbose) System.out.println("\n\nHive> " + cmd + "\n");
 		stmt.execute(cmd);
 	}