From 89db3c0b6edcffed7e1e12c202e6827271ddba26 Mon Sep 17 00:00:00 2001 From: MechCoder Date: Tue, 21 Jul 2015 10:31:31 -0700 Subject: [PATCH 1/4] [SPARK-5989] [MLLIB] Model save/load for LDA Add support for saving and loading LDA both the local and distributed versions. Author: MechCoder Closes #6948 from MechCoder/lda_save_load and squashes the following commits: 49bcdce [MechCoder] minor style fixes cc14054 [MechCoder] minor 4587d1d [MechCoder] Minor changes c753122 [MechCoder] Load and save the model in private methods 2782326 [MechCoder] [SPARK-5989] Model save/load for LDA --- docs/mllib-clustering.md | 10 +- .../spark/mllib/clustering/LDAModel.scala | 228 +++++++++++++++++- .../spark/mllib/clustering/LDASuite.scala | 41 ++++ 3 files changed, 274 insertions(+), 5 deletions(-) diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md index 0fc7036bffeaf..bb875ae2ae6cb 100644 --- a/docs/mllib-clustering.md +++ b/docs/mllib-clustering.md @@ -472,7 +472,7 @@ to the algorithm. We then output the topics, represented as probability distribu
{% highlight scala %} -import org.apache.spark.mllib.clustering.LDA +import org.apache.spark.mllib.clustering.{LDA, DistributedLDAModel} import org.apache.spark.mllib.linalg.Vectors // Load and parse the data @@ -492,6 +492,11 @@ for (topic <- Range(0, 3)) { for (word <- Range(0, ldaModel.vocabSize)) { print(" " + topics(word, topic)); } println() } + +// Save and load model. +ldaModel.save(sc, "myLDAModel") +val sameModel = DistributedLDAModel.load(sc, "myLDAModel") + {% endhighlight %}
@@ -551,6 +556,9 @@ public class JavaLDAExample { } System.out.println(); } + + ldaModel.save(sc.sc(), "myLDAModel"); + DistributedLDAModel sameModel = DistributedLDAModel.load(sc.sc(), "myLDAModel"); } } {% endhighlight %} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala index 974b26924dfb8..920b57756b625 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala @@ -17,15 +17,25 @@ package org.apache.spark.mllib.clustering -import breeze.linalg.{DenseMatrix => BDM, normalize, sum => brzSum} +import breeze.linalg.{DenseMatrix => BDM, normalize, sum => brzSum, DenseVector => BDV} +import org.apache.hadoop.fs.Path + +import org.json4s.DefaultFormats +import org.json4s.JsonDSL._ +import org.json4s.jackson.JsonMethods._ + +import org.apache.spark.SparkContext import org.apache.spark.annotation.Experimental import org.apache.spark.api.java.JavaPairRDD -import org.apache.spark.graphx.{VertexId, EdgeContext, Graph} -import org.apache.spark.mllib.linalg.{Vectors, Vector, Matrices, Matrix} +import org.apache.spark.graphx.{VertexId, Edge, EdgeContext, Graph} +import org.apache.spark.mllib.linalg.{Vectors, Vector, Matrices, Matrix, DenseVector} +import org.apache.spark.mllib.util.{Saveable, Loader} import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{SQLContext, Row} import org.apache.spark.util.BoundedPriorityQueue + /** * :: Experimental :: * @@ -35,7 +45,7 @@ import org.apache.spark.util.BoundedPriorityQueue * including local and distributed data structures. */ @Experimental -abstract class LDAModel private[clustering] { +abstract class LDAModel private[clustering] extends Saveable { /** Number of topics */ def k: Int @@ -176,6 +186,11 @@ class LocalLDAModel private[clustering] ( }.toArray } + override protected def formatVersion = "1.0" + + override def save(sc: SparkContext, path: String): Unit = { + LocalLDAModel.SaveLoadV1_0.save(sc, path, topicsMatrix) + } // TODO // override def logLikelihood(documents: RDD[(Long, Vector)]): Double = ??? @@ -184,6 +199,80 @@ class LocalLDAModel private[clustering] ( } +@Experimental +object LocalLDAModel extends Loader[LocalLDAModel] { + + private object SaveLoadV1_0 { + + val thisFormatVersion = "1.0" + + val thisClassName = "org.apache.spark.mllib.clustering.LocalLDAModel" + + // Store the distribution of terms of each topic and the column index in topicsMatrix + // as a Row in data. + case class Data(topic: Vector, index: Int) + + def save(sc: SparkContext, path: String, topicsMatrix: Matrix): Unit = { + val sqlContext = SQLContext.getOrCreate(sc) + import sqlContext.implicits._ + + val k = topicsMatrix.numCols + val metadata = compact(render + (("class" -> thisClassName) ~ ("version" -> thisFormatVersion) ~ + ("k" -> k) ~ ("vocabSize" -> topicsMatrix.numRows))) + sc.parallelize(Seq(metadata), 1).saveAsTextFile(Loader.metadataPath(path)) + + val topicsDenseMatrix = topicsMatrix.toBreeze.toDenseMatrix + val topics = Range(0, k).map { topicInd => + Data(Vectors.dense((topicsDenseMatrix(::, topicInd).toArray)), topicInd) + }.toSeq + sc.parallelize(topics, 1).toDF().write.parquet(Loader.dataPath(path)) + } + + def load(sc: SparkContext, path: String): LocalLDAModel = { + val dataPath = Loader.dataPath(path) + val sqlContext = SQLContext.getOrCreate(sc) + val dataFrame = sqlContext.read.parquet(dataPath) + + Loader.checkSchema[Data](dataFrame.schema) + val topics = dataFrame.collect() + val vocabSize = topics(0).getAs[Vector](0).size + val k = topics.size + + val brzTopics = BDM.zeros[Double](vocabSize, k) + topics.foreach { case Row(vec: Vector, ind: Int) => + brzTopics(::, ind) := vec.toBreeze + } + new LocalLDAModel(Matrices.fromBreeze(brzTopics)) + } + } + + override def load(sc: SparkContext, path: String): LocalLDAModel = { + val (loadedClassName, loadedVersion, metadata) = Loader.loadMetadata(sc, path) + implicit val formats = DefaultFormats + val expectedK = (metadata \ "k").extract[Int] + val expectedVocabSize = (metadata \ "vocabSize").extract[Int] + val classNameV1_0 = SaveLoadV1_0.thisClassName + + val model = (loadedClassName, loadedVersion) match { + case (className, "1.0") if className == classNameV1_0 => + SaveLoadV1_0.load(sc, path) + case _ => throw new Exception( + s"LocalLDAModel.load did not recognize model with (className, format version):" + + s"($loadedClassName, $loadedVersion). Supported:\n" + + s" ($classNameV1_0, 1.0)") + } + + val topicsMatrix = model.topicsMatrix + require(expectedK == topicsMatrix.numCols, + s"LocalLDAModel requires $expectedK topics, got ${topicsMatrix.numCols} topics") + require(expectedVocabSize == topicsMatrix.numRows, + s"LocalLDAModel requires $expectedVocabSize terms for each topic, " + + s"but got ${topicsMatrix.numRows}") + model + } +} + /** * :: Experimental :: * @@ -354,4 +443,135 @@ class DistributedLDAModel private ( // TODO: // override def topicDistributions(documents: RDD[(Long, Vector)]): RDD[(Long, Vector)] = ??? + override protected def formatVersion = "1.0" + + override def save(sc: SparkContext, path: String): Unit = { + DistributedLDAModel.SaveLoadV1_0.save( + sc, path, graph, globalTopicTotals, k, vocabSize, docConcentration, topicConcentration, + iterationTimes) + } +} + + +@Experimental +object DistributedLDAModel extends Loader[DistributedLDAModel] { + + private object SaveLoadV1_0 { + + val thisFormatVersion = "1.0" + + val classNameV1_0 = "org.apache.spark.mllib.clustering.DistributedLDAModel" + + // Store globalTopicTotals as a Vector. + case class Data(globalTopicTotals: Vector) + + // Store each term and document vertex with an id and the topicWeights. + case class VertexData(id: Long, topicWeights: Vector) + + // Store each edge with the source id, destination id and tokenCounts. + case class EdgeData(srcId: Long, dstId: Long, tokenCounts: Double) + + def save( + sc: SparkContext, + path: String, + graph: Graph[LDA.TopicCounts, LDA.TokenCount], + globalTopicTotals: LDA.TopicCounts, + k: Int, + vocabSize: Int, + docConcentration: Double, + topicConcentration: Double, + iterationTimes: Array[Double]): Unit = { + val sqlContext = SQLContext.getOrCreate(sc) + import sqlContext.implicits._ + + val metadata = compact(render + (("class" -> classNameV1_0) ~ ("version" -> thisFormatVersion) ~ + ("k" -> k) ~ ("vocabSize" -> vocabSize) ~ ("docConcentration" -> docConcentration) ~ + ("topicConcentration" -> topicConcentration) ~ + ("iterationTimes" -> iterationTimes.toSeq))) + sc.parallelize(Seq(metadata), 1).saveAsTextFile(Loader.metadataPath(path)) + + val newPath = new Path(Loader.dataPath(path), "globalTopicTotals").toUri.toString + sc.parallelize(Seq(Data(Vectors.fromBreeze(globalTopicTotals)))).toDF() + .write.parquet(newPath) + + val verticesPath = new Path(Loader.dataPath(path), "topicCounts").toUri.toString + graph.vertices.map { case (ind, vertex) => + VertexData(ind, Vectors.fromBreeze(vertex)) + }.toDF().write.parquet(verticesPath) + + val edgesPath = new Path(Loader.dataPath(path), "tokenCounts").toUri.toString + graph.edges.map { case Edge(srcId, dstId, prop) => + EdgeData(srcId, dstId, prop) + }.toDF().write.parquet(edgesPath) + } + + def load( + sc: SparkContext, + path: String, + vocabSize: Int, + docConcentration: Double, + topicConcentration: Double, + iterationTimes: Array[Double]): DistributedLDAModel = { + val dataPath = new Path(Loader.dataPath(path), "globalTopicTotals").toUri.toString + val vertexDataPath = new Path(Loader.dataPath(path), "topicCounts").toUri.toString + val edgeDataPath = new Path(Loader.dataPath(path), "tokenCounts").toUri.toString + val sqlContext = SQLContext.getOrCreate(sc) + val dataFrame = sqlContext.read.parquet(dataPath) + val vertexDataFrame = sqlContext.read.parquet(vertexDataPath) + val edgeDataFrame = sqlContext.read.parquet(edgeDataPath) + + Loader.checkSchema[Data](dataFrame.schema) + Loader.checkSchema[VertexData](vertexDataFrame.schema) + Loader.checkSchema[EdgeData](edgeDataFrame.schema) + val globalTopicTotals: LDA.TopicCounts = + dataFrame.first().getAs[Vector](0).toBreeze.toDenseVector + val vertices: RDD[(VertexId, LDA.TopicCounts)] = vertexDataFrame.map { + case Row(ind: Long, vec: Vector) => (ind, vec.toBreeze.toDenseVector) + } + + val edges: RDD[Edge[LDA.TokenCount]] = edgeDataFrame.map { + case Row(srcId: Long, dstId: Long, prop: Double) => Edge(srcId, dstId, prop) + } + val graph: Graph[LDA.TopicCounts, LDA.TokenCount] = Graph(vertices, edges) + + new DistributedLDAModel(graph, globalTopicTotals, globalTopicTotals.length, vocabSize, + docConcentration, topicConcentration, iterationTimes) + } + + } + + override def load(sc: SparkContext, path: String): DistributedLDAModel = { + val (loadedClassName, loadedVersion, metadata) = Loader.loadMetadata(sc, path) + implicit val formats = DefaultFormats + val expectedK = (metadata \ "k").extract[Int] + val vocabSize = (metadata \ "vocabSize").extract[Int] + val docConcentration = (metadata \ "docConcentration").extract[Double] + val topicConcentration = (metadata \ "topicConcentration").extract[Double] + val iterationTimes = (metadata \ "iterationTimes").extract[Seq[Double]] + val classNameV1_0 = SaveLoadV1_0.classNameV1_0 + + val model = (loadedClassName, loadedVersion) match { + case (className, "1.0") if className == classNameV1_0 => { + DistributedLDAModel.SaveLoadV1_0.load( + sc, path, vocabSize, docConcentration, topicConcentration, iterationTimes.toArray) + } + case _ => throw new Exception( + s"DistributedLDAModel.load did not recognize model with (className, format version):" + + s"($loadedClassName, $loadedVersion). Supported: ($classNameV1_0, 1.0)") + } + + require(model.vocabSize == vocabSize, + s"DistributedLDAModel requires $vocabSize vocabSize, got ${model.vocabSize} vocabSize") + require(model.docConcentration == docConcentration, + s"DistributedLDAModel requires $docConcentration docConcentration, " + + s"got ${model.docConcentration} docConcentration") + require(model.topicConcentration == topicConcentration, + s"DistributedLDAModel requires $topicConcentration docConcentration, " + + s"got ${model.topicConcentration} docConcentration") + require(expectedK == model.k, + s"DistributedLDAModel requires $expectedK topics, got ${model.k} topics") + model + } + } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala index 03a8a2538b464..721a065658951 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala @@ -23,6 +23,7 @@ import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.linalg.{Vector, DenseMatrix, Matrix, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ +import org.apache.spark.util.Utils class LDASuite extends SparkFunSuite with MLlibTestSparkContext { @@ -217,6 +218,46 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext { } } + test("model save/load") { + // Test for LocalLDAModel. + val localModel = new LocalLDAModel(tinyTopics) + val tempDir1 = Utils.createTempDir() + val path1 = tempDir1.toURI.toString + + // Test for DistributedLDAModel. + val k = 3 + val docConcentration = 1.2 + val topicConcentration = 1.5 + val lda = new LDA() + lda.setK(k) + .setDocConcentration(docConcentration) + .setTopicConcentration(topicConcentration) + .setMaxIterations(5) + .setSeed(12345) + val corpus = sc.parallelize(tinyCorpus, 2) + val distributedModel: DistributedLDAModel = lda.run(corpus).asInstanceOf[DistributedLDAModel] + val tempDir2 = Utils.createTempDir() + val path2 = tempDir2.toURI.toString + + try { + localModel.save(sc, path1) + distributedModel.save(sc, path2) + val samelocalModel = LocalLDAModel.load(sc, path1) + assert(samelocalModel.topicsMatrix === localModel.topicsMatrix) + assert(samelocalModel.k === localModel.k) + assert(samelocalModel.vocabSize === localModel.vocabSize) + + val sameDistributedModel = DistributedLDAModel.load(sc, path2) + assert(distributedModel.topicsMatrix === sameDistributedModel.topicsMatrix) + assert(distributedModel.k === sameDistributedModel.k) + assert(distributedModel.vocabSize === sameDistributedModel.vocabSize) + assert(distributedModel.iterationTimes === sameDistributedModel.iterationTimes) + } finally { + Utils.deleteRecursively(tempDir1) + Utils.deleteRecursively(tempDir2) + } + } + } private[clustering] object LDASuite { From 87d890cc105a7f41478433b28f53c9aa431db211 Mon Sep 17 00:00:00 2001 From: Michael Armbrust Date: Tue, 21 Jul 2015 11:18:39 -0700 Subject: [PATCH 2/4] Revert "[SPARK-9154] [SQL] codegen StringFormat" This reverts commit 7f072c3d5ec50c65d76bd9f28fac124fce96a89e. Revert #7546 Author: Michael Armbrust Closes #7570 from marmbrus/revert9154 and squashes the following commits: ed2c32a [Michael Armbrust] Revert "[SPARK-9154] [SQL] codegen StringFormat" --- .../expressions/stringOperations.scala | 42 +------------------ .../expressions/StringExpressionsSuite.scala | 18 ++++---- .../spark/sql/StringFunctionsSuite.scala | 10 ----- 3 files changed, 11 insertions(+), 59 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala index 280ae0e546358..fe57d17f1ec14 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala @@ -526,7 +526,7 @@ case class StringRPad(str: Expression, len: Expression, pad: Expression) /** * Returns the input formatted according do printf-style format strings */ -case class StringFormat(children: Expression*) extends Expression with ImplicitCastInputTypes { +case class StringFormat(children: Expression*) extends Expression with CodegenFallback { require(children.nonEmpty, "printf() should take at least 1 argument") @@ -536,10 +536,6 @@ case class StringFormat(children: Expression*) extends Expression with ImplicitC private def format: Expression = children(0) private def args: Seq[Expression] = children.tail - override def inputTypes: Seq[AbstractDataType] = - children.zipWithIndex.map(x => if (x._2 == 0) StringType else AnyDataType) - - override def eval(input: InternalRow): Any = { val pattern = format.eval(input) if (pattern == null) { @@ -555,42 +551,6 @@ case class StringFormat(children: Expression*) extends Expression with ImplicitC } } - override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { - val pattern = children.head.gen(ctx) - - val argListGen = children.tail.map(x => (x.dataType, x.gen(ctx))) - val argListCode = argListGen.map(_._2.code + "\n") - - val argListString = argListGen.foldLeft("")((s, v) => { - val nullSafeString = - if (ctx.boxedType(v._1) != ctx.javaType(v._1)) { - // Java primitives get boxed in order to allow null values. - s"(${v._2.isNull}) ? (${ctx.boxedType(v._1)}) null : " + - s"new ${ctx.boxedType(v._1)}(${v._2.primitive})" - } else { - s"(${v._2.isNull}) ? null : ${v._2.primitive}" - } - s + "," + nullSafeString - }) - - val form = ctx.freshName("formatter") - val formatter = classOf[java.util.Formatter].getName - val sb = ctx.freshName("sb") - val stringBuffer = classOf[StringBuffer].getName - s""" - ${pattern.code} - boolean ${ev.isNull} = ${pattern.isNull}; - ${ctx.javaType(dataType)} ${ev.primitive} = ${ctx.defaultValue(dataType)}; - if (!${ev.isNull}) { - ${argListCode.mkString} - $stringBuffer $sb = new $stringBuffer(); - $formatter $form = new $formatter($sb, ${classOf[Locale].getName}.US); - $form.format(${pattern.primitive}.toString() $argListString); - ${ev.primitive} = UTF8String.fromString($sb.toString()); - } - """ - } - override def prettyName: String = "printf" } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala index 3c2d88731beb4..96c540ab36f08 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala @@ -351,16 +351,18 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { } test("FORMAT") { - checkEvaluation(StringFormat(Literal("aa%d%s"), Literal(123), Literal("a")), "aa123a") + val f = 'f.string.at(0) + val d1 = 'd.int.at(1) + val s1 = 's.int.at(2) + + val row1 = create_row("aa%d%s", 12, "cc") + val row2 = create_row(null, 12, "cc") + checkEvaluation(StringFormat(Literal("aa%d%s"), Literal(123), Literal("a")), "aa123a", row1) checkEvaluation(StringFormat(Literal("aa")), "aa", create_row(null)) - checkEvaluation(StringFormat(Literal("aa%d%s"), Literal(123), Literal("a")), "aa123a") - checkEvaluation(StringFormat(Literal("aa%d%s"), 12, "cc"), "aa12cc") + checkEvaluation(StringFormat(Literal("aa%d%s"), Literal(123), Literal("a")), "aa123a", row1) - checkEvaluation(StringFormat(Literal.create(null, StringType), 12, "cc"), null) - checkEvaluation( - StringFormat(Literal("aa%d%s"), Literal.create(null, IntegerType), "cc"), "aanullcc") - checkEvaluation( - StringFormat(Literal("aa%d%s"), 12, Literal.create(null, StringType)), "aa12null") + checkEvaluation(StringFormat(f, d1, s1), "aa12cc", row1) + checkEvaluation(StringFormat(f, d1, s1), null, row2) } test("INSTR") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index 3702e73b4e74f..d1f855903ca4b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -132,16 +132,6 @@ class StringFunctionsSuite extends QueryTest { checkAnswer( df.selectExpr("printf(a, b, c)"), Row("aa123cc")) - - val df2 = Seq(("aa%d%s".getBytes, 123, "cc")).toDF("a", "b", "c") - - checkAnswer( - df2.select(formatString($"a", $"b", $"c"), formatString("aa%d%s", "b", "c")), - Row("aa123cc", "aa123cc")) - - checkAnswer( - df2.selectExpr("printf(a, b, c)"), - Row("aa123cc")) } test("string instr function") { From 9ba7c64decfc92853bd281e9e7bfb95211080dd4 Mon Sep 17 00:00:00 2001 From: "navis.ryu" Date: Tue, 21 Jul 2015 11:52:52 -0700 Subject: [PATCH 3/4] [SPARK-8357] Fix unsafe memory leak on empty inputs in GeneratedAggregate This patch fixes a managed memory leak in GeneratedAggregate. The leak occurs when the unsafe aggregation path is used to perform grouped aggregation on an empty input; in this case, GeneratedAggregate allocates an UnsafeFixedWidthAggregationMap that is never cleaned up because `next()` is never called on the aggregate result iterator. This patch fixes this by short-circuiting on empty inputs. This patch is an updated version of #6810. Closes #6810. Author: navis.ryu Author: Josh Rosen Closes #7560 from JoshRosen/SPARK-8357 and squashes the following commits: 3486ce4 [Josh Rosen] Some minor cleanup c649310 [Josh Rosen] Revert SparkPlan change: 3c7db0f [Josh Rosen] Merge remote-tracking branch 'origin/master' into SPARK-8357 adc8239 [Josh Rosen] Back out Projection changes. c5419b3 [navis.ryu] addressed comments 143e1ef [navis.ryu] fixed format & added test for CCE case 735972f [navis.ryu] used new conf apis 1a02a55 [navis.ryu] Rolled-back test-conf cleanup & fixed possible CCE & added more tests 51178e8 [navis.ryu] addressed comments 4d326b9 [navis.ryu] fixed test fails 15c5afc [navis.ryu] added a test as suggested by JoshRosen d396589 [navis.ryu] added comments 1b07556 [navis.ryu] [SPARK-8357] [SQL] Memory leakage on unsafe aggregation path with empty input --- .../sql/execution/GeneratedAggregate.scala | 14 +++++- .../org/apache/spark/sql/SQLQuerySuite.scala | 9 ++++ .../spark/sql/execution/AggregateSuite.scala | 48 +++++++++++++++++++ 3 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/AggregateSuite.scala diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala index c069da016f9f0..ecde9c57139a6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/GeneratedAggregate.scala @@ -266,7 +266,18 @@ case class GeneratedAggregate( val joinedRow = new JoinedRow3 - if (groupingExpressions.isEmpty) { + if (!iter.hasNext) { + // This is an empty input, so return early so that we do not allocate data structures + // that won't be cleaned up (see SPARK-8357). + if (groupingExpressions.isEmpty) { + // This is a global aggregate, so return an empty aggregation buffer. + val resultProjection = resultProjectionBuilder() + Iterator(resultProjection(newAggregationBuffer(EmptyRow))) + } else { + // This is a grouped aggregate, so return an empty iterator. + Iterator[InternalRow]() + } + } else if (groupingExpressions.isEmpty) { // TODO: Codegening anything other than the updateProjection is probably over kill. val buffer = newAggregationBuffer(EmptyRow).asInstanceOf[MutableRow] var currentRow: InternalRow = null @@ -280,6 +291,7 @@ case class GeneratedAggregate( val resultProjection = resultProjectionBuilder() Iterator(resultProjection(buffer)) } else if (unsafeEnabled) { + assert(iter.hasNext, "There should be at least one row for this path") log.info("Using Unsafe-based aggregator") val aggregationMap = new UnsafeFixedWidthAggregationMap( newAggregationBuffer, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index 61d5f2061ae18..beee10173fbc4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -648,6 +648,15 @@ class SQLQuerySuite extends QueryTest with BeforeAndAfterAll with SQLTestUtils { Row(2, 1, 2, 2, 1)) } + test("count of empty table") { + withTempTable("t") { + Seq.empty[(Int, Int)].toDF("a", "b").registerTempTable("t") + checkAnswer( + sql("select count(a) from t"), + Row(0)) + } + } + test("inner join where, one match per row") { checkAnswer( sql("SELECT * FROM upperCaseData JOIN lowerCaseData WHERE n = N"), diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/AggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/AggregateSuite.scala new file mode 100644 index 0000000000000..20def6bef0c17 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/AggregateSuite.scala @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution + +import org.apache.spark.sql.SQLConf +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.test.TestSQLContext + +class AggregateSuite extends SparkPlanTest { + + test("SPARK-8357 unsafe aggregation path should not leak memory with empty input") { + val codegenDefault = TestSQLContext.getConf(SQLConf.CODEGEN_ENABLED) + val unsafeDefault = TestSQLContext.getConf(SQLConf.UNSAFE_ENABLED) + try { + TestSQLContext.setConf(SQLConf.CODEGEN_ENABLED, true) + TestSQLContext.setConf(SQLConf.UNSAFE_ENABLED, true) + val df = Seq.empty[(Int, Int)].toDF("a", "b") + checkAnswer( + df, + GeneratedAggregate( + partial = true, + Seq(df.col("b").expr), + Seq(Alias(Count(df.col("a").expr), "cnt")()), + unsafeEnabled = true, + _: SparkPlan), + Seq.empty + ) + } finally { + TestSQLContext.setConf(SQLConf.CODEGEN_ENABLED, codegenDefault) + TestSQLContext.setConf(SQLConf.UNSAFE_ENABLED, unsafeDefault) + } + } +} From 60c0ce134d90ef18852ed2c637d2f240b7f99ab9 Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Tue, 21 Jul 2015 11:56:38 -0700 Subject: [PATCH 4/4] [SPARK-8906][SQL] Move all internal data source classes into execution.datasources. This way, the sources package contains only public facing interfaces. Author: Reynold Xin Closes #7565 from rxin/move-ds and squashes the following commits: 7661aff [Reynold Xin] Mima 9d5196a [Reynold Xin] Rearranged imports. 3dd7174 [Reynold Xin] [SPARK-8906][SQL] Move all internal data source classes into execution.datasources. --- project/MimaExcludes.scala | 47 +++++++++++++++++++ .../org/apache/spark/sql/DataFrame.scala | 2 +- .../apache/spark/sql/DataFrameReader.scala | 4 +- .../apache/spark/sql/DataFrameWriter.scala | 2 +- .../org/apache/spark/sql/SQLContext.scala | 9 ++-- .../spark/sql/execution/SparkStrategies.scala | 4 +- .../SqlNewHadoopRDD.scala | 9 ++-- .../datasources}/DataSourceStrategy.scala | 11 ++--- .../datasources}/LogicalRelation.scala | 7 +-- .../datasources}/PartitioningUtils.scala | 5 +- .../datasources}/commands.scala | 5 +- .../datasources}/ddl.scala | 9 ++-- .../datasources}/rules.scala | 10 ++-- .../apache/spark/sql/parquet/newParquet.scala | 5 +- .../apache/spark/sql/sources/filters.scala | 4 ++ .../apache/spark/sql/sources/interfaces.scala | 4 +- .../org/apache/spark/sql/json/JsonSuite.scala | 2 +- .../sql/parquet/ParquetFilterSuite.scala | 2 +- .../ParquetPartitionDiscoverySuite.scala | 4 +- .../sources/CreateTableAsSelectSuite.scala | 1 + .../sql/sources/ResolvedDataSourceSuite.scala | 1 + .../apache/spark/sql/hive/HiveContext.scala | 6 +-- .../spark/sql/hive/HiveMetastoreCatalog.scala | 11 +++-- .../org/apache/spark/sql/hive/HiveQl.scala | 2 +- .../spark/sql/hive/HiveStrategies.scala | 2 +- .../spark/sql/hive/execution/commands.scala | 1 + .../spark/sql/hive/orc/OrcRelation.scala | 1 + .../sql/hive/MetastoreDataSourcesSuite.scala | 2 +- .../hive/execution/HiveComparisonTest.scala | 4 +- .../sql/hive/execution/SQLQuerySuite.scala | 2 +- .../apache/spark/sql/hive/parquetSuites.scala | 2 +- .../sql/sources/hadoopFsRelationSuites.scala | 6 ++- 32 files changed, 124 insertions(+), 62 deletions(-) rename sql/core/src/main/scala/org/apache/spark/sql/{sources => execution}/SqlNewHadoopRDD.scala (99%) rename sql/core/src/main/scala/org/apache/spark/sql/{sources => execution/datasources}/DataSourceStrategy.scala (98%) rename sql/core/src/main/scala/org/apache/spark/sql/{sources => execution/datasources}/LogicalRelation.scala (88%) rename sql/core/src/main/scala/org/apache/spark/sql/{sources => execution/datasources}/PartitioningUtils.scala (99%) rename sql/core/src/main/scala/org/apache/spark/sql/{sources => execution/datasources}/commands.scala (99%) rename sql/core/src/main/scala/org/apache/spark/sql/{sources => execution/datasources}/ddl.scala (99%) rename sql/core/src/main/scala/org/apache/spark/sql/{sources => execution/datasources}/rules.scala (94%) diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index a2595ff6c22f4..fa36629c37a35 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -104,6 +104,53 @@ object MimaExcludes { // SPARK-7422 add argmax for sparse vectors ProblemFilters.exclude[MissingMethodProblem]( "org.apache.spark.mllib.linalg.Vector.argmax") + ) ++ Seq( + // SPARK-8906 Move all internal data source classes into execution.datasources + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.ResolvedDataSource"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PreInsertCastAndRename$"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTableUsingAsSelect$"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.InsertIntoDataSource$"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.SqlNewHadoopPartition"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitioningUtils$PartitionValues$"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DefaultWriterContainer"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitioningUtils$PartitionValues"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.RefreshTable$"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTempTableUsing$"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitionSpec"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DynamicPartitionWriterContainer"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTableUsingAsSelect"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.SqlNewHadoopRDD$"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DescribeCommand$"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitioningUtils$"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.SqlNewHadoopRDD"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PreInsertCastAndRename"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.Partition$"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.LogicalRelation$"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitioningUtils"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.LogicalRelation"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.Partition"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.BaseWriterContainer"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PreWriteCheck"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTableUsing"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.RefreshTable"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.SqlNewHadoopRDD$NewHadoopMapPartitionsWithSplitRDD"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DataSourceStrategy$"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTempTableUsing"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTempTableUsingAsSelect$"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTempTableUsingAsSelect"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CreateTableUsing$"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.ResolvedDataSource$"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PreWriteCheck$"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.InsertIntoDataSource"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.InsertIntoHadoopFsRelation"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DDLParser"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.CaseInsensitiveMap"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.InsertIntoHadoopFsRelation$"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DataSourceStrategy"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.SqlNewHadoopRDD$NewHadoopMapPartitionsWithSplitRDD$"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.PartitionSpec$"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DescribeCommand"), + ProblemFilters.exclude[MissingClassProblem]("org.apache.spark.sql.sources.DDLException") ) case v if v.startsWith("1.4") => diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala index 830fba35bb7bc..323ff17357fda 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala @@ -38,8 +38,8 @@ import org.apache.spark.sql.catalyst.plans.logical.{Filter, _} import org.apache.spark.sql.catalyst.plans.{Inner, JoinType} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, ScalaReflection, SqlParser} import org.apache.spark.sql.execution.{EvaluatePython, ExplainCommand, LogicalRDD} +import org.apache.spark.sql.execution.datasources.CreateTableUsingAsSelect import org.apache.spark.sql.json.JacksonGenerator -import org.apache.spark.sql.sources.CreateTableUsingAsSelect import org.apache.spark.sql.types._ import org.apache.spark.storage.StorageLevel import org.apache.spark.util.Utils diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index f1c1ddf898986..e9d782cdcd667 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -20,16 +20,16 @@ package org.apache.spark.sql import java.util.Properties import org.apache.hadoop.fs.Path -import org.apache.spark.{Logging, Partition} +import org.apache.spark.{Logging, Partition} import org.apache.spark.annotation.Experimental import org.apache.spark.api.java.JavaRDD import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.rdd.RDD +import org.apache.spark.sql.execution.datasources.{ResolvedDataSource, LogicalRelation} import org.apache.spark.sql.jdbc.{JDBCPartition, JDBCPartitioningInfo, JDBCRelation} import org.apache.spark.sql.json.JSONRelation import org.apache.spark.sql.parquet.ParquetRelation2 -import org.apache.spark.sql.sources.{LogicalRelation, ResolvedDataSource} import org.apache.spark.sql.types.StructType /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala index 3e7b9cd7976c3..ee0201a9d4cb2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -22,8 +22,8 @@ import java.util.Properties import org.apache.spark.annotation.Experimental import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation import org.apache.spark.sql.catalyst.plans.logical.InsertIntoTable +import org.apache.spark.sql.execution.datasources.{CreateTableUsingAsSelect, ResolvedDataSource} import org.apache.spark.sql.jdbc.{JDBCWriteDetails, JdbcUtils} -import org.apache.spark.sql.sources.{ResolvedDataSource, CreateTableUsingAsSelect} /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index 2dda3ad1211fa..8b4528b5d52fe 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -39,8 +39,9 @@ import org.apache.spark.sql.catalyst.optimizer.{DefaultOptimizer, Optimizer} import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan} import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.catalyst.{InternalRow, ParserDialect, _} -import org.apache.spark.sql.execution.{Filter, _} -import org.apache.spark.sql.sources._ +import org.apache.spark.sql.execution._ +import org.apache.spark.sql.execution.datasources._ +import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.util.Utils @@ -146,11 +147,11 @@ class SQLContext(@transient val sparkContext: SparkContext) new Analyzer(catalog, functionRegistry, conf) { override val extendedResolutionRules = ExtractPythonUDFs :: - sources.PreInsertCastAndRename :: + PreInsertCastAndRename :: Nil override val extendedCheckRules = Seq( - sources.PreWriteCheck(catalog) + datasources.PreWriteCheck(catalog) ) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala index 240332a80af0f..8cef7f200d2dc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.execution +import org.apache.spark.sql.{SQLContext, Strategy, execution} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning._ @@ -25,10 +26,9 @@ import org.apache.spark.sql.catalyst.plans.logical.{BroadcastHint, LogicalPlan} import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.columnar.{InMemoryColumnarTableScan, InMemoryRelation} import org.apache.spark.sql.execution.{DescribeCommand => RunnableDescribeCommand} +import org.apache.spark.sql.execution.datasources.{CreateTableUsing, CreateTempTableUsing, DescribeCommand => LogicalDescribeCommand, _} import org.apache.spark.sql.parquet._ -import org.apache.spark.sql.sources.{CreateTableUsing, CreateTempTableUsing, DescribeCommand => LogicalDescribeCommand, _} import org.apache.spark.sql.types._ -import org.apache.spark.sql.{SQLContext, Strategy, execution} private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] { self: SQLContext#SparkPlanner => diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/SqlNewHadoopRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SqlNewHadoopRDD.scala similarity index 99% rename from sql/core/src/main/scala/org/apache/spark/sql/sources/SqlNewHadoopRDD.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/SqlNewHadoopRDD.scala index 2bdc341021256..e1c1a6c06268f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/sources/SqlNewHadoopRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SqlNewHadoopRDD.scala @@ -15,24 +15,23 @@ * limitations under the License. */ -package org.apache.spark.sql.sources +package org.apache.spark.sql.execution import java.text.SimpleDateFormat import java.util.Date +import org.apache.spark.{Partition => SparkPartition, _} import org.apache.hadoop.conf.{Configurable, Configuration} import org.apache.hadoop.io.Writable import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, FileSplit} -import org.apache.spark.broadcast.Broadcast - -import org.apache.spark.{Partition => SparkPartition, _} import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.broadcast.Broadcast import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.executor.DataReadMethod import org.apache.spark.mapreduce.SparkHadoopMapReduceUtil -import org.apache.spark.rdd.{RDD, HadoopRDD} import org.apache.spark.rdd.NewHadoopRDD.NewHadoopMapPartitionsWithSplitRDD +import org.apache.spark.rdd.{HadoopRDD, RDD} import org.apache.spark.storage.StorageLevel import org.apache.spark.util.{SerializableConfiguration, Utils} diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala similarity index 98% rename from sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala index 70c9e06927582..2b400926177fe 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/sources/DataSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala @@ -15,22 +15,21 @@ * limitations under the License. */ -package org.apache.spark.sql.sources +package org.apache.spark.sql.execution.datasources import org.apache.spark.{Logging, TaskContext} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.rdd.{MapPartitionsRDD, RDD, UnionRDD} -import org.apache.spark.sql._ -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions +import org.apache.spark.sql.catalyst.{InternalRow, expressions} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{StringType, StructType} -import org.apache.spark.sql.{SaveMode, Strategy, execution, sources} -import org.apache.spark.util.{SerializableConfiguration, Utils} +import org.apache.spark.sql.{SaveMode, Strategy, execution, sources, _} import org.apache.spark.unsafe.types.UTF8String +import org.apache.spark.util.{SerializableConfiguration, Utils} /** * A Strategy for planning scans over data sources defined using the sources API. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/LogicalRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala similarity index 88% rename from sql/core/src/main/scala/org/apache/spark/sql/sources/LogicalRelation.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala index f374abffdd505..a7123dc845fa2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/sources/LogicalRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala @@ -14,11 +14,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.spark.sql.sources +package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation -import org.apache.spark.sql.catalyst.expressions.{AttributeReference, AttributeMap} -import org.apache.spark.sql.catalyst.plans.logical.{Statistics, LeafNode, LogicalPlan} +import org.apache.spark.sql.catalyst.expressions.{AttributeMap, AttributeReference} +import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} +import org.apache.spark.sql.sources.BaseRelation /** * Used to link a [[BaseRelation]] in to a logical query plan. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala similarity index 99% rename from sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala index 8b2a45d8e970a..6b4a359db22d1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/sources/PartitioningUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala @@ -15,9 +15,9 @@ * limitations under the License. */ -package org.apache.spark.sql.sources +package org.apache.spark.sql.execution.datasources -import java.lang.{Double => JDouble, Float => JFloat, Integer => JInteger, Long => JLong} +import java.lang.{Double => JDouble, Long => JLong} import java.math.{BigDecimal => JBigDecimal} import scala.collection.mutable.ArrayBuffer @@ -29,6 +29,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Cast, Literal} import org.apache.spark.sql.types._ + private[sql] case class Partition(values: InternalRow, path: String) private[sql] case class PartitionSpec(partitionColumns: StructType, partitions: Seq[Partition]) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/commands.scala similarity index 99% rename from sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/commands.scala index 5c6ef2dc90c73..84a0441e145c5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/commands.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.sql.sources +package org.apache.spark.sql.execution.datasources import java.util.{Date, UUID} @@ -24,7 +24,6 @@ import scala.collection.JavaConversions.asScalaIterator import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter => MapReduceFileOutputCommitter, FileOutputFormat} - import org.apache.spark._ import org.apache.spark.mapred.SparkHadoopMapRedUtil import org.apache.spark.mapreduce.SparkHadoopMapReduceUtil @@ -35,9 +34,11 @@ import org.apache.spark.sql.catalyst.expressions.codegen.GenerateProjection import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.execution.RunnableCommand +import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.StringType import org.apache.spark.util.SerializableConfiguration + private[sql] case class InsertIntoDataSource( logicalRelation: LogicalRelation, query: LogicalPlan, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala similarity index 99% rename from sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala index 5a8c97c773ee6..c8033d3c0470a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/sources/ddl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala @@ -15,23 +15,22 @@ * limitations under the License. */ -package org.apache.spark.sql.sources +package org.apache.spark.sql.execution.datasources import scala.language.{existentials, implicitConversions} import scala.util.matching.Regex import org.apache.hadoop.fs.Path - import org.apache.spark.Logging import org.apache.spark.deploy.SparkHadoopUtil -import org.apache.spark.sql.catalyst.AbstractSparkSQLParser +import org.apache.spark.sql.{AnalysisException, DataFrame, SQLContext, SaveMode} import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation -import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.catalyst.{AbstractSparkSQLParser, InternalRow} import org.apache.spark.sql.execution.RunnableCommand +import org.apache.spark.sql.sources._ import org.apache.spark.sql.types._ -import org.apache.spark.sql.{AnalysisException, DataFrame, SQLContext, SaveMode} import org.apache.spark.util.Utils /** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala similarity index 94% rename from sql/core/src/main/scala/org/apache/spark/sql/sources/rules.scala rename to sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala index 40ee048e2653e..11bb49b8d83de 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/sources/rules.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala @@ -15,15 +15,15 @@ * limitations under the License. */ -package org.apache.spark.sql.sources +package org.apache.spark.sql.execution.datasources -import org.apache.spark.sql.{SaveMode, AnalysisException} -import org.apache.spark.sql.catalyst.analysis.{EliminateSubQueries, Catalog} -import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Alias} +import org.apache.spark.sql.{AnalysisException, SaveMode} +import org.apache.spark.sql.catalyst.analysis.{Catalog, EliminateSubQueries} +import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, Cast} import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.types.DataType +import org.apache.spark.sql.sources.{BaseRelation, HadoopFsRelation, InsertableRelation} /** * A rule to do pre-insert data type casting and field renaming. Before we insert into diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala index e683eb0126004..2f9f880c70690 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala @@ -35,15 +35,18 @@ import org.apache.parquet.hadoop.metadata.CompressionCodecName import org.apache.parquet.hadoop.util.ContextUtil import org.apache.parquet.schema.MessageType +import org.apache.spark.{Logging, Partition => SparkPartition, SparkException} import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD._ import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.execution.{SqlNewHadoopPartition, SqlNewHadoopRDD} +import org.apache.spark.sql.execution.datasources.PartitionSpec import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.util.{SerializableConfiguration, Utils} -import org.apache.spark.{Logging, Partition => SparkPartition, SparkException} + private[sql] class DefaultSource extends HadoopFsRelationProvider { override def createRelation( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala index 24e86ca415c51..4d942e4f9287a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/filters.scala @@ -17,6 +17,10 @@ package org.apache.spark.sql.sources +//////////////////////////////////////////////////////////////////////////////////////////////////// +// This file defines all the filters that we can push down to the data sources. +//////////////////////////////////////////////////////////////////////////////////////////////////// + /** * A filter predicate for data sources. * diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala index 2cd8b358d81c6..7cd005b959488 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala @@ -18,7 +18,6 @@ package org.apache.spark.sql.sources import scala.collection.mutable -import scala.collection.mutable.ArrayBuffer import scala.util.Try import org.apache.hadoop.conf.Configuration @@ -33,6 +32,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection import org.apache.spark.sql.execution.RDDConversions +import org.apache.spark.sql.execution.datasources.{PartitioningUtils, PartitionSpec, Partition} import org.apache.spark.sql.types.StructType import org.apache.spark.sql._ import org.apache.spark.util.SerializableConfiguration @@ -523,7 +523,7 @@ abstract class HadoopFsRelation private[sql](maybePartitionSpec: Option[Partitio }) } - private[sources] final def buildScan( + private[sql] final def buildScan( requiredColumns: Array[String], filters: Array[Filter], inputPaths: Array[String], diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala index 3475f9dd6787e..1d04513a44672 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala @@ -26,8 +26,8 @@ import org.scalactic.Tolerance._ import org.apache.spark.sql.{QueryTest, Row, SQLConf} import org.apache.spark.sql.TestData._ import org.apache.spark.sql.catalyst.util.DateTimeUtils +import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.json.InferSchema.compatibleType -import org.apache.spark.sql.sources.LogicalRelation import org.apache.spark.sql.types._ import org.apache.spark.util.Utils diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala index a2763c78b6450..23df102cd951d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetFilterSuite.scala @@ -24,7 +24,7 @@ import org.apache.parquet.filter2.predicate.{FilterPredicate, Operators} import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning.PhysicalOperation -import org.apache.spark.sql.sources.LogicalRelation +import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.types._ import org.apache.spark.sql.{Column, DataFrame, QueryTest, Row, SQLConf} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala index 37b0a9fbf7a4e..4f98776b91160 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetPartitionDiscoverySuite.scala @@ -28,11 +28,11 @@ import org.apache.hadoop.fs.Path import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Literal -import org.apache.spark.sql.sources.PartitioningUtils._ -import org.apache.spark.sql.sources.{LogicalRelation, Partition, PartitionSpec} +import org.apache.spark.sql.execution.datasources.{LogicalRelation, PartitionSpec, Partition, PartitioningUtils} import org.apache.spark.sql.types._ import org.apache.spark.sql._ import org.apache.spark.unsafe.types.UTF8String +import PartitioningUtils._ // The data where the partitioning key exists only in the directory structure. case class ParquetData(intField: Int, stringField: String) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala index a71088430bfd5..1907e643c85dd 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/CreateTableAsSelectSuite.scala @@ -22,6 +22,7 @@ import java.io.{File, IOException} import org.scalatest.BeforeAndAfterAll import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.execution.datasources.DDLException import org.apache.spark.util.Utils class CreateTableAsSelectSuite extends DataSourceTest with BeforeAndAfterAll { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala index 296b0d6f74a0c..3cbf5467b253a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/ResolvedDataSourceSuite.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.sources import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.execution.datasources.ResolvedDataSource class ResolvedDataSourceSuite extends SparkFunSuite { diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala index 4684d48aff889..cec7685bb6859 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala @@ -44,9 +44,9 @@ import org.apache.spark.sql.catalyst.ParserDialect import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.execution.{ExecutedCommand, ExtractPythonUDFs, SetCommand} +import org.apache.spark.sql.execution.datasources.{PreWriteCheck, PreInsertCastAndRename, DataSourceStrategy} import org.apache.spark.sql.hive.client._ import org.apache.spark.sql.hive.execution.{DescribeHiveTableCommand, HiveNativeCommand} -import org.apache.spark.sql.sources.DataSourceStrategy import org.apache.spark.sql.types._ import org.apache.spark.util.Utils @@ -384,11 +384,11 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging { catalog.PreInsertionCasts :: ExtractPythonUDFs :: ResolveHiveWindowFunction :: - sources.PreInsertCastAndRename :: + PreInsertCastAndRename :: Nil override val extendedCheckRules = Seq( - sources.PreWriteCheck(catalog) + PreWriteCheck(catalog) ) } diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala index b15261b7914dd..0a2121c955871 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.hive +import scala.collection.JavaConversions._ + import com.google.common.base.Objects import com.google.common.cache.{CacheBuilder, CacheLoader, LoadingCache} @@ -28,6 +30,7 @@ import org.apache.hadoop.hive.ql.metadata._ import org.apache.hadoop.hive.ql.plan.TableDesc import org.apache.spark.Logging +import org.apache.spark.sql.{AnalysisException, SQLContext, SaveMode} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.{Catalog, MultiInstanceRelation, OverrideCatalog} import org.apache.spark.sql.catalyst.expressions._ @@ -35,14 +38,12 @@ import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules._ +import org.apache.spark.sql.execution.datasources +import org.apache.spark.sql.execution.datasources.{Partition => ParquetPartition, PartitionSpec, CreateTableUsingAsSelect, ResolvedDataSource, LogicalRelation} import org.apache.spark.sql.hive.client._ import org.apache.spark.sql.parquet.ParquetRelation2 -import org.apache.spark.sql.sources.{CreateTableUsingAsSelect, LogicalRelation, Partition => ParquetPartition, PartitionSpec, ResolvedDataSource} import org.apache.spark.sql.types._ -import org.apache.spark.sql.{AnalysisException, SQLContext, SaveMode, sources} -/* Implicit conversions */ -import scala.collection.JavaConversions._ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: HiveContext) extends Catalog with Logging { @@ -278,7 +279,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive parquetRelation.paths.toSet == pathsInMetastore.toSet && logical.schema.sameType(metastoreSchema) && parquetRelation.partitionSpec == partitionSpecInMetastore.getOrElse { - PartitionSpec(StructType(Nil), Array.empty[sources.Partition]) + PartitionSpec(StructType(Nil), Array.empty[datasources.Partition]) } if (useCached) { diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala index 7fc517b646b20..f5574509b0b38 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala @@ -37,7 +37,7 @@ import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.trees.CurrentOrigin import org.apache.spark.sql.execution.ExplainCommand -import org.apache.spark.sql.sources.DescribeCommand +import org.apache.spark.sql.execution.datasources.DescribeCommand import org.apache.spark.sql.hive.HiveShim._ import org.apache.spark.sql.hive.client._ import org.apache.spark.sql.hive.execution.{HiveNativeCommand, DropTable, AnalyzeTable, HiveScriptIOSchema} diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala index 9638a8201e190..a22c3292eff94 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala @@ -30,9 +30,9 @@ import org.apache.spark.sql.catalyst.planning._ import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.{DescribeCommand => RunnableDescribeCommand, _} +import org.apache.spark.sql.execution.datasources.{CreateTableUsing, CreateTableUsingAsSelect, DescribeCommand} import org.apache.spark.sql.hive.execution._ import org.apache.spark.sql.parquet.ParquetRelation -import org.apache.spark.sql.sources.{CreateTableUsing, CreateTableUsingAsSelect, DescribeCommand} import org.apache.spark.sql.types.StringType diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala index 71fa3e9c33ad9..a47f9a4feb21b 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala @@ -23,6 +23,7 @@ import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.execution.RunnableCommand +import org.apache.spark.sql.execution.datasources.{ResolvedDataSource, LogicalRelation} import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.sources._ import org.apache.spark.sql.types._ diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala index 48d35a60a759b..de63ee56dd8e6 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcRelation.scala @@ -37,6 +37,7 @@ import org.apache.spark.mapred.SparkHadoopMapRedUtil import org.apache.spark.rdd.{HadoopRDD, RDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.execution.datasources.PartitionSpec import org.apache.spark.sql.hive.{HiveContext, HiveInspectors, HiveMetastoreTypes, HiveShim} import org.apache.spark.sql.sources.{Filter, _} import org.apache.spark.sql.types.StructType diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala index d910af22c3dd1..e403f32efaf91 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala @@ -28,12 +28,12 @@ import org.apache.hadoop.mapred.InvalidInputException import org.apache.spark.Logging import org.apache.spark.sql._ +import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.hive.client.{HiveTable, ManagedTable} import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.hive.test.TestHive._ import org.apache.spark.sql.hive.test.TestHive.implicits._ import org.apache.spark.sql.parquet.ParquetRelation2 -import org.apache.spark.sql.sources.LogicalRelation import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types._ import org.apache.spark.util.Utils diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala index c9dd4c0935a72..efb04bf3d5097 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala @@ -22,11 +22,11 @@ import java.io._ import org.scalatest.{BeforeAndAfterAll, GivenWhenThen} import org.apache.spark.{Logging, SparkFunSuite} -import org.apache.spark.sql.sources.DescribeCommand -import org.apache.spark.sql.execution.{SetCommand, ExplainCommand} import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.util._ +import org.apache.spark.sql.execution.{SetCommand, ExplainCommand} +import org.apache.spark.sql.execution.datasources.DescribeCommand import org.apache.spark.sql.hive.test.TestHive /** diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index 05a1f0094e5e1..03428265422e6 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -23,12 +23,12 @@ import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.DefaultParserDialect import org.apache.spark.sql.catalyst.analysis.EliminateSubQueries import org.apache.spark.sql.catalyst.errors.DialectException +import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.hive.test.TestHive._ import org.apache.spark.sql.hive.test.TestHive.implicits._ import org.apache.spark.sql.hive.{HiveContext, HiveQLDialect, MetastoreRelation} import org.apache.spark.sql.parquet.ParquetRelation2 -import org.apache.spark.sql.sources.LogicalRelation import org.apache.spark.sql.types._ case class Nested1(f1: Nested2) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala index 9d79a4b007d66..82a8daf8b4b09 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/parquetSuites.scala @@ -23,12 +23,12 @@ import org.scalatest.BeforeAndAfterAll import org.apache.spark.sql._ import org.apache.spark.sql.execution.{ExecutedCommand, PhysicalRDD} +import org.apache.spark.sql.execution.datasources.{InsertIntoDataSource, InsertIntoHadoopFsRelation, LogicalRelation} import org.apache.spark.sql.hive.execution.HiveTableScan import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.hive.test.TestHive._ import org.apache.spark.sql.hive.test.TestHive.implicits._ import org.apache.spark.sql.parquet.{ParquetRelation2, ParquetTableScan} -import org.apache.spark.sql.sources.{InsertIntoDataSource, InsertIntoHadoopFsRelation, LogicalRelation} import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types._ import org.apache.spark.util.Utils diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala index afecf9675e11f..1cef83fd5e990 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/hadoopFsRelationSuites.scala @@ -17,10 +17,10 @@ package org.apache.spark.sql.sources -import scala.collection.JavaConversions._ - import java.io.File +import scala.collection.JavaConversions._ + import com.google.common.io.Files import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path @@ -31,10 +31,12 @@ import org.apache.parquet.hadoop.ParquetOutputCommitter import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql._ +import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types._ + abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils { override lazy val sqlContext: SQLContext = TestHive