Merge branch 'hive0.9' of https://github.com/amplab/shark into alpha-…

…0.2.0
sameeragarwal · Feb 2, 2014 · e338be4 · e338be4
2 parents 3a644b4 + 314a90f
commit e338be4
Show file tree

Hide file tree

Showing 139 changed files with 9,587 additions and 2,205 deletions.
diff --git a/bin/dev/release_cleanup.sh b/bin/dev/release_cleanup.sh
@@ -0,0 +1,40 @@
+#!/bin/sh
+
+# Copyright (C) 2012 The Regents of The University California.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DEVDIR="`dirname $0`"
+BINDIR="`dirname $DEVDIR`"
+FWDIR="`dirname $BINDIR`"
+
+rm -rf $FWDIR/run-tests-from-scratch-workspace
+rm -rf $FWDIR/test_warehouses
+
+rm -rf $FWDIR/conf/shark-env.sh
+
+rm -rf $FWDIR/metastore_db
+rm -rf $FWDIR/derby.log
+
+rm -rf $FWDIR/project/target $FWDIR/project/project/target
+
+rm -rf $FWDIR/target/resolution-cache
+rm -rf $FWDIR/target/streams
+rm -rf $FWDIR/target/scala-*/cache
+rm -rf $FWDIR/target/scala-*/classes
+rm -rf $FWDIR/target/scala-*/test-classes
+
+find $FWDIR -name ".DS_Store" -exec rm {} \;
+find $FWDIR -name ".history" -exec rm {} \;
+
diff --git a/bin/dev/run-tests-from-scratch b/bin/dev/run-tests-from-scratch
@@ -12,10 +12,11 @@
 # Set up config vars using env vars or defaults; parse cmd line flags.
 #####################################################################
 SHARK_PROJ_DIR_DEFAULT="$(cd `dirname $0`/../../; pwd)"
+SBT_OPTS_DEFAULT="-Xms512M -Xmx2048M -Xss1M -XX:+CMSClassUnloadingEnabled -XX:MaxPermSize=512m -XX:ReservedCodeCacheSize=256m -XX:+UseCodeCacheFlushing"
 SPARK_MEM_DEFAULT=4g
 SHARK_MASTER_MEM_DEFAULT=4g
 SPARK_KV_JAVA_OPTS_DEFAULT=("-Dspark.local.dir=/tmp " "-Dspark.kryoserializer.buffer.mb=10 ")
-SPARK_GIT_URL_DEFAULT="https://github.com/mesos/spark.git"
+SPARK_GIT_URL_DEFAULT="https://github.com/apache/incubator-spark.git spark"
 HIVE_GIT_URL_DEFAULT="https://github.com/amplab/hive.git -b shark-0.9"
 SPARK_HADOOP_VERSION_DEFAULT="1.0.4"
 SPARK_WITH_YARN_DEFAULT=false
@@ -49,6 +50,10 @@ else
   fi
 fi
 
+if [ "x$SBT_OPTS" == "x" ] ; then
+  SBT_OPTS=$SBT_OPTS_DEFAULT
+fi
+
 if [ "x$SPARK_MEM" == "x" ] ; then
   export SPARK_MEM=$SPARK_MEM_DEFAULT
 fi
@@ -117,6 +122,7 @@ Required Options:
 Optional configuration environment variables:
   SHARK_PROJ_DIR (default: "$SHARK_PROJ_DIR_DEFAULT")
   SCALA_HOME (default: Scala version ${SCALA_VERSION} will be downloaded and used)
+  SBT_OPTS (default: "$SBT_OPTS_DEFAULT")
   SPARK_MEM (default: $SPARK_MEM_DEFAULT)
   SHARK_MASTER_MEM (default: $SHARK_MASTER_MEM_DEFAULT)
   SPARK_JAVA_OPTS (default: "${SPARK_KV_JAVA_OPTS_DEFAULT[@]}")
@@ -226,6 +232,7 @@ fi
 # Download Scala if SCALA_HOME is not specified.
 ####################################################################
 if [ "x$SCALA_HOME" == "x" ] ; then
+  rm -rf ./scala*tgz
   wget $SCALA_DOWNLOAD_PATH
   tar xvfz scala*tgz
   export SCALA_HOME="$WORKSPACE/scala-$SCALA_VERSION"
@@ -251,7 +258,8 @@ else
   export SPARK_HADOOP_VERSION=$SPARK_HADOOP_VERSION
   export SPARK_WITH_YARN=$SPARK_WITH_YARN
   # Build spark and push the jars to local Ivy/Maven caches.
-  sbt/sbt clean publish-local
+  wget -nc http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/0.13.0/sbt-launch.jar
+  java $SBT_OPTS -jar sbt-launch.jar clean publish-local
   popd
 fi
 export SPARK_HOME="$WORKSPACE/spark"
@@ -274,17 +282,11 @@ export HADOOP_HOME="$WORKSPACE/hadoop-${SPARK_HADOOP_VERSION}"
 # Download and build Hive.
 #####################################################################
 if $SKIP_HIVE ; then
-  if [ ! -e "hive" -o ! -e "hive-warehouse" ] ; then
-    echo "hive and hive-warehouse dirs must exist when skipping Hive download and build stage."
+  if [ ! -e "hive" ] ; then
+    echo "hive dir must exist when skipping Hive download and build stage."
     exit -1
   fi
 else
-  # Setup the Hive warehouse directory.
-  HIVE_WAREHOUSE=./hive-warehouse
-  rm -rf $HIVE_WAREHOUSE
-  mkdir -p $HIVE_WAREHOUSE
-  chmod 0777 $HIVE_WAREHOUSE
-
   rm -rf hive
   git clone $HIVE_GIT_URL
   pushd hive

diff --git a/bin/ext/sharkserver.sh b/bin/ext/sharkserver.sh
@@ -18,10 +18,6 @@
 THISSERVICE=sharkserver
 export SERVICE_LIST="${SERVICE_LIST}${THISSERVICE} "
 
-# Use Java to launch Shark otherwise the unit tests cannot properly kill
-# the server process.
-export SHARK_LAUNCH_WITH_JAVA=1
-
 sharkserver() {
   echo "Starting the Shark Server"
   exec $FWDIR/run shark.SharkServer "$@"

diff --git a/conf/blinkdb-env.sh.template b/conf/blinkdb-env.sh.template
@@ -39,6 +39,11 @@ export HIVE_HOME=""
 # Only required if using Mesos:
 #export MESOS_NATIVE_LIBRARY=/usr/local/lib/libmesos.so 
 
+# Only required if run shark with spark on yarn
+#export SHARK_EXEC_MODE=yarn
+#export SPARK_ASSEMBLY_JAR=
+#export SHARK_ASSEMBLY_JAR=
+
 # (Optional) Extra classpath
 #export SPARK_LIBRARY_PATH=""
 

diff --git a/project/SharkBuild.scala b/project/SharkBuild.scala
@@ -21,23 +21,32 @@ import Keys._
 import sbtassembly.Plugin._
 import AssemblyKeys._
 
+import scala.util.Properties.{ envOrNone => env }
+
 object SharkBuild extends Build {
 
   val BLINKDB_VERSION = "0.1.0-SNAPSHOT"
 
   // Shark version
-  val SHARK_VERSION = "0.8.0-SNAPSHOT"
+  val SHARK_VERSION = "0.9.0-hive0.9-SNAPSHOT"
 
-  val SPARK_VERSION = "0.8.0-SNAPSHOT"
+  val SPARK_VERSION = "0.9.0-incubating"
 
-  val SCALA_VERSION = "2.9.3"
+  val SCALA_VERSION = "2.10.3"
 
   // Hadoop version to build against. For example, "0.20.2", "0.20.205.0", or
   // "1.0.1" for Apache releases, or "0.20.2-cdh3u3" for Cloudera Hadoop.
-  val HADOOP_VERSION = "1.0.4"
+  val DEFAULT_HADOOP_VERSION = "1.0.4"
+
+  lazy val hadoopVersion = env("SHARK_HADOOP_VERSION") orElse
+                           env("SPARK_HADOOP_VERSION") getOrElse
+                           DEFAULT_HADOOP_VERSION
+
+  // Whether to build Shark with Yarn support
+  val YARN_ENABLED = env("SHARK_YARN").getOrElse("false").toBoolean
 
   // Whether to build Shark with Tachyon jar.
-  val TACHYON_ENABLED = false
+  val TACHYON_ENABLED = true
 
   lazy val root = Project(
     id = "root",
@@ -47,29 +56,34 @@ object SharkBuild extends Build {
   val excludeKyro = ExclusionRule(organization = "de.javakaffee")
   val excludeHadoop = ExclusionRule(organization = "org.apache.hadoop")
   val excludeNetty = ExclusionRule(organization = "org.jboss.netty")
+  val excludeCurator = ExclusionRule(organization = "org.apache.curator")
+  val excludeJackson = ExclusionRule(organization = "org.codehaus.jackson")
+  val excludeAsm = ExclusionRule(organization = "asm")
+  val excludeSnappy = ExclusionRule(organization = "org.xerial.snappy")
 
   def coreSettings = Defaults.defaultSettings ++ Seq(
 
     name := "shark",
     organization := "edu.berkeley.cs.amplab",
     version := SHARK_VERSION,
     scalaVersion := SCALA_VERSION,
-    scalacOptions := Seq("-deprecation", "-unchecked", "-optimize"),
+    scalacOptions := Seq("-deprecation", "-unchecked", "-optimize", "-feature", "-Yinline-warnings"),
     parallelExecution in Test := false,
 
     // Download managed jars into lib_managed.
     retrieveManaged := true,
     resolvers ++= Seq(
       "Typesafe Repository" at "http://repo.typesafe.com/typesafe/releases/",
-      "JBoss Repository" at "http://repository.jboss.org/nexus/content/repositories/releases/",
-      "Spray Repository" at "http://repo.spray.cc/",
       "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/",
       "Local Maven" at Path.userHome.asFile.toURI.toURL + ".m2/repository"
     ),
 
     fork := true,
     javaOptions += "-XX:MaxPermSize=512m",
     javaOptions += "-Xmx2g",
+    javaOptions += "-Dsun.io.serialization.extendedDebugInfo=true",
+
+    testOptions in Test += Tests.Argument("-oF"), // Full stack trace on test failures
 
     testListeners <<= target.map(
       t => Seq(new eu.henkelmann.sbt.JUnitXmlTestsListener(t.getAbsolutePath))),
@@ -102,7 +116,7 @@ object SharkBuild extends Build {
       "org.apache.spark" %% "spark-core" % SPARK_VERSION,
       "org.apache.spark" %% "spark-repl" % SPARK_VERSION,
       "com.google.guava" % "guava" % "14.0.1",
-      "org.apache.hadoop" % "hadoop-client" % HADOOP_VERSION excludeAll(excludeNetty),
+      "org.apache.hadoop" % "hadoop-client" % hadoopVersion excludeAll(excludeJackson, excludeNetty, excludeAsm) force(),
       // See https://code.google.com/p/guava-libraries/issues/detail?id=1095
       "com.google.code.findbugs" % "jsr305" % "1.3.+",
 
@@ -114,21 +128,22 @@ object SharkBuild extends Build {
       // Test infrastructure
       "org.scalatest" %% "scalatest" % "1.9.1" % "test",
       "junit" % "junit" % "4.10" % "test",
-      "net.java.dev.jets3t" % "jets3t" % "0.9.0",
+      "net.java.dev.jets3t" % "jets3t" % "0.7.1",
       "com.novocode" % "junit-interface" % "0.8" % "test") ++
-      (if (TACHYON_ENABLED) Some("org.tachyonproject" % "tachyon" % "0.3.0-SNAPSHOT" excludeAll(excludeKyro, excludeHadoop) ) else None).toSeq
-  )
+      (if (YARN_ENABLED) Some("org.apache.spark" %% "spark-yarn" % SPARK_VERSION) else None).toSeq ++
+      (if (TACHYON_ENABLED) Some("org.tachyonproject" % "tachyon" % "0.3.0" excludeAll(excludeKyro, excludeHadoop, excludeCurator, excludeJackson, excludeNetty, excludeAsm)) else None).toSeq
+  ) ++ org.scalastyle.sbt.ScalastylePlugin.Settings
 
   def assemblyProjSettings = Seq(
-    name := "shark-assembly",
-    jarName in assembly <<= version map { v => "shark-assembly-" + v + "-hadoop" + HADOOP_VERSION + ".jar" }
+    jarName in assembly <<= version map { v => "shark-assembly-" + v + "-hadoop" + hadoopVersion + ".jar" }
   ) ++ assemblySettings ++ extraAssemblySettings
 
   def extraAssemblySettings() = Seq(
     test in assembly := {},
     mergeStrategy in assembly := {
       case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard
       case m if m.toLowerCase.matches("meta-inf.*\\.sf$") => MergeStrategy.discard
+      case "META-INF/services/org.apache.hadoop.fs.FileSystem" => MergeStrategy.concat
       case "reference.conf" => MergeStrategy.concat
       case _ => MergeStrategy.first
     }

diff --git a/project/plugins.sbt b/project/plugins.sbt
@@ -15,6 +15,8 @@
 
 addSbtPlugin("org.ensime" % "ensime-sbt-cmd" % "0.1.1")
 
+addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.3.2")
+
 addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "2.2.0")
 
 addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.4.0")
@@ -24,3 +26,5 @@ addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.9.2")
 resolvers += Resolver.url(
   "sbt-plugin-releases",
   new URL("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases/"))(Resolver.ivyStylePatterns)
+
+resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/"
diff --git a/run b/run
@@ -1,9 +1,9 @@
 #!/bin/bash
 
 # This file is used to launch Shark on the master.
-export SCALA_VERSION=2.9.3
-SHARK_VERSION=0.8.0-SNAPSHOT
-BLINKDB_VERSION=0.1.0-SNAPSHOT
+export SCALA_VERSION=2.10
+SHARK_VERSION=0.9.0-SNAPSHOT
+BLINKDB_VERSION=0.2.0-SNAPSHOT
 
 # Figure out where the framework is installed
 FWDIR="$(cd `dirname $0`; pwd)"
@@ -48,6 +48,26 @@ if [ -n "$MASTER" ] ; then
   fi
 fi
 
+# check for shark with spark on yarn params
+if [ "x$SHARK_EXEC_MODE" == "xyarn" ] ; then
+  if [ "x$SPARK_ASSEMBLY_JAR" == "x" ] ; then
+    echo "No SPARK_ASSEMBLY_JAR specified. Please set SPARK_ASSEMBLY_JAR for spark on yarn mode."
+    exit 1
+  else
+    export SPARK_JAR=$SPARK_ASSEMBLY_JAR
+  fi
+
+  if [ "x$SHARK_ASSEMBLY_JAR" == "x" ] ; then
+    echo "No SHARK_ASSEMBLY_JAR specified. please set SHARK_ASSEMBLY_JAR for spark on yarn mode."
+    exit 1
+  else
+    export SPARK_YARN_APP_JAR = $SHARK_ASSEMBLY_JAR
+  fi
+
+  # use yarn-client mode for interactive shell.
+  export MASTER=yarn-client
+fi
+
 # Check for optionally specified configuration file path
 if [ "x$HIVE_CONF_DIR" == "x" ] ; then
     HIVE_CONF_DIR="$HIVE_HOME/conf"
@@ -110,9 +130,10 @@ SPARK_CLASSPATH+=":$SHARK_HOME/target/scala-$SCALA_VERSION/test-classes"
 
 
 if [ "x$HADOOP_HOME" == "x" ] ; then
-    echo "No HADOOP_HOME specified. Shark will run in local-mode"
+  echo "No HADOOP_HOME specified. Shark will run in local-mode"
 else
-    SPARK_CLASSPATH+=:$HADOOP_HOME/conf
+  SPARK_CLASSPATH+=:$HADOOP_HOME/etc/hadoop
+  SPARK_CLASSPATH+=:$HADOOP_HOME/conf
 fi
 
 
@@ -141,22 +162,16 @@ export JAVA_OPTS
 export ANT_OPTS=$JAVA_OPTS
 
 if [ "x$RUNNER" == "x" ] ; then
-  if [ "$SHARK_LAUNCH_WITH_JAVA" == "1" ]; then
-    CLASSPATH+=":$SCALA_HOME/lib/scala-library.jar"
-    CLASSPATH+=":$SCALA_HOME/lib/scala-compiler.jar"
-    CLASSPATH+=":$SCALA_HOME/lib/jline.jar"
-    if [ -n "$JAVA_HOME" ]; then
-      RUNNER="${JAVA_HOME}/bin/java"
-    else
-      RUNNER=java
-    fi
-    # The JVM doesn't read JAVA_OPTS by default so we need to pass it in
-    EXTRA_ARGS="$JAVA_OPTS"
+  CLASSPATH+=":$SCALA_HOME/lib/scala-library.jar"
+  CLASSPATH+=":$SCALA_HOME/lib/scala-compiler.jar"
+  CLASSPATH+=":$SCALA_HOME/lib/jline.jar"
+  if [ -n "$JAVA_HOME" ]; then
+    RUNNER="${JAVA_HOME}/bin/java"
   else
-    SCALA=${SCALA_HOME}/bin/scala
-    RUNNER="$SCALA -cp \"$CLASSPATH\""
-    EXTRA_ARGS=""
+    RUNNER=java
   fi
+  # The JVM doesn't read JAVA_OPTS by default so we need to pass it in
+  EXTRA_ARGS="$JAVA_OPTS"
 fi
 
 exec $RUNNER $EXTRA_ARGS "$@"
diff --git a/sbt/sbt b/sbt/sbt
@@ -5,9 +5,9 @@ if [ -e $BLINKDB_CONF_DIR/blinkdb-env.sh ] ; then
    . $BLINKDB_CONF_DIR/blinkdb-env.sh
 fi
 
-if [[ "$@" == *"test"* ]]; then
-  if [ "x$HIVE_DEV_HOME" == "x" ]; then
-    echo "No HIVE_DEV_HOME specified. Required for tests. Please set HIVE_DEV_HOME."
+if [[ "$@" == *"test"* ]] || [[ "$@" == "eclipse" ]]; then
+  if [[ "x$HIVE_DEV_HOME" == "x" ]]; then
+    echo "No HIVE_DEV_HOME specified. Required for tests and eclipse. Please set HIVE_DEV_HOME."
     exit 1
   fi
 fi