From 6e7a50ce9e098ffe719f6bbb2b39b50dbb8e072e Mon Sep 17 00:00:00 2001 From: Evan Chan Date: Wed, 3 Jun 2015 23:31:37 -0700 Subject: [PATCH 1/7] Use spark-submit instead of compute-classpath.sh --- bin/server_start.sh | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/bin/server_start.sh b/bin/server_start.sh index e29498a36..49cd774b7 100755 --- a/bin/server_start.sh +++ b/bin/server_start.sh @@ -1,5 +1,7 @@ #!/bin/bash # Script to start the job server +# Extra arguments will be spark-submit options, for example +# ./server_start.sh --jars cassandra-spark-connector.jar set -e get_abs_script_path() { @@ -16,7 +18,7 @@ GC_OPTS="-XX:+UseConcMarkSweepGC -XX:MaxPermSize=512m -XX:+CMSClassUnloadingEnabled " -JAVA_OPTS="-Xmx5g -XX:MaxDirectMemorySize=512M +JAVA_OPTS="-XX:MaxDirectMemorySize=512M -XX:+HeapDumpOnOutOfMemoryError -Djava.net.preferIPv4Stack=true -Dcom.sun.management.jmxremote.port=9999 -Dcom.sun.management.jmxremote.authenticate=false @@ -42,13 +44,6 @@ if [ -z "$SPARK_HOME" ]; then exit 1 fi -if [ -z "$SPARK_CONF_DIR" ]; then - SPARK_CONF_DIR=$SPARK_HOME/conf -fi - -# Pull in other env vars in spark config, such as MESOS_NATIVE_LIBRARY -. $SPARK_CONF_DIR/spark-env.sh - pidFilePath=$appdir/$PIDFILE if [ -f "$pidFilePath" ] && kill -0 $(cat "$pidFilePath"); then @@ -62,7 +57,7 @@ if [ -z "$LOG_DIR" ]; then fi mkdir -p $LOG_DIR -LOGGING_OPTS="-Dlog4j.configuration=log4j-server.properties +LOGGING_OPTS="-Dlog4j.configuration=$appdir/log4j-server.properties -DLOG_DIR=$LOG_DIR" # For Mesos @@ -78,9 +73,7 @@ fi # This needs to be exported for standalone mode so drivers can connect to the Spark cluster export SPARK_HOME -# job server jar needs to appear first so its deps take higher priority -# need to explicitly include app dir in classpath so logging configs can be found -CLASSPATH="$appdir:$appdir/spark-job-server.jar:$($SPARK_HOME/bin/compute-classpath.sh)" - -exec java -cp $CLASSPATH $GC_OPTS $JAVA_OPTS $LOGGING_OPTS $CONFIG_OVERRIDES $MAIN $conffile 2>&1 & +$SPARK_HOME/bin/spark-submit --class $MAIN --driver-memory 5G \ + --driver-java-options "$GC_OPTS $JAVA_OPTS $LOGGING_OPTS $CONFIG_OVERRIDES" \ + $@ $appdir/spark-job-server.jar $conffile 2>&1 & echo $! > $pidFilePath From a731661f45ea3f2f08da454b76f5c41a0091e75f Mon Sep 17 00:00:00 2001 From: Evan Chan Date: Wed, 3 Jun 2015 23:53:47 -0700 Subject: [PATCH 2/7] Add common ActorRefFactory issue --- doc/troubleshooting.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/troubleshooting.md b/doc/troubleshooting.md index 238bf4e61..39fb0ccd3 100644 --- a/doc/troubleshooting.md +++ b/doc/troubleshooting.md @@ -33,6 +33,10 @@ after this fixed, I can run jobs submitted from a remote job server successfully (Thanks to @pcliu) +## Exception in thread "main" java.lang.NoSuchMethodError: akka.actor.ActorRefFactory.dispatcher()Lscala/concurrent/ExecutionContextExecutor; + +If you are running CDH 5.3 or older, you may have an incompatible version of Akka bundled together. :( Try modifying the version of Akka included with spark-jobserver to match the one in CDH (2.2.4, I think), or upgrade to CDH 5.4. If you are on CDH 5.4, check that `sparkVersion` in `Dependencies.scala` matches CDH. Or see [isse #154](https://github.com/spark-jobserver/spark-jobserver/issues/154). + ## I want to run job-server on Windows 1. Create directory `C:\Hadoop\bin` From a1c3b33d7bb9280c1ef2c379604aed8087dac87f Mon Sep 17 00:00:00 2001 From: Addison Higham Date: Wed, 10 Jun 2015 15:29:23 -0600 Subject: [PATCH 3/7] make logs in spark-submit write to specified log4j config --- bin/server_start.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/server_start.sh b/bin/server_start.sh index 49cd774b7..62382dcfb 100755 --- a/bin/server_start.sh +++ b/bin/server_start.sh @@ -57,7 +57,7 @@ if [ -z "$LOG_DIR" ]; then fi mkdir -p $LOG_DIR -LOGGING_OPTS="-Dlog4j.configuration=$appdir/log4j-server.properties +LOGGING_OPTS="-Dlog4j.configuration=file:$appdir/log4j-server.properties -DLOG_DIR=$LOG_DIR" # For Mesos @@ -74,6 +74,7 @@ fi export SPARK_HOME $SPARK_HOME/bin/spark-submit --class $MAIN --driver-memory 5G \ + --conf "spark.executor.extraJavaOptions=$LOGGING_OPTS" \ --driver-java-options "$GC_OPTS $JAVA_OPTS $LOGGING_OPTS $CONFIG_OVERRIDES" \ $@ $appdir/spark-job-server.jar $conffile 2>&1 & echo $! > $pidFilePath From 06b374cfe0aa80a87ab9253ce81efde29002c2ba Mon Sep 17 00:00:00 2001 From: Evan Chan Date: Wed, 10 Jun 2015 15:24:22 -0700 Subject: [PATCH 4/7] Get rid of some dead code --- job-server/src/spark.jobserver/WebApi.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/job-server/src/spark.jobserver/WebApi.scala b/job-server/src/spark.jobserver/WebApi.scala index 02084580b..5eabcd487 100644 --- a/job-server/src/spark.jobserver/WebApi.scala +++ b/job-server/src/spark.jobserver/WebApi.scala @@ -42,7 +42,6 @@ class WebApi(system: ActorSystem, val ResultKey = "result" val contextTimeout = SparkJobUtils.getContextTimeout(config) - val sparkAliveWorkerThreshold = Try(config.getInt("spark.jobserver.sparkAliveWorkerThreshold")).getOrElse(1) val bindAddress = config.getString("spark.jobserver.bind-address") val logger = LoggerFactory.getLogger(getClass) From 08c3ad80ca4022aec5f8c8cc4b75648df2bd8abe Mon Sep 17 00:00:00 2001 From: Evan Chan Date: Wed, 10 Jun 2015 15:59:11 -0700 Subject: [PATCH 5/7] Log Spray config at server start, and document increasing request timeout --- doc/troubleshooting.md | 13 +++++++++++++ job-server/src/spark.jobserver/JobServer.scala | 1 + 2 files changed, 14 insertions(+) diff --git a/doc/troubleshooting.md b/doc/troubleshooting.md index 39fb0ccd3..6512db2c2 100644 --- a/doc/troubleshooting.md +++ b/doc/troubleshooting.md @@ -16,6 +16,19 @@ send timeout param along with your request (in secs). eg below. http://devsparkcluster.cloudapp.net/jobs?appName=job-server-tests&classPath=spark.jobserver.WordCountExample&sync=true&timeout=20 ``` +You may need to adjust Spray's default request timeout and idle timeout, which are by default 40 secs and 60 secs. To do this, modify the configuration file in your deployed job server, adding a section like the following: + +``` +spray.can.server { + idle-timeout = 210 s + request-timeout = 200 s +} +``` + +Then simply restart the job server. + +Note that the idle-timeout must be higher than request-timeout, or Spray and the job server won't start. + ## Job server won't start / cannot bind to 0.0.0.0:8090 Check that another process isn't already using that port. If it is, you may want to start it on another port: diff --git a/job-server/src/spark.jobserver/JobServer.scala b/job-server/src/spark.jobserver/JobServer.scala index 8ffdebc91..bb16737cd 100644 --- a/job-server/src/spark.jobserver/JobServer.scala +++ b/job-server/src/spark.jobserver/JobServer.scala @@ -41,6 +41,7 @@ object JobServer { defaultConfig } logger.info("Starting JobServer with config {}", config.getConfig("spark").root.render()) + logger.info("Spray config: {}", config.getConfig("spray.can.server").root.render()) val port = config.getInt("spark.jobserver.port") // TODO: Hardcode for now to get going. Make it configurable later. From 29fecf86490c1216db5ed5bf50210fcabbe15d8e Mon Sep 17 00:00:00 2001 From: Evan Chan Date: Wed, 10 Jun 2015 16:20:06 -0700 Subject: [PATCH 6/7] Return a friendlier JSON response when requests time out --- job-server/src/spark.jobserver/WebApi.scala | 3 +++ 1 file changed, 3 insertions(+) diff --git a/job-server/src/spark.jobserver/WebApi.scala b/job-server/src/spark.jobserver/WebApi.scala index 5eabcd487..08069c48c 100644 --- a/job-server/src/spark.jobserver/WebApi.scala +++ b/job-server/src/spark.jobserver/WebApi.scala @@ -337,6 +337,9 @@ class WebApi(system: ActorSystem, } } + override def timeoutRoute: Route = + complete(500, errMap("Request timed out. Try using the /jobs/, /jobs APIs to get status/results")) + private def badRequest(ctx: RequestContext, msg: String) = ctx.complete(StatusCodes.BadRequest, errMap(msg)) From 0067f561bb8e59a56616ccd89e8d3951b432f873 Mon Sep 17 00:00:00 2001 From: Evan Chan Date: Wed, 10 Jun 2015 16:24:51 -0700 Subject: [PATCH 7/7] Add notes to deploy section for spark-submit --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 5a04fc58e..f6d676376 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,8 @@ For release notes, look in the `notes/` directory. They should also be up on [l ## Quick start / development mode +NOTE: This quick start guide uses SBT to run the job server and the included test jar, but the normal development process is to create a separate project for Job Server jobs and to deploy the job server to a Spark cluster. + You need to have [SBT](http://www.scala-sbt.org/release/docs/Getting-Started/Setup.html) installed. To set the current version, do something like this: @@ -231,6 +233,8 @@ def validate(sc:SparkContext, config: Contig): SparkJobValidation = { it to the remotes you have configured in `.sh` 3. On the remote server, start it in the deployed directory with `server_start.sh` and stop it with `server_stop.sh` +The `server_start.sh` script uses `spark-submit` under the hood and may be passed any of the standard extra arguments from `spark-submit`. + NOTE: by default the assembly jar from `job-server-extras`, which includes support for SQLContext and HiveContext, is used. If you face issues with all the extra dependencies, consider modifying the install scripts to invoke `sbt job-server/assembly` instead, which doesn't include the extra dependencies. Note: to test out the deploy to a local staging dir, or package the job server for Mesos,