This repository has been archived by the owner on May 22, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 44
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Separate spark functionality from engine
Signed-off-by: egor <egor@sourced.tech>
- Loading branch information
egor
committed
Mar 2, 2018
1 parent
eb1f8b9
commit 99a3ec3
Showing
6 changed files
with
143 additions
and
101 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
from sourced.ml.utils.bigartm import install_bigartm | ||
from sourced.ml.utils.engine import create_engine, create_spark, add_engine_args, add_spark_args, \ | ||
EngineConstants | ||
from sourced.ml.utils.spark import add_spark_args, create_spark, assemble_spark_config, \ | ||
SparkDefault | ||
from sourced.ml.utils.engine import add_engine_args, create_engine, EngineConstants | ||
from sourced.ml.utils.pickleable_logger import PickleableLogger |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
import logging | ||
import os | ||
import sys | ||
|
||
os.environ["PYSPARK_PYTHON"] = sys.executable | ||
|
||
import pyspark # nopep8 | ||
from pyspark.sql import SparkSession # nopep8 | ||
|
||
|
||
class SparkDefault: | ||
""" | ||
Default arguments for create_spark function and __main__ | ||
""" | ||
MASTER_ADDRESS = "local[*]" | ||
LOCAL_DIR = "/tmp/spark" | ||
LOG_LEVEL = "WARN" | ||
CONFIG = [] | ||
PACKAGES = [] | ||
|
||
|
||
def add_spark_args(my_parser, default_packages=None): | ||
my_parser.add_argument( | ||
"-s", "--spark", default=SparkDefault.MASTER_ADDRESS, | ||
help="Spark's master address.") | ||
my_parser.add_argument( | ||
"--config", nargs="+", default=SparkDefault.CONFIG, | ||
help="Spark configuration (key=value).") | ||
my_parser.add_argument( | ||
"-m", "--memory", | ||
help="Handy memory config for spark. -m 4G,10G,2G is equivalent to " | ||
"--config spark.executor.memory=4G " | ||
"--config spark.driver.memory=10G " | ||
"--config spark.driver.maxResultSize=2G." | ||
"Numbers are floats separated by commas.") | ||
if default_packages is None: | ||
default_packages = SparkDefault.PACKAGES | ||
my_parser.add_argument( | ||
"--package", nargs="+", default=default_packages, dest="packages", | ||
help="Additional Spark packages.") | ||
my_parser.add_argument( | ||
"--spark-local-dir", default=SparkDefault.LOCAL_DIR, | ||
help="Spark local directory.") | ||
my_parser.add_argument("--spark-log-level", default=SparkDefault.LOG_LEVEL, choices=( | ||
"ALL", "DEBUG", "ERROR", "FATAL", "INFO", "OFF", "TRACE", "WARN"), | ||
help="Spark log level") | ||
persistences = [att for att in pyspark.StorageLevel.__dict__.keys() if "__" not in att] | ||
my_parser.add_argument( | ||
"--persist", default=None, choices=persistences, | ||
help="Spark persistence type (StorageLevel.*).") | ||
|
||
|
||
def create_spark(session_name, | ||
spark=SparkDefault.MASTER_ADDRESS, | ||
spark_local_dir=SparkDefault.LOCAL_DIR, | ||
config=SparkDefault.CONFIG, | ||
packages=SparkDefault.PACKAGES, | ||
spark_log_level=SparkDefault.LOG_LEVEL, | ||
**_): # **kwargs are discarded for convenience | ||
log = logging.getLogger("spark") | ||
log.info("Starting %s on %s", session_name, spark) | ||
builder = SparkSession.builder.master(spark).appName(session_name) | ||
builder = builder.config("spark.jars.packages", ",".join(packages)) | ||
builder = builder.config("spark.local.dir", spark_local_dir) | ||
for cfg in config: | ||
builder = builder.config(*cfg.split("=", 1)) | ||
session = builder.getOrCreate() | ||
session.sparkContext.setLogLevel(spark_log_level) | ||
# Hide py4j verbose logging (It appears in travis mostly) | ||
logging.getLogger("py4j").setLevel(logging.WARNING) | ||
return session | ||
|
||
|
||
def assemble_spark_config(config=None, packages=None, memory: str = ""): | ||
""" | ||
Assemble configuration for a Spark session | ||
:param config: configuration to send to spark session | ||
:param packages: packages to send to spark session | ||
:param memory: string with memory configuration for spark | ||
:return: config, packages | ||
""" | ||
if config is None: | ||
config = [] | ||
if packages is None: | ||
packages = [] | ||
memory_conf = [] | ||
if memory: | ||
memory = memory.split(",") | ||
err = "Expected 3 memory parameters but got {}. " \ | ||
"Please check --help to see how -m/--memory should be used." | ||
assert len(memory) == 3, err.format(len(memory)) | ||
memory_conf.append("spark.executor.memory=" + memory[0]) | ||
memory_conf.append("spark.driver.memory=" + memory[1]) | ||
memory_conf.append("spark.driver.maxResultSize=" + memory[2]) | ||
config = config + memory_conf | ||
return config, packages |