### Extracting the archive to ./spark-home

In [1]:
%%local

import os

os.getcwd()

'/app/examples'

In [2]:
import socket

socket.getfqdn()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1,,pyspark,idle,,,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

'livy'

In [3]:
%load_ext livy_uploads.magics

In [5]:
%%remote_command

ls /app/tmp

spark-3.5.4-bin-spark3.5.4-scala2.12-hadoop3.1.1-v1.tgz
spark-3.5.4-bin-spark3.5.4-scala2.12-hadoop3.1.1-v2.tgz
$ process finished with return code 0


In [6]:
sc.addFile('file:/app/tmp/spark-3.5.4-bin-spark3.5.4-scala2.12-hadoop3.1.1-v1.tgz')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
from pathlib import Path
import shutil
from tempfile import TemporaryDirectory

from pyspark import SparkFiles

archive_path = SparkFiles.get('spark-3.5.4-bin-spark3.5.4-scala2.12-hadoop3.1.1-v1.tgz')

with TemporaryDirectory() as tmpdir:
    output_path = tmpdir + '/out'
    shutil.unpack_archive(archive_path, output_path)
    items = list(Path(output_path).glob('*'))
    if not items or len(items) > 1 or not items[0].is_dir():
        raise Exception(f'expected only one directory in the archive: {items}')

    try:
        shutil.rmtree('./spark-home')
    except FileNotFoundError:
        pass
    shutil.move(str(items[0]), './spark-home')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

'./spark-home'

In [29]:
# monkeypatching the bin/spark-class script, so we can inject some VM args to the launched JVM itself

import re

exec_line = r'exec "${CMD[@]}"'
repl = r'''
eval set -- "${SPARK_JAVA_ADHOC_ARGS:-}"
CMD=("${CMD[0]}" "$@" "${CMD[@]:1}")
exec "${CMD[@]}"
'''

pattern = re.compile('^' + re.escape(exec_line) + '$', re.MULTILINE)

with open('./spark-home/bin/spark-class') as fp:
    content = fp.read()

if 'SPARK_JAVA_ADHOC_ARGS' not in content:
    with open('./spark-home/bin/spark-class', 'w') as fp:
        fp.write(pattern.sub(repl, content))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

3641

In [8]:
%%remote_command

find ./spark-home -mindepth 1 -maxdepth 1

./spark-home/LICENSE
./spark-home/examples
./spark-home/RELEASE
./spark-home/sbin
./spark-home/NOTICE
./spark-home/conf
./spark-home/bin
./spark-home/python
./spark-home/data
./spark-home/licenses
./spark-home/jars
./spark-home/yarn
./spark-home/README.md
$ process finished with return code 0


### Sending config files

In [9]:
%send_path_to_spark -p spark/ -d ./spark-home/conf/

In [10]:
import os
import os.path
from pathlib import Path


removable_prefixes = ('SPARK_', 'PYSPARK_', 'PYTHON', 'CLASSPATH')
for env_name in set(os.environ):
    for prefix in removable_prefixes:
        if env_name.startswith(prefix):
            value = os.environ[env_name]
            if len(value) > 100:
                value = value[:100] + '...'
            print(f'removing {env_name}={value}')
            del os.environ[env_name]
            break

os.environ['SPARK_HOME'] = os.path.abspath('./spark-home')
os.environ['SPARK_CONF_DIR'] = os.path.abspath('./spark-home/conf')

python_lib = Path(os.environ['SPARK_HOME']) / 'python' / 'lib'
py_files = list(sorted(map(str, {
    *python_lib.glob('pyspark.zip'),
    *python_lib.glob('py4j-*-src.zip'),
})))
os.environ['SPARK_PYTHON_PATH'] = ':'.join(py_files)
os.environ['SPARK_PYFILES'] = ','.join(py_files)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

removing SPARK_MASTER_IP=172.21.0.2
removing SPARK_HOME=/etc/spark
removing SPARK_WORKER_DIR=/tmp/spark
removing SPARK_LOG_DIR=/var/log/spark
removing SPARK_DAEMON_JAVA_OPTS=-XX:+UseContainerSupport
removing PYTHONPATH=:/etc/spark/python/lib/pyspark.zip:/etc/spark/python/lib/py4j-0.10.9.3-src.zip:/etc/spark/python/lib...
removing SPARK_PID_DIR=/run/spark
removing SPARK_BUFFER_SIZE=65536
removing SPARK_CONF_DIR=/etc/spark/conf
removing PYTHONHASHSEED=0
removing PYSPARK_GATEWAY_PORT=40829
removing SPARK_PUBLIC_DNS=localhost
removing PYSPARK_GATEWAY_SECRET=JM1bPwLfRhO2bBhE0TewKtP0b7dzfmva1P2JZ6VM3qs=
removing SPARK_ENV_LOADED=1
removing PYSPARK_PYTHON=/usr/bin/python3
removing SPARK_SCALA_VERSION=2.12
removing PYSPARK_DRIVER_PYTHON=/usr/bin/python3
removing PYTHONUNBUFFERED=YES
removing SPARK_AUTH_SOCKET_TIMEOUT=15

In [11]:
import os
from pathlib import Path

for p in Path(os.environ['SPARK_CONF_DIR']).glob('*'):
    if '.tmpl.' not in p.name:
        continue

    new_name = p.name.replace('.tmpl', '')
    dest = p.with_name(new_name)
    print(f'templating {p} to {dest}')
    content = p.read_text()
    dest.write_text(content % os.environ)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

templating /tmp/spark-home/conf/spark-env.tmpl.sh to /tmp/spark-home/conf/spark-env.sh
4694
templating /tmp/spark-home/conf/spark-defaults.tmpl.conf to /tmp/spark-home/conf/spark-defaults.conf
1292

In [12]:
%%remote_command

cat spark-home/conf/spark-env.sh | tr ':' '\n'

#!/usr/bin/env bash

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http
//www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# This file is sourced when running various Spark programs.
# Copy it as spark-env.sh and edit that to configure Spark for your site.

# Options read when launching programs locally with
# ./bin

In [None]:
%%remote_command

hostname
kdestroy -A
klist

### Finally, try to submit

In [13]:
%%remote_command

./spark-home/bin/spark-submit --name test-spark-pi --master 'local[*]' --deploy-mode client \
--class org.apache.spark.examples.SparkPi ./spark-home/examples/jars/spark-examples_2.12-3.5.4.jar 10

25/01/11 02:23:21 INFO SparkContext: Running Spark version 3.5.4
25/01/11 02:23:21 INFO SparkContext: OS info Linux, 5.15.0-130-generic, amd64
25/01/11 02:23:21 INFO SparkContext: Java version 1.8.0_422
25/01/11 02:23:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/01/11 02:23:21 INFO ResourceUtils: No custom resources configured for spark.driver.
25/01/11 02:23:21 INFO SparkContext: Submitted application: Spark Pi
25/01/11 02:23:21 INFO ResourceProfile: Default ResourceProfile created, executor resources: Map(cores -> name: cores, amount: 1, script: , vendor: , memory -> name: memory, amount: 1024, script: , vendor: , offHeap -> name: offHeap, amount: 0, script: , vendor: ), task resources: Map(cpus -> name: cpus, amount: 1.0)
25/01/11 02:23:21 INFO ResourceProfile: Limiting resource is cpu
25/01/11 02:23:21 INFO ResourceProfileManager: Added ResourceProfile id: 0
25/01/11 02:23:21 INFO SecurityManager: