### Spark notebook ###

This notebook will only work in a Jupyter notebook or Jupyter lab session running on the cluster master node in the cloud.

Follow the instructions on the computing resources page to start a cluster and open this notebook.

**Steps**

1. Connect to the Windows server using Windows App.
2. Connect to Kubernetes.
3. Start Jupyter and open this notebook from Jupyter in order to connect to Spark.

In [1]:
# Run this cell to import pyspark and to define start_spark() and stop_spark()

import findspark

findspark.init()

import getpass
import pandas
import pyspark
import random
import re

from IPython.display import display, HTML
from pyspark import SparkContext
from pyspark.sql import SparkSession


# Constants used to interact with Azure Blob Storage using the hdfs command or Spark

global username

username = re.sub('@.*', '', getpass.getuser())


# Functions used below

def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """

    html = []

    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')

    return ''.join(html)


def show_as_html(df, n=20):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():

        name = sc.getConf().get("spark.app.name")

        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://localhost:{sc.uiWebUrl.split(":")[-1]}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else:
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username} (notebook)</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{username}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.driver.memory", f'{master_memory}g')
        .config("spark.executor.memory", f'{worker_memory}g')
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.kubernetes.container.image", "madsregistry001.azurecr.io/hadoop-spark:v3.3.5-openjdk-8")
        .config("spark.kubernetes.container.image.pullPolicy", "IfNotPresent")
        .config("spark.kubernetes.memoryOverheadFactor", "0.3")
        .config("spark.memory.fraction", "0.1")
        .config("spark.app.name", f"{username} (notebook)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """

    global spark
    global sc

    if 'spark' in globals() and 'sc' in globals():

        spark.stop()

        del spark
        del sc

    display_spark()


# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))

### WordCount example notebook ###

The code below provides an example for how you would use a notebook to start spark, run a job using spark, and then stop spark.

**Steps**

- Run `start_spark()` to start a spark session in the notebook (only change the resources when advised to do so for an exercise or assignment)
- Write and run code interactively, creating additional cells as needed.
  - Define input and output paths where the output path is based on your username and is inside your home directory in HDFS.
  - Load the input from HDFS using `sc.textFile`.
  - Compute distinct word counts using `flatMap`, `map`, and `reduceByKey`.
  - Save the output back to HDFS using `saveAsTextFile`.
  - View the output in the Python console by using `collect` instead.
- Run `stop_spark()` before closing the notebook or kill your spark application by hand using the link in the [Spark UI](http://mathmadslinux2p.canterbury.ac.nz:8080/).

**Key points**

- The spark session `spark` and spark context `sc` global variables are defined by `start_spark()`. These are how we first interact with Spark.
- The `name`, `data_path`, and `word_counts_path` variables are native Python variabels that are defined for convenience only.
- The `data` and `word_counts` variables are `pyspark` objects in Python that contain references to objects managed by Spark and have methods that allow us to interact with Spark.
- The `saveAsTextFile` and `collect` methods are actions that cause a job to be started.
- The `textFile`, `flatMap`, `map`, and `reduceByKey` transformations are only carried out after the job has actually been started.

In [2]:
# Run this cell to start a spark session in this notebook

start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1)

25/08/13 13:47:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


0,1
spark.dynamicAllocation.enabled,false
spark.fs.azure.sas.uco-user.madsstorage002.blob.core.windows.net,"""sp=racwdl&st=2024-09-19T08:00:18Z&se=2025-09-19T16:00:18Z&spr=https&sv=2022-11-02&sr=c&sig=qtg6fCdoFz6k3EJLw7dA8D3D8wN0neAYw8yG4z4Lw2o%3D"""
spark.kubernetes.driver.pod.name,spark-master-driver
spark.app.startTime,1755049637661
spark.app.name,rsh224 (notebook)
spark.app.submitTime,1755049637476
spark.fs.azure.sas.campus-user.madsstorage002.blob.core.windows.net,"""sp=racwdl&st=2024-09-19T08:03:31Z&se=2025-09-19T16:03:31Z&spr=https&sv=2022-11-02&sr=c&sig=kMP%2BsBsRzdVVR8rrg%2BNbDhkRBNs6Q98kYY695XMRFDU%3D"""
spark.kubernetes.container.image.pullPolicy,IfNotPresent
spark.kubernetes.namespace,rsh224
spark.driver.memory,1g


In [3]:
# Define your input and output paths

# These paths have been updated to use your global username variable automatically
data_path = f"hdfs:///data/helloworld"
word_counts_path = f"hdfs:///user/{username}/word-count-spark-notebook"

print(data_path)
print(word_counts_path)

hdfs:///data/helloworld
hdfs:///user/rsh224/word-count-spark-notebook


In [8]:
# Load the data from HDFS and use flatMap, map, and reduceBykey to compute distinct word counts

data = sc.textFile(data_path)

word_counts = (
    data
    .flatMap(lambda line: line.split(" "))
    .map(lambda word: (word, 1))
    .reduceByKey(lambda x, y: x + y)
)

# word_counts.saveAsTextFile(word_counts_path)

In [4]:
# We can run bash commands from the notebook using ! and we can still interpolate the username automatically

!hdfs dfs -ls /user/{username}/word-count-spark-notebook/

Found 17 items
-rw-r--r--   3 rsh224 rsh224          0 2025-08-13 13:46 /user/rsh224/word-count-spark-notebook/_SUCCESS
-rw-r--r--   3 rsh224 rsh224          0 2025-08-13 13:46 /user/rsh224/word-count-spark-notebook/part-00000
-rw-r--r--   3 rsh224 rsh224          0 2025-08-13 13:46 /user/rsh224/word-count-spark-notebook/part-00001
-rw-r--r--   3 rsh224 rsh224          0 2025-08-13 13:46 /user/rsh224/word-count-spark-notebook/part-00002
-rw-r--r--   3 rsh224 rsh224          0 2025-08-13 13:46 /user/rsh224/word-count-spark-notebook/part-00003
-rw-r--r--   3 rsh224 rsh224          0 2025-08-13 13:46 /user/rsh224/word-count-spark-notebook/part-00004
-rw-r--r--   3 rsh224 rsh224          0 2025-08-13 13:46 /user/rsh224/word-count-spark-notebook/part-00005
-rw-r--r--   3 rsh224 rsh224          0 2025-08-13 13:46 /user/rsh224/word-count-spark-notebook/part-00006
-rw-r--r--   3 rsh224 rsh224          0 2025-08-13 13:46 /user/rsh224/word-count-spark-notebook/part-00007
-rw-r--r--   3 rsh224 rs

In [5]:
# We can run bash commands from the notebook using ! and we can still interpolate the username automatically

!hdfs dfs -cat /user/{username}/word-count-spark-notebook/*

('Hello', 160)
('world', 160)


In [9]:
# Collect the results to the master node and display them in this notebook

results = word_counts.collect()

print(results)



[('Hello', 160), ('world', 160)]


                                                                                

### Stop Spark ###

In [10]:
# Run this cell before closing the notebook or kill your spark application by hand using the link in the Spark UI

stop_spark()

25/08/13 13:49:00 WARN ExecutorPodsWatchSnapshotSource: Kubernetes client has been closed.
