### Spark notebook ###

This notebook will only work in a Jupyter notebook or Jupyter lab session running on the cluster master node in the cloud.

Follow the instructions on the computing resources page to start a cluster and open this notebook.

**Steps**

1. Connect to the Windows server using Windows App.
2. Connect to Kubernetes.
3. Start Jupyter and open this notebook from Jupyter in order to connect to Spark.

In [1]:
# Run this cell to import pyspark and to define start_spark() and stop_spark()

import findspark

findspark.init()

import getpass
import pandas
import pyspark
import random
import re

from IPython.display import display, HTML
from pyspark import SparkContext
from pyspark.sql import SparkSession


# Constants used to interact with Azure Blob Storage using the hdfs command or Spark

global username

username = re.sub('@.*', '', getpass.getuser())


# Functions used below

def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """

    html = []

    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')

    return ''.join(html)


def show_as_html(df, n=20):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():

        name = sc.getConf().get("spark.app.name")

        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://localhost:{sc.uiWebUrl.split(":")[-1]}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else:
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username} (notebook)</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{username}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.driver.memory", f'{master_memory}g')
        .config("spark.executor.memory", f'{worker_memory}g')
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.kubernetes.container.image", "madsregistry001.azurecr.io/hadoop-spark:v3.3.5-openjdk-8")
        .config("spark.kubernetes.container.image.pullPolicy", "IfNotPresent")
        .config("spark.kubernetes.memoryOverheadFactor", "0.3")
        .config("spark.memory.fraction", "0.1")
        .config("spark.app.name", f"{username} (notebook)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """

    global spark
    global sc

    if 'spark' in globals() and 'sc' in globals():

        spark.stop()

        del spark
        del sc

    display_spark()


# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))

### WordCount using the DataFrame API ###

The code below shows you how to run a simple WordCount job using Spark DataFrame API.

**Sections**

- [DataFrame API](#DataFrame-API)

**Key points**

- We can run **bash** commands directly from the **Python** notebook using `!` to prefix the command, e.g. `!hdfs dfs -ls /data/helloworld`.
- We can use the pipe operator `|` to pass the output of a bash command to another bash command. 
- The higher level SQL API and DataFrame API still make use of the underlying RDD API.
- We can use multiline strings to make it easier to write readable SQL statements that are correctly indented.
- There is always more than one way to do anything, pick a way that works for you and don't spend **too** long thinking about which is the best way.

In [2]:
# Run this cell to start a spark session in this notebook

start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1)

25/08/13 15:07:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


0,1
spark.dynamicAllocation.enabled,false
spark.fs.azure.sas.uco-user.madsstorage002.blob.core.windows.net,"""sp=racwdl&st=2024-09-19T08:00:18Z&se=2025-09-19T16:00:18Z&spr=https&sv=2022-11-02&sr=c&sig=qtg6fCdoFz6k3EJLw7dA8D3D8wN0neAYw8yG4z4Lw2o%3D"""
spark.kubernetes.driver.pod.name,spark-master-driver
spark.app.name,rsh224 (notebook)
spark.fs.azure.sas.campus-user.madsstorage002.blob.core.windows.net,"""sp=racwdl&st=2024-09-19T08:03:31Z&se=2025-09-19T16:03:31Z&spr=https&sv=2022-11-02&sr=c&sig=kMP%2BsBsRzdVVR8rrg%2BNbDhkRBNs6Q98kYY695XMRFDU%3D"""
spark.kubernetes.container.image.pullPolicy,IfNotPresent
spark.kubernetes.namespace,rsh224
spark.driver.memory,1g
spark.app.startTime,1755054456282
spark.kubernetes.executor.podNamePrefix,rsh224-notebook-f963e398a165c180


In [3]:
# We only need to import pypsark.sql functions and types, everything else we get from the global variables sc or spark

from pyspark.sql import functions as F  # this imports e.g. F.min, F.max, ... to avoid naming conflicts with Python min, max, ...
from pyspark.sql.types import *  # imports everything, e.g. StringType, FloatType, StructType, ...

### DataFrame API ###

The DataFrame API provides a more flexible interface to what is essentially still the SQL API.

**Key Points**

- Data is organized into named columns, DataFrame = Dataset[Row]
- Data is untyped
- Only syntax errors are caught at compile time (not analysis errors)
- Similar interface to pandas in Python or data frames in R

**Simple**

- Use `spark.read.csv` directly.
- There is one `data` variable and one `word_counts` variable, all transformations are defined in one multiline command.

In [4]:
# First we define our data input and output paths and a DataFrame representing the data in HDFS

# These paths have been updated to use your global username variable automatically
data_path = f"hdfs:///data/helloworld"
word_counts_path = f"hdfs:///user/{username}/word-count-spark-dataframe-api"

# This command uses the Python variable spark = SparkSession to define a DataFrame representing the data in HDFS
# Under the hood, Spark creates a DataFrame (but doesn't actually load the data yet)
# Here in our notebook, we store a Python reference to the DataFrame in a Python variable data = pyspark.sql.DataFrame 
data = spark.read.csv(data_path)

print(type(data))
print(data)
data.printSchema()

[Stage 0:>                                                          (0 + 1) / 1]

<class 'pyspark.sql.dataframe.DataFrame'>
DataFrame[_c0: string]
root
 |-- _c0: string (nullable = true)



                                                                                

In [5]:
# Next we transform the data using .select, .groupBy, and .count methods

word_counts = (
    data  # This is a pyspark.sql.DataFrame so it has methods like .select, .groupBy, .count, etc. built into the object directly
    .select(  # This will select a new column equivalent to the output of the .flatMap from the RDD API
        F.explode(F.split(F.col("_c0"), " ")).alias("word")
    )
    .groupBy("word")  # This will return a GroupedDataFrame which we can then aggregate, e.g. using .count()
    .count()  # This will count the number of records in each group e.g. for each unique word
)

print(type(word_counts))
print(word_counts)
word_counts.printSchema()

<class 'pyspark.sql.dataframe.DataFrame'>
DataFrame[word: string, count: bigint]
root
 |-- word: string (nullable = false)
 |-- count: long (nullable = false)



In [6]:
# Finally we can collect the results to this Python session on the master node and print them in the notebook

results = word_counts.collect()

print(type(results))
print(results)



<class 'list'>
[Row(word='Hello', count=160), Row(word='world', count=160)]


                                                                                

In [7]:
# We can also show the results as a human readable table 

word_counts.show()

+-----+-----+
| word|count|
+-----+-----+
|Hello|  160|
|world|  160|
+-----+-----+



In [8]:
# Or use a helper function to display the results in an html table (this uses DataFrame.toPandas and Ipython.display internally)

show_as_html(word_counts)

Unnamed: 0,word,count
0,Hello,160
1,world,160


In [9]:
# We can also write the results back to HDFS, and inspect them using the hdfs bash command in the following cells

word_counts.write.csv(word_counts_path)

In [10]:
!hdfs dfs -ls /user/{username}/word-count-spark-dataframe-api/

Found 2 items
-rw-r--r--   3 rsh224 rsh224          0 2025-08-13 15:19 /user/rsh224/word-count-spark-dataframe-api/_SUCCESS
-rw-r--r--   3 rsh224 rsh224         20 2025-08-13 15:19 /user/rsh224/word-count-spark-dataframe-api/part-00000-ad80676b-7066-4ebe-b684-0adb413cebf4-c000.csv


In [11]:
!hdfs dfs -cat /user/{username}/word-count-spark-dataframe-api/*

Hello,160
world,160


**More complicated**

- Use `spark.read.load` with `.format`, `.option`, and `.schema` to provide more control over how data is loaded.
- The `words` variable could now be used to cache the intermediate result and use it to compute something other than `word_counts`.

In [12]:
# First we define our data input and output paths


# These paths have been updated to use your global username variable automatically
data_path = f"hdfs:///data/helloworld"
word_counts_path = f"hdfs:///user/{username}/word-count-spark-dataframe-api-more-complicated"

# Next we define a schema that we can use when we load the data from HDFS 
schema = StructType([
    StructField("text", StringType(), False)
])
print(schema)

# Then we load the data into a DataFrame representing the data in HDFS
data = (
    spark.read
    .format("com.databricks.spark.csv")
    .option("header", "false")
    .option("inferSchema", "false")
    .schema(schema)
    .load(data_path)
)

print(type(data))
print(data)
data.printSchema()

StructType([StructField('text', StringType(), False)])
<class 'pyspark.sql.dataframe.DataFrame'>
DataFrame[text: string]
root
 |-- text: string (nullable = true)



In [13]:
# Next we transform the data using select, groupBy, and count methods

# We can separate the .select and the .groupBy in the earlier statement to provide references to the intermediate outputs
# which we could potentially reuse in other statements
words = data.select(F.explode(F.split(F.col("text"), " ")).alias("word"))
word_counts = words.groupBy("word").count()

print(type(words))
print(words)
words.printSchema()
show_as_html(words, 5)

print(type(word_counts))
print(word_counts)
word_counts.printSchema()
show_as_html(word_counts)

<class 'pyspark.sql.dataframe.DataFrame'>
DataFrame[word: string]
root
 |-- word: string (nullable = false)



Unnamed: 0,word
0,Hello
1,world
2,Hello
3,world
4,Hello


<class 'pyspark.sql.dataframe.DataFrame'>
DataFrame[word: string, count: bigint]
root
 |-- word: string (nullable = false)
 |-- count: long (nullable = false)



Unnamed: 0,word,count
0,Hello,160
1,world,160


In [14]:
word_counts.write.csv(word_counts_path)

In [15]:
!hdfs dfs -ls /user/{username}/word-count-spark-dataframe-api-more-complicated/

Found 2 items
-rw-r--r--   3 rsh224 rsh224          0 2025-08-13 15:28 /user/rsh224/word-count-spark-dataframe-api-more-complicated/_SUCCESS
-rw-r--r--   3 rsh224 rsh224         20 2025-08-13 15:28 /user/rsh224/word-count-spark-dataframe-api-more-complicated/part-00000-9a95ea86-11b8-44e0-80fe-32062d71025f-c000.csv


In [16]:
!hdfs dfs -cat /user/{username}/word-count-spark-dataframe-api-more-complicated/*

Hello,160
world,160


### Stop Spark ###

In [17]:
# Run this cell before closing the notebook or kill your spark application by hand using the link in the Spark UI

stop_spark()

25/08/13 15:29:59 WARN ExecutorPodsWatchSnapshotSource: Kubernetes client has been closed.
