### Spark notebook ###

This notebook will only work in a Jupyter notebook or Jupyter lab session running on the cluster master node in the cloud.

Follow the instructions on the computing resources page to start a cluster and open this notebook.

**Steps**

1. Connect to the Windows server using Windows App.
2. Connect to Kubernetes.
3. Start Jupyter and open this notebook from Jupyter in order to connect to Spark.

In [1]:
# Run this cell to import pyspark and to define start_spark() and stop_spark()

import findspark

findspark.init()

import getpass
import pandas
import pyspark
import random
import re

from IPython.display import display, HTML
from pyspark import SparkContext
from pyspark.sql import SparkSession


# Constants used to interact with Azure Blob Storage using the hdfs command or Spark

global username

username = re.sub('@.*', '', getpass.getuser())


# Functions used below

def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """

    html = []

    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')

    return ''.join(html)


def show_as_html(df, n=20):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():

        name = sc.getConf().get("spark.app.name")

        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://localhost:{sc.uiWebUrl.split(":")[-1]}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else:
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username} (notebook)</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{username}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.driver.memory", f'{master_memory}g')
        .config("spark.executor.memory", f'{worker_memory}g')
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.kubernetes.container.image", "madsregistry001.azurecr.io/hadoop-spark:v3.3.5-openjdk-8")
        .config("spark.kubernetes.container.image.pullPolicy", "IfNotPresent")
        .config("spark.kubernetes.memoryOverheadFactor", "0.3")
        .config("spark.memory.fraction", "0.1")
        .config("spark.app.name", f"{username} (notebook)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """

    global spark
    global sc

    if 'spark' in globals() and 'sc' in globals():

        spark.stop()

        del spark
        del sc

    display_spark()


# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))

### DataFrame API ###

The code below demonstrates some common **transformations**, **actions**, and **functions** in the DataFrame API.

**Sections**

- [Data](#Data)
- [User defined functions](#User-defined-functions)

**Key points**

- The datasets used in these examples are designed to have as much complexity as possible while still being small.
- The examples use `printSchema` and `show_as_html` frequently to show the contents of dataframes as they are transformed.

In [2]:
# Run this cell to start a spark session in this notebook

start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1)

25/08/16 22:18:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


0,1
spark.dynamicAllocation.enabled,false
spark.fs.azure.sas.uco-user.madsstorage002.blob.core.windows.net,"""sp=racwdl&st=2024-09-19T08:00:18Z&se=2025-09-19T16:00:18Z&spr=https&sv=2022-11-02&sr=c&sig=qtg6fCdoFz6k3EJLw7dA8D3D8wN0neAYw8yG4z4Lw2o%3D"""
spark.kubernetes.driver.pod.name,spark-master-driver
spark.app.name,rsh224 (notebook)
spark.fs.azure.sas.campus-user.madsstorage002.blob.core.windows.net,"""sp=racwdl&st=2024-09-19T08:03:31Z&se=2025-09-19T16:03:31Z&spr=https&sv=2022-11-02&sr=c&sig=kMP%2BsBsRzdVVR8rrg%2BNbDhkRBNs6Q98kYY695XMRFDU%3D"""
spark.kubernetes.container.image.pullPolicy,IfNotPresent
spark.kubernetes.namespace,rsh224
spark.driver.memory,1g
spark.kubernetes.executor.podNamePrefix,rsh224-notebook-ee376b98b2630e1e
spark.executor.memory,1g


In [3]:
# We only need to import the Row object and the pyspark sql types, everything else we get from the global variables sc or spark

from pyspark.sql import Row, DataFrame, Window, functions as F
from pyspark.sql.types import *

### Data ###

This code creates two datasets, `data` and `department_data`.

**Key points**

- These datasets are designed to have as much complexity as possible while still being small.
- The datasets are contructed in Python using pyspark Row objects, distributed to give an RDD, and then wrapped with a DataFrame = Dataset[Row].
- This code does not load any data from HDFS.

In [4]:
# Create, distribute, and wrap data by hand

schema = StructType([
    StructField("Name"       ,  StringType() , True),
    StructField("Department" ,  StringType() , True),
    StructField("Age"        , IntegerType() , True),
    StructField("Gender"     ,  StringType() , True),
    StructField("Salary"     ,  DoubleType() , True)
])
data = spark.createDataFrame(  # Finally, wrap the RDD with metadata by creating a DataFrame = Dataset[Row]
    sc.parallelize(  # Second, take that list of pyspark row objects, distribute them as Spark rows in an RDD[Row]
        [  # First, define a list of pyspark row objects (this is just a Python list in memory on the master node)
            Row("Alpha One"   , "X" , 28 , "M"  ,  80000.0),
            Row("Bravo Two"   , "X" , 25 , "M"  ,  70000.0),
            Row("Charlie"     , "X" , 23 , "M"  ,  80000.0),  # Charlie has no last name, duplicate salary in department X
            Row("Delta Four"  , "Y" , 30 , None , 100000.0),  # Gender is none
            Row("Echo Five"   , "Y" , 27 , "F"  , 120000.0),
            Row("Foxtrot Six" , "Z" , 20 , "F"  ,  90000.0),
            Row("Golf Seven"  , "Z" , 20 , "F"  ,  50000.0),  # Duplicate age in department Z
            Row("Hotel Eight" , "Z" , 38 , "F"  , 100000.0),
            Row("Indigo Nine" , "Z" , 50 , "M"  ,  70000.0),
            Row("Juliet Ten"  , "Z" , 18 , "F"  ,     None),  # Salary is none
        ]
    ), schema=schema)

print(type(data))
data.printSchema()
print(data)
show_as_html(data)

<class 'pyspark.sql.dataframe.DataFrame'>
root
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Salary: double (nullable = true)

DataFrame[Name: string, Department: string, Age: int, Gender: string, Salary: double]


                                                                                

Unnamed: 0,Name,Department,Age,Gender,Salary
0,Alpha One,X,28,M,80000.0
1,Bravo Two,X,25,M,70000.0
2,Charlie,X,23,M,80000.0
3,Delta Four,Y,30,,100000.0
4,Echo Five,Y,27,F,120000.0
5,Foxtrot Six,Z,20,F,90000.0
6,Golf Seven,Z,20,F,50000.0
7,Hotel Eight,Z,38,F,100000.0
8,Indigo Nine,Z,50,M,70000.0
9,Juliet Ten,Z,18,F,


In [5]:
# Create, distribute, and wrap additional department data by hand

department_schema = StructType([
    StructField("Department", StringType(), True),
    StructField("Name", StringType(), True),
    StructField("Campus", StringType(), True)
])
department_data = spark.createDataFrame(
    sc.parallelize(
        [
            Row("X", "Xray",   "U"),
            Row("Y", "Yankee", "V"),
            Row("Z", "Zulu",   "W"),
        ]
    ), schema=department_schema)

print(type(department_data))
department_data.printSchema()
print(department_data)
show_as_html(department_data)

<class 'pyspark.sql.dataframe.DataFrame'>
root
 |-- Department: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Campus: string (nullable = true)

DataFrame[Department: string, Name: string, Campus: string]


Unnamed: 0,Department,Name,Campus
0,X,Xray,U
1,Y,Yankee,V
2,Z,Zulu,W


### User defined functions ###

The following user defined functions allow you to use arbitrary Python logic to operate on each row in a dataframe in a distributed way.

**Key points**

- A user defined function can be any Python function provided that function is portable or any modules used are installed on the worker nodes as well.
- The function is wrapped in `F.udf` to convert it to a pyspark column transformation that can be understood by Spark.
- The pyspark return type of the function needs to be specified so that Spark knows how to convert or structure what is returned.
- You lose all optimizations that Spark can apply such as filtering on filescan instead of in memory.

In [6]:
# Python lambda functions, nothing to do with Spark

my_full_name = lambda x, y: f"{x} {y}"

print(my_full_name("Alpha", "One"))
print(my_full_name("Charlie", None))

Alpha One
Charlie None


In [7]:
# Python def function instead, nothing to do with Spark

def my_full_name_formatted(first_name, last_name, style="traditional"):
    """Combine first and last names into a full name with optional formatting.
    
    Args:
        first_name (str): first name
        last_name (str): last name
        style (str): formatting to apply, either "traditional" or "formal"
        
    Returns:
        full_name (str): first and last name combined
    """
    
    if last_name is None:
        return first_name

    if first_name is None:
        return last_name

    if style == "traditional":
        return first_name + " " + last_name
    elif style == "formal":
        return f"{last_name}, {first_name}"
    else:
        return first_name + " " + last_name


print(my_full_name_formatted("Alpha", "One"))
print(my_full_name_formatted("Alpha", "One", style="traditional"))
print(my_full_name_formatted("Alpha", "One", style="formal"))

print(my_full_name_formatted("Charlie", None))
print(my_full_name_formatted("Charlie", None, style="traditional"))
print(my_full_name_formatted("Charlie", None, style="formal"))

Alpha One
Alpha One
One, Alpha
Charlie
Charlie
Charlie


In [8]:
# Wrap functions to override default arguments, nothing to do with Spark

my_full_name_traditional = lambda x, y: my_full_name_formatted(x, y, style="traditional")
my_full_name_formal = lambda x, y: my_full_name_formatted(x, y, style="formal")

print(my_full_name_traditional("Alpha", "One"))
print(my_full_name_formal("Alpha", "One"))

print(my_full_name_traditional("Charlie", None))
print(my_full_name_formal("Charlie", None))

Alpha One
One, Alpha
Charlie
Charlie


In [10]:
# pyspark user defined functions

my_full_name_udf = F.udf(my_full_name, StringType())
my_full_name_traditional_udf = F.udf(my_full_name_traditional, StringType())
my_full_name_formal_udf = F.udf(my_full_name_formal, StringType())
my_struct_udf = F.udf(
    lambda: {"B": 2, "A": 1},
    StructType([
        StructField("A", IntegerType()),
        StructField("B", IntegerType()),
    ])
)
my_array_udf = F.udf(
    lambda: [1, 2, 3, 4, 5],
    ArrayType(IntegerType()),
)

temp = (
    data
    .withColumn("Names", F.split(F.col("Name"), " "))
    .select(
        F.col("Names")[0].alias("FirstName"),
        F.col("Names")[1].alias("LastName"),
    )
    .select(
        my_full_name_udf(F.col("FirstName"), F.col("LastName")).alias("FullName"),
        my_full_name_traditional_udf(F.col("FirstName"), F.col("LastName")).alias("FullNameTraditional"),
        my_full_name_formal_udf(F.col("FirstName"), F.col("LastName")).alias("FullNameFormal"),
        my_struct_udf().alias("StructOutput"),
        my_array_udf().alias("ArrayOutput"),
    )
    .withColumn("StructOutput.A", F.col("StructOutput")["A"])
    .withColumn("StructOutput.B", F.col("StructOutput")["B"])
    .withColumn("ArrayOutput[0]", F.col("ArrayOutput")[0])
    .withColumn("ArrayOutput[1]", F.col("ArrayOutput")[1])
)
temp.printSchema()
show_as_html(temp)

root
 |-- FullName: string (nullable = true)
 |-- FullNameTraditional: string (nullable = true)
 |-- FullNameFormal: string (nullable = true)
 |-- StructOutput: struct (nullable = true)
 |    |-- A: integer (nullable = true)
 |    |-- B: integer (nullable = true)
 |-- ArrayOutput: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- StructOutput.A: integer (nullable = true)
 |-- StructOutput.B: integer (nullable = true)
 |-- ArrayOutput[0]: integer (nullable = true)
 |-- ArrayOutput[1]: integer (nullable = true)



                                                                                

Unnamed: 0,FullName,FullNameTraditional,FullNameFormal,StructOutput,ArrayOutput,StructOutput.A,StructOutput.B,ArrayOutput[0],ArrayOutput[1]
0,Alpha One,Alpha One,"One, Alpha","(1, 2)","[1, 2, 3, 4, 5]",1,2,1,2
1,Bravo Two,Bravo Two,"Two, Bravo","(1, 2)","[1, 2, 3, 4, 5]",1,2,1,2
2,Charlie None,Charlie,Charlie,"(1, 2)","[1, 2, 3, 4, 5]",1,2,1,2
3,Delta Four,Delta Four,"Four, Delta","(1, 2)","[1, 2, 3, 4, 5]",1,2,1,2
4,Echo Five,Echo Five,"Five, Echo","(1, 2)","[1, 2, 3, 4, 5]",1,2,1,2
5,Foxtrot Six,Foxtrot Six,"Six, Foxtrot","(1, 2)","[1, 2, 3, 4, 5]",1,2,1,2
6,Golf Seven,Golf Seven,"Seven, Golf","(1, 2)","[1, 2, 3, 4, 5]",1,2,1,2
7,Hotel Eight,Hotel Eight,"Eight, Hotel","(1, 2)","[1, 2, 3, 4, 5]",1,2,1,2
8,Indigo Nine,Indigo Nine,"Nine, Indigo","(1, 2)","[1, 2, 3, 4, 5]",1,2,1,2
9,Juliet Ten,Juliet Ten,"Ten, Juliet","(1, 2)","[1, 2, 3, 4, 5]",1,2,1,2


### Stop Spark ###

In [11]:
# Run this cell before closing the notebook or kill your spark application by hand using the link in the Spark UI

stop_spark()

25/08/16 22:48:18 WARN ExecutorPodsWatchSnapshotSource: Kubernetes client has been closed.
