### Spark notebook ###

This notebook will only work in a Jupyter notebook or Jupyter lab session running on the cluster master node in the cloud.

Follow the instructions on the computing resources page to start a cluster and open this notebook.

**Steps**

1. Connect to the Windows server using Windows App.
2. Connect to Kubernetes.
3. Start Jupyter and open this notebook from Jupyter in order to connect to Spark.

In [8]:
# Run this cell to import pyspark and to define start_spark() and stop_spark()

import findspark

findspark.init()

import getpass
import pandas
import pyspark
import random
import re

from IPython.display import display, HTML
from pyspark import SparkContext
from pyspark.sql import SparkSession


# Constants used to interact with Azure Blob Storage using the hdfs command or Spark

global username

username = re.sub('@.*', '', getpass.getuser())


# Functions used below

def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """

    html = []

    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')

    return ''.join(html)


def show_as_html(df, n=20):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():

        name = sc.getConf().get("spark.app.name")

        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://localhost:{sc.uiWebUrl.split(":")[-1]}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else:
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username} (notebook)</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{username}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.driver.memory", f'{master_memory}g')
        .config("spark.executor.memory", f'{worker_memory}g')
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.kubernetes.container.image", "madsregistry001.azurecr.io/hadoop-spark:v3.3.5-openjdk-8")
        .config("spark.kubernetes.container.image.pullPolicy", "IfNotPresent")
        .config("spark.kubernetes.memoryOverheadFactor", "0.3")
        .config("spark.memory.fraction", "0.1")
        .config("spark.app.name", f"{username} (notebook)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """

    global spark
    global sc

    if 'spark' in globals() and 'sc' in globals():

        spark.stop()

        del spark
        del sc

    display_spark()


# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))

### OpenFlights example using the SQL and DataFrame API ###

The code below shows you how to carry out more complex data analysis tasks using the SQL and DataFrame API.

**Key points**

- Make your code as readable as possible using multiline commands and indenting. You should look up Python style guides to improve your code readability.
- You should always define schemas if you are loading a structured dataset with columns and data types.
- The `show_as_html` command leverages existing pandas jupyter integration to show a limited number of rows of a spark dataframe as html.
- You should always comment your code so that it can be read and understood by someone else (or by yourself several months from now).

In [9]:
# Run this cell to start a spark session in this notebook

start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1)

0,1
spark.dynamicAllocation.enabled,false
spark.fs.azure.sas.uco-user.madsstorage002.blob.core.windows.net,"""sp=racwdl&st=2024-09-19T08:00:18Z&se=2025-09-19T16:00:18Z&spr=https&sv=2022-11-02&sr=c&sig=qtg6fCdoFz6k3EJLw7dA8D3D8wN0neAYw8yG4z4Lw2o%3D"""
spark.kubernetes.driver.pod.name,spark-master-driver
spark.app.submitTime,1755045591716
spark.app.name,rsh224 (notebook)
spark.fs.azure.sas.campus-user.madsstorage002.blob.core.windows.net,"""sp=racwdl&st=2024-09-19T08:03:31Z&se=2025-09-19T16:03:31Z&spr=https&sv=2022-11-02&sr=c&sig=kMP%2BsBsRzdVVR8rrg%2BNbDhkRBNs6Q98kYY695XMRFDU%3D"""
spark.kubernetes.container.image.pullPolicy,IfNotPresent
spark.kubernetes.namespace,rsh224
spark.app.startTime,1755056075998
spark.driver.memory,1g


In [10]:
# We only need to import pypsark.sql functions and types, everything else we get from the global variables sc or spark

from pyspark.sql import functions as F  # this imports e.g. F.min, F.max, ... to avoid naming conflicts with Python min, max, ...
from pyspark.sql.types import *

### Data ###

The OpenFlights data is an open source database of airports, train stations, and ferry terminals from around the world, including information about routes, airlines, airports, countries, and planes. You can find out more about the OpenFlights data on their website https://openflights.org/data.php.

**Key points**

- We we use the routes and airports datasets in this notebook. 
- We want to define schemas and load the data all at once so that it can be used consistently in the cells below.

In [11]:
# Routes

schema_routes = StructType([
    StructField("airline", StringType(), True),
    StructField("airline_id", StringType(), True),
    StructField("src_airport", StringType(), True),
    StructField("src_airport_id", StringType(), True),
    StructField("dst_airport", StringType(), True),
    StructField("dst_airport_id", StringType(), True),
    StructField("codeshare", StringType(), True),
    StructField("stops", IntegerType(), True),
    StructField("equipment", StringType(), True),
])
routes = (
    spark.read.format("com.databricks.spark.csv")
    .option("header", "false")
    .option("inferSchema", "false")
    .schema(schema_routes)
    .load("hdfs:///data/openflights/routes.dat")
)
print(routes)
routes.printSchema()
show_as_html(routes)

DataFrame[airline: string, airline_id: string, src_airport: string, src_airport_id: string, dst_airport: string, dst_airport_id: string, codeshare: string, stops: int, equipment: string]
root
 |-- airline: string (nullable = true)
 |-- airline_id: string (nullable = true)
 |-- src_airport: string (nullable = true)
 |-- src_airport_id: string (nullable = true)
 |-- dst_airport: string (nullable = true)
 |-- dst_airport_id: string (nullable = true)
 |-- codeshare: string (nullable = true)
 |-- stops: integer (nullable = true)
 |-- equipment: string (nullable = true)



                                                                                

Unnamed: 0,airline,airline_id,src_airport,src_airport_id,dst_airport,dst_airport_id,codeshare,stops,equipment
0,2B,410,AER,2965,KZN,2990,,0,CR2
1,2B,410,ASF,2966,KZN,2990,,0,CR2
2,2B,410,ASF,2966,MRV,2962,,0,CR2
3,2B,410,CEK,2968,KZN,2990,,0,CR2
4,2B,410,CEK,2968,OVB,4078,,0,CR2
5,2B,410,DME,4029,KZN,2990,,0,CR2
6,2B,410,DME,4029,NBC,6969,,0,CR2
7,2B,410,DME,4029,TGK,\N,,0,CR2
8,2B,410,DME,4029,UUA,6160,,0,CR2
9,2B,410,EGO,6156,KGD,2952,,0,CR2


In [12]:
# Airports

schema_airports = StructType([
  StructField("airport_id", IntegerType(), True),
  StructField("airport", StringType(), True),
  StructField("city", StringType(), True),
  StructField("country", StringType(), True),
  StructField("iata", StringType(), True),
  StructField("icao", StringType(), True),
  StructField("latitude", DoubleType(), True),
  StructField("longitude", DoubleType(), True),
  StructField("altitude", DoubleType(), True),
  StructField("timezone", DoubleType(), True),
  StructField("dst", StringType(), True),
  StructField("tz", StringType(), True),
  StructField("type", StringType(), True),
  StructField("source", StringType(), True),
])
airports = (
    spark.read.format("com.databricks.spark.csv")
    .option("header", "false")
    .option("inferSchema", "false")
    .schema(schema_airports)
    .load("hdfs:////data/openflights/airports.dat")
)
print(airports)
airports.printSchema()
show_as_html(airports)

DataFrame[airport_id: int, airport: string, city: string, country: string, iata: string, icao: string, latitude: double, longitude: double, altitude: double, timezone: double, dst: string, tz: string, type: string, source: string]
root
 |-- airport_id: integer (nullable = true)
 |-- airport: string (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- iata: string (nullable = true)
 |-- icao: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- altitude: double (nullable = true)
 |-- timezone: double (nullable = true)
 |-- dst: string (nullable = true)
 |-- tz: string (nullable = true)
 |-- type: string (nullable = true)
 |-- source: string (nullable = true)



                                                                                

Unnamed: 0,airport_id,airport,city,country,iata,icao,latitude,longitude,altitude,timezone,dst,tz,type,source
0,1,Goroka Airport,Goroka,Papua New Guinea,GKA,AYGA,-6.08169,145.391998,5282.0,10.0,U,Pacific/Port_Moresby,airport,OurAirports
1,2,Madang Airport,Madang,Papua New Guinea,MAG,AYMD,-5.20708,145.789001,20.0,10.0,U,Pacific/Port_Moresby,airport,OurAirports
2,3,Mount Hagen Kagamuga Airport,Mount Hagen,Papua New Guinea,HGU,AYMH,-5.82679,144.296005,5388.0,10.0,U,Pacific/Port_Moresby,airport,OurAirports
3,4,Nadzab Airport,Nadzab,Papua New Guinea,LAE,AYNZ,-6.569803,146.725977,239.0,10.0,U,Pacific/Port_Moresby,airport,OurAirports
4,5,Port Moresby Jacksons International Airport,Port Moresby,Papua New Guinea,POM,AYPY,-9.44338,147.220001,146.0,10.0,U,Pacific/Port_Moresby,airport,OurAirports
5,6,Wewak International Airport,Wewak,Papua New Guinea,WWK,AYWK,-3.58383,143.669006,19.0,10.0,U,Pacific/Port_Moresby,airport,OurAirports
6,7,Narsarsuaq Airport,Narssarssuaq,Greenland,UAK,BGBW,61.1605,-45.425999,112.0,-3.0,E,America/Godthab,airport,OurAirports
7,8,Godthaab / Nuuk Airport,Godthaab,Greenland,GOH,BGGH,64.190903,-51.678101,283.0,-3.0,E,America/Godthab,airport,OurAirports
8,9,Kangerlussuaq Airport,Sondrestrom,Greenland,SFJ,BGSF,67.012222,-50.711603,165.0,-3.0,E,America/Godthab,airport,OurAirports
9,10,Thule Air Base,Thule,Greenland,THU,BGTL,76.531197,-68.703201,251.0,-4.0,E,America/Thule,airport,OurAirports


### SQL API ###

In the cells below we use the SQL API to compute airport size based on the number of destination airports you can fly to from each airport. 

In [13]:
# Compute airport size using SQL API

routes.registerTempTable('routes')
airports.registerTempTable('airports')

counts_sql = """
    SELECT
        airports.airport AS airport_name,
        airports.city AS airport_city,
        airports.country AS airport_country,
        counts.airport_size AS airport_size
    FROM
    (
        SELECT
            routes.src_airport_id AS airport_id,
            count(routes.dst_airport_id) AS airport_size
        FROM
            routes
        GROUP BY
            airport_id
    ) counts
    LEFT JOIN
        airports ON counts.airport_id = airports.airport_id
    ORDER BY
        airport_size DESC
"""
counts = spark.sql(counts_sql)



In [14]:
print(counts)
counts.printSchema()
show_as_html(counts)

DataFrame[airport_name: string, airport_city: string, airport_country: string, airport_size: bigint]
root
 |-- airport_name: string (nullable = true)
 |-- airport_city: string (nullable = true)
 |-- airport_country: string (nullable = true)
 |-- airport_size: long (nullable = false)



Unnamed: 0,airport_name,airport_city,airport_country,airport_size
0,Hartsfield Jackson Atlanta International Airport,Atlanta,United States,915
1,Chicago O'Hare International Airport,Chicago,United States,558
2,Beijing Capital International Airport,Beijing,China,535
3,London Heathrow Airport,London,United Kingdom,527
4,Charles de Gaulle International Airport,Paris,France,524
5,Frankfurt am Main International Airport,Frankfurt,Germany,497
6,Los Angeles International Airport,Los Angeles,United States,492
7,Dallas Fort Worth International Airport,Dallas-Fort Worth,United States,469
8,John F Kennedy International Airport,New York,United States,456
9,Amsterdam Airport Schiphol,Amsterdam,Netherlands,453


### DataFrame API ###

**Simple**

- Define one variable representing the output of all transformations using a single multiline command.

In [15]:
# Compute airport size using DataFrame API

counts = (
    routes
    # Select only the columns that are needed
    .select([
        'src_airport_id',
        'dst_airport_id',
    ])
    # Group by source and count destinations
    .groupBy('src_airport_id')
    .agg({
        'dst_airport_id': 'count',
    })
    .select(
        F.col('src_airport_id').alias('airport_id'),
        F.col('count(dst_airport_id)').alias('airport_size')
    )
    # Merge with airports to get airport name
    .join(
        airports
        .select(
            F.col('airport_id'),
            F.col('airport').alias('airport_name')
        ),
        on='airport_id',
        how='left',
    )
    # Select columns to be retained
    .select(
        F.col('airport_name'),
        F.col('airport_size')
    )
    # Order by airport size descending
    .orderBy('airport_size', ascending=False)
)

In [16]:
print(counts)
counts.printSchema()
show_as_html(counts)

DataFrame[airport_name: string, airport_size: bigint]
root
 |-- airport_name: string (nullable = true)
 |-- airport_size: long (nullable = false)



Unnamed: 0,airport_name,airport_size
0,Hartsfield Jackson Atlanta International Airport,915
1,Chicago O'Hare International Airport,558
2,Beijing Capital International Airport,535
3,London Heathrow Airport,527
4,Charles de Gaulle International Airport,524
5,Frankfurt am Main International Airport,497
6,Los Angeles International Airport,492
7,Dallas Fort Worth International Airport,469
8,John F Kennedy International Airport,456
9,Amsterdam Airport Schiphol,453


**More complicated**

- Defines multiple variables representing useful intermediate steps that are then combined.

In [17]:
# Compute airport size using DataFrame API

route_counts = (
    routes
    # Select only the columns that are needed
    .select([
        'src_airport_id',
        'dst_airport_id',
    ])
    # Group by source and count destinations
    .groupBy('src_airport_id')
    .agg({
        'dst_airport_id': 'count',
    })
    .select(
        F.col('src_airport_id').alias('airport_id'),
        F.col('count(dst_airport_id)').alias('airport_size')
    )
)

airport_names = (
    airports
    .select(
        F.col('airport_id'),
        F.col('airport').alias('airport_name')
    )
)

route_counts_with_airport_names = (
    route_counts
    # Merge with airports to get airport name
    .join(
        airport_names,
        on='airport_id',
        how='left',
    )
    # Select columns to be retained
    .select(
        F.col('airport_name'),
        F.col('airport_size')
    )
    # Order by airport size descending
    .orderBy('airport_size', ascending=False)
)

In [18]:
print(route_counts)
route_counts.printSchema()
show_as_html(route_counts)

DataFrame[airport_id: string, airport_size: bigint]
root
 |-- airport_id: string (nullable = true)
 |-- airport_size: long (nullable = false)



Unnamed: 0,airport_id,airport_size
0,2923,38
1,2812,11
2,245,41
3,729,4
4,7168,1
5,7173,1
6,2433,9
7,2613,19
8,6039,6
9,897,51


In [19]:
print(airport_names)
airport_names.printSchema()
show_as_html(airport_names)

DataFrame[airport_id: int, airport_name: string]
root
 |-- airport_id: integer (nullable = true)
 |-- airport_name: string (nullable = true)



Unnamed: 0,airport_id,airport_name
0,1,Goroka Airport
1,2,Madang Airport
2,3,Mount Hagen Kagamuga Airport
3,4,Nadzab Airport
4,5,Port Moresby Jacksons International Airport
5,6,Wewak International Airport
6,7,Narsarsuaq Airport
7,8,Godthaab / Nuuk Airport
8,9,Kangerlussuaq Airport
9,10,Thule Air Base


In [20]:
print(route_counts_with_airport_names)
route_counts_with_airport_names.printSchema()
show_as_html(route_counts_with_airport_names)

DataFrame[airport_name: string, airport_size: bigint]
root
 |-- airport_name: string (nullable = true)
 |-- airport_size: long (nullable = false)



Unnamed: 0,airport_name,airport_size
0,Hartsfield Jackson Atlanta International Airport,915
1,Chicago O'Hare International Airport,558
2,Beijing Capital International Airport,535
3,London Heathrow Airport,527
4,Charles de Gaulle International Airport,524
5,Frankfurt am Main International Airport,497
6,Los Angeles International Airport,492
7,Dallas Fort Worth International Airport,469
8,John F Kennedy International Airport,456
9,Amsterdam Airport Schiphol,453


### Stop Spark ###

In [21]:
# Run this cell before closing the notebook or kill your spark application by hand using the link in the Spark UI

stop_spark()

25/08/13 16:03:01 WARN ExecutorPodsWatchSnapshotSource: Kubernetes client has been closed.
