In [1]:
# Run this cell to import pyspark and to define start_spark() and stop_spark()

import findspark

findspark.init()

import getpass
import pandas
import pyspark
import random
import re

from IPython.display import display, HTML
from pyspark import SparkContext
from pyspark.sql import SparkSession
import pyspark.sql.functions as F


# Constants used to interact with Azure Blob Storage using the hdfs command or Spark

global username

username = re.sub('@.*', '', getpass.getuser())

global azure_account_name
global azure_data_container_name
global azure_user_container_name
global azure_user_token

azure_account_name = "madsstorage002"
azure_data_container_name = "campus-data"
azure_user_container_name = "campus-user"
azure_user_token = r"sp=racwdl&st=2025-08-01T09:41:33Z&se=2026-12-30T16:56:33Z&spr=https&sv=2024-11-04&sr=c&sig=GzR1hq7EJ0lRHj92oDO1MBNjkc602nrpfB5H8Cl7FFY%3D"


# Functions used below

def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """

    html = []

    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')

    return ''.join(html)


def show_as_html(df, n=20):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():

        name = sc.getConf().get("spark.app.name")

        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://localhost:{sc.uiWebUrl.split(":")[-1]}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else:
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username} (notebook)</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{username}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.driver.memory", f'{master_memory}g')
        .config("spark.executor.memory", f'{worker_memory}g')
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.kubernetes.container.image", "madsregistry001.azurecr.io/hadoop-spark:v3.3.5-openjdk-8")
        .config("spark.kubernetes.container.image.pullPolicy", "IfNotPresent")
        .config("spark.kubernetes.memoryOverheadFactor", "0.3")
        .config("spark.memory.fraction", "0.1")
        .config(f"fs.azure.sas.{azure_user_container_name}.{azure_account_name}.blob.core.windows.net",  azure_user_token)
        .config("spark.app.name", f"{username} (notebook)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """

    global spark
    global sc

    if 'spark' in globals() and 'sc' in globals():

        spark.stop()

        del spark
        del sc

    display_spark()


# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))

In [2]:
# Run this cell to start a spark session in this notebook

start_spark(executor_instances=4, executor_cores=4, worker_memory=8, master_memory=8)

25/08/28 09:42:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


0,1
spark.dynamicAllocation.enabled,false
spark.app.submitTime,1756330964678
spark.fs.azure.sas.uco-user.madsstorage002.blob.core.windows.net,"""sp=racwdl&st=2024-09-19T08:00:18Z&se=2025-09-19T16:00:18Z&spr=https&sv=2022-11-02&sr=c&sig=qtg6fCdoFz6k3EJLw7dA8D3D8wN0neAYw8yG4z4Lw2o%3D"""
spark.kubernetes.driver.pod.name,spark-master-driver
spark.executor.instances,4
spark.app.name,rsh224 (notebook)
spark.cores.max,16
spark.kubernetes.executor.podNamePrefix,rsh224-notebook-68ba5198ed7bba64
spark.fs.azure.sas.campus-user.madsstorage002.blob.core.windows.net,"""sp=racwdl&st=2024-09-19T08:03:31Z&se=2025-09-19T16:03:31Z&spr=https&sv=2022-11-02&sr=c&sig=kMP%2BsBsRzdVVR8rrg%2BNbDhkRBNs6Q98kYY695XMRFDU%3D"""
spark.kubernetes.container.image.pullPolicy,IfNotPresent


In [25]:
# Write your imports here or insert cells below

from pyspark.sql import functions as F
from pyspark.sql.types import *
from math import radians, sin, cos, asin, sqrt

In [36]:
stations_enriched_path = f'wasbs://{azure_user_container_name}@{azure_account_name}.blob.core.windows.net/{username}/stations-enriched'

wasbs://campus-data@madsstorage002.blob.core.windows.net/ghcnd/ghcnd-stations.txt


In [41]:
stations_enriched = spark.read.csv(stations_enriched_path, header=True, inferSchema=False)

In [42]:
stations_enriched.show(20, False)

+-----------+----------+------------+--------+---------+---------+--------------------+----+------------+------+------------+----------+------------------------+-----------------+------------------+
|ID         |STATE_CODE|COUNTRY_CODE|LATITUDE|LONGITUDE|ELEVATION|STATION_NAME        |GSN |HCN_CRN_FLAG|WMO_ID|COUNTRY_NAME|STATE_NAME|ELEMENTS                |NUM_CORE_ELEMENTS|NUM_OTHER_ELEMENTS|
+-----------+----------+------------+--------+---------+---------+--------------------+----+------------+------+------------+----------+------------------------+-----------------+------------------+
|AFM00040990|NULL      |AF          |31.5    |65.85    |1010.0   |KANDAHAR AIRPORT    |NULL|NULL        |40990 |Afghanistan |NULL      |TMAX;TMIN;PRCP;SNWD;TAVG|4                |1                 |
|AGE00147712|NULL      |AG          |36.17   |1.34     |112.0    |ORLEANSVILLE (CHLEF)|NULL|NULL        |NULL  |Algeria     |NULL      |TMAX;TMIN;PRCP          |3                |0                 |
|AGE0

In [43]:
stations_enriched.printSchema()

root
 |-- ID: string (nullable = true)
 |-- STATE_CODE: string (nullable = true)
 |-- COUNTRY_CODE: string (nullable = true)
 |-- LATITUDE: string (nullable = true)
 |-- LONGITUDE: string (nullable = true)
 |-- ELEVATION: string (nullable = true)
 |-- STATION_NAME: string (nullable = true)
 |-- GSN: string (nullable = true)
 |-- HCN_CRN_FLAG: string (nullable = true)
 |-- WMO_ID: string (nullable = true)
 |-- COUNTRY_NAME: string (nullable = true)
 |-- STATE_NAME: string (nullable = true)
 |-- ELEMENTS: string (nullable = true)
 |-- NUM_CORE_ELEMENTS: string (nullable = true)
 |-- NUM_OTHER_ELEMENTS: string (nullable = true)



In [44]:
stations = stations_enriched.select(
    'ID',
    F.col('LATITUDE').cast(DoubleType()),
    F.col('LONGITUDE').cast(DoubleType()),
    'STATION_NAME'
)

stations.show()

+-----------+--------+---------+--------------------+
|         ID|LATITUDE|LONGITUDE|        STATION_NAME|
+-----------+--------+---------+--------------------+
|AFM00040990|    31.5|    65.85|    KANDAHAR AIRPORT|
|AGE00147712|   36.17|     1.34|ORLEANSVILLE (CHLEF)|
|AGE00147713|   36.18|      5.4|               SETIF|
|AGE00147718|   34.85|     5.72|              BISKRA|
|AGM00060402|  36.712|     5.07|             SOUMMAM|
|AGM00060417|  36.383|    3.883|              BOUIRA|
|AGM00060421|  35.867|    7.117|      OUM EL BOUAGHI|
|AGM00060445|  36.178|    5.324|     SETIF AIN ARNAT|
|AGM00060461|    35.7|    -0.65|           ORAN-PORT|
|AGM00060531|  35.017|    -1.45|              ZENATA|
|AGM00060566|  32.384|    3.794|            NOUMERAT|
|AGM00060581|  31.673|     6.14|          OUED IRARA|
|AJ000037679|    41.1|     49.2|             SIASAN'|
|AJ000037734|    40.8|     46.0|             SHAMHOR|
|AJ000037740|  40.983|   47.867|              QABALA|
|AJ000037825|    40.4|     4

In [46]:
stations_nz = stations.filter(F.col('COUNTRY_CODE') == 'NZ').select(
    F.col('ID').alias('ID_A'),
    F.col('LATITUDE').alias('LAT_A'),
    F.col('LONGITUDE').alias('LON_A'),
    F.col('STATION_NAME').alias('NAME_A')
)

In [47]:
stations_nz.show(20, False)

+-----------+-------+--------+-------------------+
|ID_A       |LAT_A  |LON_A   |NAME_A             |
+-----------+-------+--------+-------------------+
|NZ000093994|-29.25 |-177.917|RAOUL ISL/KERMADEC |
|NZ000093417|-40.9  |174.983 |PARAPARAUMU AWS    |
|NZ000093844|-46.417|168.333 |INVERCARGILL AIRPOR|
|NZM00093678|-42.417|173.7   |KAIKOURA           |
|NZ000939450|-52.55 |169.167 |CAMPBELL ISLAND AWS|
|NZM00093439|-41.333|174.8   |WELLINGTON AERO AWS|
|NZM00093929|-50.483|166.3   |ENDERBY ISLAND AWS |
|NZ000936150|-42.717|170.983 |HOKITIKA AERODROME |
|NZ000093012|-35.1  |173.267 |KAITAIA            |
|NZ000093292|-38.65 |177.983 |GISBORNE AERODROME |
|NZ000937470|-44.517|169.9   |TARA HILLS         |
|NZ000939870|-43.95 |-176.567|CHATHAM ISLANDS AWS|
|NZM00093110|-37.0  |174.8   |AUCKLAND AERO AWS  |
|NZM00093781|-43.489|172.532 |CHRISTCHURCH INTL  |
|NZ000933090|-39.017|174.183 |NEW PLYMOUTH AWS   |
+-----------+-------+--------+-------------------+



In [48]:
stations_nz_b = stations_nz.select(
    F.col('ID_A').alias('ID_B'),
    F.col('LAT_A').alias('LAT_B'),
    F.col('LON_A').alias('LON_B'),
    F.col('NAME_A').alias('NAME_B')
)

In [49]:
stations_nz_cross = stations_nz.crossJoin(stations_nz_b).where(F.col('ID_A') < F.col('ID_B'))
stations_nz_cross.show()

+-----------+------+--------+------------------+-----------+-------+--------+-------------------+
|       ID_A| LAT_A|   LON_A|            NAME_A|       ID_B|  LAT_B|   LON_B|             NAME_B|
+-----------+------+--------+------------------+-----------+-------+--------+-------------------+
|NZ000093994|-29.25|-177.917|RAOUL ISL/KERMADEC|NZM00093678|-42.417|   173.7|           KAIKOURA|
|NZ000093994|-29.25|-177.917|RAOUL ISL/KERMADEC|NZ000939450| -52.55| 169.167|CAMPBELL ISLAND AWS|
|NZ000093994|-29.25|-177.917|RAOUL ISL/KERMADEC|NZM00093439|-41.333|   174.8|WELLINGTON AERO AWS|
|NZ000093994|-29.25|-177.917|RAOUL ISL/KERMADEC|NZM00093929|-50.483|   166.3| ENDERBY ISLAND AWS|
|NZ000093994|-29.25|-177.917|RAOUL ISL/KERMADEC|NZ000936150|-42.717| 170.983| HOKITIKA AERODROME|
|NZ000093994|-29.25|-177.917|RAOUL ISL/KERMADEC|NZ000937470|-44.517|   169.9|         TARA HILLS|
|NZ000093994|-29.25|-177.917|RAOUL ISL/KERMADEC|NZ000939870| -43.95|-176.567|CHATHAM ISLANDS AWS|
|NZ000093994|-29.25|

In [50]:
stations_nz_cross.printSchema()

root
 |-- ID_A: string (nullable = true)
 |-- LAT_A: double (nullable = true)
 |-- LON_A: double (nullable = true)
 |-- NAME_A: string (nullable = true)
 |-- ID_B: string (nullable = true)
 |-- LAT_B: double (nullable = true)
 |-- LON_B: double (nullable = true)
 |-- NAME_B: string (nullable = true)



In [51]:
stations_nz_cross.count()

105

In [52]:
def get_haversine_distance(lat1_d, lon1_d, lat2_d, lon2_d):

    # convert degrees to radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1_d, lon1_d, lat2_d, lon2_d])
    
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    # Radius of earth in kms
    R = 6371.0088

    a = sin(dlat/2)**2 + cos(lat1)*cos(lat2)*(sin(dlon/2)**2)

    distance = 2*R*asin(sqrt(a))

    return distance
    

In [53]:
get_haversine_distance(40.7128, 74.0060, 51.5074, 0.1278)

5570.229873656523

In [54]:
get_distance_udf = F.udf(get_haversine_distance, DoubleType())

In [55]:
stations_nz_distance = stations_nz_cross.withColumn(
    'DISTANCE', get_distance_udf(
        F.col('LAT_A'),
        F.col('LON_A'),
        F.col('LAT_B'),
        F.col('LON_B')
    )
)

stations_nz_distance.show()

+-----------+------+--------+------------------+-----------+-------+--------+-------------------+------------------+
|       ID_A| LAT_A|   LON_A|            NAME_A|       ID_B|  LAT_B|   LON_B|             NAME_B|          DISTANCE|
+-----------+------+--------+------------------+-----------+-------+--------+-------------------+------------------+
|NZ000093994|-29.25|-177.917|RAOUL ISL/KERMADEC|NZM00093678|-42.417|   173.7|           KAIKOURA|1645.5672600919054|
|NZ000093994|-29.25|-177.917|RAOUL ISL/KERMADEC|NZ000939450| -52.55| 169.167|CAMPBELL ISLAND AWS| 2799.180360445167|
|NZ000093994|-29.25|-177.917|RAOUL ISL/KERMADEC|NZM00093439|-41.333|   174.8|WELLINGTON AERO AWS|1495.9438524148266|
|NZ000093994|-29.25|-177.917|RAOUL ISL/KERMADEC|NZM00093929|-50.483|   166.3| ENDERBY ISLAND AWS| 2705.424733563147|
|NZ000093994|-29.25|-177.917|RAOUL ISL/KERMADEC|NZ000936150|-42.717| 170.983| HOKITIKA AERODROME|1796.3643450230413|
|NZ000093994|-29.25|-177.917|RAOUL ISL/KERMADEC|NZ000937470|-44.

In [57]:
stations_nz_distance.orderBy(F.col('DISTANCE')).show()

+-----------+-------+-------+-------------------+-----------+-------+-------+-------------------+------------------+
|       ID_A|  LAT_A|  LON_A|             NAME_A|       ID_B|  LAT_B|  LON_B|             NAME_B|          DISTANCE|
+-----------+-------+-------+-------------------+-----------+-------+-------+-------------------+------------------+
|NZ000093417|  -40.9|174.983|    PARAPARAUMU AWS|NZM00093439|-41.333|  174.8|WELLINGTON AERO AWS|50.529096275802594|
|NZM00093439|-41.333|  174.8|WELLINGTON AERO AWS|NZM00093678|-42.417|  173.7|           KAIKOURA|151.07164367845573|
|NZ000936150|-42.717|170.983| HOKITIKA AERODROME|NZM00093781|-43.489|172.532|  CHRISTCHURCH INTL|152.25856699117296|
|NZM00093678|-42.417|  173.7|           KAIKOURA|NZM00093781|-43.489|172.532|  CHRISTCHURCH INTL|152.45918124327264|
|NZ000093417|  -40.9|174.983|    PARAPARAUMU AWS|NZM00093678|-42.417|  173.7|           KAIKOURA| 199.5298991599465|
|NZ000936150|-42.717|170.983| HOKITIKA AERODROME|NZ000937470|-44

In [59]:
output_path = f'wasbs://{azure_user_container_name}@{azure_account_name}.blob.core.windows.net'
output_distance_path = f'{output_path}/rsh224/stations_nz_distance'
stations_nz_distance.write.mode('overwrite').option('header', True).csv(output_distance_path)

25/08/28 11:50:23 WARN AzureFileSystemThreadPoolExecutor: Disabling threads for Delete operation as thread count 0 is <= 1


In [61]:
!hdfs dfs -ls {output_path}/rsh224/stations_nz_distance

Found 8 items
-rw-r--r--   1 rsh224 supergroup          0 2025-08-28 11:50 wasbs://campus-user@madsstorage002.blob.core.windows.net/rsh224/stations_nz_distance/_SUCCESS
-rw-r--r--   1 rsh224 supergroup       1153 2025-08-28 11:50 wasbs://campus-user@madsstorage002.blob.core.windows.net/rsh224/stations_nz_distance/part-00000-730257f7-e380-4cf1-a01a-25f97d14ca9c-c000.csv
-rw-r--r--   1 rsh224 supergroup       2726 2025-08-28 11:50 wasbs://campus-user@madsstorage002.blob.core.windows.net/rsh224/stations_nz_distance/part-00002-730257f7-e380-4cf1-a01a-25f97d14ca9c-c000.csv
-rw-r--r--   1 rsh224 supergroup       1035 2025-08-28 11:50 wasbs://campus-user@madsstorage002.blob.core.windows.net/rsh224/stations_nz_distance/part-00003-730257f7-e380-4cf1-a01a-25f97d14ca9c-c000.csv
-rw-r--r--   1 rsh224 supergroup        929 2025-08-28 11:50 wasbs://campus-user@madsstorage002.blob.core.windows.net/rsh224/stations_nz_distance/part-00004-730257f7-e380-4cf1-a01a-25f97d14ca9c-c000.csv
-rw-r--r--   1 rsh2

In [None]:
stop_spark()

25/08/28 11:50:54 WARN ExecutorPodsWatchSnapshotSource: Kubernetes client has been closed.
