In [1]:
# Run this cell to import pyspark and to define start_spark() and stop_spark()

import findspark

findspark.init()

import getpass
import pandas
import pyspark
import random
import re

from IPython.display import display, HTML
from pyspark import SparkContext
from pyspark.sql import SparkSession
import pyspark.sql.functions as F


# Constants used to interact with Azure Blob Storage using the hdfs command or Spark

global username

username = re.sub('@.*', '', getpass.getuser())

global azure_account_name
global azure_data_container_name
global azure_user_container_name
global azure_user_token

azure_account_name = "madsstorage002"
azure_data_container_name = "campus-data"
azure_user_container_name = "campus-user"
azure_user_token = r"sp=racwdl&st=2025-08-01T09:41:33Z&se=2026-12-30T16:56:33Z&spr=https&sv=2024-11-04&sr=c&sig=GzR1hq7EJ0lRHj92oDO1MBNjkc602nrpfB5H8Cl7FFY%3D"


# Functions used below

def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """

    html = []

    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')

    return ''.join(html)


def show_as_html(df, n=20):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():

        name = sc.getConf().get("spark.app.name")

        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://localhost:{sc.uiWebUrl.split(":")[-1]}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else:
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username} (notebook)</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{username}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.driver.memory", f'{master_memory}g')
        .config("spark.executor.memory", f'{worker_memory}g')
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.kubernetes.container.image", "madsregistry001.azurecr.io/hadoop-spark:v3.3.5-openjdk-8")
        .config("spark.kubernetes.container.image.pullPolicy", "IfNotPresent")
        .config("spark.kubernetes.memoryOverheadFactor", "0.3")
        .config("spark.memory.fraction", "0.1")
        .config(f"fs.azure.sas.{azure_user_container_name}.{azure_account_name}.blob.core.windows.net",  azure_user_token)
        .config("spark.app.name", f"{username} (notebook)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """

    global spark
    global sc

    if 'spark' in globals() and 'sc' in globals():

        spark.stop()

        del spark
        del sc

    display_spark()


# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))

In [2]:
# Run this cell to start a spark session in this notebook

start_spark(executor_instances=4, executor_cores=4, worker_memory=8, master_memory=8)

25/10/05 14:01:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


0,1
spark.dynamicAllocation.enabled,false
spark.fs.azure.sas.campus-user.madsstorage002.blob.core.windows.net,"""sp=racwdl&st=2025-08-01T09:41:33Z&se=2026-12-30T16:56:33Z&spr=https&sv=2024-11-04&sr=c&sig=GzR1hq7EJ0lRHj92oDO1MBNjkc602nrpfB5H8Cl7FFY%3D"""
spark.kubernetes.driver.pod.name,spark-master-driver
spark.executor.instances,4
spark.app.id,spark-e83e7004e26c40538f650b1ed2f5d6bc
spark.app.name,rsh224 (notebook)
spark.cores.max,16
spark.kubernetes.container.image.pullPolicy,IfNotPresent
spark.kubernetes.namespace,rsh224
spark.executor.cores,4


In [3]:
# Write your imports here or insert cells below

from pyspark.sql import functions as F
from pyspark.sql.types import *

In [4]:
directory_path = f'wasbs://{azure_data_container_name}@{azure_account_name}.blob.core.windows.net/msd'

In [77]:
!hdfs dfs -ls {directory_path}/audio/attributes

Found 13 items
-rwxrwxrwx   1       1051 2024-09-13 12:00 wasbs://campus-data@madsstorage002.blob.core.windows.net/msd/audio/attributes/msd-jmir-area-of-moments-all-v1.0.attributes.csv
-rwxrwxrwx   1        671 2024-09-13 12:00 wasbs://campus-data@madsstorage002.blob.core.windows.net/msd/audio/attributes/msd-jmir-lpc-all-v1.0.attributes.csv
-rwxrwxrwx   1        484 2024-09-13 12:00 wasbs://campus-data@madsstorage002.blob.core.windows.net/msd/audio/attributes/msd-jmir-methods-of-moments-all-v1.0.attributes.csv
-rwxrwxrwx   1        898 2024-09-13 12:00 wasbs://campus-data@madsstorage002.blob.core.windows.net/msd/audio/attributes/msd-jmir-mfcc-all-v1.0.attributes.csv
-rwxrwxrwx   1        777 2024-09-13 12:00 wasbs://campus-data@madsstorage002.blob.core.windows.net/msd/audio/attributes/msd-jmir-spectral-all-all-v1.0.attributes.csv
-rwxrwxrwx   1        777 2024-09-13 12:00 wasbs://campus-data@madsstorage002.blob.core.windows.net/msd/audio/attributes/msd-jmir-spectral-derivatives-all-all

In [78]:
!hdfs dfs -ls {directory_path}/audio/features

Found 13 items
drwxrwxrwx   -          0 1970-01-01 12:00 wasbs://campus-data@madsstorage002.blob.core.windows.net/msd/audio/features/msd-jmir-area-of-moments-all-v1.0.csv
drwxrwxrwx   -          0 1970-01-01 12:00 wasbs://campus-data@madsstorage002.blob.core.windows.net/msd/audio/features/msd-jmir-lpc-all-v1.0.csv
drwxrwxrwx   -          0 1970-01-01 12:00 wasbs://campus-data@madsstorage002.blob.core.windows.net/msd/audio/features/msd-jmir-methods-of-moments-all-v1.0.csv
drwxrwxrwx   -          0 1970-01-01 12:00 wasbs://campus-data@madsstorage002.blob.core.windows.net/msd/audio/features/msd-jmir-mfcc-all-v1.0.csv
drwxrwxrwx   -          0 1970-01-01 12:00 wasbs://campus-data@madsstorage002.blob.core.windows.net/msd/audio/features/msd-jmir-spectral-all-all-v1.0.csv
drwxrwxrwx   -          0 1970-01-01 12:00 wasbs://campus-data@madsstorage002.blob.core.windows.net/msd/audio/features/msd-jmir-spectral-derivatives-all-all-v1.0.csv
drwxrwxrwx   -          0 1970-01-01 12:00 wasbs://campus

In [38]:
# map data types defined in attribute files to spark data types
TYPE_MAPPER = {
    "real": FloatType(),
    "numeric": FloatType(),
    "integer": IntegerType(),
    "string": StringType()
}

# function to format columns to snake case and add prefix
def rename_columns(df, prefix):
    new_names = []
    
    for old_name in df.columns:
        # filters out alphanumeric characters and replace remaining (' ', '-', '_') with underscore '_'
        # then converts everything to lower case 
        new_name = re.sub('[^0-9a-zA-Z]+', '_', old_name).lower()
        new_names.append(f'{prefix}_{new_name}')

    renamed_df = df.toDF(*new_names)
    return renamed_df

# single function to automate everything. This function loads the attributes, extracts the schema,
# loads the associated feature and renames the columns
# prepares and returns the dataset with formatted column names ready for use
def load_feature(feature_name, prefix):
    attributes = spark.read.csv(f'{directory_path}/audio/attributes/{feature_name}.attributes.csv')   

    attributes_list = [(row._c0.strip(), row._c1.strip()) for row in attributes.collect()]

    schema = StructType([
        StructField(name, TYPE_MAPPER.get(dtype.lower(), StringType()), True)
        for name, dtype in attributes_list
    ])
    
    features = spark.read.csv(
        f'{directory_path}/audio/features/{feature_name}.csv',
        schema=schema,
        header=False
    )

    renamed_features = rename_columns(features, prefix)

    return renamed_features


In [39]:
jmom = load_feature('msd-jmir-area-of-moments-all-v1.0', prefix='jmom')

In [34]:
jmom.printSchema()

root
 |-- jmom_area_method_of_moments_overall_standard_deviation_1: float (nullable = true)
 |-- jmom_area_method_of_moments_overall_standard_deviation_2: float (nullable = true)
 |-- jmom_area_method_of_moments_overall_standard_deviation_3: float (nullable = true)
 |-- jmom_area_method_of_moments_overall_standard_deviation_4: float (nullable = true)
 |-- jmom_area_method_of_moments_overall_standard_deviation_5: float (nullable = true)
 |-- jmom_area_method_of_moments_overall_standard_deviation_6: float (nullable = true)
 |-- jmom_area_method_of_moments_overall_standard_deviation_7: float (nullable = true)
 |-- jmom_area_method_of_moments_overall_standard_deviation_8: float (nullable = true)
 |-- jmom_area_method_of_moments_overall_standard_deviation_9: float (nullable = true)
 |-- jmom_area_method_of_moments_overall_standard_deviation_10: float (nullable = true)
 |-- jmom_area_method_of_moments_overall_average_1: float (nullable = true)
 |-- jmom_area_method_of_moments_overall_average

In [35]:
jmom.show(20, False)

+--------------------------------------------------------+--------------------------------------------------------+--------------------------------------------------------+--------------------------------------------------------+--------------------------------------------------------+--------------------------------------------------------+--------------------------------------------------------+--------------------------------------------------------+--------------------------------------------------------+---------------------------------------------------------+---------------------------------------------+---------------------------------------------+---------------------------------------------+---------------------------------------------+---------------------------------------------+---------------------------------------------+---------------------------------------------+---------------------------------------------+---------------------------------------------+--------------

In [40]:
stop_spark()

25/10/05 15:51:16 WARN ExecutorPodsWatchSnapshotSource: Kubernetes client has been closed.
