In [1]:
# Run this cell to import pyspark and to define start_spark() and stop_spark()

import findspark

findspark.init()

import getpass
import pandas
import pyspark
import random
import re

from IPython.display import display, HTML
from pyspark import SparkContext
from pyspark.sql import SparkSession
import pyspark.sql.functions as F


# Constants used to interact with Azure Blob Storage using the hdfs command or Spark

global username

username = re.sub('@.*', '', getpass.getuser())

global azure_account_name
global azure_data_container_name
global azure_user_container_name
global azure_user_token

azure_account_name = "madsstorage002"
azure_data_container_name = "campus-data"
azure_user_container_name = "campus-user"
azure_user_token = r"sp=racwdl&st=2025-08-01T09:41:33Z&se=2026-12-30T16:56:33Z&spr=https&sv=2024-11-04&sr=c&sig=GzR1hq7EJ0lRHj92oDO1MBNjkc602nrpfB5H8Cl7FFY%3D"


# Functions used below

def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """

    html = []

    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')

    return ''.join(html)


def show_as_html(df, n=20):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():

        name = sc.getConf().get("spark.app.name")

        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://localhost:{sc.uiWebUrl.split(":")[-1]}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else:
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username} (notebook)</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{username}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.driver.memory", f'{master_memory}g')
        .config("spark.executor.memory", f'{worker_memory}g')
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.kubernetes.container.image", "madsregistry001.azurecr.io/hadoop-spark:v3.3.5-openjdk-8")
        .config("spark.kubernetes.container.image.pullPolicy", "IfNotPresent")
        .config("spark.kubernetes.memoryOverheadFactor", "0.3")
        .config("spark.memory.fraction", "0.1")
        .config(f"fs.azure.sas.{azure_user_container_name}.{azure_account_name}.blob.core.windows.net",  azure_user_token)
        .config("spark.app.name", f"{username} (notebook)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """

    global spark
    global sc

    if 'spark' in globals() and 'sc' in globals():

        spark.stop()

        del spark
        del sc

    display_spark()


# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))

In [2]:
# Run this cell to start a spark session in this notebook

start_spark(executor_instances=4, executor_cores=4, worker_memory=8, master_memory=8)

25/10/07 16:11:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


0,1
spark.dynamicAllocation.enabled,false
spark.fs.azure.sas.campus-user.madsstorage002.blob.core.windows.net,"""sp=racwdl&st=2025-08-01T09:41:33Z&se=2026-12-30T16:56:33Z&spr=https&sv=2024-11-04&sr=c&sig=GzR1hq7EJ0lRHj92oDO1MBNjkc602nrpfB5H8Cl7FFY%3D"""
spark.kubernetes.driver.pod.name,spark-master-driver
spark.executor.instances,4
spark.app.name,rsh224 (notebook)
spark.cores.max,16
spark.kubernetes.container.image.pullPolicy,IfNotPresent
spark.kubernetes.namespace,rsh224
spark.executor.cores,4
spark.driver.memory,8g


In [3]:
# Write your imports here or insert cells below

from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer
from pyspark.sql.window import Window
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
import sys
import os


sys.path.append(os.path.abspath(".."))

from helpers import load_feature, stratified_split, apply_class_weights, get_multiclass_metrics


In [4]:
directory_path = f'wasbs://{azure_data_container_name}@{azure_account_name}.blob.core.windows.net/msd'

In [5]:
features_path = f'{username}/msd/output/feature_genre'
input_path = f'wasbs://{azure_user_container_name}@{azure_account_name}.blob.core.windows.net/{features_path}'

In [6]:
features = spark.read.csv(
    input_path,
    inferSchema=True,
    header=True
)

                                                                                

# Genre Encoding

In [7]:
indexer = StringIndexer(inputCol='genre', outputCol='label')

features_indexed = indexer.fit(features).transform(features)

                                                                                

In [8]:
class_balance_df = features_indexed.groupBy('genre', 'label').count().orderBy('count', ascending=False)
total = features_indexed.count()
class_balance_df = class_balance_df.withColumn('proportion', F.round(F.col('count') / total, 3))

                                                                                

In [9]:
class_balance_df.show()



+--------------+-----+------+----------+
|         genre|label| count|proportion|
+--------------+-----+------+----------+
|      Pop_Rock|  0.0|237641|     0.565|
|    Electronic|  1.0| 40662|     0.097|
|           Rap|  2.0| 20899|      0.05|
|          Jazz|  3.0| 17774|     0.042|
|         Latin|  4.0| 17503|     0.042|
|           RnB|  5.0| 14314|     0.034|
| International|  6.0| 14193|     0.034|
|       Country|  7.0| 11691|     0.028|
|     Religious|  8.0|  8779|     0.021|
|        Reggae|  9.0|  6928|     0.016|
|         Blues| 10.0|  6800|     0.016|
|         Vocal| 11.0|  6182|     0.015|
|          Folk| 12.0|  5789|     0.014|
|       New Age| 13.0|  4000|      0.01|
| Comedy_Spoken| 14.0|  2067|     0.005|
|         Stage| 15.0|  1613|     0.004|
|Easy_Listening| 16.0|  1535|     0.004|
|   Avant_Garde| 17.0|  1012|     0.002|
|     Classical| 18.0|   555|     0.001|
|      Children| 19.0|   463|     0.001|
+--------------+-----+------+----------+
only showing top

                                                                                

# Scaling and Stratified Sampling

In [17]:
feature_cols = [c for c in features_indexed.columns if c not in ('track_id', 'genre', 'label')]

assembler = VectorAssembler(inputCols=feature_cols, outputCol='features_raw')
assembled_features = assembler.transform(features_indexed)



In [18]:
scaler = StandardScaler(
    inputCol='features_raw',
    outputCol='features',
    withMean=True,
    withStd=True
)

scaled_df = scaler.fit(assembled_features).transform(assembled_features)

                                                                                

In [19]:
scaled_df = scaled_df.select('track_id', 'features', 'label')

In [20]:
train, test = stratified_split(scaled_df, label_col='label', train_fraction=0.8)

                                                                                

# Training without weights

In [21]:
lr = LogisticRegression(
    featuresCol='features',
    labelCol='label',
    family='multinomial',
    maxIter=100
)

model = lr.fit(train)
predictions = model.transform(train)



In [27]:
evaluations = get_multiclass_metrics(predictions)
eval_df = spark.createDataFrame([Row(**evaluations)])
eval_df.show()

                                                                                

+-------------------+------------------+------------------+------------------+
|           accuracy|         precision|            recall|                f1|
+-------------------+------------------+------------------+------------------+
|0.41458304232434695|0.6768299947470359|0.4145830423243471|0.4768539237104466|
+-------------------+------------------+------------------+------------------+



# Training with Class Weights

In [24]:
train_weighted = apply_class_weights(train)

                                                                                

In [25]:
lr = LogisticRegression(
    featuresCol='features',
    labelCol='label',
    weightCol='weight',
    family='multinomial',
    maxIter=100
)

model = lr.fit(train_weighted)
predictions = model.transform(train_weighted)



In [26]:
evaluations = get_multiclass_metrics(predictions)
eval_df = spark.createDataFrame([Row(**evaluations)])
eval_df.show()

                                                                                

+-------------------+------------------+------------------+------------------+
|           accuracy|         precision|            recall|                f1|
+-------------------+------------------+------------------+------------------+
|0.41458304232434695|0.6768299947470359|0.4145830423243471|0.4768539237104466|
+-------------------+------------------+------------------+------------------+



In [None]:
stop_spark()