In [2]:
# Run this cell to import pyspark and to define start_spark() and stop_spark()

import findspark

findspark.init()

import getpass
import pandas
import pyspark
import random
import re

from IPython.display import display, HTML
from pyspark import SparkContext
from pyspark.sql import SparkSession
import pyspark.sql.functions as F


# Constants used to interact with Azure Blob Storage using the hdfs command or Spark

global username

username = re.sub('@.*', '', getpass.getuser())

global azure_account_name
global azure_data_container_name
global azure_user_container_name
global azure_user_token

azure_account_name = "madsstorage002"
azure_data_container_name = "campus-data"
azure_user_container_name = "campus-user"
azure_user_token = r"sp=racwdl&st=2025-08-01T09:41:33Z&se=2026-12-30T16:56:33Z&spr=https&sv=2024-11-04&sr=c&sig=GzR1hq7EJ0lRHj92oDO1MBNjkc602nrpfB5H8Cl7FFY%3D"


# Functions used below

def dict_to_html(d):
    """Convert a Python dictionary into a two column table for display.
    """

    html = []

    html.append(f'<table width="100%" style="width:100%; font-family: monospace;">')
    for k, v in d.items():
        html.append(f'<tr><td style="text-align:left;">{k}</td><td>{v}</td></tr>')
    html.append(f'</table>')

    return ''.join(html)


def show_as_html(df, n=20):
    """Leverage existing pandas jupyter integration to show a spark dataframe as html.
    
    Args:
        n (int): number of rows to show (default: 20)
    """

    display(df.limit(n).toPandas())

    
def display_spark():
    """Display the status of the active Spark session if one is currently running.
    """
    
    if 'spark' in globals() and 'sc' in globals():

        name = sc.getConf().get("spark.app.name")

        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:green">active</span></b>, look for <code>{name}</code> under the running applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://localhost:{sc.uiWebUrl.split(":")[-1]}" target="_blank">Spark Application UI</a></li>',
            f'</ul>',
            f'<p><b>Config</b></p>',
            dict_to_html(dict(sc.getConf().getAll())),
            f'<p><b>Notes</b></p>',
            f'<ul>',
            f'<li>The spark session <code>spark</code> and spark context <code>sc</code> global variables have been defined by <code>start_spark()</code>.</li>',
            f'<li>Please run <code>stop_spark()</code> before closing the notebook or restarting the kernel or kill <code>{name}</code> by hand using the link in the Spark UI.</li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))
        
    else:
        
        html = [
            f'<p><b>Spark</b></p>',
            f'<p>The spark session is <b><span style="color:red">stopped</span></b>, confirm that <code>{username} (notebook)</code> is under the completed applications section in the Spark UI.</p>',
            f'<ul>',
            f'<li><a href="http://mathmadslinux2p.canterbury.ac.nz:8080/" target="_blank">Spark UI</a></li>',
            f'</ul>',
        ]
        display(HTML(''.join(html)))


# Functions to start and stop spark

def start_spark(executor_instances=2, executor_cores=1, worker_memory=1, master_memory=1):
    """Start a new Spark session and define globals for SparkSession (spark) and SparkContext (sc).
    
    Args:
        executor_instances (int): number of executors (default: 2)
        executor_cores (int): number of cores per executor (default: 1)
        worker_memory (float): worker memory (default: 1)
        master_memory (float): master memory (default: 1)
    """

    global spark
    global sc

    cores = executor_instances * executor_cores
    partitions = cores * 4
    port = 4000 + random.randint(1, 999)

    spark = (
        SparkSession.builder
        .config("spark.driver.extraJavaOptions", f"-Dderby.system.home=/tmp/{username}/spark/")
        .config("spark.dynamicAllocation.enabled", "false")
        .config("spark.executor.instances", str(executor_instances))
        .config("spark.executor.cores", str(executor_cores))
        .config("spark.cores.max", str(cores))
        .config("spark.driver.memory", f'{master_memory}g')
        .config("spark.executor.memory", f'{worker_memory}g')
        .config("spark.driver.maxResultSize", "0")
        .config("spark.sql.shuffle.partitions", str(partitions))
        .config("spark.kubernetes.container.image", "madsregistry001.azurecr.io/hadoop-spark:v3.3.5-openjdk-8")
        .config("spark.kubernetes.container.image.pullPolicy", "IfNotPresent")
        .config("spark.kubernetes.memoryOverheadFactor", "0.3")
        .config("spark.memory.fraction", "0.1")
        .config(f"fs.azure.sas.{azure_user_container_name}.{azure_account_name}.blob.core.windows.net",  azure_user_token)
        .config("spark.app.name", f"{username} (notebook)")
        .getOrCreate()
    )
    sc = SparkContext.getOrCreate()
    
    display_spark()

    
def stop_spark():
    """Stop the active Spark session and delete globals for SparkSession (spark) and SparkContext (sc).
    """

    global spark
    global sc

    if 'spark' in globals() and 'sc' in globals():

        spark.stop()

        del spark
        del sc

    display_spark()


# Make css changes to improve spark output readability

html = [
    '<style>',
    'pre { white-space: pre !important; }',
    'table.dataframe td { white-space: nowrap !important; }',
    'table.dataframe thead th:first-child, table.dataframe tbody th { display: none; }',
    '</style>',
]
display(HTML(''.join(html)))

In [3]:
# Run this cell to start a spark session in this notebook

start_spark(executor_instances=4, executor_cores=4, worker_memory=8, master_memory=8)

25/10/13 21:49:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


0,1
spark.dynamicAllocation.enabled,false
spark.app.submitTime,1760345373770
spark.fs.azure.sas.campus-user.madsstorage002.blob.core.windows.net,"""sp=racwdl&st=2025-08-01T09:41:33Z&se=2026-12-30T16:56:33Z&spr=https&sv=2024-11-04&sr=c&sig=GzR1hq7EJ0lRHj92oDO1MBNjkc602nrpfB5H8Cl7FFY%3D"""
spark.kubernetes.driver.pod.name,spark-master-driver
spark.executor.instances,4
spark.app.name,rsh224 (notebook)
spark.cores.max,16
spark.kubernetes.container.image.pullPolicy,IfNotPresent
spark.kubernetes.namespace,rsh224
spark.executor.cores,4


In [80]:
# Write your imports here or insert cells below

from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer
from pyspark.sql.window import Window
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RankingEvaluator
import sys
import os


sys.path.append(os.path.abspath(".."))

from helpers import load_feature, stratified_split, apply_class_weights, get_multiclass_metrics


In [4]:
directory_path = f'wasbs://{azure_data_container_name}@{azure_account_name}.blob.core.windows.net/msd'

In [5]:
schema = StructType([
    StructField('user_id', StringType()),
    StructField('song_id', StringType()),
    StructField('play_count', IntegerType(), True)
])

In [6]:
triplets = spark.read.csv(
    f'{directory_path}/tasteprofile/triplets.tsv',
    schema=schema,
    sep='\t'
)

In [23]:
triplets.select(
    F.countDistinct('user_id').alias('unique_users_count'),
    F.countDistinct('song_id').alias('unique_songs_count')
).show()



+------------------+------------------+
|unique_users_count|unique_songs_count|
+------------------+------------------+
|           1019318|            384546|
+------------------+------------------+



                                                                                

In [7]:
popularSongs = (triplets
    .groupBy('song_id')
    .agg(F.sum('play_count').alias('total_plays'))
    .filter(F.col('total_plays') >= 20)
)

In [8]:
activeUsers = (triplets
    .groupBy('user_id')
    .agg(F.countDistinct('song_id').alias('songs_played'))
    .filter(F.col('songs_played') >= 20)
)

In [9]:
filtered_triplets = (triplets
                    .join(popularSongs, on='song_id', how='inner')
                    .join(activeUsers, on='user_id', how='inner')
).drop('total_plays', 'songs_played')

In [10]:
filtered_triplets.count()

                                                                                

42792690

In [40]:
filtered_triplets.select(
    F.countDistinct('user_id').alias('unique_users_count'),
    F.countDistinct('song_id').alias('unique_songs_count')
).show()



+------------------+------------------+
|unique_users_count|unique_songs_count|
+------------------+------------------+
|            661103|            227695|
+------------------+------------------+



                                                                                

In [41]:
original_count = triplets.count()

                                                                                

In [42]:
final_count = filtered_triplets.count()

                                                                                

In [37]:
filtered_percen = ((original_count - final_count)/original_count) * 100

filtered_percen

11.537073145662594

# Encoding

In [11]:
user_indexer = StringIndexer(inputCol='user_id', outputCol='user_index', handleInvalid='skip')
song_indexer = StringIndexer(inputCol='song_id', outputCol='song_index', handleInvalid='skip')

encoded_triplets = user_indexer.fit(filtered_triplets).transform(filtered_triplets)
encoded_triplets = song_indexer.fit(encoded_triplets).transform(encoded_triplets)

25/10/13 12:51:35 WARN DAGScheduler: Broadcasting large task binary with size 35.6 MiB
25/10/13 12:51:35 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
25/10/13 12:51:48 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
25/10/13 12:52:03 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
25/10/13 12:52:09 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
25/10/13 12:52:16 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
                                                                                

In [46]:
encoded_triplets.show(20, False)

25/10/12 15:20:31 WARN DAGScheduler: Broadcasting large task binary with size 35.6 MiB
25/10/12 15:20:31 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
25/10/12 15:20:45 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
25/10/12 15:21:02 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
25/10/12 15:21:09 WARN DAGScheduler: Broadcasting large task binary with size 48.9 MiB
[Stage 412:>                                                        (0 + 1) / 1]

+----------------------------------------+------------------+----------+----------+----------+
|user_id                                 |song_id           |play_count|user_index|song_index|
+----------------------------------------+------------------+----------+----------+----------+
|00007f902e31b0693a023e9c234461d4e6991eec|SOITDGY12A8C1334DA|1         |609874.0  |5705.0    |
|00007f902e31b0693a023e9c234461d4e6991eec|SOPUDWU12A8AE4612A|1         |609874.0  |1326.0    |
|00007f902e31b0693a023e9c234461d4e6991eec|SOAUUQZ12A6310E8C4|1         |609874.0  |143142.0  |
|00007f902e31b0693a023e9c234461d4e6991eec|SOODVNE12AB018D4EA|3         |609874.0  |50363.0   |
|00007f902e31b0693a023e9c234461d4e6991eec|SOSRJRK12AB0186FF4|1         |609874.0  |861.0     |
|00007f902e31b0693a023e9c234461d4e6991eec|SOIGXKF12A6701E096|3         |609874.0  |828.0     |
|00007f902e31b0693a023e9c234461d4e6991eec|SOLAIVS12A6D4FA40A|13        |609874.0  |86986.0   |
|00007f902e31b0693a023e9c234461d4e6991eec|SOGPBAW1

                                                                                

# Train test splitting

In [12]:
train, test_unfiltered = encoded_triplets.randomSplit([0.2, 0.8], seed=43)

In [13]:
test_unfiltered.count()

25/10/13 12:52:19 WARN DAGScheduler: Broadcasting large task binary with size 35.6 MiB
25/10/13 12:52:20 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
25/10/13 12:52:33 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
25/10/13 12:52:49 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
25/10/13 12:53:01 WARN DAGScheduler: Broadcasting large task binary with size 48.9 MiB
                                                                                

34233835

In [14]:
train_users = train.select('user_id').distinct()
test_users = test_unfiltered.select('user_id').distinct()

In [15]:
test_only_users = test_users.join(train_users, on='user_id', how='left_anti')
test = test_unfiltered.join(test_only_users, on='user_id', how='left_anti')

In [16]:
# these are the 'cold-start' users that are in test set but not in train set
# they have been removed using the left_anti join
test_only_users.count()

25/10/13 12:53:13 WARN DAGScheduler: Broadcasting large task binary with size 35.6 MiB
25/10/13 12:53:14 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
25/10/13 12:53:28 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
25/10/13 12:53:48 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
25/10/13 12:53:56 WARN DAGScheduler: Broadcasting large task binary with size 48.9 MiB
                                                                                

1222

In [17]:
test.count()

25/10/13 12:54:15 WARN DAGScheduler: Broadcasting large task binary with size 35.6 MiB
25/10/13 12:54:16 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
25/10/13 12:54:30 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
25/10/13 12:54:49 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
25/10/13 12:54:59 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
                                                                                

34206398

In [18]:
train.cache()
test.cache()

DataFrame[user_id: string, song_id: string, play_count: int, user_index: double, song_index: double]

# Model Training

In [None]:
als = ALS(
    userCol='user_index',
    itemCol='song_index',
    ratingCol='play_count',
    implicitPrefs=True,
    seed=42,
    maxIter=10,
    regParam=0.1,
    alpha=40
)

model = als.fit(train)

# Hand-testing model performance

In [20]:
test_user_activity = test.groupBy('user_index').agg(
    F.countDistinct('song_index').alias('songs_listened'),
    F.sum('play_count').alias('total_plays')
)

hand_test_users = (test_user_activity
                   .filter(F.col('total_plays') == 15)
                   .select('user_index')
                   .distinct()
                   .limit(5)
                  )

hand_test_users.cache()
hand_test_users.show()

25/10/13 12:57:41 WARN DAGScheduler: Broadcasting large task binary with size 35.6 MiB
25/10/13 12:57:41 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
25/10/13 12:57:57 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
25/10/13 12:58:15 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
25/10/13 12:58:24 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 12:58:52 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 12:58:56 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 12:58:58 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 12:59:00 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
[Stage 643:>                                                        (0 + 1) / 1]

+----------+
|user_index|
+----------+
|  596955.0|
|  468765.0|
|  628855.0|
|  648877.0|
|  646185.0|
+----------+



                                                                                

In [21]:
hand_test_recs = model.recommendForUserSubset(hand_test_users, numItems=10)
hand_test_recs.show(truncate=False)

25/10/13 12:59:03 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 12:59:04 WARN DAGScheduler: Broadcasting large task binary with size 49.2 MiB
25/10/13 12:59:10 WARN DAGScheduler: Broadcasting large task binary with size 49.1 MiB
[Stage 715:>                                                        (0 + 1) / 1]

+----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_index|recommendations                                                                                                                                                                              |
+----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|648877    |[{247, 0.2377797}, {4, 0.2233335}, {9, 0.18882796}, {101, 0.17528135}, {176, 0.17458627}, {486, 0.16999}, {430, 0.16942908}, {597, 0.16840437}, {766, 0.16833577}, {1748, 0.16388376}]       |
|596955    |[{1, 0.26126173}, {71, 0.24834356}, {14, 0.23733969}, {5, 0.23639767}, {92, 0.2339701}, {132, 0.23187818}, {88, 0.22946031}, {110, 0.21819355}, {0, 0.21778768}, {13, 0.20916529

                                                                                

In [22]:
metadata = spark.read.csv(
    f'{directory_path}/main/metadata.csv.gz',
    inferSchema=True,
    header=True
)

                                                                                

In [23]:
song_id_mapper = encoded_triplets.select('song_id', 'song_index').distinct()

In [57]:
recommended_full = (hand_test_recs
               .withColumn('recs', F.explode('recommendations'))
               .select('user_index', F.col('recs.song_index'), F.col('recs.rating'))
               .join(song_id_mapper, on='song_index', how='inner')
               .join(metadata, on='song_id', how='inner')                              
)

recommended_songs = recommended_full.groupBy('user_index').agg(F.collect_list('title').alias('recommended_songs'))        

In [25]:
recommended_songs.show(truncate=False)

25/10/13 12:59:18 WARN DAGScheduler: Broadcasting large task binary with size 44.5 MiB
25/10/13 12:59:18 WARN DAGScheduler: Broadcasting large task binary with size 9.0 MiB
25/10/13 12:59:19 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
25/10/13 12:59:19 WARN DAGScheduler: Broadcasting large task binary with size 8.9 MiB
25/10/13 12:59:20 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 12:59:33 WARN DAGScheduler: Broadcasting large task binary with size 49.2 MiB
25/10/13 12:59:37 WARN DAGScheduler: Broadcasting large task binary with size 44.6 MiB
25/10/13 12:59:44 WARN DAGScheduler: Broadcasting large task binary with size 49.1 MiB
25/10/13 13:00:01 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
25/10/13 13:00:08 WARN DAGScheduler: Broadcasting large task binary with size 45.7 MiB
25/10/13 13:00:20 WARN DAGScheduler: Broadcasting large task binary with size 49.2 MiB
25/10/13 13:00:22 WARN DAGScheduler: Broadcas

+----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_index|recommended_songs                                                                                                                                                                                 |
+----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|646185    |[Under Pressure, Have You Ever Seen The Rain, Ride The Lightning, Electric Avenue, Bad Moon Rising, Kun Puut Tekee Seittiä, Come As You Are, Burden In My Hand, Fortunate Son, The Memory Remains]|
|648877    |[Wild World, Video Killed The Radio Star, Sweet And Wild, Where I Stand, Restless, Peace Train, Tive Sim, You're The One, A Beggar On A Beach Of Gold, If Yo

                                                                                

In [44]:
test_full = (test
 .join(hand_test_users, on='user_index', how='inner')
 .join(metadata, on='song_id', how='left')
 .select('user_index', 'song_index', 'artist_name', 'title', 'release')
)

actual = hand_test_users.join(
    test_full,
    how='inner',
    on='user_index'
)

In [27]:
actual_songs.show(truncate=False)

25/10/13 13:00:26 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 13:00:27 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 13:00:28 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 13:00:33 WARN DAGScheduler: Broadcasting large task binary with size 49.1 MiB
25/10/13 13:00:36 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
[Stage 877:>                                                        (0 + 1) / 1]

+----------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_index|songs_listened                                                                                                                                                                                                                                                                                                                                             |
+----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [28]:
actual_vs_recommended = actual_songs.join(
    recommended_songs,
    on='user_index',
    how='inner'
)

In [29]:
# this dataframe contains the recommended vs actual songs for five hand-picked users
actual_vs_recommended.show(truncate=False)                        

25/10/13 13:14:49 WARN DAGScheduler: Broadcasting large task binary with size 44.5 MiB
25/10/13 13:14:50 WARN DAGScheduler: Broadcasting large task binary with size 9.0 MiB
25/10/13 13:14:50 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
25/10/13 13:14:50 WARN DAGScheduler: Broadcasting large task binary with size 8.9 MiB
25/10/13 13:14:52 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 13:14:53 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 13:14:53 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 13:15:05 WARN DAGScheduler: Broadcasting large task binary with size 49.2 MiB
25/10/13 13:15:06 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 13:15:09 WARN DAGScheduler: Broadcasting large task binary with size 44.6 MiB
25/10/13 13:15:20 WARN DAGScheduler: Broadcasting large task binary with size 49.1 MiB
25/10/13 13:15:26 WARN DAGScheduler: Broadcas

+----------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_index|songs_listened                                                                                                                                                                                                                                                                                                                                             |recommended_songs                                                                   

                                                                                

In [70]:
def display_songs(user_index):
    print(f'User: {user_index}')
    print('Songs Played:')
    (actual
         .filter(F.col('user_index') == user_index)
         .select('artist_name', 'title')
         .show(truncate=False)
    )

    print('Recommended Songs: ')
    (recommended_full
         .filter(F.col('user_index') == user_index)
         .select('rating', 'artist_name', 'title')
         .orderBy(F.desc('rating'))
         .show(truncate=False))

In [71]:
hand_test_users_list = [596955.0, 468765.0, 628855.0, 648877.0, 646185.0]

for user in hand_test_users_list:
    display_songs(user)

User: 596955.0
Songs Played:


25/10/13 14:09:24 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 14:09:25 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 14:09:26 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 14:09:31 WARN DAGScheduler: Broadcasting large task binary with size 49.1 MiB
25/10/13 14:09:32 WARN DAGScheduler: Broadcasting large task binary with size 49.1 MiB
25/10/13 14:09:33 WARN DAGScheduler: Broadcasting large task binary with size 49.1 MiB
                                                                                

+---------------------+------------------------------+
|artist_name          |title                         |
+---------------------+------------------------------+
|Frightened Rabbit    |Skip The Youth                |
|Soltero              |Hands Up                      |
|Interpol             |A Time To Be So Small         |
|Foals                |Electric Bloom                |
|The Police           |Walking On The Moon           |
|Bat For Lashes       |Moon And Moon                 |
|Erin McKeown         |You Mustn't Kick It Around    |
|The Kills            |Goodnight Bad Morning         |
|Emmy The Great       |Mia                           |
|Soltero              |The Prize                     |
|Vampire Weekend      |Cape Cod Kwassa Kwassa (Album)|
|The New Pornographers|Centre For Holy Wars          |
|the bird and the bee |Again & Again                 |
|Camera Obscura       |Country Mile                  |
|Tiny Vipers          |They Might Follow You         |
+---------

25/10/13 14:09:35 WARN DAGScheduler: Broadcasting large task binary with size 44.5 MiB
25/10/13 14:09:35 WARN DAGScheduler: Broadcasting large task binary with size 9.0 MiB
25/10/13 14:09:36 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
25/10/13 14:09:36 WARN DAGScheduler: Broadcasting large task binary with size 8.9 MiB
25/10/13 14:09:37 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 14:09:52 WARN DAGScheduler: Broadcasting large task binary with size 49.2 MiB
25/10/13 14:09:55 WARN DAGScheduler: Broadcasting large task binary with size 44.6 MiB
25/10/13 14:10:03 WARN DAGScheduler: Broadcasting large task binary with size 49.1 MiB
25/10/13 14:10:19 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
25/10/13 14:10:26 WARN DAGScheduler: Broadcasting large task binary with size 45.7 MiB
25/10/13 14:10:38 WARN DAGScheduler: Broadcasting large task binary with size 49.2 MiB
25/10/13 14:10:42 WARN DAGScheduler: Broadcas

+----------+----------------------+--------------------------------+
|rating    |artist_name           |title                           |
+----------+----------------------+--------------------------------+
|0.26126173|Florence + The Machine|Dog Days Are Over (Radio Edit)  |
|0.24834356|Tiny Vipers           |They Might Follow You           |
|0.23733969|Five Iron Frenzy      |Canada                          |
|0.23639767|Kings Of Leon         |Revelry                         |
|0.2339701 |The Postal Service    |Such Great Heights              |
|0.23187818|Emmy The Great        |Mia                             |
|0.22946031|The Ruts              |West One (Shine On Me)          |
|0.21819355|Panic At The Disco    |Behind The Sea [Live In Chicago]|
|0.21778768|Harmonia              |Sehr kosmisch                   |
|0.20916529|Coldplay              |The Scientist                   |
+----------+----------------------+--------------------------------+

User: 468765.0
Songs Played:


25/10/13 14:10:45 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 14:10:45 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 14:10:46 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 14:10:52 WARN DAGScheduler: Broadcasting large task binary with size 49.1 MiB
25/10/13 14:10:53 WARN DAGScheduler: Broadcasting large task binary with size 49.1 MiB
25/10/13 14:10:54 WARN DAGScheduler: Broadcasting large task binary with size 49.1 MiB
                                                                                

+------------------------------+----------------------------------+
|artist_name                   |title                             |
+------------------------------+----------------------------------+
|The Killers                   |Under The Gun                     |
|The Pussycat Dolls            |I Don't Need A Man                |
|Rihanna                       |Breakin' Dishes                   |
|Coldplay                      |Speed Of Sound                    |
|Lady GaGa                     |Beautiful_ Dirty_ Rich            |
|Lily Allen                    |Not Fair (Clean Radio Edit)       |
|Lily Allen                    |Not Fair                          |
|Miley Cyrus                   |Talk Is Cheap                     |
|Plies                         |Watch Dis (Explicit Album Version)|
|Kings Of Leon                 |Happy Alone                       |
|Kings Of Leon                 |Happy Alone                       |
|Mariah Carey / Twista         |One And Only    

25/10/13 14:10:57 WARN DAGScheduler: Broadcasting large task binary with size 44.5 MiB
25/10/13 14:10:57 WARN DAGScheduler: Broadcasting large task binary with size 9.0 MiB
25/10/13 14:10:57 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
25/10/13 14:10:57 WARN DAGScheduler: Broadcasting large task binary with size 8.9 MiB
25/10/13 14:10:58 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 14:11:13 WARN DAGScheduler: Broadcasting large task binary with size 49.2 MiB
25/10/13 14:11:16 WARN DAGScheduler: Broadcasting large task binary with size 44.6 MiB
25/10/13 14:11:23 WARN DAGScheduler: Broadcasting large task binary with size 49.1 MiB
25/10/13 14:11:37 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
25/10/13 14:11:45 WARN DAGScheduler: Broadcasting large task binary with size 45.7 MiB
25/10/13 14:11:55 WARN DAGScheduler: Broadcasting large task binary with size 49.2 MiB
25/10/13 14:11:59 WARN DAGScheduler: Broadcas

+----------+-------------+-------------------+
|rating    |artist_name  |title              |
+----------+-------------+-------------------+
|0.63378936|Coldplay     |The Scientist      |
|0.60080063|Coldplay     |Clocks             |
|0.59467715|Coldplay     |Shiver             |
|0.5805102 |Coldplay     |In My Place        |
|0.57369894|Coldplay     |Fix You            |
|0.5568106 |Coldplay     |Yellow             |
|0.5568106 |Coldplay     |Yellow             |
|0.5298537 |Coldplay     |Don't Panic        |
|0.5230598 |Kings Of Leon|Use Somebody       |
|0.5230598 |Kings Of Leon|Use Somebody       |
|0.5162204 |Coldplay     |Speed Of Sound     |
|0.48417148|The Killers  |When You Were Young|
+----------+-------------+-------------------+

User: 628855.0
Songs Played:


25/10/13 14:12:02 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 14:12:03 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 14:12:03 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 14:12:09 WARN DAGScheduler: Broadcasting large task binary with size 49.1 MiB
25/10/13 14:12:11 WARN DAGScheduler: Broadcasting large task binary with size 49.1 MiB
25/10/13 14:12:12 WARN DAGScheduler: Broadcasting large task binary with size 49.1 MiB
                                                                                

+-----------------------+----------------------------------------+
|artist_name            |title                                   |
+-----------------------+----------------------------------------+
|Deviates               |My Life                                 |
|Cute Is What We Aim For|There's A Class For This (Album Version)|
|Skids                  |Charade                                 |
|G. Love                |Stone Me                                |
|Olle Adolphson         |En Glad Calypso Om Våren (Live '62)     |
|Pixies                 |Break My Body                           |
|Peter Tosh             |Rastafari Is                            |
|Legião Urbana          |Come Share My Life                      |
|Gustavo Cerati         |Uno Entre 1000                          |
|Guru                   |Hall of Fame                            |
|Beastie Boys           |Intergalactic                           |
|Cake                   |Frank Sinatra                        

25/10/13 14:12:14 WARN DAGScheduler: Broadcasting large task binary with size 44.5 MiB
25/10/13 14:12:14 WARN DAGScheduler: Broadcasting large task binary with size 9.0 MiB
25/10/13 14:12:14 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
25/10/13 14:12:14 WARN DAGScheduler: Broadcasting large task binary with size 8.9 MiB
25/10/13 14:12:16 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 14:12:30 WARN DAGScheduler: Broadcasting large task binary with size 49.2 MiB
25/10/13 14:12:34 WARN DAGScheduler: Broadcasting large task binary with size 44.6 MiB
25/10/13 14:12:40 WARN DAGScheduler: Broadcasting large task binary with size 49.1 MiB
25/10/13 14:12:55 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
25/10/13 14:13:02 WARN DAGScheduler: Broadcasting large task binary with size 45.7 MiB
25/10/13 14:13:13 WARN DAGScheduler: Broadcasting large task binary with size 49.2 MiB
25/10/13 14:13:17 WARN DAGScheduler: Broadcas

+----------+------------------------------+---------------------------------------+
|rating    |artist_name                   |title                                  |
+----------+------------------------------+---------------------------------------+
|0.16661032|Cartola                       |Tive Sim                               |
|0.15510224|Nine Inch Nails               |Wish                                   |
|0.1442335 |White Denim                   |Transparency                           |
|0.14398651|Maná                          |Rayando el sol                         |
|0.13882193|Queen                         |Love Of My Life (1993 Digital Remaster)|
|0.13843735|Base Ball Bear                |Sayonara-Nostalgia                     |
|0.1375887 |Radiohead                     |Creep (Explicit)                       |
|0.13655047|Kid Cudi / Kanye West / Common|Make Her Say                           |
|0.13638781|Radiohead                     |(Nice Dream)                     

25/10/13 14:13:20 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 14:13:20 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 14:13:21 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 14:13:26 WARN DAGScheduler: Broadcasting large task binary with size 49.1 MiB
25/10/13 14:13:27 WARN DAGScheduler: Broadcasting large task binary with size 49.1 MiB
25/10/13 14:13:29 WARN DAGScheduler: Broadcasting large task binary with size 49.1 MiB
                                                                                

+------------------------------------------+-------------------------------------------------------+
|artist_name                               |title                                                  |
+------------------------------------------+-------------------------------------------------------+
|Newcleus                                  |Jam On It (Accapella)                                  |
|Combat 84                                 |Barry Prudom (Re-Mix)                                  |
|Chris Rea                                 |Tell Me There's A Heaven                               |
|Don McLean                                |Vincent                                                |
|Little Jackie                             |The Stoop (Explicit)                                   |
|Jarvis Church                             |Love Song (Featuring C4)                               |
|Wyclef Jean featuring Paul Simon          |Fast Car                                       

25/10/13 14:13:30 WARN DAGScheduler: Broadcasting large task binary with size 44.5 MiB
25/10/13 14:13:30 WARN DAGScheduler: Broadcasting large task binary with size 9.0 MiB
25/10/13 14:13:31 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
25/10/13 14:13:31 WARN DAGScheduler: Broadcasting large task binary with size 8.9 MiB
25/10/13 14:13:32 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 14:13:48 WARN DAGScheduler: Broadcasting large task binary with size 49.2 MiB
25/10/13 14:13:51 WARN DAGScheduler: Broadcasting large task binary with size 44.6 MiB
25/10/13 14:13:58 WARN DAGScheduler: Broadcasting large task binary with size 49.1 MiB
25/10/13 14:14:13 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
25/10/13 14:14:20 WARN DAGScheduler: Broadcasting large task binary with size 45.7 MiB
25/10/13 14:14:30 WARN DAGScheduler: Broadcasting large task binary with size 49.2 MiB
25/10/13 14:14:35 WARN DAGScheduler: Broadcas

+----------+-----------------------------+---------------------------------+
|rating    |artist_name                  |title                            |
+----------+-----------------------------+---------------------------------+
|0.2377797 |Cat Stevens                  |Wild World                       |
|0.2233335 |Dwight Yoakam                |You're The One                   |
|0.18882796|Cartola                      |Tive Sim                         |
|0.17528135|Mike And The Mechanics       |A Beggar On A Beach Of Gold      |
|0.17458627|The Buggles                  |Video Killed The Radio Star      |
|0.16999   |Cat Stevens                  |Peace Train                      |
|0.16942908|Radney Foster                |Sweet And Wild                   |
|0.16840437|Days Of The New              |Where I Stand                    |
|0.16833577|Cat Stevens                  |If You Want To Sing Out_ Sing Out|
|0.16388376|Alison Krauss / Union Station|Restless                         |

25/10/13 14:14:38 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 14:14:38 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 14:14:39 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 14:14:45 WARN DAGScheduler: Broadcasting large task binary with size 49.1 MiB
25/10/13 14:14:46 WARN DAGScheduler: Broadcasting large task binary with size 49.1 MiB
25/10/13 14:14:48 WARN DAGScheduler: Broadcasting large task binary with size 49.1 MiB
                                                                                

+-----------------------+-----------------------------------+
|artist_name            |title                              |
+-----------------------+-----------------------------------+
|A Perfect Circle       |The Outsider                       |
|King Tubby             |Dub On My Mind Dub                 |
|The Streets            |The Irony Of It All (Album Version)|
|The Streets            |Don't Mug Yourself (Out Takes)     |
|The Streets            |Weak Become Heroes                 |
|Metallica              |Ride The Lightning                 |
|BEFORE THE DAWN        |Faithless                          |
|Bruce Dickinson        |Road To Hell                       |
|Metallica              |One                                |
|Five Finger Death Punch|Bad Company                        |
|Matthew Good           |Weapon                             |
|A Perfect Circle       |Magdalena                          |
|The Streets            |Not Addicted                       |
|Norman 

25/10/13 14:14:49 WARN DAGScheduler: Broadcasting large task binary with size 44.5 MiB
25/10/13 14:14:49 WARN DAGScheduler: Broadcasting large task binary with size 9.0 MiB
25/10/13 14:14:50 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
25/10/13 14:14:50 WARN DAGScheduler: Broadcasting large task binary with size 8.9 MiB
25/10/13 14:14:51 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 14:15:06 WARN DAGScheduler: Broadcasting large task binary with size 49.2 MiB
25/10/13 14:15:10 WARN DAGScheduler: Broadcasting large task binary with size 44.6 MiB
25/10/13 14:15:17 WARN DAGScheduler: Broadcasting large task binary with size 49.1 MiB
25/10/13 14:15:32 WARN DAGScheduler: Broadcasting large task binary with size 35.7 MiB
25/10/13 14:15:40 WARN DAGScheduler: Broadcasting large task binary with size 45.7 MiB
25/10/13 14:15:51 WARN DAGScheduler: Broadcasting large task binary with size 49.2 MiB
25/10/13 14:15:55 WARN DAGScheduler: Broadcas

+----------+------------------------------+---------------------------+
|rating    |artist_name                   |title                      |
+----------+------------------------------+---------------------------+
|0.1975654 |Creedence Clearwater Revival  |Fortunate Son              |
|0.19175874|Creedence Clearwater Revival  |Have You Ever Seen The Rain|
|0.18644644|Metallica                     |Ride The Lightning         |
|0.17989723|Metallica / Marianne Faithfull|The Memory Remains         |
|0.17886128|Queen                         |Under Pressure             |
|0.1738212 |Nirvana                       |Come As You Are            |
|0.16669244|Creedence Clearwater Revival  |Bad Moon Rising            |
|0.16512457|Soundgarden                   |Burden In My Hand          |
|0.16479841|Eddy Grant                    |Electric Avenue            |
|0.16273695|Scandinavian Music Group      |Kun Puut Tekee Seittiä     |
+----------+------------------------------+---------------------

                                                                                

# Model Evaluation

In [76]:
k=10

als_recommendations = model.recommendForAllUsers(k)

In [77]:
als_recommendations.printSchema()

root
 |-- user_index: integer (nullable = false)
 |-- recommendations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- song_index: integer (nullable = true)
 |    |    |-- rating: float (nullable = true)



In [78]:
show_as_html(als_recommendations)

25/10/13 16:30:25 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 16:34:02 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
                                                                                

Unnamed: 0,user_index,recommendations
0,38,"[(235, 1.1846497058868408), (249, 1.1527633666..."
1,57,"[(206, 0.9366303086280823), (214, 0.8701990246..."
2,64,"[(85, 1.5044667720794678), (5, 1.4657384157180..."
3,77,"[(290, 1.086869716644287), (782, 1.02669608592..."
4,80,"[(494, 1.0465326309204102), (296, 0.9302388429..."
5,85,"[(290, 0.8564937710762024), (213, 0.8261248469..."
6,94,"[(470, 1.0850210189819336), (182, 1.0431475639..."
7,104,"[(363, 1.2611483335494995), (246, 1.2267875671..."
8,111,"[(306, 1.0983549356460571), (533, 1.0365855693..."
9,146,"[(226, 1.2012583017349243), (264, 1.1167669296..."


In [84]:
predicted = (
    als_recommendations
    .withColumn('rec', F.explode('recommendations'))
    .select('user_index', F.col('rec.song_index').alias('song_index'))
)

predicted_grouped = (
    predicted
    .groupBy('user_index')
    .agg(F.collect_list('song_index').cast(ArrayType(DoubleType())).alias('predicted_songs'))
)

actual_grouped = (
    test
    .groupBy('user_index')
    .agg(F.collect_set('song_index').cast(ArrayType(DoubleType())).alias('actual_songs'))
)

joined = predicted_grouped.join(actual_grouped, on='user_index', how='inner')

In [85]:
evaluator = RankingEvaluator(predictionCol='predicted_songs', labelCol='actual_songs')

precisionAtK = evaluator.evaluate(joined, {evaluator.metricName: "precisionAtK", evaluator.k: k})
mapAtK = evaluator.evaluate(joined, {evaluator.metricName: "meanAveragePrecisionAtK", evaluator.k: k})
ndcgAtK = evaluator.evaluate(joined, {evaluator.metricName: "ndcgAtK", evaluator.k: k})

25/10/13 16:57:58 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 16:57:58 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 17:01:26 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 17:01:32 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 17:01:38 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 17:01:42 WARN DAGScheduler: Broadcasting large task binary with size 49.2 MiB
25/10/13 17:01:45 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 17:01:45 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 17:05:18 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 17:05:19 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 17:05:25 WARN DAGScheduler: Broadcasting large task binary with size 49.0 MiB
25/10/13 17:05:29 WARN DAGScheduler: Broadc

In [86]:
print(f'metrics for implicit feedback')
print(f'')
print(f'precision @ K: {precisionAtK:.5f}')
print(f'MAP @ K:       {mapAtK:.5f}')
print(f'NDCG @ K:      {ndcgAtK:.5f}')
print(f'')

metrics for implicit feedback

precision @ K: 0.10040
MAP @ K:       0.05153
NDCG @ K:      0.10578



In [99]:
stop_spark()

25/10/11 16:16:12 WARN ExecutorPodsWatchSnapshotSource: Kubernetes client has been closed.
