In [1]:
raw_user_artist_path = "/content/user_artist_data.txt"

In [2]:
from pyspark.shell import spark

raw_user_artist_data = spark.read.text(raw_user_artist_path)

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 3.5.1
      /_/

Using Python version 3.10.12 (main, Nov 20 2023 15:14:05)
Spark context Web UI available at http://1dbbd8969999:4040
Spark context available as 'sc' (master = local[*], app id = local-1716299852786).
SparkSession available as 'spark'.


In [3]:
raw_user_artist_data.show(5)

+-------------------+
|              value|
+-------------------+
|       1000002 1 55|
| 1000002 1000006 33|
|  1000002 1000007 8|
|1000002 1000009 144|
|1000002 1000010 314|
+-------------------+
only showing top 5 rows



In [4]:
raw_artist_data = spark.read.text("/content/artist_data.txt")

In [5]:
raw_artist_data.show(5)

+--------------------+
|               value|
+--------------------+
|1134999\t06Crazy ...|
|6821360\tPang Nak...|
|10113088\tTerfel,...|
|10151459\tThe Fla...|
|6826647\tBodensta...|
+--------------------+
only showing top 5 rows



In [6]:
raw_artist_alias = spark.read.text("/content/artist_alias.txt")

In [7]:
raw_artist_alias.show(5)

+-----------------+
|            value|
+-----------------+
| 1092764\t1000311|
| 1095122\t1000557|
| 6708070\t1007267|
|10088054\t1042317|
| 1195917\t1042317|
+-----------------+
only showing top 5 rows



In [8]:
raw_user_artist_data.show(10)

+-------------------+
|              value|
+-------------------+
|       1000002 1 55|
| 1000002 1000006 33|
|  1000002 1000007 8|
|1000002 1000009 144|
|1000002 1000010 314|
|  1000002 1000013 8|
| 1000002 1000014 42|
| 1000002 1000017 69|
|1000002 1000024 329|
|  1000002 1000025 1|
+-------------------+
only showing top 10 rows



In [9]:
from pyspark.sql.functions import split, min, max
from pyspark.sql.types import IntegerType, StringType

user_artist_df = raw_user_artist_data.withColumn("user",
                                                 split(raw_user_artist_data["value"], " "). \
                                                 getItem(0). \
                                                 cast(IntegerType()))

user_artist_df = user_artist_df.withColumn("artist",
                                             split(raw_user_artist_data["value"], " "). \
                                             getItem(1). \
                                             cast(IntegerType()))

user_artist_df = user_artist_df.withColumn("count",
                                                split(raw_user_artist_data["value"], " "). \
                                                getItem(2). \
                                                cast(IntegerType())). \
                                                drop("value")

user_artist_df.select([min("user"), max("user"), min("artist"), max("artist")]).show()

+---------+---------+-----------+-----------+
|min(user)|max(user)|min(artist)|max(artist)|
+---------+---------+-----------+-----------+
|       90|  2443548|          1|   10794401|
+---------+---------+-----------+-----------+



In [10]:
from pyspark.sql.functions import col


artist_by_id = raw_artist_data.withColumn("id",
                                            split(col("value"), "\t", 2). \
                                            getItem(0). \
                                            cast(IntegerType()))

artist_by_id = artist_by_id.withColumn("name",
                                        split(col("value"), "\t", 2). \
                                        getItem(1). \
                                        cast(StringType())). \
                                        drop("value")

artist_by_id.show(5)

+--------+--------------------+
|      id|                name|
+--------+--------------------+
| 1134999|        06Crazy Life|
| 6821360|        Pang Nakarin|
|10113088|Terfel, Bartoli- ...|
|10151459| The Flaming Sidebur|
| 6826647|   Bodenstandig 3000|
+--------+--------------------+
only showing top 5 rows



In [11]:
artist_alias = raw_artist_alias.withColumn("artist",
                                            split(col("value"), "\t"). \
                                            getItem(0). \
                                            cast(IntegerType())). \
                                withColumn("alias",
                                            split(col("value"), "\t"). \
                                            getItem(1). \
                                            cast(IntegerType())). \
                                            drop("value")

artist_alias.show(5)

+--------+-------+
|  artist|  alias|
+--------+-------+
| 1092764|1000311|
| 1095122|1000557|
| 6708070|1007267|
|10088054|1042317|
| 1195917|1042317|
+--------+-------+
only showing top 5 rows



In [12]:
artist_by_id.filter(artist_by_id.id.isin(1092764, 1000311)).show()

+-------+--------------+
|     id|          name|
+-------+--------------+
|1000311| Steve Winwood|
|1092764|Winwood, Steve|
+-------+--------------+



In [13]:
from pyspark.sql.functions import broadcast, when


train_data = train_data = user_artist_df.join(broadcast(artist_alias), "artist", how="left")

train_data = train_data.withColumn("artist",
                                    when(col("alias").isNull(), col("artist")). \
                                    otherwise(col("alias")))

train_data = train_data.withColumn("artist",
                                    col("artist").cast(IntegerType())). \
                                    drop("alias")

train_data.cache()

train_data.count()

24296858

In [14]:
from pyspark.ml.recommendation import ALS

model = ALS(rank=10, seed=0, maxIter=5, regParam=0.1, implicitPrefs=True, alpha=1.0, userCol="user", itemCol="artist", ratingCol="count"). \
        fit(train_data)

In [15]:
model.userFactors.show(1, truncate=False)

+---+------------------------------------------------------------------------------------------------------------------------------+
|id |features                                                                                                                      |
+---+------------------------------------------------------------------------------------------------------------------------------+
|90 |[0.16020624, 0.20717518, -0.17194684, 0.06038469, 0.062727705, 0.54658705, -0.40481892, 0.43657345, -0.10396775, -0.042728312]|
+---+------------------------------------------------------------------------------------------------------------------------------+
only showing top 1 row



In [16]:
user_id = 2093760

existing_artist_ids = train_data.filter(train_data.user == user_id). \
    select("artist"). \
    collect()

existing_artist_ids = [i[0] for i in existing_artist_ids]

artist_by_id.filter(col("id").isin(existing_artist_ids)).show()

+-------+---------------+
|     id|           name|
+-------+---------------+
|   1180|     David Gray|
|    378|  Blackalicious|
|    813|     Jurassic 5|
|1255340|The Saw Doctors|
|    942|         Xzibit|
+-------+---------------+



In [17]:
user_subset = train_data.select("user").where(col("user") == user_id).distinct()
top_predictions = model.recommendForUserSubset(user_subset, 5)

top_predictions.show()

+-------+--------------------+
|   user|     recommendations|
+-------+--------------------+
|2093760|[{2814, 0.0294106...|
+-------+--------------------+



In [18]:
top_predictions_pandas = top_predictions.toPandas()

print(top_predictions_pandas)

      user                                    recommendations
0  2093760  [(2814, 0.029410677030682564), (1300642, 0.028...


In [19]:
recommended_artist_ids = [i[0] for i in top_predictions_pandas. \
    recommendations[0]]

artist_by_id.filter(col("id").isin(recommended_artist_ids)).show()

+-------+----------+
|     id|      name|
+-------+----------+
|   2814|   50 Cent|
|   4605|Snoop Dogg|
|1007614|     Jay-Z|
|1001819|      2Pac|
|1300642|  The Game|
+-------+----------+



In [None]:
def area_under_curve(
        positive_data,
        b_all_artist_IDs,
        predict_function
):

all_data = user_artist_df.join(broadcast(artist_alias), "artist", how="left"). \
    withColumn("artist", when(col("alias").isNull(), col("artist")). \
               otherwise(col("alias"))). \
    withColumn("artist", col("artist").cast(IntegerType())). \
    drop("alias")

train_data, cv_data = all_data.randomSplit([0.9, 0.1], seed=54321)
train_data.cache()
cv_data.cache()

all_artist_ids = all_data.select("artist").distinct().count()
b_all_artist_ids = broadcast(all_artist_ids)


model = ALS(rank=10, seed=0, maxIter=5, regParam=0.1, implicitPrefs=True, alpha=1.0, userCol="user", itemCol="artist", ratingCol="count"). \
    fit(train_data)

area_under_curve(cv_data, b_all_artist_ids, model.transform)

In [None]:
from pyspark.sql.functions import sum as _sum


def predict_most_listened(train):
    listen_counts = train.groupBy("artist"). \
        agg(_sum("count").alias("prediction")). \
        select("artist", "prediction")

    return all_data.join(listen_counts, "artist", "left_outer"). \
        select("user", "artist", "prediction")

area_under_curve(cv_data, b_all_artist_ids, predict_most_listened(train_data))

In [None]:
from pprint import pprint
from itertools import product


ranks = [5, 30]
reg_params = [4.0, 0.0001]
alphas = [1.0, 40.0]
hyperparam_combinations = list(product(*[ranks, reg_params, alphas]))

evaluations = []

for c in hyperparam_combinations:
    rank = c[0]
    reg_param = c[1]
    alpha = c[2]
    model = ALS().setSeed(0).setImplicitPrefs(True).setRank(rank). \
        setRegParam(reg_param).setAlpha(alpha).setMaxIter(20). \
        setUserCol("user").setItemCol("artist"). \
        setRatingCol("count").setPredictionCol("prediction"). \
        fit(train_data)

    auc = area_under_curve(cv_data, b_all_artist_ids, model.transform)

    model.userFactors.unpersist()
    model.itemFactors.unpersist()

    evaluations.append((auc, (rank, reg_param, alpha)))


evaluations.sort(key=lambda x: x[0], reverse=True)
pprint(evaluations)

In [None]:
some_users = all_data.select("user").distinct().limit(100) 1

val someRecommendations =
  someUsers.map(userID => (userID, makeRecommendations(model, userID, 5)))
someRecommendations.foreach { case (userID, recsDF) =>
  val recommendedArtists = recsDF.select("artist").as[Int].collect()
  println(s"$userID -> ${recommendedArtists.mkString(", ")}")
}