In [None]:
import pyspark

from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.config("spark.driver.memory", "6g").appName('chapter_3').getOrCreate()

### Setting Up the Data

In [None]:
raw_user_artist_path = "data/audioscrobbler_data/user_artist_data.txt"
raw_user_artist_data = spark.read.text(raw_user_artist_path)

raw_user_artist_data.show(5)

In [None]:
raw_artist_data = spark.read.text("data/audioscrobbler_data/artist_data.txt")

raw_artist_data.show(5)

In [None]:
raw_artist_alias = spark.read.text("data/audioscrobbler_data/artist_alias.txt")

raw_artist_alias.show(5)

### Preparing the Data

In [None]:
raw_user_artist_data.show(10)

In [None]:
from pyspark.sql.functions import split, min, max
from pyspark.sql.types import IntegerType, StringType

user_artist_df = raw_user_artist_data.withColumn('user',
                                    split(raw_user_artist_data['value'], ' ').\
                                    getItem(0).\
                                    cast(IntegerType()))
user_artist_df = user_artist_df.withColumn('artist',
                                    split(raw_user_artist_data['value'], ' ').\
                                    getItem(1).\
                                    cast(IntegerType()))
user_artist_df = user_artist_df.withColumn('count',
                                    split(raw_user_artist_data['value'], ' ').\
                                    getItem(2).\
                                    cast(IntegerType())).drop('value')

user_artist_df.select([min("user"), max("user"), min("artist"),\
                                    max("artist")]).show()

In [None]:
from pyspark.sql.functions import col

artist_by_id = raw_artist_data.withColumn('id', split(col('value'), '\s+', 2).\
                                                getItem(0).\
                                                cast(IntegerType()))
artist_by_id = artist_by_id.withColumn('name', split(col('value'), '\s+', 2).\
                                               getItem(1).\
                                               cast(StringType())).drop('value')

artist_by_id.show(5)

In [None]:
artist_alias = raw_artist_alias.withColumn('artist',
                                          split(col('value'), '\s+').\
                                                getItem(0).\
                                                cast(IntegerType())).\
                                withColumn('alias',
                                            split(col('value'), '\s+').\
                                            getItem(1).\
                                            cast(StringType())).\
                                drop('value')

artist_alias.show(5)

In [None]:
artist_by_id.filter(artist_by_id.id.isin(1092764, 1000311)).show()

### Building a First Model

In [None]:
from pyspark.sql.functions import broadcast, when

train_data = user_artist_df.join(broadcast(artist_alias),
                                              'artist', how='left')
train_data = train_data.withColumn('artist',
                                    when(col('alias').isNull(), col('artist')).\
                                    otherwise(col('alias')))

train_data = train_data.withColumn('artist', col('artist').\
                                             cast(IntegerType())).\
                                             drop('alias')

train_data.cache()

train_data.count()

In [None]:
from pyspark.ml.recommendation import ALS

model = ALS(rank=10, seed=0, maxIter=5, regParam=0.1,
            implicitPrefs=True, alpha=1.0, userCol='user',
            itemCol='artist', ratingCol='count'). \
        fit(train_data)

In [None]:
model.userFactors.show(1, truncate = False)

### Spot Checking Recommendations

In [None]:
user_id = 2093760

existing_artist_ids = train_data.filter(train_data.user == user_id).select("artist").collect()

existing_artist_ids = [i[0] for i in existing_artist_ids]

artist_by_id.filter(col('id').isin(existing_artist_ids)).show()

In [None]:
user_subset = train_data.select('user').where(col('user') == user_id).distinct()
top_predictions = model.recommendForUserSubset(user_subset, 5)

top_predictions.show()

In [None]:
top_predictions_pandas = top_predictions.toPandas()
print(top_predictions_pandas)

In [None]:
recommended_artist_ids = [i[0] for i in top_predictions_pandas.\
                                        recommendations[0]]

artist_by_id.filter(col('id').isin(recommended_artist_ids)).show()

In [None]:
from pyspark.sql.functions import col, lit, count, mean, coalesce
from pyspark.sql import DataFrame
from typing import List
import random


def area_under_curve(positive_data: DataFrame, b_all_artist_ids: List[int], predict_function) -> float:
    positive_predictions = predict_function(positive_data.select("user", "artist")).withColumnRenamed("prediction", "positivePrediction")
    
    def negative_data_generation(user_artist_tuples):
        user_negative_artists = []
        for user, pos_artist_ids in user_artist_tuples:
            pos_artist_id_set = set(pos_artist_ids)
            negative_artists = set()
            while len(negative_artists) < len(pos_artist_id_set):
                artist_id = b_all_artist_ids[random.randint(0, len(b_all_artist_ids) - 1)]
                if artist_id not in pos_artist_id_set:
                    negative_artists.add(artist_id)
            user_negative_artists.extend([(user, artist_id) for artist_id in negative_artists])
        return user_negative_artists
    
    user_artist_rdd = positive_data.select("user", "artist").rdd.groupByKey().mapValues(list).collect()
    negative_data = spark.createDataFrame(negative_data_generation(user_artist_rdd), schema=["user", "artist"])
    
    negative_predictions = predict_function(negative_data).withColumnRenamed("prediction", "negativePrediction")
    
    joined_predictions = positive_predictions.join(negative_predictions, "user").select("user", "positivePrediction", "negativePrediction").cache()
    
    all_counts = joined_predictions.groupBy("user").agg(count(lit(1)).alias("total")).select("user", "total")
    correct_counts = joined_predictions.filter(col("positivePrediction") > col("negativePrediction")).groupBy("user").agg(count("user").alias("correct")).select("user", "correct")
    
    mean_auc = all_counts.join(correct_counts, ["user"], "left_outer").select(col("user"), (coalesce(col("correct"), lit(0)) / col("total")).alias("auc")).agg(mean("auc")).collect()[0][0]
    
    joined_predictions.unpersist()
    
    return mean_auc



all_data = user_artist_df.join(broadcast(artist_alias), 'artist', how='left') \
    .withColumn('artist', when(col('alias').isNull(), col('artist'))\
    .otherwise(col('alias'))) \
    .withColumn('artist', col('artist').cast(IntegerType())).drop('alias')

train_data, cv_data = all_data.randomSplit([0.9, 0.1], seed=54321)
train_data.cache()
cv_data.cache()

all_artist_ids = all_data.select("artist").distinct()
all_artist_ids = [i[0] for i in all_artist_ids.collect()]
# b_all_artist_ids = broadcast(all_artist_ids)

model = ALS(rank=10, seed=0, maxIter=5, regParam=0.1,
            implicitPrefs=True, alpha=1.0, userCol='user',
            itemCol='artist', ratingCol='count') \
        .fit(train_data)

area_under_curve(cv_data, all_artist_ids, model.transform)


In [None]:
from pyspark.sql.functions import sum as _sum

def predict_most_listened(train):
    listen_counts = train.groupBy("artist").agg(_sum("count").alias("prediction")).select("artist", "prediction")
    return train.join(listen_counts, "artist", "left_outer").select("user", "artist", "prediction")


area_under_curve(cv_data, all_artist_ids, predict_most_listened)


### Hyperparameter selection

In [None]:
from pprint import pprint
from itertools import product

ranks = [5, 30]
reg_params = [4.0, 0.0001]
alphas = [1.0, 40.0]
hyperparam_combinations = list(product(*[ranks, reg_params, alphas]))

evaluations = []

for c in hyperparam_combinations:
    rank = c[0]
    reg_param = c[1]
    alpha = c[2]
    model = ALS().setSeed(0).setImplicitPrefs(True).setRank(rank).setRegParam(reg_param).setAlpha(alpha).setMaxIter(20).setUserCol("user").setItemCol("artist").setRatingCol("count").setPredictionCol("prediction").fit(train_data)

    auc = area_under_curve(cv_data, all_artist_ids, model.transform)

    model.userFactors.unpersist()
    model.itemFactors.unpersist()

    evaluations.append((auc, (rank, reg_param, alpha)))

evaluations.sort(key=lambda x: x[0], reverse=True)
pprint(evaluations)

In [None]:
some_users = all_data.select("user").distinct().limit(100)

def make_recommendations(model, user_id, num_recs):
    user_subset = train_data.select('user').where(col('user') == user_id).distinct()
    recommendations = model.recommendForUserSubset(user_subset, num_recs)
    return recommendations

some_recommendations = [(user_id[0], make_recommendations(model, user_id[0], 5)) for user_id in some_users.collect()]

for user_id, recs_df in some_recommendations:
    recs_df = recs_df.select("recommendations")
    recommended_artists = [row.asDict()["artist"] for row in recs_df.collect()[0][0]]
    print(f"{user_id} -> {', '.join(map(str, recommended_artists))}")
