# Create SparkContext, SparkSession

https://spark.apache.org/docs/latest/rdd-programming-guide.html

http://spark.apache.org/docs/latest/sql-getting-started.html

In [None]:
import findspark
findspark.init()

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
sc = SparkContext(appName="RecSys")
se = SparkSession(sc)


In [None]:
! aws s3 cp s3://ydatazian/yandex_music yandex_music --recursive

# Yandex.Music dataset

In [None]:
! ls -lh yandex_music

total 52M
-rw-rw-r-- 1 hadoop hadoop 3.7M Jun  4 08:13 artists.jsonl
-rw-rw-r-- 1 hadoop hadoop  48M Jun  4 08:10 events.csv
-rw-rw-r-- 1 hadoop hadoop  254 Jun  4 08:13 README.txt


In [None]:
! head -n 5 yandex_music/artists.jsonl

{"artistId":0,"artistName":"Mack Gordon"}
{"artistId":1,"artistName":"Kenny Dorham"}
{"artistId":2,"artistName":"Max Roach"}
{"artistId":3,"artistName":"Francis Rossi"}
{"artistId":4,"artistName":"Status Quo"}


In [None]:
! head -n 5 yandex_music/events.csv

userId,artistId,plays,skips
0,335,1,0
0,708,1,0
0,710,2,1
0,815,1,1


# Copy data to HDFS

In [None]:
! hadoop fs -copyFromLocal yandex_music /

copyFromLocal: `/yandex_music/README.txt': File exists
copyFromLocal: `/yandex_music/artists.jsonl': File exists
copyFromLocal: `/yandex_music/events.csv': File exists


In [None]:
! hadoop fs -ls -h /yandex_music

Found 3 items
-rw-r--r--   1 hadoop hadoop        254 2021-06-04 08:14 /yandex_music/README.txt
-rw-r--r--   1 hadoop hadoop      3.7 M 2021-06-04 08:14 /yandex_music/artists.jsonl
-rw-r--r--   1 hadoop hadoop     47.6 M 2021-06-04 08:14 /yandex_music/events.csv


# Load dataset

In [None]:
artists = se.read.json("hdfs:///yandex_music/artists.jsonl")
artists.registerTempTable("artists")
artists.limit(5).toPandas()

Unnamed: 0,artistId,artistName
0,0,Mack Gordon
1,1,Kenny Dorham
2,2,Max Roach
3,3,Francis Rossi
4,4,Status Quo


In [None]:
events = se.read.csv("hdfs:///yandex_music/events.csv", header=True, 
                     schema='userId bigint, artistId bigint, plays INT, skips INT')
events.registerTempTable("events")
events.limit(5).toPandas()

Unnamed: 0,userId,artistId,plays,skips
0,0,335,1,0
1,0,708,1,0
2,0,710,2,1
3,0,815,1,1
4,0,880,1,1


In [None]:
# statistics
se.sql("""
select
    count(distinct userId) as users,
    count(distinct artistId) as artists,
    count(*) as interactions,
    count(*) / (count(distinct userId) * count(distinct artistId)) as density
from 
    events
""").toPandas()

Unnamed: 0,users,artists,interactions,density
0,4999,53031,3412504,0.012872


In [None]:
# most popular artists
se.sql("""
select
    artists.artistName,
    sum(plays) as popularity
from 
    events join artists on events.artistId = artists.artistId
group by artistName
order by popularity desc
limit 30
""").toPandas()

Unnamed: 0,artistName,popularity
0,Imagine Dragons,43447
1,Би-2,29415
2,Баста,27264
3,Ленинград,26311
4,Сплин,25062
5,Queen,24905
6,Sia,22803
7,LOBODA,21923
8,Noize MC,21774
9,Linkin Park,21584


# Train iALS

Assume, rating is encoded into `plays` column

In [None]:
import numpy as np

In [None]:
%%time
train, test = events.rdd.randomSplit([0.95, 0.05], seed=0)

# speed-up, we request it often
train.cache()
test.cache()

train.count()
test.count()

CPU times: user 22.5 ms, sys: 577 µs, total: 23.1 ms
Wall time: 13 s


170048

In [None]:
train.take(5)

[Row(userId=0, artistId=335, plays=1, skips=0),
 Row(userId=0, artistId=708, plays=1, skips=0),
 Row(userId=0, artistId=710, plays=2, skips=1),
 Row(userId=0, artistId=815, plays=1, skips=1),
 Row(userId=0, artistId=880, plays=1, skips=1)]

In [None]:
%%time

from pyspark.mllib.recommendation import ALS

model = ALS().trainImplicit(
    train.map(lambda x: (x.userId, x.artistId, np.log2(x.plays + 1))),
    rank=32, iterations=10, lambda_=0.01, alpha=10.0, seed=0
)

CPU times: user 136 ms, sys: 24 ms, total: 160 ms
Wall time: 46.1 s


In [None]:
# we take all artist profiles
import numpy as np

artist_to_name = {}
for row in artists.collect():
    artist_to_name[row.artistId] = row.artistName

artist_ids = []
artist_names = []
artist_profiles = []

for artistId, profile in model.productFeatures().collect():
    artist_ids.append(artistId)
    artist_names.append(artist_to_name[artistId])
    artist_profiles.append(profile)

artist_ids = np.array(artist_ids)
artist_names = np.array(artist_names)
artist_profiles = np.vstack(artist_profiles)
print(artist_profiles.shape)

(52657, 32)


# Artists similarity

In [None]:
target_artists = {index: v 
                  for index, v in enumerate(artist_names) 
                  if "Coldplay" == v or "50 Cent" == v or "AC/DC" == v}
target_artists

{78: 'Coldplay', 19576: 'AC/DC', 39708: '50 Cent'}

In [None]:
import scipy
import scipy.spatial

for index, name in target_artists.items():
    print("#############", name, "#############")
    
    cosines = (-scipy.spatial.distance.cdist([artist_profiles[index]], artist_profiles, metric='cosine') + 1)[0]
    cosines[np.isnan(cosines)] = -1e20

    for idx in np.argsort(cosines)[::-1][:10]:
        print(artist_names[idx], "\t", cosines[idx])

############# Coldplay #############
Coldplay 	 1.0
Adele 	 0.9514991314960761
OneRepublic 	 0.950638578019446
Lana Del Rey 	 0.9504944981911303
Maroon 5 	 0.9430133462907239
Twenty One Pilots 	 0.9157747691071161
Ed Sheeran 	 0.9157440379942977
Pharrell Williams 	 0.9118622317087689
Sam Smith 	 0.9114849543340605
Rihanna 	 0.9039903881160768
############# AC/DC #############
AC/DC 	 1.0
The Offspring 	 0.8920983700671247
Nirvana 	 0.8696372871542281
Red Hot Chili Peppers 	 0.8636266966741878
Metallica 	 0.8588127380977159
System of A Down 	 0.854862161734499
Bon Jovi 	 0.8430942536703123
Limp Bizkit 	 0.8370104524874088
Nickelback 	 0.8320685893567428
Kiss 	 0.8268261529375072
############# 50 Cent #############
50 Cent 	 1.0
Dr. Dre 	 0.8901604622643832
2Chainz 	 0.831186741989025
Lloyd Banks 	 0.8245230681566197
Ludacris 	 0.819130925498649
Fat Joe 	 0.8093649119722239
Jay-Z 	 0.8060270813917894
Cashis 	 0.8050292090107992
Missy  Elliott 	 0.7990513393453588
Akon 	 0.793983630880583

# NDCG

In [None]:
def dcg(ratings):
    return float(np.sum((2 ** np.array(ratings, np.float32) - 1) / np.log2(np.arange(1, len(ratings) + 1) + 1)))


def ndcg(ratings, at=None):
    idcg = dcg(sorted(ratings, reverse=True))
    return dcg(ratings) / idcg if idcg > 0 else 0


def ndcg_score(y_true, y_pred):
    assert y_true.shape == y_pred.shape
    order = np.argsort(y_pred)[::-1]
    return ndcg(y_true[order])


# tests
def test1():
    y_true = np.array([  0,   0,   2,   1,   0])
    y_pred = np.array([0.2, 0.1, 0.5, 0.3, 0.4])
    correct_ndcg = (3 / np.log(1 + 1) + 1 / np.log(3 + 1)) / (3 / np.log(1 + 1) + 1 / np.log(2 + 1))
    assert np.allclose(ndcg_score(y_true, y_pred), correct_ndcg)

    
def test2():
    y_true = np.array([  0,   0,   0,   0,   0])
    y_pred = np.array([0.2, 0.1, 0.5, 0.3, 0.4])
    assert np.allclose(ndcg_score(y_true, y_pred), 0.0)

    
def test3():
    y_true = np.array([  1,   0,   0,   0,   0])
    y_pred = np.array([0.2, 0.1, 0.5, 0.3, 0.4])
    correct_ndcg = (1 / np.log(4 + 1)) / (1 / np.log(1 + 1))
    assert np.allclose(ndcg_score(y_true, y_pred), correct_ndcg)


test1()
test2()
test3()

In [None]:
print(dcg([5, 4, 3, 2, 1]))
print(dcg([3, 4, 5, 2, 1]))
print(dcg([5, 4, 1, 2, 3]))

45.64282878502658
33.64282878502658
44.963945628433834


# Calc NDCG for baseline

Range artists by popularity

In [None]:
artist_to_popularity = (
    train
    .map(lambda x: (x.artistId, x.plays))
    .reduceByKey(lambda a, b: a + b)
    .collect()
)

artist_to_popularity = {a: p for a, p in artist_to_popularity}

In [None]:
predictions_and_ratings_per_user = (
    test
    .map(lambda x: (x.userId, (artist_to_popularity.get(x.artistId, 0), np.log2(x.plays + 1))))
    .groupByKey()
    .map(lambda x: (x[0], list(x[1])))
)

In [None]:
predictions_and_ratings_per_user.take(1)

[(0,
  [(2393, 1.0),
   (21848, 3.321928094887362),
   (624, 1.0),
   (7273, 1.0),
   (900, 1.584962500721156),
   (494, 1.584962500721156),
   (4011, 1.0),
   (2271, 1.584962500721156),
   (788, 1.0),
   (1024, 1.584962500721156),
   (4428, 3.584962500721156),
   (230, 1.0),
   (1515, 1.0),
   (2313, 1.584962500721156),
   (1243, 2.321928094887362),
   (5501, 2.321928094887362),
   (7768, 5.977279923499917),
   (783, 1.0),
   (4757, 1.584962500721156),
   (1228, 1.0),
   (47, 0.0),
   (4281, 0.0),
   (3577, 0.0),
   (1263, 0.0),
   (2080, 0.0),
   (181, 0.0),
   (1763, 0.0),
   (975, 0.0),
   (6877, 0.0),
   (9, 0.0),
   (276, 0.0),
   (3294, 0.0),
   (5314, 0.0),
   (444, 0.0),
   (54, 0.0),
   (751, 0.0)])]

In [None]:
def ndcg_for_user(x):
    y_pred = np.array([e[0] for e in x])
    y_true = np.array([e[1] for e in x])
    return ndcg_score(y_true, y_pred)
    
(
    predictions_and_ratings_per_user
    .map(lambda x: ndcg_for_user(x[1]))
    .mean()
)

0.6641733130095444

# NDCG for iALS

In [None]:
predictions = (
    model
    .predictAll(test.map(lambda x: (x.userId, x.artistId)))
    .map(lambda x: ((x[0], x[1]), x[2]))
)

In [None]:
predictions.take(5)

[((2760, 57436), 0.3116623497144269),
 ((3013, 57436), 0.383071386909422),
 ((4698, 57436), 0.6615872184920817),
 ((679, 57436), 0.2201103342246492),
 ((4031, 57436), 0.27049808298492056)]

In [None]:
predictions_and_ratings_per_user = (
    predictions
    .join(test.map(lambda x: ((x.userId, x.artistId), np.log2(x.plays + 1))))
    .map(lambda x: (x[0][0], x[1]))
    .groupByKey()
    .map(lambda x: (x[0], list(x[1])))
)

In [None]:
predictions_and_ratings_per_user.take(1)

[(424,
  [(0.49470669790378563, 0.0),
   (0.07256944997573084, 1.0),
   (1.0418825212243026, 1.0),
   (0.873224499136573, 3.0),
   (0.7800387467699602, 0.0),
   (0.5205558719341108, 1.0),
   (0.8574898887049791, 1.0),
   (0.670542248485958, 1.0),
   (0.9413614384766841, 1.0),
   (0.5795242140631821, 1.0),
   (0.8416199330412857, 1.0),
   (0.8542678021391192, 0.0),
   (0.9193556138983022, 1.0),
   (0.7073060767055575, 1.584962500721156),
   (0.524343721291789, 0.0),
   (0.22297555849377154, 0.0),
   (0.7891040536478586, 1.0),
   (0.9171743034407762, 2.0),
   (0.675839796037516, 1.0),
   (0.36477756220112845, 1.0),
   (0.8036689112341607, 1.0),
   (0.7416945356735936, 1.584962500721156),
   (0.8661557675018852, 4.700439718141092),
   (-0.012526385180065408, 0.0),
   (0.9405575210023864, 3.584962500721156),
   (0.37677583541476733, 1.0),
   (0.059876624874282566, 2.807354922057604),
   (0.8369159623023275, 1.0),
   (0.20230316560754527, 1.0),
   (0.6800615414350136, 0.0),
   (0.6108168720

In [None]:
def ndcg_for_user(x):
    y_pred = np.array([e[0] for e in x])
    y_true = np.array([e[1] for e in x])
    return ndcg_score(y_true, y_pred)
    
(
    predictions_and_ratings_per_user
    .map(lambda x: ndcg_for_user(x[1]))
    .mean()
)

0.7190445899654145

In [None]:
print("Increased by {:0.3}%!".format(100 * (0.716 / 0.661 - 1)))

Increased by 8.32%!
