<a target="_blank" href="../cluster" style="font-size:20px">All Applications (YARN)</a>

# Создаем SparkContext и SparkSession

https://spark.apache.org/docs/latest/rdd-programming-guide.html

http://spark.apache.org/docs/latest/sql-getting-started.html

In [1]:
import findspark
findspark.init()

import pyspark
sc = pyspark.SparkContext(appName='jupyter')

from pyspark.sql import SparkSession, Row
se = SparkSession(sc)

# Датасет Яндекс.Музыка

In [2]:
! ls -lh yandex_music

total 3.7M
-rwxrwxrwx 1 nobody nogroup 3.7M Nov 11  2020 artists.jsonl
lrwxrwxrwx 1 nobody nogroup   44 Feb 15 20:07 events.csv -> /home/jovyan/work-ro/yandex_music/events.csv
-rwxrwxrwx 1 nobody nogroup  254 Nov 11  2020 README.txt


In [3]:
! head -n 5 yandex_music/artists.jsonl

{"artistId":0,"artistName":"Mack Gordon"}
{"artistId":1,"artistName":"Kenny Dorham"}
{"artistId":2,"artistName":"Max Roach"}
{"artistId":3,"artistName":"Francis Rossi"}
{"artistId":4,"artistName":"Status Quo"}


In [4]:
! head -n 5 yandex_music/events.csv

userId,artistId,plays,skips
0,335,1,0
0,708,1,0
0,710,2,1
0,815,1,1


# Копируем файлы в HDFS

In [5]:
! hadoop fs -copyFromLocal yandex_music /

In [6]:
! hadoop fs -ls -h /yandex_music

Found 3 items
-rw-r--r--   1 jovyan supergroup        254 2022-02-16 17:08 /yandex_music/README.txt
-rw-r--r--   1 jovyan supergroup      3.7 M 2022-02-16 17:08 /yandex_music/artists.jsonl
-rw-r--r--   1 jovyan supergroup     47.6 M 2022-02-16 17:08 /yandex_music/events.csv


# Загружаем данные

In [7]:
artists = se.read.json("hdfs:///yandex_music/artists.jsonl")
artists.registerTempTable("artists")
artists.limit(5).toPandas()

Unnamed: 0,artistId,artistName
0,0,Mack Gordon
1,1,Kenny Dorham
2,2,Max Roach
3,3,Francis Rossi
4,4,Status Quo


In [8]:
events = se.read.csv("hdfs:///yandex_music/events.csv", header=True, 
                     schema='userId bigint, artistId bigint, plays INT, skips INT')
events.registerTempTable("events")
events.limit(5).toPandas()

Unnamed: 0,userId,artistId,plays,skips
0,0,335,1,0
1,0,708,1,0
2,0,710,2,1
3,0,815,1,1
4,0,880,1,1


In [9]:
# статистики
se.sql("""
select
    count(distinct userId) as users,
    count(distinct artistId) as artists,
    count(*) as interactions,
    count(*) / (count(distinct userId) * count(distinct artistId)) as density
from 
    events
""").toPandas()

Unnamed: 0,users,artists,interactions,density
0,4999,53031,3412504,0.012872


In [10]:
# самые популярные исполнители
se.sql("""
select
    artists.artistName,
    sum(plays) as popularity
from 
    events join artists on events.artistId = artists.artistId
group by artistName
order by popularity desc
limit 30
""").toPandas()

Unnamed: 0,artistName,popularity
0,Imagine Dragons,43447
1,Би-2,29415
2,Баста,27264
3,Ленинград,26311
4,Сплин,25062
5,Queen,24905
6,Sia,22803
7,LOBODA,21923
8,Noize MC,21774
9,Linkin Park,21584


# Обучаем iALS

Будем считать, что рейтинг – это plays

In [11]:
import numpy as np

In [12]:
%%time
train, test = events.rdd.randomSplit([0.95, 0.05], seed=0)

# кэшируем для скорости, будем обращаться несколько раз
train.cache()
test.cache()

train.count()
test.count()

CPU times: user 21.7 ms, sys: 2.13 ms, total: 23.8 ms
Wall time: 28.1 s


In [13]:
train.take(5)

[Row(userId=0, artistId=335, plays=1, skips=0),
 Row(userId=0, artistId=708, plays=1, skips=0),
 Row(userId=0, artistId=710, plays=2, skips=1),
 Row(userId=0, artistId=815, plays=1, skips=1),
 Row(userId=0, artistId=880, plays=1, skips=1)]

In [14]:
%%time
from pyspark.mllib.recommendation import ALS
model = ALS().trainImplicit(
    train.map(lambda x: (x.userId, x.artistId, np.log2(x.plays + 1))),
    rank=32, iterations=10, lambda_=0.01, alpha=10.0, seed=0
)

CPU times: user 237 ms, sys: 70.1 ms, total: 307 ms
Wall time: 1min 14s


In [15]:
# достаем все профили исполнителей
import numpy as np

artist_to_name = {}
for row in artists.collect():
    artist_to_name[row.artistId] = row.artistName

artist_ids = []
artist_names = []
artist_profiles = []

for artistId, profile in model.productFeatures().collect():
    artist_ids.append(artistId)
    artist_names.append(artist_to_name[artistId])
    artist_profiles.append(profile)

artist_ids = np.array(artist_ids)
artist_names = np.array(artist_names)
artist_profiles = np.vstack(artist_profiles)
print(artist_profiles.shape)

(52665, 32)


# Похожести исполнителей

In [16]:
target_artists = {index: v 
                  for index, v in enumerate(artist_names) 
                  if "Coldplay" == v or "50 Cent" == v or "AC/DC" == v}
target_artists

{11563: '50 Cent', 22207: 'AC/DC', 32914: 'Coldplay'}

In [17]:
import scipy
import scipy.spatial

for index, name in target_artists.items():
    print("#############", name, "#############")
    
    cosines = (-scipy.spatial.distance.cdist([artist_profiles[index]], artist_profiles, metric='cosine') + 1)[0]
    cosines[np.isnan(cosines)] = -1e20

    for idx in np.argsort(cosines)[::-1][:10]:
        print(artist_names[idx], "\t", cosines[idx])

############# 50 Cent #############
50 Cent 	 1.0
Dr. Dre 	 0.87176920263
Lloyd Banks 	 0.862571081123
Jay-Z 	 0.835235793381
2Chainz 	 0.822563140385
Cashis 	 0.812456739525
Snoop Dogg 	 0.810484217851
Missy  Elliott 	 0.805635315489
Akon 	 0.798163079104
Busta Rhymes 	 0.787780080328
############# AC/DC #############
AC/DC 	 1.0
The Offspring 	 0.880806285576
Nirvana 	 0.87660720058
Metallica 	 0.874860631399
Red Hot Chili Peppers 	 0.873863202133
System of A Down 	 0.853683802014
Limp Bizkit 	 0.853453742095
Bon Jovi 	 0.850678176659
Nickelback 	 0.846709868065
Scorpions 	 0.842406452006
############# Coldplay #############
Coldplay 	 1.0
Lana Del Rey 	 0.957398977112
Adele 	 0.954771689577
OneRepublic 	 0.951114667239
Maroon 5 	 0.950856957375
Sam Smith 	 0.929694792036
Katy Perry 	 0.921320013598
Ed Sheeran 	 0.919972991344
Pharrell Williams 	 0.919839118559
Twenty One Pilots 	 0.91862092206


# NDCG

In [21]:
def dcg(ratings):
    return float(np.sum((2 ** np.array(ratings, np.float32) - 1) / np.log2(np.arange(1, len(ratings) + 1) + 1)))


def ndcg(ratings, at=None):
    idcg = dcg(sorted(ratings, reverse=True))
    return dcg(ratings) / idcg if idcg > 0 else 0


def ndcg_score(y_true, y_pred):
    assert y_true.shape == y_pred.shape
    order = np.argsort(y_pred)[::-1]
    return ndcg(y_true[order])


# tests
def test1():
    y_true = np.array([  0,   0,   2,   1,   0])
    y_pred = np.array([0.2, 0.1, 0.5, 0.3, 0.4])
    correct_ndcg = (3 / np.log(1 + 1) + 1 / np.log(3 + 1)) / (3 / np.log(1 + 1) + 1 / np.log(2 + 1))
    assert np.allclose(ndcg_score(y_true, y_pred), correct_ndcg)

    
def test2():
    y_true = np.array([  0,   0,   0,   0,   0])
    y_pred = np.array([0.2, 0.1, 0.5, 0.3, 0.4])
    assert np.allclose(ndcg_score(y_true, y_pred), 0.0)

    
def test3():
    y_true = np.array([  1,   0,   0,   0,   0])
    y_pred = np.array([0.2, 0.1, 0.5, 0.3, 0.4])
    correct_ndcg = (1 / np.log(4 + 1)) / (1 / np.log(1 + 1))
    assert np.allclose(ndcg_score(y_true, y_pred), correct_ndcg)


test1()
test2()
test3()

In [22]:
print(dcg([5, 4, 3, 2, 1]))
print(dcg([3, 4, 5, 2, 1]))
print(dcg([5, 4, 1, 2, 3]))

45.64282878502658
33.64282878502658
44.963945628433834


# Считаем NDCG для базового решения

Всегда ранжируем исполнителей по популярности

In [23]:
artist_to_popularity = (
    train
    .map(lambda x: (x.artistId, x.plays))
    .reduceByKey(lambda a, b: a + b)
    .collect()
)

artist_to_popularity = {a: p for a, p in artist_to_popularity}

In [24]:
predictions_and_ratings_per_user = (
    test
    .map(lambda x: (x.userId, (artist_to_popularity.get(x.artistId, 0), np.log2(x.plays + 1))))
    .groupByKey()
    .map(lambda x: (x[0], list(x[1])))
)

In [25]:
predictions_and_ratings_per_user.take(1)

[(0,
  [(2371, 1.0),
   (22102, 3.3219280948873622),
   (609, 1.0),
   (7399, 1.0),
   (884, 1.5849625007211561),
   (481, 1.5849625007211561),
   (4008, 1.0),
   (2325, 1.5849625007211561),
   (774, 1.0),
   (1035, 1.5849625007211561),
   (4484, 3.5849625007211561),
   (234, 1.0),
   (1523, 1.0),
   (2273, 1.5849625007211561),
   (1243, 2.3219280948873622),
   (5388, 2.3219280948873622),
   (7856, 5.9772799234999168),
   (781, 1.0),
   (4743, 1.5849625007211561),
   (1234, 1.0),
   (1569, 0.0),
   (650, 0.0),
   (817, 0.0),
   (1579, 0.0),
   (1397, 0.0),
   (9438, 0.0),
   (9, 0.0),
   (773, 0.0),
   (543, 0.0),
   (46, 0.0),
   (574, 0.0),
   (9578, 0.0),
   (241, 0.0)])]

In [26]:
def ndcg_for_user(x):
    y_pred = np.array([e[0] for e in x])
    y_true = np.array([e[1] for e in x])
    return ndcg_score(y_true, y_pred)
    
(
    predictions_and_ratings_per_user
    .map(lambda x: ndcg_for_user(x[1]))
    .mean()
)

0.6610150124257518

# NDCG для iALS

In [27]:
predictions = (
    model
    .predictAll(test.map(lambda x: (x.userId, x.artistId)))
    .map(lambda x: ((x[0], x[1]), x[2]))
)

In [28]:
predictions.take(5)

[((2464, 17312), 0.7113483998861412),
 ((3949, 17312), 0.8030995046830238),
 ((1858, 17312), 0.3888120783910832),
 ((4147, 17312), -0.1829481711739028),
 ((77, 3456), 0.30611075912876945)]

In [29]:
predictions_and_ratings_per_user = (
    predictions
    .join(test.map(lambda x: ((x.userId, x.artistId), np.log2(x.plays + 1))))
    .map(lambda x: (x[0][0], x[1]))
    .groupByKey()
    .map(lambda x: (x[0], list(x[1])))
)

In [30]:
predictions_and_ratings_per_user.take(1)

[(96,
  [(1.0849417236113745, 1.0),
   (0.7144303997635124, 1.0),
   (0.5114239784729908, 1.0),
   (0.18255648185233259, 1.5849625007211561),
   (0.7491707940072484, 1.0),
   (0.6321712326683473, 1.0),
   (0.7683826553700539, 1.0),
   (0.677685685303794, 2.0),
   (0.9798890490844336, 1.5849625007211561),
   (0.4607632137355492, 1.0),
   (0.7553091695158917, 1.0),
   (0.9052658988985727, 2.0),
   (0.7161830270603652, 1.0),
   (0.444178762598336, 1.0),
   (0.798676768696861, 1.0),
   (0.5129975120074302, 1.5849625007211561),
   (0.6173897353011186, 2.3219280948873622),
   (0.3019529742887951, 1.0),
   (0.7967404255723357, 1.5849625007211561),
   (-0.011930253274327831, 0.0),
   (0.5978614297793097, 0.0),
   (0.8262770475356271, 0.0),
   (0.6037709275290899, 1.0),
   (1.107849133421495, 1.0),
   (0.5053338495413762, 1.0),
   (0.9786339527860086, 2.0),
   (0.856785336309896, 1.5849625007211561),
   (0.9950012216893022, 1.0),
   (1.0400502444928788, 1.0),
   (1.0215764343653677, 1.0),
   (0

In [31]:
def ndcg_for_user(x):
    y_pred = np.array([e[0] for e in x])
    y_true = np.array([e[1] for e in x])
    return ndcg_score(y_true, y_pred)
    
(
    predictions_and_ratings_per_user
    .map(lambda x: ndcg_for_user(x[1]))
    .mean()
)

0.7165281799560415

In [32]:
print("Улучшение на {:0.3} процентов!".format(100 * (0.716 / 0.661 - 1)))

Улучшение на 8.32 процентов!


In [34]:
sc.stop()