In [1]:
from utils import * 
import numpy as np

URL = "http://mtg.upf.edu/static/datasets/last.fm/lastfm-dataset-1K.tar.gz"
f_name = "lastfm-dataset-1K.tar.gz"
dir_name = "lastfm-dataset-1K"
dataset_f_name = "userid-timestamp-artid-artname-traid-traname.tsv"

In [2]:
download_dataset(URL, f_name)

Dataset Exists...


In [57]:
user_data = load_dataset(dir_name, dataset_f_name, 1000000)

Read File...


In [58]:
user_data[:10]

Unnamed: 0,userid,timestamp,artist-id,artist-name,track-id,track-name
0,user_000001,2009-05-04T23:08:57Z,f1b1cf71-bd35-4e99-8624-24a6e15f133a,Deep Dish,,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007
1,user_000001,2009-05-04T13:54:10Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Composition 0919 (Live_2009_4_15)
2,user_000001,2009-05-04T13:52:04Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc2 (Live_2009_4_15)
3,user_000001,2009-05-04T13:42:52Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Hibari (Live_2009_4_15)
4,user_000001,2009-05-04T13:42:11Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc1 (Live_2009_4_15)
5,user_000001,2009-05-04T13:38:31Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,To Stanford (Live_2009_4_15)
6,user_000001,2009-05-04T13:33:28Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Improvisation (Live_2009_4_15)
7,user_000001,2009-05-04T13:23:45Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Glacier (Live_2009_4_15)
8,user_000001,2009-05-04T13:19:22Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Parolibre (Live_2009_4_15)
9,user_000001,2009-05-04T13:13:38Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Bibo No Aozora (Live_2009_4_15)


## Cleaning infrequent data

First remove infrequent tracks (<10 plays)

In [59]:
# Removing infrequent tracks.
if user_data['track-id'].isnull().sum() > 0:
    user_data = user_data.dropna(axis = 0, subset = ['track-id'])

In [60]:
user_data.shape

(865616, 6)

In [61]:
total_plays = user_data.groupby(["track-id"]).size().reset_index()
total_plays.rename(columns = {0: 'total'}, inplace = True)
total_plays.describe()

Unnamed: 0,total
count,126986.0
mean,6.816625
std,25.337567
min,1.0
25%,1.0
50%,2.0
75%,6.0
max,2069.0


In [63]:
frequent_plays = total_plays[total_plays['total'] >= 10]['track-id']
frequent_plays.describe()

count                                    20666
unique                                   20666
top       8837c142-45e4-495f-abd9-4f3c016dfa52
freq                                         1
Name: track-id, dtype: object

In [64]:
# Drop infrequent tracks.
data = user_data[user_data['track-id'].isin(frequent_plays)]
data.describe()

Unnamed: 0,userid,timestamp,artist-id,artist-name,track-id,track-name
count,595039,595039,595039,595039,595039,595039
unique,41,592738,3331,3309,20666,19328
top,user_000033,2008-11-24T11:38:21Z,164f0d73-1234-4e2c-8743-d77bf2191051,Kanye West,82558949-cd98-4c58-af35-3f1a9430d52e,Heartless
freq,58772,24,26616,26616,2069,2120


Now, remove infrequent users.

In [66]:
# Removing infrequent tracks.
if data['userid'].isnull().sum() > 0:
    data = data.dropna(axis = 0, subset = ['userid'])

In [67]:
data.shape

(595039, 6)

In [68]:
user_plays = data.groupby(["userid"]).size().reset_index()
user_plays.rename(columns = {0: 'total'}, inplace = True)
user_plays.describe()

Unnamed: 0,total
count,41.0
mean,14513.146341
std,15059.907282
min,176.0
25%,3090.0
50%,10437.0
75%,18144.0
max,58772.0


In [69]:
frequent_users = user_plays[user_plays['total'] >= 10]['userid']
frequent_users.describe()

count              41
unique             41
top       user_000035
freq                1
Name: userid, dtype: object

In [70]:
# Drop infrequent tracks.
data = data[data['userid'].isin(frequent_users)]
data.describe()

Unnamed: 0,userid,timestamp,artist-id,artist-name,track-id,track-name
count,595039,595039,595039,595039,595039,595039
unique,41,592738,3331,3309,20666,19328
top,user_000033,2008-11-24T11:38:21Z,164f0d73-1234-4e2c-8743-d77bf2191051,Kanye West,82558949-cd98-4c58-af35-3f1a9430d52e,Heartless
freq,58772,24,26616,26616,2069,2120


## Storing id-name mappings

Forming an artist, track id-name mapping.

In [71]:
artists = data[["artist-id", "artist-name"]].drop_duplicates()


In [72]:
track = data[["track-id", "track-name"]].drop_duplicates()

Forming a mapping from artist to track

In [73]:
artist_track = data[["track-id", "artist-id"]].drop_duplicates()

## Playcount matrix

In [74]:
playcount = data.groupby(["userid", "track-id"]).size().reset_index()

In [75]:
playcount.rename(columns = {0: 'playcount'}, inplace = True)

In [76]:
playcount[:10]

Unnamed: 0,userid,track-id,playcount
0,user_000001,00b07689-ec4c-4773-94ce-06f3d198431e,4
1,user_000001,0198d56b-a92f-4596-a206-9e136f56e39f,21
2,user_000001,019cc5d9-1aaf-4687-b0af-6bc83bbccf0b,16
3,user_000001,01d866fc-7145-46af-ad7a-4133a477d54b,6
4,user_000001,02d30185-9d38-4f1c-9dac-63aa14f8cc69,1
5,user_000001,03156214-005e-456f-a5e3-0df32024fdad,16
6,user_000001,04049f3b-2fda-460f-a3ea-05f0418792c1,13
7,user_000001,04be389b-e44f-46c3-9ff3-2a0f0c3beda7,12
8,user_000001,05182b53-d942-4e39-9d87-3e62658898ff,14
9,user_000001,05ef9a7c-edab-4937-9c46-6bf572017d14,17


In [77]:
from scipy.sparse import csr_matrix

playcounts = playcount.pivot(index = 'userid', columns = 'track-id', values = 'playcount').fillna(0)


In [80]:
plays_matrix = csr_matrix(playcounts.values)
playcounts[:10]

track-id,0002c0b8-b737-49d8-a007-68d3bc2da175,0003dd36-b4d2-4216-a37e-b110f6882ecb,0005ebe2-6971-4de6-a8eb-5dff41f3b811,00063d3a-14a8-4748-8025-4b7cb828b37f,00087784-820c-4f71-b539-1f948b37befe,0019e275-6d9f-4ff8-bcd8-1bfce804163a,002017be-b745-42e4-aaf2-74a1c3879703,00226a83-ec86-4950-83de-650c3715e344,00271b0b-0459-4984-a3a5-a1f88e1136ca,002a1ae6-34bf-4687-b2ff-f78aaf7c74f5,...,ffddf0b4-d265-40e1-8950-f97207e86838,ffe2102d-4a5e-48b7-977a-0c28e2af119e,ffe3798a-44fd-4795-969c-2e14fc7083c8,ffea36e9-ae42-402d-ba47-2c1ef4cfca05,ffeaac69-dfd9-4159-950f-3981d5775b91,fff02100-41bf-440f-bcb8-f8998b6b1cc7,fff1607f-8763-460f-bf46-44da9c95a93a,fff7464a-d5db-4dd7-9c40-9e8deddf6063,fffada82-2845-44a0-a783-899f383e295e,fffe453c-b68b-4e43-9cef-b6767a587415
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
user_000001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_000002,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,11.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0
user_000003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
user_000004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_000005,0.0,0.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_000006,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_000007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_000008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_000009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_000010,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Conversion of Playcounts to Ratings

Based on work in A hybrid online-product recommendation system: Combining implicit
rating-based collaborative filtering and sequential pattern analysis by Choi et al.

Absolute Preference: AP(u, i) = ln((n(u played i)/n(u played anything)) + 1)

Relative Preference: RP(u, i) = AP(u,i) / Max_{c \in U} AP(c, i)

Implicit Rating: R(u, i) = Round up(5 * RP (u,i))

In [82]:
# List of number of songs ever played by each user. Reduces computation
user_plays = plays_matrix.sum(axis=1) # Sum all columns
print(user_plays[:20])
print(user_plays.shape)

#number_of_users

[[  6723.]
 [ 42930.]
 [ 10437.]
 [  8200.]
 [ 13211.]
 [ 10246.]
 [   182.]
 [ 34997.]
 [  1857.]
 [  3090.]
 [  3700.]
 [ 47121.]
 [  4220.]
 [   176.]
 [ 10161.]
 [ 11517.]
 [  3810.]
 [ 10566.]
 [ 22334.]
 [  2793.]]
(41, 1)


In [83]:
# Compute Absolute Rating
absolute_preferences = np.log(plays_matrix / user_plays + 1)

In [84]:
absolute_preferences.shape
number_of_users=absolute_preferences.shape[0]
number_of_items=absolute_preferences.shape[1]

In [85]:
# 2. List of maximum ratings of each item
max_ratings = absolute_preferences.max(axis=0) # Along columns
print(max_ratings[:10])
print(max_ratings.shape)

[[ 0.00077131  0.00054501  0.00105916 ...,  0.00215983  0.00067887
   0.00037263]]
(1, 20666)


In [86]:
print(max_ratings[max_ratings ==0].size)

0


In [87]:
print(user_plays[user_plays == 0].size)

0


In [88]:
print(max_ratings.shape)

(1, 20666)


In [89]:
# Relative Preference: RP(u, i) = AP(u,i) / Max_{c \in U} AP(c, i)
relative_preferences = absolute_preferences / (max_ratings)
print(absolute_preferences.shape)
print(relative_preferences.shape)

(41, 20666)
(41, 20666)


In [90]:
# Implicit Rating: R(u, i) = Round up(5 * RP (u,i))
implicit_ratings = np.ceil(5 * relative_preferences)

In [91]:
implicit_ratings.max(axis=1)
print(implicit_ratings[:10, :10])

[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  2.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  5.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  5.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]]


In [92]:
implicit_ratings.shape

(41, 20666)

In [93]:
def compute_biases(ratings):
    """computes biases for every user and every item, and the overall average of the ratings
    
    input:
        ratings: the matrix of ratings
    output: 
        mu: overall average of ratings
        user_biases: deviations of user means wrt mu, shape (1,N)
        item_biases: deviations of item means wrt mu, shape (D,1)
    """
    num_users, num_items = ratings.shape
    

    # boolean array showing the nonzero entries
    nz = ratings != 0

    # the mean is computed over ALL nonzero ratings
    mu = ratings[nz].mean()
    print(num_users)
    print(num_items)
    # mean over all nonzero ratings
    # biases for every user/item
    item_means = ratings.sum(axis=0) / (nz).sum(axis=0)
    user_means = ratings.sum(axis=1) / (nz).sum(axis=1)
    user_biases = np.reshape(user_means - mu, (1, num_users))
    item_biases = np.reshape(item_means - mu, (num_items, 1))
    return mu, user_biases, item_biases




In [94]:
mu, user_biases, item_biases  = compute_biases(implicit_ratings)

41
20666


In [95]:
def prediction_biased(W, Z, mu, user_biases, item_biases):
    """ returns the biased prediction matrix.
    
    input :
        W: item feature matrix
        Z: user feature matrix
        mu: average of all ratings
        user_biases: deviations of user means wrt mu, shape (1,N)
        item_biases: deviations of item means wrt mu, shape (D,1) 
    output:
        X_hat : has shape (D, N)
    """
    # compute the biased prediction matrix
    X_hat = (W.transpose().dot(Z)) + item_biases + user_biases + mu

    # make sure that ratings stay within valid range of 1-5
    if np.isscalar(X_hat):
        if X_hat < 1:
            X_hat = 1
        elif X_hat > 5:
            X_hat = 5
    else:
        X_hat[X_hat < 1] = 1.
        X_hat[X_hat > 5] = 5.

    return X_hat