In [2]:
import random
import pandas as pd
import numpy as np

import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
from sklearn.preprocessing import MinMaxScaler

#-------------------------
# LOAD AND PREP THE DATA
#-------------------------
 
raw_data = pd.read_table('/Users/lrnby/Downloads/lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv')[:1000000]
raw_data = raw_data.drop(raw_data.columns[1], axis=1)
raw_data.columns = ['user', 'artist', 'plays']
 
# Drop rows with missing values
data = raw_data.dropna()
  
# Convert artists names into numerical IDs
data['user_id'] = data['user'].astype("category").cat.codes
data['artist_id'] = data['artist'].astype("category").cat.codes
 
# Create a lookup frame so we can get the artist names back in 
# readable form later.
item_lookup = data[['artist_id', 'artist']].drop_duplicates()
item_lookup['artist_id'] = item_lookup.artist_id.astype(str)
 
data = data.drop(['user', 'artist'], axis=1)
 
# Drop any rows that have 0 plays
data = data.loc[data.plays != 0]
 
# Create lists of all users, artists and plays
users = list(np.sort(data.user_id.unique()))
artists = list(np.sort(data.artist_id.unique()))
plays = list(data.plays)
 
# Get the rows and columns for our new matrix
rows = data.user_id.astype(int)
cols = data.artist_id.astype(int)
 
# Contruct a sparse matrix for our users and items containing number of plays
data_sparse = sparse.csr_matrix((plays, (rows, cols)), shape=(len(users), len(artists)))

print(data_sparse)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['user_id'] = data['user'].astype("category").cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['artist_id'] = data['artist'].astype("category").cat.codes


  (0, 4040)	229
  (0, 4092)	288
  (0, 8174)	310
  (0, 9995)	135
  (0, 10139)	134
  (0, 11384)	185
  (0, 21306)	1099
  (0, 21631)	154
  (0, 23545)	302
  (0, 24401)	232
  (0, 25212)	222
  (0, 25218)	717
  (0, 30381)	134
  (0, 30757)	361
  (0, 31506)	151
  (0, 33641)	358
  (0, 35681)	227
  (0, 35696)	316
  (0, 37750)	182
  (0, 38582)	198
  (0, 38747)	706
  (0, 41606)	135
  (0, 42472)	281
  (0, 43046)	387
  (0, 43571)	145
  :	:
  (20464, 54484)	224
  (20464, 58371)	124
  (20464, 58560)	88
  (20464, 59211)	333
  (20464, 61343)	145
  (20464, 61415)	93
  (20464, 61987)	175
  (20464, 62131)	185
  (20464, 63091)	161
  (20464, 64741)	704
  (20464, 65908)	239
  (20464, 66342)	100
  (20464, 67282)	167
  (20464, 70463)	175
  (20464, 71431)	119
  (20464, 71801)	114
  (20464, 71916)	166
  (20464, 72260)	204
  (20464, 72631)	190
  (20464, 72709)	213
  (20464, 72882)	110
  (20464, 72951)	174
  (20464, 73786)	136
  (20464, 75149)	339
  (20464, 77696)	439


In [3]:
def nonzeros(m, row):
    for index in range(m.indptr[row], m.indptr[row+1]):
        yield m.indices[index], m.data[index]
def implicit_als_cg(Cui, features=20, iterations=20, lambda_val=0.1):
    user_size, item_size = Cui.shape

    X = np.random.rand(user_size, features) * 0.01
    Y = np.random.rand(item_size, features) * 0.01

    Cui, Ciu = Cui.tocsr(), Cui.T.tocsr()
    

    for iteration in range(iterations):
        print('iteration %d of %d' % (iteration+1, iterations))
        least_squares_cg(Cui, X, Y, lambda_val)
        least_squares_cg(Ciu, Y, X, lambda_val)
    
    return sparse.csr_matrix(X), sparse.csr_matrix(Y)
  
  
def least_squares_cg(Cui, X, Y, lambda_val, cg_steps=3):
    users, features = X.shape
    
    YtY = Y.T.dot(Y) + lambda_val * np.eye(features)
    
    for u in range(users):
        ## Считаем первую чать градиента r = (b)-Ax
        x = X[u]
        r = -YtY.dot(x)
        ## Считаем b и добавляем к вектору
        for i, confidence in nonzeros(Cui, u):
            r += (confidence - (confidence - 1) * Y[i].dot(x)) * Y[i]
        p = r.copy()
        rsold = r.dot(r)

        for it in range(cg_steps):
            Ap = YtY.dot(p)
            for i, confidence in nonzeros(Cui, u):
                Ap += (confidence - 1) * Y[i].dot(p) * Y[i]

            alpha = rsold / p.dot(Ap)
            x += alpha * p
            r -= alpha * Ap

            rsnew = r.dot(r)
            p = r + (rsnew / rsold) * p
            rsold = rsnew

        X[u] = x

alpha_val = 15
conf_data = (data_sparse * alpha_val).astype('double')
print(conf_data)
user_vecs, item_vecs = implicit_als_cg(conf_data, iterations=10, features=20)

  (0, 4040)	3435.0
  (0, 4092)	4320.0
  (0, 8174)	4650.0
  (0, 9995)	2025.0
  (0, 10139)	2010.0
  (0, 11384)	2775.0
  (0, 21306)	16485.0
  (0, 21631)	2310.0
  (0, 23545)	4530.0
  (0, 24401)	3480.0
  (0, 25212)	3330.0
  (0, 25218)	10755.0
  (0, 30381)	2010.0
  (0, 30757)	5415.0
  (0, 31506)	2265.0
  (0, 33641)	5370.0
  (0, 35681)	3405.0
  (0, 35696)	4740.0
  (0, 37750)	2730.0
  (0, 38582)	2970.0
  (0, 38747)	10590.0
  (0, 41606)	2025.0
  (0, 42472)	4215.0
  (0, 43046)	5805.0
  (0, 43571)	2175.0
  :	:
  (20464, 54484)	3360.0
  (20464, 58371)	1860.0
  (20464, 58560)	1320.0
  (20464, 59211)	4995.0
  (20464, 61343)	2175.0
  (20464, 61415)	1395.0
  (20464, 61987)	2625.0
  (20464, 62131)	2775.0
  (20464, 63091)	2415.0
  (20464, 64741)	10560.0
  (20464, 65908)	3585.0
  (20464, 66342)	1500.0
  (20464, 67282)	2505.0
  (20464, 70463)	2625.0
  (20464, 71431)	1785.0
  (20464, 71801)	1710.0
  (20464, 71916)	2490.0
  (20464, 72260)	3060.0
  (20464, 72631)	2850.0
  (20464, 72709)	3195.0
  (20464, 7288

In [33]:
#------------------------------
# FIND SIMILAR ITEMS
#------------------------------

# Let's find similar artists to Jay-Z. 
# Note that this ID might be different for you if you're using
# the full dataset or if you've sliced it somehow. 
item_id = 68429

# Get the item row for Jay-Z
item_vec = item_vecs[item_id].T

# Calculate the similarity score between Mr Carter and other artists
# and select the top 10 most similar.
scores = item_vecs.dot(item_vec).toarray().reshape(1,-1)[0]
top_10 = np.argsort(scores)[::-1][:30]

artists = []
artist_scores = []

# Get and print the actual artists names and scores
for idx in top_10:
    artists.append(item_lookup.artist.loc[item_lookup.artist_id == str(idx)].iloc[0])
    artist_scores.append(scores[idx])

similar = pd.DataFrame({'artist': artists, 'score': artist_scores})

print(similar)

                   artist     score
0             the beatles  0.001036
1               [unknown]  0.001036
2             johnny cash  0.001033
3                the cure  0.001032
4             david bowie  0.001032
5              pink floyd  0.001032
6               radiohead  0.001031
7         nine inch nails  0.001031
8   red hot chili peppers  0.001030
9               sigur rós  0.001030
10           depeche mode  0.001030
11                nirvana  0.001030
12              daft punk  0.001030
13     the rolling stones  0.001029
14              bob dylan  0.001029
15           led zeppelin  0.001029
16              the doors  0.001029
17                 r.e.m.  0.001028
18               coldplay  0.001028
19                  queen  0.001028
20                  oasis  0.001028
21           jack johnson  0.001027
22                  björk  0.001027
23             portishead  0.001027
24          amy winehouse  0.001026
25               gorillaz  0.001026
26  the smashing pumpkins  0

In [29]:

print(item_lookup.loc[item_lookup.artist == "the beatles"])

    artist_id       artist
256     68429  the beatles
