In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/lastfm/lastfm.csv


## last fm and als 

reference : https://medium.com/radon-dev/als-implicit-collaborative-filtering-5ed653ba39fe 
https://www.benfrederickson.com/matrix-factorization/
kaggle dataset : https://www.kaggle.com/datasets/ravichaubey1506/lastfm

Used different lastfm dataset (which is much smaller) and it doesn't have 'plays' column so I filled all the rows with plays 1 . 

모든 유저가 관심있는 음악에 대해서 재생을 한 번 했다고 가정했기 때문에 confidence score의 의미가 크지는 않다. least squared algorithm 부분을 이해하기 위한 코드 

In [3]:
import random
import pandas as pd
import numpy as np

import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
from sklearn.preprocessing import MinMaxScaler

In [6]:
raw_data = pd.read_csv('/kaggle/input/lastfm/lastfm.csv')

In [8]:
data = raw_data.dropna().drop_duplicates()

In [12]:
data['artist_id'] = data['artist'].astype("category").cat.codes
data['user_id'] = data['user'].astype("category").cat.codes

In [56]:
raw_data.head()

Unnamed: 0,user,artist,sex,country
0,1,red hot chili peppers,f,Germany
1,1,the black dahlia murder,f,Germany
2,1,goldfrapp,f,Germany
3,1,dropkick murphys,f,Germany
4,1,le tigre,f,Germany


In [17]:
# To look up artist name by id 
item_lookup = data[['artist_id', 'artist']].drop_duplicates()
item_lookup['artist_id'] = item_lookup.artist_id.astype(str)
item_lookup.head()

Unnamed: 0,artist_id,artist
0,713,red hot chili peppers
1,848,the black dahlia murder
2,372,goldfrapp
3,280,dropkick murphys
4,511,le tigre


In [19]:
data = data.drop(['user', 'artist', 'sex', 'country'], axis=1)
data.head()

Unnamed: 0,artist_id,user_id
0,713,0
1,848,0
2,372,0
3,280,0
4,511,0


In [23]:
data['plays'] = 1

In [24]:
data = data[['user_id', 'artist_id', 'plays']]
data

Unnamed: 0,user_id,artist_id,plays
0,0,713,1
1,0,848,1
2,0,372,1
3,0,280,1
4,0,511,1
...,...,...,...
289950,14999,133,1
289951,14999,685,1
289952,14999,860,1
289953,14999,13,1


## make sparse matrix with library

In [58]:
# Get all unique value by list 
users = list(np.sort(data.user_id.unique()))
artists = list(np.sort(data.artist_id.unique()))
plays = list(data.plays) 
print(len(users), len(artists), len(plays))

15000 1004 289953


In [26]:
rows = data.user_id.astype(int) 
cols = data.artist_id.astype(int)
print(rows, cols)

0             0
1             0
2             0
3             0
4             0
          ...  
289950    14999
289951    14999
289952    14999
289953    14999
289954    14999
Name: user_id, Length: 289953, dtype: int64 0         713
1         848
2         372
3         280
4         511
         ... 
289950    133
289951    685
289952    860
289953     13
289954    703
Name: artist_id, Length: 289953, dtype: int64


In [27]:
# make R matrix user x item size
data_sparse = sparse.csr_matrix((plays, (rows, cols)), 
                                shape=(len(users), len(artists)))
data_sparse

<15000x1004 sparse matrix of type '<class 'numpy.int64'>'
	with 289953 stored elements in Compressed Sparse Row format>

In [123]:
def my_im_als(sparse_matrix, alpha_val = 40, epochs = 10, lambda_val = 0.1, features=10): 
    confidence = sparse_matrix * alpha_val  
    # print(confidence.shape) # 15000, 1004
    user_size, item_size = sparse_matrix.shape 
    # print(user_size, item_size) #15000 1004
    
    X = sparse.csr_matrix(np.random.normal(size=(user_size, features)))
    Y = sparse.csr_matrix(np.random.normal(size=(item_size, features))) 
    
    X_I = sparse.eye(user_size) 
    Y_I = sparse.eye(item_size)
    # print(X.shape, Y.shape) # (15000, 10) (1004, 10)
    # print(X_I.shape, Y_I.shape) # (15000, 15000) (1004, 1004)
    
    I = sparse.eye(features) 
    lI = lambda_val * I 
    # print(I.shape, lI.shape) #(10, 10) (10, 10)
    
    for i in range(epochs):
        print('epochs %d of %d'%(i + 1, epochs)) 
        
        yTy = Y.T.dot(Y) 
        xTx = X.T.dot(X) 
        
        for u in range(user_size): 
            u_row = confidence[u, :].toarray() 
            # print(u_row) [0, 0, ,... 40, 0]
            p_u = u_row.copy() 
            p_u[p_u != 0] = 1.0 
            # print(p_u) [0, 0, .... 1, 0]
            
            CuI = sparse.diags(u_row, [0]) 
            # print(CuI)  (280, 280)	40.0 (245, 245) 40.0 ...
            Cu = CuI + Y_I
            # print(Cu) (0,0) 1.0 (1,1) 1.0 .... (280, 280) 41.0
            # print(CuI.shape, Cu.shape) # (1004, 1004) (1004, 1004)
            
            regularization = Y.T.dot(CuI).dot(Y) # (10, 1004).dot((1004,1004)).dot(1004,10) -> (10,10)
            yT_Cu_pu = Y.T.dot(Cu).dot(p_u.T) # (10,1004).dot(1004,1004).dot(1004,1) -> (10,1)
            # print(regularization.shape, yT_Cu_pu.shape) (10,10), (10,1)
            X[u] = spsolve(yTy + regularization + lI, yT_Cu_pu) 
            # print(X[u].shape) (1,10)
            
        for i in range(item_size):
            i_row = confidence[:, i].T.toarray() 
            p_i = i_row.copy() 
            p_i[p_i != 0] = 1.0 
            CiI = sparse.diags(i_row, [0]) 
            Ci = CiI + X_I 
            regularization = X.T.dot(CiI).dot(X) 
            xT_Ci_pi = X.T.dot(Ci).dot(p_i.T) 
            Y[i] = spsolve(xTx + regularization + lI, xT_Ci_pi) 
    return X, Y
        

In [120]:
user_vecs, item_vecs = my_im_als(data_sparse)

epochs 1 of 10
epochs 2 of 10
epochs 3 of 10
epochs 4 of 10
epochs 5 of 10
epochs 6 of 10
epochs 7 of 10
epochs 8 of 10
epochs 9 of 10
epochs 10 of 10


In [121]:
item_id = 7
print(item_lookup[item_lookup.artist_id == str(item_id)])

item_vec = item_vecs[item_id].T

# find top 10 score
scores = item_vecs.dot(item_vec).toarray().reshape(1,-1)[0]
top_10 = np.argsort(scores)[::-1][:10]

artists = []
artist_scores = []

# Add them so that the artist name and score appear
for idx in top_10:
    artists.append(item_lookup.artist.loc[item_lookup.artist_id == str(idx)].iloc[0])
    artist_scores.append(scores[idx])

similar = pd.DataFrame({'artist': artists, 'score': artist_scores})

print(similar)

     artist_id   artist
1462         7  50 cent
                  artist     score
0        michael jackson  0.010866
1             kanye west  0.010828
2            the beatles  0.010492
3                 eminem  0.010404
4              daft punk  0.010321
5               coldplay  0.009847
6  red hot chili peppers  0.009817
7               gorillaz  0.009489
8                  jay-z  0.009374
9        black eyed peas  0.009324


In [122]:
item_id = 10
print(item_lookup[item_lookup.artist_id == str(item_id)])

item_vec = item_vecs[item_id].T

# find top 10 score
scores = item_vecs.dot(item_vec).toarray().reshape(1,-1)[0]
top_10 = np.argsort(scores)[::-1][:10]

artists = []
artist_scores = []

# Add them so that the artist name and score appear
for idx in top_10:
    artists.append(item_lookup.artist.loc[item_lookup.artist_id == str(idx)].iloc[0])
    artist_scores.append(scores[idx])

similar = pd.DataFrame({'artist': artists, 'score': artist_scores})

print(similar)

     artist_id             artist
4934        10  a day to remember
                   artist     score
0               brand new  0.005019
1         jimmy eat world  0.004982
2               underoath  0.004976
3  dashboard confessional  0.004905
4      taking back sunday  0.004855
5         city and colour  0.004838
6                  thrice  0.004811
7      coheed and cambria  0.004752
8     death cab for cutie  0.004719
9                anberlin  0.004708
