## SVD Matrix Factorization


* Data Source: https://grouplens.org/datasets/movielens/latest/ (ml-latest-small-zip)

In [1]:
import pandas as pd
import os 
from scipy.sparse import csr_matrix
from fuzzywuzzy import fuzz



In [2]:
os.getcwd()

'c:\\Users\\manpresingh\\OneDrive - Microsoft\\Personal\\Recommendation Models'

In [3]:
df_movies = pd.read_csv('./ml-latest-small/movies.csv', 
                        usecols=['movieId', 'title', 'genres'], 
                        dtype={'movieId':'int32', 'title':'str', 'genres':'str'})

df_ratings = pd.read_csv('./ml-latest-small/ratings.csv',
                         usecols=['userId', 'movieId', 'rating'],
                         dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [4]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int32 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int32(1), object(2)
memory usage: 190.4+ KB


In [5]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   userId   100836 non-null  int32  
 1   movieId  100836 non-null  int32  
 2   rating   100836 non-null  float32
dtypes: float32(1), int32(2)
memory usage: 1.2 MB


In [6]:
df_movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
df_ratings.head(5)

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [8]:
num_users = df_ratings.userId.nunique()
num_items = df_ratings.movieId.nunique()
num_users, num_items

(610, 9724)

In [9]:
df_ratings['rating'].value_counts().sort_index()

rating
0.5     1370
1.0     2811
1.5     1791
2.0     7551
2.5     5550
3.0    20047
3.5    13136
4.0    26818
4.5     8551
5.0    13211
Name: count, dtype: int64

In [10]:
df_ratings['movieId'].value_counts().sort_values(ascending=False).head(5)

movieId
356     329
318     317
296     307
593     279
2571    278
Name: count, dtype: int64

In [11]:
# Randomly took 50 for this POC
# Ideally, we need to plot and find the popular movies which are being seen / rated more
# This is doone to avoid using unseen/unrated movies to reduce KNN compue
# But, it can also degrade the model and as unrated movies will never get recommended

pd.DataFrame(df_ratings['movieId'].value_counts()).query('count >=50')

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
356,329
318,317
296,307
593,279
2571,278
...,...
333,50
3785,50
8361,50
2105,50


In [12]:
popular_movies = list(set(pd.DataFrame(df_ratings['movieId'].value_counts()).query('count >=50').index))

In [13]:
df_movies.movieId.nunique()

9742

In [14]:
df_ratings_popular_movies = df_ratings[df_ratings.movieId.isin(popular_movies)]

In [15]:
df_ratings.shape

(100836, 3)

In [16]:
df_ratings_popular_movies.shape

(41360, 3)

In [17]:
df_ratings_popular_movies.movieId.nunique()

450

In [18]:
df_ratings_popular_movies.userId.nunique()

606

In [89]:
from libreco.algorithms import SVD, ALS
from libreco.data import random_split
import numpy as np
import pandas as pd
from libreco.data import random_split, DatasetPure
from libreco.algorithms import ItemCF  # ItemCF algorithm
from libreco.evaluation import evaluate

In [20]:
df_ratings_new = df_ratings.rename(columns={'userId':'user', 'movieId':'item', 'rating':'label'})

In [21]:


train_data, eval_data, test_data = random_split(df_ratings_new, multi_ratios=[0.8, 0.1, 0.1])

In [22]:
train_data

Unnamed: 0,user,item,label
20596,135,4085,4.0
37993,260,750,4.0
98177,606,6798,2.5
36409,249,434,3.5
88947,573,74458,4.5
...,...,...,...
56650,376,48516,4.5
76231,480,10,4.0
57287,380,3408,3.0
12085,74,3481,3.5


In [23]:
train_data.item.nunique(), train_data.shape

(8965, (80668, 3))

In [24]:
eval_data.item.nunique(), eval_data.shape

(3279, (9651, 3))

In [25]:
test_data.item.nunique(), test_data.shape

(3266, (9670, 3))

In [26]:
train_data, data_info = DatasetPure.build_trainset(train_data)
eval_data = DatasetPure.build_evalset(eval_data)
test_data = DatasetPure.build_testset(test_data)

In [27]:
len(set(train_data.item_indices)), len(set(eval_data.item_indices)), len(set(test_data.item_indices)) 

(8965, 3279, 3266)

In [28]:
data_info


n_users: 610, n_items: 8965, data density: 1.4751 %

In [29]:
import tensorflow as tf

In [30]:
mysvd = SVD(
    task='rating',
    data_info=data_info,
    loss_type='rmse',
    embed_size=16,
    n_epochs=20,
    lr=.03,
    batch_size=64,
    sampler='popular',
    seed=100
)

In [31]:
# with tf.compat.v1.variable_scope('scope', reuse=tf.compat.v1.AUTO_REUSE):
#     var = tf.Variable([1], name='bu_var')


In [32]:
mysvd.fit(train_data, 
          neg_sampling=False, 
          shuffle=True, 
          eval_data=eval_data, 
        #   metrics='accuracy',
          verbose=2,
          k=10)


Training start time: [35m2024-04-10 00:38:26[0m


train: 100%|██████████| 1261/1261 [00:01<00:00, 1165.10it/s]


Epoch 1 elapsed: 1.086s
	 [32mtrain_loss: 2.2102[0m


eval_pointwise: 100%|██████████| 2/2 [00:00<00:00, 501.50it/s]


	 eval rmse: 1.1349


train: 100%|██████████| 1261/1261 [00:01<00:00, 962.49it/s] 


Epoch 2 elapsed: 1.313s
	 [32mtrain_loss: 1.2258[0m


eval_pointwise: 100%|██████████| 2/2 [00:00<00:00, 211.75it/s]


	 eval rmse: 1.1563


train: 100%|██████████| 1261/1261 [00:01<00:00, 1028.69it/s]


Epoch 3 elapsed: 1.233s
	 [32mtrain_loss: 1.2248[0m


eval_pointwise: 100%|██████████| 2/2 [00:00<?, ?it/s]


	 eval rmse: 1.1426


train: 100%|██████████| 1261/1261 [00:01<00:00, 907.82it/s]


Epoch 4 elapsed: 1.394s
	 [32mtrain_loss: 1.2389[0m


eval_pointwise: 100%|██████████| 2/2 [00:00<00:00, 156.06it/s]


	 eval rmse: 1.1542


train: 100%|██████████| 1261/1261 [00:01<00:00, 883.93it/s]


Epoch 5 elapsed: 1.442s
	 [32mtrain_loss: 1.2727[0m


eval_pointwise: 100%|██████████| 2/2 [00:00<?, ?it/s]


	 eval rmse: 1.1654


train: 100%|██████████| 1261/1261 [00:01<00:00, 788.42it/s] 


Epoch 6 elapsed: 1.592s
	 [32mtrain_loss: 1.3031[0m


eval_pointwise: 100%|██████████| 2/2 [00:00<?, ?it/s]


	 eval rmse: 1.1650


train: 100%|██████████| 1261/1261 [00:01<00:00, 977.43it/s] 


Epoch 7 elapsed: 1.292s
	 [32mtrain_loss: 1.3143[0m


eval_pointwise: 100%|██████████| 2/2 [00:00<?, ?it/s]


	 eval rmse: 1.1598


train: 100%|██████████| 1261/1261 [00:01<00:00, 800.52it/s]


Epoch 8 elapsed: 1.580s
	 [32mtrain_loss: 1.3092[0m


eval_pointwise: 100%|██████████| 2/2 [00:00<00:00, 544.61it/s]


	 eval rmse: 1.1837


train: 100%|██████████| 1261/1261 [00:01<00:00, 792.25it/s]


Epoch 9 elapsed: 1.596s
	 [32mtrain_loss: 1.3248[0m


eval_pointwise: 100%|██████████| 2/2 [00:00<00:00, 122.87it/s]


	 eval rmse: 1.1681


train: 100%|██████████| 1261/1261 [00:01<00:00, 873.49it/s]


Epoch 10 elapsed: 1.455s
	 [32mtrain_loss: 1.3308[0m


eval_pointwise: 100%|██████████| 2/2 [00:00<00:00, 120.93it/s]


	 eval rmse: 1.1635


train: 100%|██████████| 1261/1261 [00:01<00:00, 984.14it/s]


Epoch 11 elapsed: 1.294s
	 [32mtrain_loss: 1.3318[0m


eval_pointwise: 100%|██████████| 2/2 [00:00<00:00, 2402.24it/s]


	 eval rmse: 1.1625


train: 100%|██████████| 1261/1261 [00:01<00:00, 937.17it/s] 


Epoch 12 elapsed: 1.342s
	 [32mtrain_loss: 1.3372[0m


eval_pointwise: 100%|██████████| 2/2 [00:00<?, ?it/s]


	 eval rmse: 1.1669


train: 100%|██████████| 1261/1261 [00:01<00:00, 782.45it/s]


Epoch 13 elapsed: 1.605s
	 [32mtrain_loss: 1.3536[0m


eval_pointwise: 100%|██████████| 2/2 [00:00<00:00, 405.42it/s]


	 eval rmse: 1.1663


train: 100%|██████████| 1261/1261 [00:01<00:00, 712.85it/s]


Epoch 14 elapsed: 1.775s
	 [32mtrain_loss: 1.3593[0m


eval_pointwise: 100%|██████████| 2/2 [00:00<00:00, 163.02it/s]


	 eval rmse: 1.1629


train: 100%|██████████| 1261/1261 [00:01<00:00, 774.78it/s]


Epoch 15 elapsed: 1.625s
	 [32mtrain_loss: 1.3594[0m


eval_pointwise: 100%|██████████| 2/2 [00:00<00:00, 820.64it/s]


	 eval rmse: 1.1564


train: 100%|██████████| 1261/1261 [00:01<00:00, 784.85it/s]


Epoch 16 elapsed: 1.610s
	 [32mtrain_loss: 1.3809[0m


eval_pointwise: 100%|██████████| 2/2 [00:00<?, ?it/s]


	 eval rmse: 1.1590


train: 100%|██████████| 1261/1261 [00:01<00:00, 790.00it/s]


Epoch 17 elapsed: 1.590s
	 [32mtrain_loss: 1.3513[0m


eval_pointwise: 100%|██████████| 2/2 [00:00<00:00, 633.01it/s]


	 eval rmse: 1.1780


train: 100%|██████████| 1261/1261 [00:01<00:00, 790.09it/s]


Epoch 18 elapsed: 1.615s
	 [32mtrain_loss: 1.3428[0m


eval_pointwise: 100%|██████████| 2/2 [00:00<00:00, 285.47it/s]


	 eval rmse: 1.1554


train: 100%|██████████| 1261/1261 [00:01<00:00, 870.48it/s]


Epoch 19 elapsed: 1.446s
	 [32mtrain_loss: 1.3825[0m


eval_pointwise: 100%|██████████| 2/2 [00:00<00:00, 958.37it/s]


	 eval rmse: 1.1849


train: 100%|██████████| 1261/1261 [00:01<00:00, 797.28it/s]


Epoch 20 elapsed: 1.587s
	 [32mtrain_loss: 1.3605[0m


eval_pointwise: 100%|██████████| 2/2 [00:00<?, ?it/s]


	 eval rmse: 1.1784


In [38]:
train_data.item_indices

array([2943,  585, 4361, ..., 2469, 2524, 1267], dtype=int64)

In [88]:
df_ratings

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [33]:
mysvd.get_user_embedding(1)

array([-0.18887505,  0.30995592, -0.2553538 ,  0.02148238,  0.27394977,
       -0.43262103, -0.08156496, -0.01705863, -0.37877437,  0.09085429,
        0.21179098, -0.50542915, -0.17378356,  0.04107404, -0.16036922,
        0.19457975], dtype=float32)

In [68]:
user_id=1
mylist = list(mysvd.recommend_user(user=user_id, 
                                    n_rec=10, 
                                    filter_consumed=True,
                                    random_rec=False)[user_id])
mylist

[73804, 148424, 26359, 4292, 6506, 140289, 105540, 66090, 120813, 55729]

In [72]:
df_movies[df_movies.title=='Jurassic Park (1993)']

Unnamed: 0,movieId,title,genres
418,480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller


In [73]:
mysvd.search_knn_items(480,10)

[7843, 96861, 72178, 74851, 148424, 107702, 122092, 102084, 6885, 118270]

In [74]:
mysvd.get_item_embedding(480)

array([-0.22210251, -0.10360587, -0.1825069 , -0.6639169 , -0.24370892,
       -0.03359564, -0.07777096, -0.28804234,  0.15468891,  0.16104098,
        0.39022174, -0.5732916 , -0.25358984,  0.25219274, -0.05396141,
        0.2521045 ], dtype=float32)

In [75]:
mysvd.get_item_embedding(7843)

array([-0.64590997, -0.2664566 ,  1.0879902 , -3.7340066 , -1.4406211 ,
       -1.5549015 , -1.4788551 , -1.924328  ,  0.76644236,  2.3885536 ,
        1.2764162 , -3.9307182 , -0.23398729, -0.44844377, -0.49260446,
       -0.6566653 ], dtype=float32)

In [76]:
import scipy
scipy.spatial.distance.cosine(mysvd.get_item_embedding(480), mysvd.get_item_embedding(7843))

0.2176225781440735

In [71]:
df_movies[df_movies.movieId.isin(mysvd.search_knn_items(480,10))]

Unnamed: 0,movieId,title,genres
4622,6885,In the Cut (2003),Crime|Drama|Mystery|Romance|Thriller
5038,7843,Lammbock (2001),Comedy
7178,72178,Welcome to Dongmakgol (2005),Comedy|Drama|War
7281,74851,From Paris with Love (2010),Action|Crime
7991,96861,Taken 2 (2012),Action|Crime|Drama|Thriller
8148,102084,Justice League: Doom (2012),Action|Animation|Fantasy
8332,107702,Grudge Match (2013),Comedy
8610,118270,Hellbenders (2012),Comedy|Horror|Thriller
8675,122092,Guy X (2005),Comedy|War
9159,148424,Chi-Raq (2015),Comedy|Drama


In [79]:
mysvd.search_knn_users(1,10)

[281, 128, 251, 542, 529, 271, 203, 277, 257, 172]

In [81]:
mysvd.predict(user=[1,1,1,1,1], item=[12,89,2139,73804,148424],
               cold_start='popular')

array([2.3739839, 2.917882 , 3.9494333, 5.       , 5.       ],
      dtype=float32)

In [84]:
mysvd.init_knn(approximate=True, sim_type='cosine')

using approximate searching mode...


In [87]:
mysvd.search_knn_items(480,10)
# Earlier result:[7843, 96861, 72178, 74851, 148424, 107702, 122092, 102084, 6885, 118270]
# This result has changed. Earlier one was brute-force, but now, it's Approximate search

[480, 7843, 34321, 6885, 3873, 135567, 181139, 6550, 6223, 74851]

In [96]:
myALS = ALS(
    task='rating',
    data_info=data_info,
    # loss_type='rmse',
    embed_size=16,
    n_epochs=20,
    use_cg=True,
    reg=.05,
    # lr=.03,
    n_threads=3,
    # batch_size=64,
    # sampler='popular',
    seed=100
)

In [1]:

myALS.fit(train_data, 
          neg_sampling=False, 
          shuffle=True, 
          eval_data=eval_data, 
        #   metrics='accuracy',
          verbose=2,
          k=10)


NameError: name 'myALS' is not defined

In [107]:
myALS.predict(user=[1,1,1,1,1], item=[12,89,2139,73804,148424],
               cold_start='popular')

array([1.3981931, 3.3185241, 5.       , 3.744216 , 4.292846 ],
      dtype=float32)

In [104]:
myALS.init_knn(approximate=False, sim_type='cosine')

In [105]:
myALS.search_knn_items(480,10)

[480, 1580, 1198, 467, 1006, 2020, 1036, 704, 5443, 33794]

In [106]:
df_movies[df_movies.movieId.isin(myALS.search_knn_items(480,10))]

Unnamed: 0,movieId,title,genres
405,467,Live Nude Girls (1995),Comedy
418,480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller
574,704,"Quest, The (1996)",Action|Adventure
764,1006,"Chamber, The (1996)",Drama
793,1036,Die Hard (1988),Action|Crime|Thriller
900,1198,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure
1183,1580,Men in Black (a.k.a. MIB) (1997),Action|Comedy|Sci-Fi
1495,2020,Dangerous Liaisons (1988),Drama|Romance
3871,5443,Juwanna Mann (2002),Comedy
5917,33794,Batman Begins (2005),Action|Crime|IMAX
