# Probablistic Matrix Factorization build 

Reference : https://www.kaggle.com/code/robottums/probabalistic-matrix-factorization-with-suprise 

The original code used surprise dataset, but my code reads numpy and dataframe 

In [60]:
import numpy as np 
import pandas as pd 
from matplotlib import pyplot as plt 
import random
import argparse
import pickle
import torch

In [61]:
# Example R matrix 
# the row means user and column means item 
# for example user0 rated item0 rating 1 and item4 rating 3. 
R = np.array([
    [1, 0, 0, 1, 3],
    [2, 0, 3, 1, 1],
    [1, 2, 0, 5, 0],
    [1, 0, 0, 4, 4],
    [2, 1, 5, 4, 0],
    [5, 1, 5, 4, 0],
    [0, 0, 0, 1, 0],
])

# The latent matrix P and Q will be the size of (7, d) and (d, 5)

In [62]:
class ProbabilisticMatrixFactorization(torch.nn.Module): 
    def __init__(self, R, d=3, lr=0.09, reg_param=0.01, epochs=2):
      super(ProbabilisticMatrixFactorization, self).__init__()
      self.R = R
      self.d = d
      self.lr = lr
      self.reg_param = reg_param 
      self.epochs = epochs 
      self.nusers = R.shape[0] 
      self.mitems = R.shape[1]

      # latent vectors
      self.user_vecs = np.random.normal(scale = 1. / self.d, size=(self.nusers, self.d)) 
      self.item_vecs = np.random.normal(scale = 1. / self.d, size=(self.mitems, self.d))

      # biases 
      self.user_bias = np.zeros(self.nusers)
      self.item_bias = np.zeros(self.mitems)

      self.all_bias = np.mean(self.R[np.where(self.R != 0)]) 
      self.all_mean = np.sum(self.R) / len(np.where(self.R > 0)[0])
      print("Average of all ratings:", self.all_mean)

    def rui(self):
      return self.rui

    def uv(self):
      return self.user_vecs

    def fit(self) :
      xi, yi = self.R.nonzero() 
      print("Model epochs:", self.epochs)
      for epoch in range(self.epochs):
        for i, j in zip(xi, yi):
          # i = user x j item y R[i, j] = r_ui 
          residual = self.R[i, j] - np.dot(self.user_vecs[i], self.item_vecs[j])
          if i == 0 and j == 3 and epoch % 10 == 0:
            print("[Epoch %2d] Target Score %d, Predicted Score %.4f, Residual %.4f" %(epoch, self.R[i, j], np.dot(self.user_vecs[i], self.item_vecs[j]), residual))
          temp = self.user_vecs[i, :] 
          self.user_vecs[i, :] += self.lr * residual * self.item_vecs[j, :] 
          self.item_vecs[j, :] += self.lr * residual * temp 


    def test(self, user_ind, item_ind):
      if user_ind >= self.nusers or item_ind >= self.mitems or user_ind < 0 or item_ind < 0:
        return self.all_mean
      nanCheck = np.dot(self.user_vecs[user_ind, :], self.item_vecs[item_ind, :]) 
      if np.isnan(nanCheck):
        return self.all_mean 
      else: 
        return np.dot(self.user_vecs[user_ind, :], self.item_vecs[item_ind, :]) 
      

In [63]:
# The loss of a single element is decreasing
factorizer = ProbabilisticMatrixFactorization(R, d=3, lr=0.01, reg_param=0.01, epochs=100)
factorizer.fit()

Average of all ratings: 2.590909090909091
Model epochs: 100
[Epoch  0] Target Score 1, Predicted Score 0.0178, Residual 0.9822
[Epoch 10] Target Score 1, Predicted Score 0.0874, Residual 0.9126
[Epoch 20] Target Score 1, Predicted Score 0.7642, Residual 0.2358
[Epoch 30] Target Score 1, Predicted Score 1.2819, Residual -0.2819
[Epoch 40] Target Score 1, Predicted Score 1.3501, Residual -0.3501
[Epoch 50] Target Score 1, Predicted Score 1.3160, Residual -0.3160
[Epoch 60] Target Score 1, Predicted Score 1.2535, Residual -0.2535
[Epoch 70] Target Score 1, Predicted Score 1.1818, Residual -0.1818
[Epoch 80] Target Score 1, Predicted Score 1.1109, Residual -0.1109
[Epoch 90] Target Score 1, Predicted Score 1.0490, Residual -0.0490


In [64]:
# Truth
R[0, 4]

3

In [65]:
# Predicted. (2.9 and 3 is similar)
factorizer.test(0,4)

2.9497440069465393

# Using dataFrame

Here, we will have a dataframe not a matrix. 
The first column is user_id and second column is movie_id. The last column will be each user's rating about an item 

In [66]:
xi, yi = R.nonzero()

In [86]:
make_np_lst = []
for x, y in zip(xi, yi):
  make_np_lst.append([x, y, R[x,y]])
data = pd.DataFrame(make_np_lst, columns=['User_ID', 'Movie_ID', 'Rating'])
data.head()

Unnamed: 0,User_ID,Movie_ID,Rating
0,0,0,1
1,0,3,1
2,0,4,3
3,1,0,2
4,1,2,3


In [87]:
# The mean value of Rating 2.59 means average of all rating
data[data.columns[0]]
data.describe()

Unnamed: 0,User_ID,Movie_ID,Rating
count,22.0,22.0,22.0
mean,2.772727,1.909091,2.590909
std,1.875451,1.477098,1.623022
min,0.0,0.0,1.0
25%,1.0,0.25,1.0
50%,3.0,2.0,2.0
75%,4.0,3.0,4.0
max,6.0,4.0,5.0


In [70]:
# The code is slightly different from above code, because it reads from pandas dataframe.

class ProbabilisticMatrixFactorization(torch.nn.Module): 
    def __init__(self, dataframe, d=3, lr=0.09, reg_param=0.01, epochs=2):
      super(ProbabilisticMatrixFactorization, self).__init__()
      self.data = dataframe
      self.d = d
      self.lr = lr
      self.reg_param = reg_param 
      self.epochs = epochs 
      self.nusers = len(self.data[self.data.columns[0]]) #R.shape[0] 
      self.mitems = len(self.data[self.data.columns[1]]) #R.shape[1]

      # latent vectors
      self.user_vecs = np.random.normal(scale = 1. / self.d, size=(self.nusers, self.d)) 
      self.item_vecs = np.random.normal(scale = 1. / self.d, size=(self.mitems, self.d))

      # biases 
      self.user_bias = np.zeros(self.nusers)
      self.item_bias = np.zeros(self.mitems)

      self.all_mean = np.sum(self.data.iloc[:,2]) / self.data.shape[0]
      print("Average of all ratings:", self.all_mean)

    def rui(self):
      return self.rui

    def uv(self):
      return self.user_vecs

    def fit(self) :
      print("Model epochs:", self.epochs)
      for epoch in range(self.epochs):
        for ind, row in self.data.iterrows():
          i, j, r_ui = row
          residual = r_ui - np.dot(self.user_vecs[i], self.item_vecs[j])
          if ind == 3 and epoch % 10 == 0:
            print("[Epoch %2d] Target Score %d, Predicted Score %.4f, Residual %.4f" %(epoch, r_ui, np.dot(self.user_vecs[i], self.item_vecs[j]), residual))
          temp = self.user_vecs[i, :] 
          self.user_vecs[i, :] += self.lr * residual * self.item_vecs[j, :] 
          self.item_vecs[j, :] += self.lr * residual * temp 


    def test(self, user_ind, item_ind):
      if user_ind >= self.nusers or item_ind >= self.mitems or user_ind < 0 or item_ind < 0:
        return self.all_mean
      nanCheck = np.dot(self.user_vecs[user_ind, :], self.item_vecs[item_ind, :]) 
      if np.isnan(nanCheck):
        return self.all_mean 
      else: 
        return np.dot(self.user_vecs[user_ind, :], self.item_vecs[item_ind, :]) 
      

In [71]:
factorizer = ProbabilisticMatrixFactorization(data, d=3, lr=0.01, reg_param=0.01, epochs=200)
factorizer.fit()

Average of all ratings: 2.590909090909091
Model epochs: 200
[Epoch  0] Target Score 2, Predicted Score -0.0621, Residual 2.0621
[Epoch 10] Target Score 2, Predicted Score -0.0419, Residual 2.0419
[Epoch 20] Target Score 2, Predicted Score 0.0540, Residual 1.9460
[Epoch 30] Target Score 2, Predicted Score 0.6556, Residual 1.3444
[Epoch 40] Target Score 2, Predicted Score 1.3239, Residual 0.6761
[Epoch 50] Target Score 2, Predicted Score 1.6260, Residual 0.3740
[Epoch 60] Target Score 2, Predicted Score 1.8020, Residual 0.1980
[Epoch 70] Target Score 2, Predicted Score 1.9219, Residual 0.0781
[Epoch 80] Target Score 2, Predicted Score 2.0003, Residual -0.0003
[Epoch 90] Target Score 2, Predicted Score 2.0484, Residual -0.0484
[Epoch 100] Target Score 2, Predicted Score 2.0757, Residual -0.0757
[Epoch 110] Target Score 2, Predicted Score 2.0891, Residual -0.0891
[Epoch 120] Target Score 2, Predicted Score 2.0929, Residual -0.0929
[Epoch 130] Target Score 2, Predicted Score 2.0899, Residua

In [72]:
# Truth 
data[(data['User_ID'] == 0) & (data['Movie_ID'] == 3)]

Unnamed: 0,User_ID,Movie_ID,Rating
1,0,3,1


In [76]:
# Predicted. Rating and predicted value (1 & 1.8) is somewhat similar 
user, item = 0, 3
print((data[(data['User_ID'] == user) & (data['Movie_ID'] == item)]))
factorizer.test(user, item)

   User_ID  Movie_ID  Rating
1        0         3       1


1.8762257085202614

In [77]:
# Rating and predicted value is very similar 
user, item = 2, 3
print((data[(data['User_ID'] == user) & (data['Movie_ID'] == item)]))
factorizer.test(user, item)

   User_ID  Movie_ID  Rating
9        2         3       5


5.003330730985496

In [78]:
# Let's look at all the predicted value with the truth
result = []
for ind, row in data.iterrows():
  user, item = row['User_ID'], row['Movie_ID']
  predicted_value = factorizer.test(user, item) 
  result.append([user, item, row['Rating'], predicted_value])
result = pd.DataFrame(result, columns=['User_ID', 'Movie_ID', 'Rating', 'Predicted'])
result

Unnamed: 0,User_ID,Movie_ID,Rating,Predicted
0,0,0,1,0.850884
1,0,3,1,1.876226
2,0,4,3,2.368737
3,1,0,2,2.022846
4,1,2,3,2.917454
5,1,3,1,1.216325
6,1,4,1,0.880686
7,2,0,1,1.012302
8,2,1,2,1.979015
9,2,3,5,5.003331


In [85]:
# Let's look at unknown rating that the model predicted. 
grid = []
for user_ind in range(R.shape[0]):
  user_rating = []
  for item_ind in range(R.shape[1]):
    predicted_value = factorizer.test(user_ind, item_ind)
    user_rating.append(int(predicted_value))
  grid.append(user_rating)
grid

[[0, 0, 0, 1, 2],
 [2, 0, 2, 1, 0],
 [1, 1, 2, 5, 6],
 [1, 1, 2, 3, 4],
 [2, 1, 4, 4, 4],
 [4, 0, 5, 3, 4],
 [-1, 0, 0, 1, 1]]

In [79]:
# The original R matrix
R

array([[1, 0, 0, 1, 3],
       [2, 0, 3, 1, 1],
       [1, 2, 0, 5, 0],
       [1, 0, 0, 4, 4],
       [2, 1, 5, 4, 0],
       [5, 1, 5, 4, 0],
       [0, 0, 0, 1, 0]])