# Probablistic Matrix Factorization build 

Reference : https://www.kaggle.com/code/robottums/probabalistic-matrix-factorization-with-suprise 

The original code used surprise dataset, but my code reads dataframe 

Here, we will have a dataframe not a matrix. 
The first column is user_id and second column is movie_id. The last column will be each user's rating about an item 

In [14]:
import numpy as np 
import pandas as pd 
from matplotlib import pyplot as plt 
import random
import argparse
import pickle
import torch

In [15]:
make_np_lst = [[0, 0, 1],
 [0, 3, 1],
 [0, 4, 3],
 [1, 0, 2],
 [1, 2, 3],
 [1, 3, 1],
 [1, 4, 1],
 [2, 0, 1],
 [2, 1, 2],
 [2, 3, 5],
 [3, 0, 1],
 [3, 3, 4],
 [3, 4, 4],
 [4, 0, 2],
 [4, 1, 1],
 [4, 2, 5],
 [4, 3, 4],
 [5, 0, 5],
 [5, 1, 1],
 [5, 2, 5],
 [5, 3, 4],
 [6, 3, 1]]

In [16]:
data = pd.DataFrame(make_np_lst, columns=['User_ID', 'Movie_ID', 'Rating'])
data.head()

Unnamed: 0,User_ID,Movie_ID,Rating
0,0,0,1
1,0,3,1
2,0,4,3
3,1,0,2
4,1,2,3


In [17]:
# The mean value of Rating 2.59 means average of all rating
data[data.columns[0]]
data.describe()

Unnamed: 0,User_ID,Movie_ID,Rating
count,22.0,22.0,22.0
mean,2.772727,1.909091,2.590909
std,1.875451,1.477098,1.623022
min,0.0,0.0,1.0
25%,1.0,0.25,1.0
50%,3.0,2.0,2.0
75%,4.0,3.0,4.0
max,6.0,4.0,5.0


In [18]:
class ProbabilisticMatrixFactorization(torch.nn.Module): 
    def __init__(self, dataframe, d=10, lr=0.01, reg_param=0.05, epochs=10):
      super(ProbabilisticMatrixFactorization, self).__init__()
      self.data = dataframe
      self.d = d
      self.lr = lr
      self.reg_param = reg_param 
      self.epochs = epochs 
      self.nusers = len(np.unique(self.data[self.data.columns[0]])) #R.shape[0] 
      self.mitems = len(np.unique(self.data[self.data.columns[1]])) #R.shape[1]
      print(self.nusers, self.mitems)

      # latent vectors
      self.user_vecs =  np.random.normal(scale = 1. / self.d, size=(self.nusers, self.d)) # Draw random samples from a normal (Gaussian) distribution.
      self.item_vecs = np.random.normal(scale = 1. / self.d, size=(self.mitems, self.d))

      # biases 
      self.user_bias = np.zeros(self.nusers)
      self.item_bias = np.zeros(self.mitems)

      self.global_average = np.sum(self.data.iloc[:,2]) / self.data.shape[0]
      print("Average of all ratings:", self.global_average)

    def rui(self):
      return self.rui

    def uv(self):
      return self.user_vecs

    def fit(self) :
      print("Model epochs:", self.epochs)
      for epoch in range(self.epochs):
        cost = 0
        for ind, row in self.data.iterrows():
          i, j, r_ui = row
          mul_vec = np.dot(self.user_vecs[i], self.item_vecs[j].transpose())
          prediction = self.global_average + self.user_bias[i] + self.item_bias[j] + mul_vec
          residual = r_ui - prediction 
          
          # update biases 
          self.user_bias[i] += self.lr * (residual - self.reg_param * self.user_bias[i])
          self.item_bias[j] += self.lr * (residual - self.reg_param * self.item_bias[j])
          
          temp = self.user_vecs[i, :] 
          du = (residual * self.item_vecs[j, :]) - (self.reg_param * self.user_vecs[i, :])
          di = (residual * self.user_vecs[i, :]) - (self.reg_param * self.item_vecs[j, :])
          self.user_vecs[i, :] += self.lr * du
          self.item_vecs[j, :] += self.lr * di 

          cost += pow(residual, 2)
        if epoch % 10 == 0:
         print("[Epoch %2d] MSE score : %4f" % (epoch, np.sqrt(cost / self.data.shape[0])))

    def test(self, user_ind, item_ind):
      if user_ind >= self.nusers or item_ind >= self.mitems or user_ind < 0 or item_ind < 0:
        return self.global_average
      nanCheck = np.dot(self.user_vecs[user_ind, :], self.item_vecs[item_ind, :]) 
      if np.isnan(nanCheck):
        return self.global_average 
      else: 
        return self.global_average + self.user_bias[user_ind] + self.item_bias[item_ind] + np.dot(self.user_vecs[user_ind, :], self.item_vecs[item_ind, :].transpose()) 
      

In [19]:
factorizer = ProbabilisticMatrixFactorization(data, d=10, lr=0.01, reg_param=0.01, epochs=200)
factorizer.fit()

7 5
Average of all ratings: 2.590909090909091
Model epochs: 200
[Epoch  0] MSE score : 1.586691
[Epoch 10] MSE score : 1.332078
[Epoch 20] MSE score : 1.159979
[Epoch 30] MSE score : 1.023852
[Epoch 40] MSE score : 0.894389
[Epoch 50] MSE score : 0.755746
[Epoch 60] MSE score : 0.610825
[Epoch 70] MSE score : 0.479457
[Epoch 80] MSE score : 0.378520
[Epoch 90] MSE score : 0.307603
[Epoch 100] MSE score : 0.256953
[Epoch 110] MSE score : 0.218274
[Epoch 120] MSE score : 0.186972
[Epoch 130] MSE score : 0.160778
[Epoch 140] MSE score : 0.138528
[Epoch 150] MSE score : 0.119540
[Epoch 160] MSE score : 0.103353
[Epoch 170] MSE score : 0.089599
[Epoch 180] MSE score : 0.077962
[Epoch 190] MSE score : 0.068155


In [20]:
# Predicted. Rating and predicted value is somewhat similar 
user, item = 0, 3
print((data[(data['User_ID'] == user) & (data['Movie_ID'] == item)]))
factorizer.test(user, item)

   User_ID  Movie_ID  Rating
1        0         3       1


1.089412610289063

In [21]:
# Rating and predicted value is very similar 
user, item = 2, 3
print((data[(data['User_ID'] == user) & (data['Movie_ID'] == item)]))
factorizer.test(user, item)

   User_ID  Movie_ID  Rating
9        2         3       5


4.972867385967501

In [22]:
# Let's look at all the predicted value with the truth
result = []
for ind, row in data.iterrows():
  user, item = row['User_ID'], row['Movie_ID']
  predicted_value = factorizer.test(user, item) 
  result.append([user, item, row['Rating'], predicted_value])
result = pd.DataFrame(result, columns=['User_ID', 'Movie_ID', 'Rating', 'Predicted'])
result

Unnamed: 0,User_ID,Movie_ID,Rating,Predicted
0,0,0,1,1.008546
1,0,3,1,1.089413
2,0,4,3,2.884285
3,1,0,2,1.99981
4,1,2,3,2.981994
5,1,3,1,0.970277
6,1,4,1,1.104168
7,2,0,1,1.009283
8,2,1,2,1.967444
9,2,3,5,4.972867


In [23]:
# Let's look at unknown rating that the model predicted. 
# First, print the original R matrix
org_R = []
nusers = len(np.unique(data[data.columns[0]]))
mitems = len(np.unique(data[data.columns[1]]))
for i in range(nusers):
  tmplst = []
  for j in range(mitems):
    tmplst.append(0)
  org_R.append(tmplst)
for ind, row in data.iterrows():
  i, j, r_ui = row[data.columns[0]], row[data.columns[1]], row[data.columns[2]]
  org_R[i][j] = r_ui
org_R

[[1, 0, 0, 1, 3],
 [2, 0, 3, 1, 1],
 [1, 2, 0, 5, 0],
 [1, 0, 0, 4, 4],
 [2, 1, 5, 4, 0],
 [5, 1, 5, 4, 0],
 [0, 0, 0, 1, 0]]

In [25]:
# Now let's prin unknown rating that the model predicted. 
grid = []
for user_ind in range(nusers):
  user_rating = []
  for item_ind in range(mitems):
    predicted_value = factorizer.test(user_ind, item_ind)
    user_rating.append(np.round(predicted_value, 1))
  grid.append(user_rating)
grid

[[1.0, 0.4, 3.5, 1.1, 2.9],
 [2.0, -0.7, 3.0, 1.0, 1.1],
 [1.0, 2.0, 4.8, 5.0, 4.4],
 [1.0, 1.4, 4.4, 4.0, 4.0],
 [2.0, 1.1, 4.9, 4.0, 3.8],
 [5.0, 1.0, 5.1, 4.0, 3.6],
 [1.8, -0.1, 3.2, 1.0, 2.1]]