### Sharon Laurance Muthipeedika
### 312486

### Exercise 2: Implementing basic matrix factorization (MF) technique for recommender systems

In [1]:
#Import Required Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt
import os
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import euclidean_distances


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
os.chdir('gdrive/MyDrive/LAB10') 

In [4]:
movie_lens=pd.read_csv("ml-100k/u.data",header=None,delim_whitespace=True)

In [5]:
movie_lens.columns=["User_Id","Movie_Id","Rating","Timestamp"]

In [13]:
n_u = len(movie_lens["User_Id"].unique())
n_m = len(movie_lens["Movie_Id"].unique())
sparsity = len(movie_lens)/(n_u*n_m)
print("sparsity of ratings is %.2f%%" %(sparsity*100))

sparsity of ratings is 6.30%


In [14]:
# Split the dataframe into a train ,validation and test set
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(movie_lens,test_size=0.1)
train_data, val_data = train_test_split(train_data,test_size=0.11111)
train_data = pd.DataFrame(train_data)
test_data = pd.DataFrame(test_data)
val_data = pd.DataFrame(val_data)

In [15]:
print(np.shape(train_data))
print(np.shape(val_data))
print(np.shape(test_data))

(80000, 4)
(10000, 4)
(10000, 4)


In [16]:
test_data

Unnamed: 0,User_Id,Movie_Id,Rating,Timestamp
86824,643,226,2,891449476
24139,386,515,5,877654961
4559,216,226,3,880244803
13560,121,57,5,891390014
88537,894,935,3,879896815
...,...,...,...,...
38071,299,94,1,889503564
59066,622,402,3,882670252
28026,456,462,3,881373506
66231,889,186,5,880181563


In [18]:
# Create training ,validation and test matrix

R = np.zeros((n_u, n_m))
for line in train_data.itertuples():
  R[line[1]-1, line[2]-1] = line[3]  


V = np.zeros((n_u, n_m))
for line in val_data.itertuples():
  V[line[1]-1, line[2]-1] = line[3] 


T = np.zeros((n_u, n_m))
for line in test_data.itertuples():
  T[line[1]-1, line[2]-1] = line[3]

In [19]:
np.shape(V)

(943, 1682)

Matrix Factorization

$$\mathbf{R} \approx \mathbf{P} \times \mathbf{Q}^T = \hat{\mathbf{R}}$$\
$\hat{r}_{ij} = p_i^T q_j = \sum_{k=1}^K{p_{ik} q_{kj}}$\
$e_{ij}^2 = (r_{ij} - \hat{r}_{ij})^2 = (r_{ij} - \sum_{k=1}^K{p_{ik}q_{kj}})^2$\
$E = \sum_{(u_i, d_j, r_{ij}) \in T}{e_{ij}} = \sum_{(u_i,d_j,r_{ij}) \in T}{(r_{ij} - \sum_{k=1}^K{p_{ik}q_{kj}})^2}$

In [20]:
class MC(BaseEstimator):
    
    def __init__(self,  n_u, n_m, n_factors=10, n_epochs=250, lmbda=10, gamma=9e-5):
        self.n_u = n_u #Number of Users
        self.n_m = n_m #Number of Movies
        self.n_factors = n_factors #Latent Factors
        self.n_epochs = n_epochs
        self.lmbda = lmbda #Regularisation Constany
        self.gamma = gamma #Learning Rate
        

    def fit(self, X, y):
        """Fits all the latent factors for users and items and saves the resulting matrix representations.
        """
        X, y = check_X_y(X, y)
        
        
        # Create training matrix
        R = np.zeros((self.n_u, self.n_m))
        print(np.shape(R))
        for idx, row in enumerate(X):
            R[row[0]-1, row[1]-1] = y[idx]  

        # Initialize latent factors
        P = 3 * np.random.rand(self.n_u, self.n_factors) # Latent factors for users
        Q = 3 * np.random.rand(self.n_m, self.n_factors) # Latent factors for movies
                    

        def rmse_score(R, Q, P):
            I = R != 0  # Indicator function which is zero for missing data
            ME = I * (R - np.dot(P, Q.T))  # Errors between real and predicted ratings
            MSE = ME**2  
            return np.sqrt(np.sum(MSE)/np.sum(I))  # sum of squared errors

        # Fit with stochastic or batch gradient descent
        train_errors = []
        # Stochastic GD
        users,items = R.nonzero()      
        for epoch in range(self.n_epochs):
          for u, i in zip(users,items):
            e = R[u, i] - np.dot(P[u, :], Q[i, :].T)  # Error for this observation
            P[u, :] += self.gamma * ( e * Q[i, :] - self.lmbda * P[u, :]) # Update this user's features
            Q[i, :] += self.gamma * ( e * P[u, :] - self.lmbda * Q[i, :])  # Update this movie's features
          train_errors.append(rmse_score(R,Q,P)) # Training RMSE for this pass
      

        print("Completed %i epochs, final RMSE = %.2f" %(self.n_epochs, train_errors[-1]))
        self.Q = Q
        self.P = P
        self.train_errors = train_errors
        
        # Return the estimator
        return self

    def predict(self, X):
        """ Predicts a vector of ratings from a matrix of user and item ids.
        """
        X = check_array(X)
        
        y = np.zeros(len(X))
        PRED = np.dot(self.P, self.Q.T)
        for idx, row in enumerate(X):
            y[idx] = PRED[row[0]-1, row[1]-1]
        
        
        return y

    def score(self, X, y):
        """ Element-wise root mean squared error.
        """
        yp = self.predict(X)
        err = y - yp
        mse = np.sum(np.multiply(err, err))/len(err)
        return np.sqrt(mse)

In [23]:
X = movie_lens[["User_Id", "Movie_Id"]].to_numpy()
y = movie_lens["Rating"].values
n_u = len(movie_lens["User_Id"].unique())
n_m = len(movie_lens["Movie_Id"].unique())


#### Optimize the hyper-parameters i.e. λ regularization constant, α learning rate, k latent dimensions.

In [24]:
rcmdr = MC(n_u=n_u, n_m=n_m, gamma=6e-5, n_epochs=400)
params = {"lmbda": (45, 50),
         "n_factors": (15, 18)} #n_factors=k latent dimensions ,lambda=regularisation constant
search = GridSearchCV(rcmdr, param_grid=params, cv=3)
search.fit(X, y)

(943, 1682)
Completed 400 epochs, final RMSE = 3.70
(943, 1682)
Completed 400 epochs, final RMSE = 3.71
(943, 1682)
Completed 400 epochs, final RMSE = 3.71
(943, 1682)
Completed 400 epochs, final RMSE = 3.70
(943, 1682)
Completed 400 epochs, final RMSE = 3.71
(943, 1682)
Completed 400 epochs, final RMSE = 3.71
(943, 1682)
Completed 400 epochs, final RMSE = 3.70
(943, 1682)
Completed 400 epochs, final RMSE = 3.71
(943, 1682)
Completed 400 epochs, final RMSE = 3.71
(943, 1682)
Completed 400 epochs, final RMSE = 3.70
(943, 1682)
Completed 400 epochs, final RMSE = 3.71
(943, 1682)
Completed 400 epochs, final RMSE = 3.71
(943, 1682)
Completed 400 epochs, final RMSE = 3.70


GridSearchCV(cv=3, estimator=MC(gamma=6e-05, n_epochs=400, n_m=1682, n_u=943),
             param_grid={'lmbda': (45, 50), 'n_factors': (15, 18)})

#### Compute the validation RMSE.

In [25]:
best_est = search.best_estimator_
results = pd.DataFrame(search.cv_results_)
results[["mean_test_score", "std_test_score", "params"]].sort_values(by=["mean_test_score"], ascending=True).head()

Unnamed: 0,mean_test_score,std_test_score,params
1,3.704782,0.010879,"{'lmbda': 45, 'n_factors': 18}"
0,3.7048,0.010868,"{'lmbda': 45, 'n_factors': 15}"
2,3.704824,0.010854,"{'lmbda': 50, 'n_factors': 15}"
3,3.704825,0.010849,"{'lmbda': 50, 'n_factors': 18}"


#### Compute the test RMSE

In [29]:
X_val=val_data[["User_Id", "Movie_Id"]].to_numpy()
y_val = val_data["Rating"].values
rmse_score=search.score(X_val,y_val)
print("RMSE score is",rmse_score )

RMSE score is 3.7022194925889536


In [30]:
X_test=test_data[["User_Id", "Movie_Id"]].to_numpy()
y_test = test_data["Rating"].values
rmse_score_test=search.score(X_test,y_test)
print("RMSE score is",rmse_score_test )

RMSE score is 3.7052423013498035


https://www.kaggle.com/washingtongold/matrix-factorization-on-movie-ratings-dataset?scriptVersionId=30187124
https://nbviewer.org/github/albertauyeung/matrix-factorization-in-python/blob/master/mf.ipynb
https://albertauyeung.github.io/2017/04/23/python-matrix-factorization.html/