In this notebook, I just copy the code from the previous module and explore the difference between the performance of sklearn’s non-negative matrix factorization library and simple baseline or similarity-based methods in Module 3.

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
from sklearn.model_selection import train_test_split
from scipy.sparse import coo_matrix, csr_matrix
from scipy.spatial.distance import jaccard, cosine 

In [3]:
MV_users = pd.read_csv('movies/users.csv')
MV_movies = pd.read_csv('movies/movies.csv')
train = pd.read_csv('movies/train.csv')
test = pd.read_csv('movies/test.csv')

In [4]:
from collections import namedtuple
Data = namedtuple('Data', ['users','movies','train','test'])
data = Data(MV_users, MV_movies, train, test)

In [5]:
class RecSys():
    def __init__(self, data):
        self.data = data
        self.allusers = list(self.data.users['uID'])
        self.allmovies = list(self.data.movies['mID'])
        self.genres = list(self.data.movies.columns.drop(['mID', 'title', 'year']))
        self.mid2idx = dict(zip(self.data.movies.mID, list(range(len(self.data.movies)))))
        self.uid2idx = dict(zip(self.data.users.uID, list(range(len(self.data.users)))))
        self.Mr = self.rating_matrix()
        self.Mm = None
        self.sim = np.zeros((len(self.allmovies), len(self.allmovies)))

    def rating_matrix(self):
        """
        Convert the rating matrix to numpy array of shape (#allusers,#allmovies)
        """
        ind_movie = [self.mid2idx[x] for x in self.data.train.mID]
        ind_user = [self.uid2idx[x] for x in self.data.train.uID]
        rating_train = list(self.data.train.rating)

        return np.array(coo_matrix((rating_train, (ind_user, ind_movie)),
                                   shape=(len(self.allusers), len(self.allmovies))).toarray())

    def predict_everything_to_3(self):
        """
        Predict everything to 3 for the test data
        """
        # Generate an array with 3s against all entries in test dataset
        # your code here
        return np.array([3] * self.data.test.shape[0])

    def predict_to_user_average(self):
        """
        Predict to average rating for the user.
        Returns numpy array of shape (#users,)
        """
        # Generate an array as follows:
        # 1. Calculate all avg user rating as sum of ratings of user across all movies/number of movies whose rating > 0
        # 2. Return the average rating of users in test data
        # your code here
        users = np.unique(self.allusers)
        users_avg_rating = np.zeros((users.shape[0], 2))
        users_avg_rating[:, 0] = users
        res = np.zeros((self.data.test.shape[0], 2))
        res[:, 0] = self.data.test['uID']
        for user in users:
            index_train = np.where(self.data.train['uID'] == user)[0]
            selected_train = self.data.train.iloc[index_train]
            final_train = selected_train[selected_train['rating'] > 0]
            row = np.where(users_avg_rating[:, 0] == user)[0][0]
            users_avg_rating[row, 1] = final_train['rating'].sum() / final_train.shape[0]

        for i in range(self.data.test.shape[0]):
            user_to_find = self.data.test.iloc[i, 0]
            row_index = np.where(users_avg_rating[:, 0] == user_to_find)[0][0]
            rating_of_user = users_avg_rating[row_index, 1]
            res[i, 1] = rating_of_user

        return res[:, 1]

    def predict_from_sim(self, uid, mid):
        """
        Predict a user rating on a movie given userID and movieID
        """
        # Predict user rating as follows:
        # 1. Get entry of user id in rating matrix
        # 2. Get entry of movie id in sim matrix
        # 3. Employ 1 and 2 to predict user rating of the movie
        # your code here
        user_ratings = self.Mr[self.uid2idx[uid]]
        movie_similarities = self.sim[self.mid2idx[mid]]
        
        non_zero_indices = user_ratings > 0
        filtered_ratings = user_ratings[non_zero_indices]
        if movie_similarities.shape[0] == 1:
            temp = movie_similarities.tolist()
            movie_similarities = np.array(temp[0])
        filtered_similarities = movie_similarities[non_zero_indices]
        
        if np.sum(filtered_similarities) > 0:
            predicted_rating = np.dot(filtered_similarities, filtered_ratings) / np.sum(filtered_similarities)
            return predicted_rating

    def predict(self):
        """
        Predict ratings in the test data. Returns predicted rating in a numpy array of size (# of rows in testdata,)
        """
        # your code here
        prediction = []
        for index, row in self.data.test.iterrows():
            uid = row['uID']
            mid = row['mID']
            pred = self.predict_from_sim(uid, mid)
            prediction.append(pred)
        return np.array(prediction, dtype=float)

    def rmse(self, yp):
        yp[np.isnan(yp)] = 3  # In case there is nan values in prediction, it will impute to 3.
        yt = np.array(self.data.test.rating)
        return np.sqrt(((yt - yp) ** 2).mean())



In [7]:
# Creating Sample test data
np.random.seed(42)
sample_train = train[:30000]
sample_test = test[:30000]


sample_MV_users = MV_users[(MV_users.uID.isin(sample_train.uID)) | (MV_users.uID.isin(sample_test.uID))]
sample_MV_movies = MV_movies[(MV_movies.mID.isin(sample_train.mID)) | (MV_movies.mID.isin(sample_test.mID))]


sample_data = Data(sample_MV_users, sample_MV_movies, sample_train, sample_test)
sample_rs = RecSys(sample_data)
rs = RecSys(data)


In [9]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
from sklearn.metrics import mean_squared_error
from scipy.sparse import coo_matrix

class MovieRecommender(RecSys):
    def __init__(self, data, n_components=15, init='random', random_state=0):
        super().__init__(data)
        self.nmf_model = NMF(n_components=n_components, init=init, random_state=random_state)
        self.W = None
        self.H = None

    def fit(self):
        self.W = self.nmf_model.fit_transform(self.Mr)
        self.H = self.nmf_model.components_

    def predict(self):
        return np.dot(self.W, self.H)

    def calculate_rmse(self, test_data):
        predictions = self.predict()
        test_indices = (test_data['uID'].map(self.uid2idx).dropna(), test_data['mID'].map(self.mid2idx).dropna())
        actual_ratings = test_data.loc[test_indices[0].index, 'rating']
        predicted_ratings = predictions[test_indices]
        return np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))



In [10]:
recommender = MovieRecommender(data)
recommender.fit()
rmse = recommender.calculate_rmse(test)
print(f"RMSE: {rmse}")



RMSE: 2.8732952663085833


This value is much lower than RMSE in Module 3, even most of than predict_everything_to_3() with RMSE of 1.2585510334053043, which mean low performance of factorization of NMF in sklearn library.
Given the results of the RMSE (2.8732952663085833) from the matrix factorization approach using NMF compared to the much lower RMSE values (0.98-1.25) achieved by simple baseline or similarity-based methods, it's essential to analyze why this might be the case and explore potential solutions to improve the matrix factorization model's performance.

Reasons for Poor Performance with NMF
1) Non-Negativity Constraint: NMF imposes a non-negativity constraint on both user and item matrices. This constraint may not be appropriate for all datasets, especially if the interaction data (ratings) include inherent negative values or the data is centered around zero. Movie ratings, for instance, can be more effectively modeled if negative latent factors are allowed, as they can capture dislikes or negative preferences.

2) Sparsity of Data: NMF, like many other matrix factorization methods, struggles with highly sparse datasets. If many users have rated only a small number of movies, the model may find it difficult to learn meaningful latent features for the majority of users and items.

3) Overfitting: With a higher number of components, NMF might overfit the training data, leading to poor generalization on unseen data (test set).

4) Simplicity of Model: NMF does not incorporate any regularization terms or mechanisms to handle user bias (e.g., some users might generally give higher ratings) and item bias (e.g., some movies might generally receive higher ratings). Baseline methods typically account for these biases explicitly.

Suggested Fixes:
1. Incorporate Biases: Modifying the NMF model to include user and item biases can significantly improve performance. This can be done by subtracting the global average rating, user bias, and item bias from each rating before fitting the model and adding them back to the predictions.

2. Use Regularization: Adding regularization terms (like L2 regularization) to the loss function used by NMF can help prevent overfitting, especially in the case of sparse data.

3. Hybrid Approaches: Combining NMF with other techniques, such as similarity-based methods or even neural networks, can help capture more complex patterns in the data. Hybrid models can leverage the strengths of each approach.

4. Tuning Hyperparameters: Adjusting the number of latent factors (n_components) and the initialization parameters could yield better results. Cross-validation should be used to find the optimal configuration.

5. Alternate Matrix Factorization Techniques: If NMF's non-negativity constraint is too restrictive, other matrix factorization techniques such as SVD (Singular Value Decomposition) or ALS (Alternating Least Squares) might be more appropriate, as they do not impose this constraint.

6. Enhance Data Quality: Techniques to handle sparsity, like data imputation or gathering more data, can improve the model's learning ability.