In [1]:
# Input data 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/movies-data/movies.csv
/kaggle/input/movies-data/users.csv
/kaggle/input/movies-data/train.csv
/kaggle/input/movies-data/test.csv


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import NMF
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, ConfusionMatrixDisplay
from scipy.sparse import coo_matrix, csr_matrix
from collections import namedtuple



In [3]:
file_path1 = '/kaggle/input/movies-data/users.csv'
file_path2 = '/kaggle/input/movies-data/movies.csv'
file_path3 ='/kaggle/input/movies-data/train.csv'
file_path4 ='/kaggle/input/movies-data/test.csv'

In [4]:
MV_users = pd.read_csv(file_path1)
MV_movies =pd.read_csv(file_path2)
train = pd.read_csv(file_path3)
test = pd.read_csv(file_path4)

In [5]:
from collections import namedtuple
Data = namedtuple('Data', ['users','movies','train','test'])
data = Data(MV_users, MV_movies, train, test)

In [6]:
class RecSys():
    def __init__(self,data):
        self.data=data
        self.allusers = list(self.data.users['uID'])
        self.allmovies = list(self.data.movies['mID'])
        self.genres = list(self.data.movies.columns.drop(['mID', 'title', 'year']))
        self.mid2idx = dict(zip(self.data.movies.mID,list(range(len(self.data.movies)))))
        self.uid2idx = dict(zip(self.data.users.uID,list(range(len(self.data.users)))))
        self.Mr=self.rating_matrix()
        self.Mm=None 
        self.sim=np.zeros((len(self.allmovies),len(self.allmovies)))
        
    def rating_matrix(self):
        """
        Convert the rating matrix to numpy array of shape (#allusers,#allmovies)
        """
        ind_movie = [self.mid2idx[x] for x in self.data.train.mID] 
        ind_user = [self.uid2idx[x] for x in self.data.train.uID]
        rating_train = list(self.data.train.rating)
        
        return np.array(coo_matrix((rating_train, (ind_user, ind_movie)), shape=(len(self.allusers), len(self.allmovies))).toarray())

    def rmse(self,yp):
        yp[np.isnan(yp)]=3 #In case there is nan values in prediction, it will impute to 3.
        yt=np.array(self.data.test.rating)
        return np.sqrt(((yt-yp)**2).mean())
    
    def factorization(self):
        

        result = np.zeros((self.data.test.shape[0],))
        
        ratings = self.Mr.copy()
        ratings[ratings==0] = 3
        
        nmf_model = NMF(n_components=25, random_state=0, init='random')
        W = nmf_model.fit_transform(ratings)
        H = nmf_model.components_
        
        predicted_ratings = np.dot(W, H)

        scaler = MinMaxScaler(feature_range=(1, 5.5))
        scaler.fit(predicted_ratings)
        predicted_ratings = scaler.transform(predicted_ratings)
        
        for i in range(self.data.test.shape[0]):
            pred = predicted_ratings[self.uid2idx[self.data.test["uID"][i]]][self.mid2idx[self.data.test["mID"][i]]]
            result[i] = pred

        #print(self.data.test)  
        #print(result)
        
        return result

In [7]:
# Perform NMF on ratings matrix
rs = RecSys(data)
ratingsMatrix = rs.Mr
model = NMF(n_components = 18, random_state = 42, init="nndsvda", solver="mu", beta_loss="kullback-leibler", max_iter=1000).fit(ratingsMatrix)
W = model.transform(ratingsMatrix)
H = model.components_



In [8]:
# Reconstruct user data as predictions from NMF
X = model.inverse_transform(W)
X.shape

(6040, 3883)

In [9]:
# Adapt the predict method of RecSys() to make predictions from the reconstructed user data, rather than the using the baseline / imputation methods.
yhat = []
n_test = len(rs.data.test)
for i in range(n_test):
    x = rs.data.test.iloc[i]
    mid = x.mID
    uid = x.uID
    yhat.append(X[rs.uid2idx[uid],rs.mid2idx[mid]])

In [10]:
# Adapt the rmse method of RecSys()
yhat = np.asarray(yhat)
yhat[np.isnan(yhat)] = 3 
labs = np.array(rs.data.test.rating)
RMSE = np.sqrt(((labs-yhat)**2).mean())

print("The RMSE of the predictions made using NMF was:", RMSE)



The RMSE of the predictions made using NMF was: 2.8850867946900713


The Week 3 collaborative filtering models all achieved RMSEs below 1—even the simplest baseline of filling missing ratings with each user’s average performed markedly better than an RMSE of 2.885. Such a high error indicates serious shortcomings:

Extreme sparsity: With so many missing entries, low-rank approximation methods struggle to recover the true underlying structure without large reconstruction errors.

Low rating values: The small range of possible ratings magnifies random noise, further degrading factorization quality.

KL-divergence loss limitations: Although KL-loss handles zero-heavy matrices better, it prevents the use of “nndsvd” initialization, which is ideal for sparse data and helps reduce error.



To improve performance, consider pre-condensing the matrix before factorization—options include applying Truncated SVD or PCA to reduce dimensionality and noise prior to collaborative filtering.