### **Matrix Factorization for Collaborative Filtering**

Investigating matrix factorization for collaborative filtering using [the MovieLens 100K dataset]( https://grouplens.org/datasets/movielens/100k/), I used the cosine similarity between movie vectors, defined as

$$
\text{sim}(\mathbf{v}_i,\mathbf{v}_j) = \frac{\mathbf{v}_i \cdot \mathbf{v}_j}{\Vert \mathbf{v}_i \Vert \Vert \mathbf{v}_j \Vert}
$$
to measure the similarity between different movies. Both Stochastic Gradient Descent (SGD) and the Alternating algorithm for matrix factorization were applied and their results compared.





In [None]:
## Preparation code to download the data and load them into a sparse matrix
##
import pandas as pd
import numpy as np
import gdown
from scipy.sparse import csr_matrix
import time
from IPython.display import display

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
sns.set_style("whitegrid")

# 1. Download and unzip the dataset first
#    URL: https://files.grouplens.org/datasets/movielens/ml-100k.zip
#    Then point to the u.data file inside the unzipped folder.
#!wget -O ml-100k.zip https://files.grouplens.org/datasets/movielens/ml-100k.zip
#!unzip -o -q ml-100k.zip
#path = "ml-100k/u.data"

##download from google drive
#url = 'https://drive.google.com/file/d/12cpRJ2977nyqUSiUMkvdRS9OtYy6Fj_P/view?usp=share_link'
#path = 'u.data'
#gdown.download(url, path, quiet=False,fuzzy=True)

# download from YorkU server
!wget -q -r -np -nH --cut-dirs=1 -R "index.html*" http://www.cs.yorku.ca/~huijiang/ml-100k/
path = "ml-100k/u.data"

# The raw file is tab-separated: user_id, item_id, rating, timestamp

df = pd.read_csv(path, sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])

# 2. Convert IDs to 0-based indices (good for matrix indexing)
df['user_id'] = df['user_id'].astype('category')
df['item_id'] = df['item_id'].astype('category')

n_users = df['user_id'].nunique()
n_items = df['item_id'].nunique()

print(f"Users: {n_users}, Items: {n_items}, Ratings: {len(df)}")

# 3. Build a sparse user–item rating matrix (CSR format)
# Rows \u2192 users, Columns \u2192 items, Values \u2192 ratings
X = csr_matrix(
    (df['rating'].astype(float),
     (df['user_id'].cat.codes, df['item_id'].cat.codes)),
    shape=(n_users, n_items)
)

print(X.shape)

df_matrix = pd.DataFrame.sparse.from_spmatrix(X)
display(df_matrix)

print(df.columns)

df.head()

### Alternating Algorithm for Matrix Factorization (Algorithm 7.6 on page 145)
###
# X: a sparse matrix (scipy.sparse.csr_matrix)
# k: size for dense vectors
def Alternating_MF(X, k=10, lambda1=0.1, lambda2=0.1, max_epoch=10):
  # initialize U and V
  U = 0.01*np.random.normal(size =(X.shape[0], k))
  V = 0.01*np.random.normal(size =(X.shape[1], k))
  n = X.count_nonzero()       # number of training samples

  loss = loss_fun_vec(U, V, X, lambda1, lambda2)
  print(f'epoch = 0: loss = {loss}')

  for ep in range(max_epoch):
    for i in range(X.shape[0]):
      X_row = X.getrow(i)     # extract i-th row in X
      ind = X_row.nonzero()   # get 1st, 2nd indices for i-th col in X
      V_s = V[ind[1],:]
      U[i,:] = np.transpose(np.linalg.inv(V_s.T @ V_s + lambda1 * np.identity(k) * n/U.size) @ V_s.T @ X_row[ind[0],ind[1]].T)

    for j in range(X.shape[1]):
      X_col = X.getcol(j)     # extract j-th col in X
      ind = X_col.nonzero()   # get 1st, 2nd indices for i-th col in X
      U_s = U[ind[0],:]
      V[j,:] = np.transpose(np.linalg.inv(U_s.T @ U_s + lambda2 * np.identity(k) * n/V.size) @ U_s.T @ X_col[ind[0],ind[1]].T)

    loss = loss_fun_vec(U, V, X, lambda1, lambda2)
    print(f'epoch = {ep+1}: loss = {loss}')

  return U,V


# vectorized version of loss function Q(U,V) as above
# X: a sparse matrix (scipy.sparse.csr_matrix)
# U,V: both dense matrices (X = U @ V.T)
def loss_fun_vec(U, V, X, lambda1=0.1, lambda2=0.1):
  ind = X.nonzero()
  diff = np.sum(U[ind[0],:]*V[ind[1],:],axis=1) - np.array(X[ind[0],ind[1]]).squeeze()
  loss = np.mean(diff*diff)
  loss += lambda1*np.sum(U*U)/U.size + lambda2*np.sum(V*V)/V.size
  return loss

#learning rate
def SGD_MF(X, k=10, lr=0.01, lambda1=0.005, lambda2=0.005, max_epoch=10, batch_size=100):

    U = 0.1*np.random.normal(size=(X.shape[0], k))
    V = 0.1*np.random.normal(size=(X.shape[1], k))

    n = X.count_nonzero()
    loss = loss_fun_vec(U, V, X, lambda1, lambda2)
    print(f'epoch = 0: loss = {loss}')

    for epoch in range(max_epoch):
        indices = list(zip(*X.nonzero()))
        np.random.shuffle(indices)

        # iterate over batches
        for start in range(0, len(indices), batch_size):
            batch = indices[start:start + batch_size]

            # accumulate gradients over the batch
            grad_U = np.zeros_like(U)
            grad_V = np.zeros_like(V)

            for i, j in batch:
                e_ij = X[i, j] - U[i, :] @ V[j, :].T
                grad_U[i, :] += e_ij * V[j, :] - lambda1 * U[i, :]
                grad_V[j, :] += e_ij * U[i, :] - lambda2 * V[j, :]

            # apply average gradient for the batch
            U += lr * grad_U
            V += lr * grad_V

        lr *= 0.95
        loss = loss_fun_vec(U, V, X, lambda1, lambda2)
        print(f'epoch = {epoch+1}: loss = {loss}')

    return U, V

def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

def find_similar_movies(movie_id, V, k=3):
    target_vector = V[movie_id, :]
    similarities = []

    for i in range(V.shape[0]):
        if i != movie_id:
            sim = cosine_similarity(target_vector, V[i, :])
            similarities.append((i, sim))

    # descending order sort
    similarities.sort(key=lambda x: x[1], reverse=True)

    #return top k similarities
    return similarities[:k]

def load_genre_data(path='ml-100k/u.item'):
    genre_cols = ['unknown', 'Action', 'Adventure', 'Animation', 'Children',
    'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
    'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance',
    'Sci-Fi', 'Thriller', 'War', 'Western']

    movie_df = pd.read_csv(path, sep='|', encoding='latin-1', header=None)
    movie_id = movie_df[0]
    genres = movie_df.iloc[:, 5:24]
    genres.columns = genre_cols

    genre_dict = {}
    for idx, row in genres.iterrows():
        movie_genres = [genre for genre, value in row.items() if value == 1]
        genre_dict[movie_id[idx]] = movie_genres

    return genre_dict

# Load movie titles
movie_df = pd.read_csv('ml-100k/u.item', sep='|', encoding='latin-1',
                       header=None, usecols=[0, 1], names=['movie_id', 'title'])
movie_titles = movie_df.set_index('movie_id')['title'].to_dict()

test_movie_ids = [28, 81, 126, 131, 153, 172, 178, 179, 269, 342]


genre_dict = load_genre_data()
print("\n")



# alternating matrix factorization
for k_dim in [10, 15, 20, 30, 40, 50, 100]:
    print(f"\nRunning Alternating MF with k={k_dim}\n")
    start_time = time.time()
    U, V = Alternating_MF(X, k=k_dim, lambda1=0.1, lambda2=0.1, max_epoch=10)
    training_time = time.time() - start_time
    print(f"\nTraining time: {training_time:.2f} seconds")
    print(f"Top 3 Similar Movies for Each Test Movie (k={k_dim})\n")

    for movie_id in test_movie_ids:
        mid = movie_id + 1  # convert to 1-based ID
        movie_title = movie_titles.get(mid, "Unknown")
        movie_genres = ", ".join(genre_dict.get(mid, [])) or "Unknown"

        print(f"\nMovie {movie_id} (ID {mid}): {movie_title}  "
              f"(Genres: {movie_genres})")

        similar_movies = find_similar_movies(movie_id, V, k=3)
        for rank, (sim_id, similarity) in enumerate(similar_movies, 1):
            smid = sim_id + 1
            sim_title = movie_titles.get(smid, "Unknown")
            sim_genres = ", ".join(genre_dict.get(smid, [])) or "Unknown"

            print(f"  {rank}. Movie {sim_id} (ID {smid}): {sim_title}  "
                  f"(Genres: {sim_genres})  "
                  f"(similarity: {similarity:.4f})")



print("\n")

# Stochastic Gradient Descent Matrix factorization
for k_dim in [10, 30, 50, 100]:
    print(f"\nRunning SGD MF with k={k_dim}\n")
    start_time = time.time()
    U, V = SGD_MF(X, k=k_dim, lr=0.01, lambda1=0.01, lambda2=0.01, max_epoch=20, batch_size=100)
    training_time = time.time() - start_time
    print(f"\nTraining time: {training_time:.2f} seconds")
    print(f"Top 3 Similar Movies for Each Test Movie (k={k_dim})\n")

    for movie_id in test_movie_ids:
        mid = movie_id + 1
        movie_title = movie_titles.get(mid, "Unknown")
        movie_genres = ", ".join(genre_dict.get(mid, [])) or "Unknown"

        print(f"\nMovie {movie_id} (ID {mid}): {movie_title}  "
              f"(Genres: {movie_genres})")

        similar_movies = find_similar_movies(movie_id, V, k=3)
        for rank, (sim_id, similarity) in enumerate(similar_movies, 1):
            smid = sim_id + 1
            sim_title = movie_titles.get(smid, "Unknown")
            sim_genres = ", ".join(genre_dict.get(smid, [])) or "Unknown"

            print(f"  {rank}. Movie {sim_id} (ID {smid}): {sim_title}  "
                  f"(Genres: {sim_genres})  "
                  f"(similarity: {similarity:.4f})")






Users: 943, Items: 1682, Ratings: 100000
(943, 1682)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0,0,0,0,0,0,0,0,0,0
1,4.0,0,0,0,0,0,0,0,0,2.0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4.0,3.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,0,0,0,0,0,0,0,0,5.0,0,...,0,0,0,0,0,0,0,0,0,0
939,0,0,0,2.0,0,0,4.0,5.0,3.0,0,...,0,0,0,0,0,0,0,0,0,0
940,5.0,0,0,0,0,0,4.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
941,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Index(['user_id', 'item_id', 'rating', 'timestamp'], dtype='object')



Running Alternating MF with k=10

epoch = 0: loss = 13.727060477317755
epoch = 1: loss = 3.810308605313195
epoch = 2: loss = 0.7195658246404677
epoch = 3: loss = 0.6538622032143963
epoch = 4: loss = 0.631982172145617
epoch = 5: loss = 0.6203452658049178
epoch = 6: loss = 0.6127806717033566
epoch = 7: loss = 0.6074003546617032
epoch = 8: loss = 0.603328827414795
epoch = 9: loss = 0.6001107332614734
epoch = 10: loss = 0.5975069654172172

Training time: 14.55 seconds
Top 3 Similar Movies for Each Test Movie (k=10)


Movie 28 (ID 29): Batman Forever (1995)  (Genres: Action, Adventure, Comedy, Crime)
  1. Movie 682 (ID 683): Rocket Man (1997)  (Genres: Comedy)  (similarity: 0.9576)
  2. Movie 554 (ID 555): White Man's Burden (1995)  (Genres: Drama)  (similarity: 0.9334)
  3. Movie 382 (ID 383): Flintstones, The (1994)  (Genres: Children, Comedy)  (similarity: 0.9206)

Movie 81 (ID 82): Jurassic Park (1993)  (Genres: Act

**Discussion:**

Here we use collaborative filtering to determine similarity between movies based on the premise that when comparing two films, if multiple users like one film they will most likely like the other. Rather than using the original sample list, I chose a selection of ten films I had seen as I was interested in verifying based on my own preferences whether the cosine similarity resulted in plausible results.

When looking at the output created by the Alternating Algorithm for Matrix Factorization, we observe that by including more latent features (higher value of *k*), it seemed to both produce less plausible results

This can be attributed to the fact that a high cosine between two movie vectors just means the model found those two columns pointing in nearly the same direction. But does this accurately depict the viewer's preferences? Not necessarily, as this could be caused by a few shared raters or by noise. Thus overfitting is generally a problem in collaborative filtering models based upon sparsely populated data input.

Also including the genres of the films was useful in this analysis. An example comparison of one film at k=10 and 40 (both using the alternating algorithm):

**k=10**

Movie 178 (ID 179): Clockwork Orange, A (1971)  (Genres: Sci-Fi)
  1. Movie 1352 (ID 1353): 1-900 (1994)  (Genres: Romance)  (similarity: 0.8716)
  2. Movie 1142 (ID 1143): Hard Eight (1996)  (Genres: Crime, Thriller)  (similarity: 0.8613)
  3. Movie 1523 (ID 1524): Kaspar Hauser (1993)  (Genres: Drama)  (similarity: 0.8486)

**k=40**

Movie 178 (ID 179): Clockwork Orange, A (1971)  (Genres: Sci-Fi)
  1. Movie 474 (ID 475): Trainspotting (1996)  (Genres: Drama)  (similarity: 0.5654)
  2. Movie 134 (ID 135): 2001: A Space Odyssey (1968)  (Genres: Drama, Mystery, Sci-Fi, Thriller)  (similarity: 0.5501)
  3. Movie 36 (ID 37): Nadja (1994)  (Genres: Drama)  (similarity: 0.5371)


When we provide recommendations for the same movie, A Clockwork Orange, at k =40 we see that it also recommends 2001: A Space Odyssey which is by the same director (Stanley Kubrick), which substantiates the idea that it is providing a greater accuracy of viewer preference predictions.


With Larger k values, it can be observed that the cosine similarity is a smaller value but the top few movies actually correlated better overall to reality. In the second example, the similar movies are all drama and/or sci-fi which makes sense as the reference movie itself is serious and science fiction. Compared to when k=10, we see that it is compared to movies that are less serious albeit with a higher cosine similarity. Thus cosine similarity in terms of proximity to a perfect fit of 1.0 (vectors pointing in the exact same direction) should be a desired metric when designing a collaborative filtering model. Rather, an appropriately balanced number of iterations which allows the highest cosine similarities to depict a more accurate comparison.

**General Observations:**

*   ALS converged faster than SGD when the value of k is smaller
*   SGD tends to overfit more and the results do not reflect a human comparison
*   Both algorithms overfit the training data as k became higher
*   As k was increased (higher latent features), the Alternating Algorithm results became very influenced by noise whereas at lower values only the most vital trends were captured

In terms of training efficiency, the Alternating Algorithm tends to converge in fewer epochs for smaller latent dimensions, making it faster per recommendation in low-dimensional spaces. However, it becomes computationally expensive as the latent dimension k increases. In contrast, SGD scales linearly with the number of non-zero ratings and latent features, allowing updates on subsets of data (batches), making it more suitable for large sparse datasets.
SGD proved to not be very useful in smaller k values and needed approximately 50 to provide better recommendations. Ultimately, I would recommend SGD for larger data sets (especially when they are sparse), and ALS for smaller data sets that are well-populated for more accurate results.


**Alternating Least Squares (ALS / Alternating MF):**

Factorizes X $ ≈ UV^\top$ with latent dimension k.
Iteratively updates user vectors U and movie vectors V to minimize squared error.


**Applying Stochastic Gradient Descent (SGD) to matrix factorization: **

If we consider the total error function (with no regularization):

$$Q(U,V) = \sum_{(i,j) \in \Omega} \left( x_{ij} - u_i^\top v_j \right)^2$$

We can apply this to SGD by only considering one value in the residual matrix:

$$ e_(i,j) = \left( x_{ij} - u_i^\top v_j \right)$$

The loss for a single entry is:

$$L_{ij} = \frac{1}{2} \left( x_{ij} - u_i^\top v_j \right)^2$$

The gradients with respect to the user and item vectors are:

$$\frac{\partial L_{ij}}{\partial u_i} = \frac{\partial}{\partial u_i} \frac{1}{2} \left(x_{ij} - u_i^\top v_j \right)^2 $$

$$ = \left (x_{ij} - u_i^\top v_j \right) \frac {\partial}{\partial u_i} \left(x_{ij} - u_i^\top v_j \right ) $$
$$ = - \left (x_{ij} - u_i^\top v_j \right) v_j $$

or, as written with "e" for error:

$$ = - \left (e_{ij}  \right) v_j $$

and:

$$\frac{\partial L_{ij}}{\partial v_j} = - \left (x_{ij} - u_i^\top v_j \right) u_j $$
$$ = - \left (e_{ij}  \right) u_j $$

Then we update the vectors using a learning rate η:

$$u_i \gets u_i + \eta \, e_{ij} \, v_j, \quad
v_j \gets v_j + \eta \, e_{ij} \, u_i $$
