ALS Baseline Recommender System

This notebook trains a collaborative filtering baseline using Alternating Least Squares (ALS).
ALS learns user and movie embeddings from the interaction matrix produced by the pipeline, and predicts which movies each user is likely to enjoy based on rating patterns.

# Setup and Imports

This section loads the processed data files created by the data pipeline and imports the ALS model from the implicit library

In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
from pathlib import Path
from implicit.als import AlternatingLeastSquares
import pickle

In [2]:
PROJECT_ROOT = Path("..").resolve()
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"

In [3]:
print("Project root:", PROJECT_ROOT)
print("Processed dir:", PROCESSED_DIR)

Project root: /Users/sanjaydilip/Desktop/Code/Projects/Movie Recommender
Processed dir: /Users/sanjaydilip/Desktop/Code/Projects/Movie Recommender/data/processed


# Load Pipeline Outputs

We load the train/test splits, the movie lookup table, and the sparse item×user matrix generated by the pipeline.

In [4]:
train = pd.read_csv(PROCESSED_DIR / "train.csv")
test = pd.read_csv(PROCESSED_DIR / "test.csv")
movie_map = pd.read_csv(PROCESSED_DIR / "movie_map.csv")

In [5]:
item_user = sp.load_npz(PROCESSED_DIR / "item_user_train.npz")

In [15]:
user_item = item_user.T.tocsr()

In [6]:
train.head()

Unnamed: 0,user_id,movie_id,u_index,m_index,rating,timestamp,title,genres
0,1,3186,0,31,4.0,2000-12-31 22:00:19,"Girl, Interrupted (1999)",Drama
1,1,1270,0,22,5.0,2000-12-31 22:00:55,Back to the Future (1985),Comedy|Sci-Fi
2,1,1721,0,27,4.0,2000-12-31 22:00:55,Titanic (1997),Drama|Romance
3,1,1022,0,37,5.0,2000-12-31 22:00:55,Cinderella (1950),Animation|Children's|Musical
4,1,2340,0,24,3.0,2000-12-31 22:01:43,Meet Joe Black (1998),Romance


# Sanity Checks

To confirm shapes and verify that the matrix aligns properly with the cleaned dataset

In [7]:
print("Train shape:", train.shape)
print("Test shape:", test.shape)

Train shape: (802553, 8)
Test shape: (197656, 8)


In [8]:
print("Unique users (train):", train["u_index"].nunique())
print("Unique users (test):", test["u_index"].nunique())
print("Unique movies:", train["m_index"].nunique())

Unique users (train): 6040
Unique users (test): 6040
Unique movies: 3667


In [16]:
print("Item x User matrix shape:", item_user.shape)
print("User x Item matrix shape:", user_item.shape)

Item x User matrix shape: (3706, 6040)
User x Item matrix shape: (6040, 3706)


# Train the ALS Model

We initialize and train ALS on the item×user matrix.
ALS learns low-dimensional embeddings for users and movies based on their interactions.

In [10]:
factors = 64
regularization = 0.1
iterations = 20

In [11]:
model = AlternatingLeastSquares(
    factors=factors,
    regularization=regularization,
    iterations=iterations,
    random_state=42,
)

In [12]:
model.fit(item_user)

  0%|          | 0/20 [00:00<?, ?it/s]

# Recommendation Helper Function

This helper retrieves top-N recommended movies for a given user.

In [20]:
def recommend_for_user(model, user_index, user_item_matrix, movie_map, top_n=10):
    user_row = user_item_matrix[user_index]
    rec_ids, scores = model.recommend(
        userid=user_index,
        user_items=user_row,
        N=top_n,
    )
    rec_df = movie_map[movie_map["m_index"].isin(rec_ids)].copy()
    order = {m: i for i, m in enumerate(rec_ids)}
    rec_df["rank"] = rec_df["m_index"].map(order)
    rec_df = rec_df.sort_values("rank")
    return rec_df[["m_index", "title", "genres"]].reset_index(drop=True)


In [21]:
sample_user = train["u_index"].iloc[0]
recommend_for_user(model, sample_user, user_item, movie_map, top_n=10)

Unnamed: 0,m_index,title,genres
0,1251,Midnight Run (1988),Action|Adventure|Comedy|Crime
1,1748,Boys on the Side (1995),Comedy|Drama
2,61,Rebel Without a Cause (1955),Drama
3,3482,"Uninvited Guest, An (2000)",Drama
4,172,Miller's Crossing (1990),Drama
5,1136,Outside Providence (1999),Comedy
6,861,"Deer Hunter, The (1978)",Drama|War
7,1679,Friday the 13th: The Final Chapter (1984),Horror


# Build Test Dictionary

To evaluate the model, we map each user to the movies held out for testing.

In [22]:
test_items_by_user = (
    test.groupby("u_index")["m_index"]
    .apply(set)
    .to_dict()
)

In [23]:
len(test_items_by_user)

6040

In [24]:
list(test_items_by_user.items())[:3]

[(0, {4, 16, 25, 29, 30, 32, 33, 34, 35, 40}),
 (1,
  {60,
   66,
   69,
   72,
   73,
   74,
   79,
   83,
   86,
   91,
   94,
   98,
   107,
   118,
   119,
   125,
   126,
   130,
   133,
   137,
   146,
   154,
   157,
   160,
   163}),
 (2, {4, 22, 84, 116, 176, 179, 182, 183, 198, 207})]

# Define Evaluation Metrics

These metrics tell us how well the recommender ranks relevant movies.

In [30]:
def recall_at_k(model, user_item_matrix, test_dict, k=10):
    recalls = []
    max_model_user = model.user_factors.shape[0]
    for user, test_items in test_dict.items():

        if user >= max_model_user:
            continue
        if user_item_matrix[user].nnz == 0 or len(test_items) == 0:
            continue
        user_row = user_item_matrix[user]
        rec_ids, _ = model.recommend(
            userid=user,
            user_items=user_row,
            N=k,
        )
        hits = len(set(rec_ids).intersection(test_items))
        recalls.append(hits / len(test_items))
    if not recalls:
        return 0.0
    return float(np.mean(recalls))

In [31]:
def ndcg_at_k(model, user_item_matrix, test_dict, k=10):
    ndcgs = []
    max_model_user = model.user_factors.shape[0]
    for user, test_items in test_dict.items():
        if user >= max_model_user:
            continue
        if user_item_matrix[user].nnz == 0 or len(test_items) == 0:
            continue
        user_row = user_item_matrix[user]
        rec_ids, _ = model.recommend(
            userid=user,
            user_items=user_row,
            N=k,
        )
        dcg = 0.0
        for rank, item_id in enumerate(rec_ids):
            if item_id in test_items:
                dcg += 1.0 / np.log2(rank + 2)
        ideal_hits = min(k, len(test_items))
        if ideal_hits == 0:
            continue
        idcg = sum(1.0 / np.log2(r + 2) for r in range(ideal_hits))
        if idcg == 0:
            continue
        ndcgs.append(dcg / idcg)
    if not ndcgs:
        return 0.0
    return float(np.mean(ndcgs))

# Evaluate the ALS Model

We compute Recall@10 and NDCG@10 as the baseline performance.

In [32]:
k = 10
recall_k = recall_at_k(model, user_item, test_items_by_user, k=k)
ndcg_k = ndcg_at_k(model, user_item, test_items_by_user, k=k)
print(f"Recall@{k}: {recall_k:.4f}")
print(f"NDCG@{k}:  {ndcg_k:.4f}")

Recall@10: 0.0017
NDCG@10:  0.0057


# Saving the Trained ALS Model

We save the model for later use in the hybrid recommender.

In [33]:
model_path = PROCESSED_DIR / "als_model.pkl"

In [34]:
with open(model_path, "wb") as f:
    pickle.dump(model, f)

In [35]:
print("Saved ALS model to:", model_path)

Saved ALS model to: /Users/sanjaydilip/Desktop/Code/Projects/Movie Recommender/data/processed/als_model.pkl


# Summary

In [36]:
summary = {
    "factors": factors,
    "regularization": regularization,
    "iterations": iterations,
    "recall_at_10": recall_k,
    "ndcg_at_10": ndcg_k,
    "n_users": item_user.shape[1],
    "n_items": item_user.shape[0],
}

In [37]:
pd.DataFrame([summary])

Unnamed: 0,factors,regularization,iterations,recall_at_10,ndcg_at_10,n_users,n_items
0,64,0.1,20,0.001721,0.005723,6040,3706
