Building the Hybrid Recommender with LightFM

In [1]:
# Ensure numpy and pandas are up-to-date

import pandas as pd

import os

# Check if files exist before loading
ratings_path = "/Users/richard/data/product-recommender/data/raw/ratings.csv"
movies_path = "/Users/richard/data/product-recommender/data/raw/movies.csv"

ratings = pd.read_csv(ratings_path)
movies = pd.read_csv(movies_path)

# Always preview data before working with it
print("Ratings Sample:")
print(ratings.head())

print("\nMovies Sample:")
print(movies.head())


Ratings Sample:
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931

Movies Sample:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


 Clean and Merge
 We want to:

1.Filter out users with too few ratings (to reduce noise)
2.Merge movie info into the ratings for hybrid modeling

In [2]:
# Keep only users with at least 5 ratings
ratings_filtered = ratings.groupby('userId').filter(lambda x: len(x) >= 5)

# Merge genres and titles into ratings
data = pd.merge(ratings_filtered, movies, on='movieId')

# LightFM prefers string IDs
data['userId'] = data['userId'].astype(str)
data['movieId'] = data['movieId'].astype(str)


 Create Content Features with TF-
  We need to turn genre  into numerical vectors.

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Replace "|" with space so TF-IDF treats each genre as a word
movies['genres_cleaned'] = movies['genres'].str.replace('|', ' ', regex=False)

# Create a TF-IDF matrix for genres
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(movies['genres_cleaned'])

print(movies['genres_cleaned'].head())
print(tfidf_matrix.shape)

feature_names = vectorizer.get_feature_names_out()

# View TF-IDF scores for first movie
movie_0_vector = tfidf_matrix[0]
df = pd.DataFrame(movie_0_vector.toarray(), columns=feature_names)
print(df.T[df.T[0] > 0])


0    Adventure Animation Children Comedy Fantasy
1                     Adventure Children Fantasy
2                                 Comedy Romance
3                           Comedy Drama Romance
4                                         Comedy
Name: genres_cleaned, dtype: object
(9742, 24)
                  0
adventure  0.416846
animation  0.516225
children   0.504845
comedy     0.267586
fantasy    0.482990


### Building the Dataset for LightFM

To train a recommendation model using LightFM, we need to prepare the data in a specific format. This involves creating the following components:

1. **Mapping of Users and Items**  
   - Create unique mappings for user IDs and item (movie) IDs to ensure compatibility with LightFM's requirements.

2. **Interactions Matrix**  
   - Construct a sparse matrix that represents which users have interacted with which items.  
   - Each entry in the matrix indicates whether a user has rated a specific movie (and optionally, the rating value).

3. **Item Feature Matrix**  
   - Use the TF-IDF vectors generated from the movie genres to create a feature matrix for the items.  
   - This matrix provides additional content-based information about the items, which can improve the hybrid recommendation model.

These components will serve as the input for training the LightFM model.

In [8]:
import pandas as pd

# Load ratings data
data = pd.read_csv('../data/raw/ratings.csv')

# Preview the data
print(data.head())

from lightfm import LightFM
from lightfm.data import Dataset

dataset = Dataset()

# Register all users and items
dataset.fit(data['userId'], data['movieId'])

# Build interactions matrix
(interactions, weights) = dataset.build_interactions(
    [(row['userId'], row['movieId']) for _, row in data.iterrows()]
)


   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


### Map TF-IDF to LightFM Items
LightFM needs item features in the same order as its internal item IDs.


In [9]:
from scipy.sparse import csr_matrix
import numpy as np

# Map movieId to LightFM internal item ID
item_mapping = dataset.mapping()[2]  # movieId → LightFM ID
movie_id_map = dict(zip(movies['movieId'], range(len(movies))))  # movieId → row in TF-IDF

# Align TF-IDF rows with LightFM item IDs
item_features = []
for movieId, lightfm_id in item_mapping.items():
    tfidf_row = tfidf_matrix[movie_id_map[movieId]]
    item_features.append(tfidf_row.toarray())

item_features_matrix = csr_matrix(np.vstack(item_features))


 Train the Hybrid LightFM Model

In [10]:
model = LightFM(loss='warp', no_components=32)  # You can also try 'bpr' or 'logistic'

# Train with both interactions and item features
model.fit(interactions, item_features=item_features_matrix, epochs=10, num_threads=4)


<lightfm.lightfm.LightFM at 0x11eb79370>

### Generate Recommendations

In [36]:
def recommend(user_id, N=5):
    user_id_map = dataset.mapping()[0]  # userId → LightFM ID
    item_id_map = dataset.mapping()[2]  # movieId → LightFM ID
    reverse_item_map = {v: k for k, v in item_id_map.items()}

    # Check if user_id exists in the mapping
    if user_id not in user_id_map:
        print(f"User ID {user_id} not found in the mapping.")
        return []

    user_x = user_id_map[user_id]
    scores = model.predict(user_ids=user_x,
                           item_ids=np.arange(len(item_id_map)),
                           item_features=item_features_matrix)

    top_items = np.argsort(-scores)[:N]
    top_movie_ids = [reverse_item_map[i] for i in top_items]

    return movies[movies['movieId'].isin(top_movie_ids)]['title'].values

print("Recommended movies for user 1:")
print(recommend(1, N=5))




Recommended movies for user 1:
['Super Mario Bros. (1993)' 'Young Sherlock Holmes (1985)'
 'Wiz, The (1978)' 'Chitty Chitty Bang Bang (1968)'
 'New Adventures of Pippi Longstocking, The (1988)']


In [42]:
print(list(user_id_map.keys())[:5])


[1, 2, 3, 4, 5]


In [71]:
# Clean genres
movies['genres_cleaned'] = movies['genres'].str.replace('|', ' ', regex=False)

# Fit TF-IDF vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(movies['genres_cleaned'])

# Map movieId to index in tfidf_matrix
movie_id_to_index = dict(zip(movies['movieId'], range(len(movies))))

# Retrieve item mapping from dataset
_, _, item_map, _ = dataset.mapping()
item_id_to_index = {item: index for item, index in item_map.items()}

# Align TF-IDF rows with LightFM item IDs
item_features_list = []
for item_id in item_map:
    tfidf_index = movie_id_to_index.get(item_id)
    if tfidf_index is not None:
        item_features_list.append(tfidf_matrix[tfidf_index].toarray())
    else:
        # If no TF-IDF vector is found, append a zero vector
        item_features_list.append(np.zeros((1, tfidf_matrix.shape[1])))

# Stack into a single csr_matrix
item_features_array = np.vstack(item_features_list)
item_features = csr_matrix(item_features_array)

def recommend(user_id, model, interactions, item_features, user_map, item_map, item_id_reverse_map, N=5):
    """
    Generate top N recommendations for a given user.
    """
    user_x = user_map[user_id]
    scores = model.predict(user_ids=user_x,
                           item_ids=np.arange(len(item_map)),
                           item_features=item_features)

    top_items = np.argsort(-scores)[:N]
    recommended_movie_ids = [item_id_reverse_map[i] for i in top_items]
    return recommended_movie_ids

# Retrieve mappings
user_map, _, item_map, _ = dataset.mapping()
item_id_reverse_map = {v: k for k, v in item_map.items()}

# Select a sample user

sample_user = list(user_map.keys())[0]

# Generate recommendations
recommended_ids = recommend(sample_user, model, interactions, item_features, user_map, item_map, item_id_reverse_map)

# Map back to movie titles
recommended_titles = movies[movies['movieId'].isin(recommended_ids)]['title'].values

# Display recommendations
print(f"Recommended movies for user {sample_user}:")
for title in recommended_titles:
    print(f"- {title}")



Recommended movies for user 1:
- Super Mario Bros. (1993)
- Young Sherlock Holmes (1985)
- Wiz, The (1978)
- Chitty Chitty Bang Bang (1968)
- New Adventures of Pippi Longstocking, The (1988)


In [72]:
num_users = 3
top_n = 5

# Get the list of user IDs
user_ids = list(user_map.keys())[:num_users]

for user_id in user_ids:
    # Get the internal LightFM user ID
    user_x = user_map[user_id]

    # Predict scores for all items for the current user
    scores = model.predict(user_ids=user_x,
                           item_ids=np.arange(len(item_map)),
                           item_features=item_features)

    # Get the indices of the top N scores
    top_items = np.argsort(-scores)[:top_n]

    # Map internal item IDs back to original movie IDs
    recommended_movie_ids = [item_id_reverse_map[i] for i in top_items]

    # Retrieve movie titles
    recommended_titles = movies[movies['movieId'].isin(recommended_movie_ids)]['title'].values

    # Display recommendations
    print(f"\nTop {top_n} recommendations for User {user_id}:")
    for idx, title in enumerate(recommended_titles, start=1):
        print(f"{idx}. {title}")


Top 5 recommendations for User 1:
1. Super Mario Bros. (1993)
2. Young Sherlock Holmes (1985)
3. Wiz, The (1978)
4. Chitty Chitty Bang Bang (1968)
5. New Adventures of Pippi Longstocking, The (1988)

Top 5 recommendations for User 2:
1. Dark Knight, The (2008)
2. Eagle Eye (2008)
3. Fast Five (Fast and the Furious 5, The) (2011)
4. Good Day to Die Hard, A (2013)
5. Fast & Furious 6 (Fast and the Furious 6, The) (2013)

Top 5 recommendations for User 3:
1. Krull (1983)
2. Godzilla vs. Mothra (Mosura tai Gojira) (1964)
3. Batman v Superman: Dawn of Justice (2016)
4. Rogue One: A Star Wars Story (2016)
5. Marvel One-Shot: Agent Carter (2013)
