## 1. Imports and Dependencies

In [1]:
import pandas as pd
import numpy as np
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, auc_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
import zipfile, os, urllib.request



## 2. Download and Load MovieLens 100K Dataset

In [2]:
# Download and extract MovieLens 100K
def download_movielens():
    if not os.path.exists('ml-100k/u.data'):
        urllib.request.urlretrieve('http://files.grouplens.org/datasets/movielens/ml-100k.zip', 'ml-100k.zip')
        with zipfile.ZipFile('ml-100k.zip', 'r') as z:
            z.extractall()
download_movielens()

# Load ratings and items
def load_data():
    ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
    items = pd.read_csv('ml-100k/u.item', sep='|', encoding='latin-1', header=None,
                        names=['movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL',
                               'unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 
                               'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 
                               'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])
    return ratings, items

ratings, items = load_data()

## 3. Data Preprocessing: Extract Genres and Feature Binarization

In [3]:
genre_cols = items.columns[5:]
items['genres'] = items[genre_cols].apply(lambda row: list(genre_cols[row==1]), axis=1)
item_genres = dict(zip(items.movie_id, items.genres))

dataset = Dataset()
dataset.fit((x for x in ratings['user_id']), (x for x in ratings['item_id']))

mlb = MultiLabelBinarizer()
genre_features = mlb.fit_transform(items['genres'])
item_features = [(item_id, item_genres[item_id]) for item_id in items.movie_id]
dataset.fit_partial(items=(x for x, _ in item_features), item_features=(x for x in mlb.classes_))

interactions, _ = dataset.build_interactions([(uid, iid) for uid, iid in zip(ratings.user_id, ratings.item_id)])
item_features_matrix = dataset.build_item_features(item_features)

## 4. Train-Test Split and Build Interactions

In [4]:
train_ratings, test_ratings = train_test_split(ratings, test_size=0.2, random_state=42)
train_interactions, _ = dataset.build_interactions([(uid, iid) for uid, iid in zip(train_ratings.user_id, train_ratings.item_id)])
test_interactions, _ = dataset.build_interactions([(uid, iid) for uid, iid in zip(test_ratings.user_id, test_ratings.item_id)])



## 5. Train LightFM Hybrid Model

In [5]:
## Train LightFM Hybrid Model
model = LightFM(loss='warp', no_components=30)
model.fit(train_interactions, item_features=item_features_matrix, epochs=10, num_threads=4)


<lightfm.lightfm.LightFM at 0x11be50370>

## 6. Define Evaluation Metrics (Precision@k, NDCG, Hit Rate)

In [6]:
def evaluate_model(model, train_interactions, test_interactions, item_features):
    prec_train = precision_at_k(model, train_interactions, item_features=item_features, k=10).mean()
    prec_test = precision_at_k(model, test_interactions, item_features=item_features, k=10).mean()
    auc = auc_score(model, test_interactions, item_features=item_features).mean()
    print(f"📊 Precision@10 (Train): {prec_train:.4f}")
    print(f"📊 Precision@10 (Test): {prec_test:.4f}")
    print(f"📈 AUC Score (Test): {auc:.4f}")

evaluate_model(model, train_interactions, test_interactions, item_features_matrix)

📊 Precision@10 (Train): 0.5981
📊 Precision@10 (Test): 0.1263
📈 AUC Score (Test): 0.9023


In [7]:
def hit_rate_at_k(model, interactions, item_features, k=10):
    hits = 0
    total = 0
    for user_id in range(interactions.shape[0]):
        true_items = interactions.tocsr()[user_id].indices
        if len(true_items) == 0:
            continue
        scores = model.predict(user_id, np.arange(interactions.shape[1]), item_features=item_features)
        top_items = np.argsort(-scores)[:k]
        hits += any(item in top_items for item in true_items)
        total += 1
    return hits / total if total > 0 else 0

hit_rate = hit_rate_at_k(model, test_interactions, item_features_matrix, k=10)
print(f"🎯 Hit Rate@10 (Test): {hit_rate:.4f}")

🎯 Hit Rate@10 (Test): 0.7117


In [8]:
def ndcg_at_k(model, interactions, item_features, k=10):
    ndcgs = []
    for user_id in range(interactions.shape[0]):
        true_items = interactions.tocsr()[user_id].indices
        if len(true_items) == 0:
            continue
        scores = model.predict(user_id, np.arange(interactions.shape[1]), item_features=item_features)
        top_items = np.argsort(-scores)[:k]
        relevance = [1 if item in true_items else 0 for item in top_items]
        ideal_relevance = sorted(relevance, reverse=True)
        dcg = sum([rel / np.log2(idx + 2) for idx, rel in enumerate(relevance)])
        idcg = sum([rel / np.log2(idx + 2) for idx, rel in enumerate(ideal_relevance)])
        ndcg = dcg / idcg if idcg > 0 else 0
        ndcgs.append(ndcg)
    return np.mean(ndcgs) if ndcgs else 0

ndcg_score_k = ndcg_at_k(model, test_interactions, item_features_matrix, k=10)
print(f"🌟 NDCG@10 (Test): {ndcg_score_k:.4f}")

🌟 NDCG@10 (Test): 0.3784


## 7. Recommend Top-N Movies for a Known User

In [9]:
uid_map, iid_map, _ = dataset.mapping()[0], dataset.mapping()[2], dataset.mapping()[1]
rev_iid_map = {v: k for k, v in iid_map.items()}

def recommend_known_user(model, user_id, k=10):
    scores = model.predict(uid_map[user_id], np.arange(len(iid_map)), item_features=item_features_matrix)
    top_items = np.argsort(-scores)[:k]
    print(f"Top {k} recommendations for known user {user_id}:")
    for i in top_items:
        title = items[items.movie_id == rev_iid_map[i]]['title'].values[0]
        print('-', title)

recommend_known_user(model, user_id=3)

Top 10 recommendations for known user 3:
- Game, The (1997)
- Scream (1996)
- L.A. Confidential (1997)
- Air Force One (1997)
- Boogie Nights (1997)
- Lost Highway (1997)
- Cop Land (1997)
- Devil's Advocate, The (1997)
- Full Monty, The (1997)
- Contact (1997)


In [10]:
recommend_known_user(model, user_id=10)

Top 10 recommendations for known user 10:
- Fargo (1996)
- Amadeus (1984)
- English Patient, The (1996)
- Silence of the Lambs, The (1991)
- Schindler's List (1993)
- One Flew Over the Cuckoo's Nest (1975)
- Citizen Kane (1941)
- Pulp Fiction (1994)
- Clockwork Orange, A (1971)
- Apocalypse Now (1979)


## 8. Cold-Start Recommendations

### 8.1 Recommend Top Popular Movies

In [11]:
top_popular = ratings.groupby('item_id').size().sort_values(ascending=False).head(10)
print("Top 10 popular movies for unknown user (cold start):")
for iid in top_popular.index:
    title = items[items.movie_id == iid]['title'].values[0]
    print('-', title)

Top 10 popular movies for unknown user (cold start):
- Star Wars (1977)
- Contact (1997)
- Fargo (1996)
- Return of the Jedi (1983)
- Liar Liar (1997)
- English Patient, The (1996)
- Scream (1996)
- Toy Story (1995)
- Air Force One (1997)
- Independence Day (ID4) (1996)


### 8.2 Recommend Based on Genre Preferences

In [12]:
# Cold-start: Recommend based on genre similarity for a new user with genre preference
def recommend_by_genre(preferred_genres, items_df, k=10):
    matches = items_df[items_df[preferred_genres].sum(axis=1) > 0]
    top = matches.groupby('movie_id').size().sort_values(ascending=False).head(k)
    print("Top genre-based recommendations:")
    for idx in top.index:
        title = items_df[items_df.movie_id == idx]['title'].values[0]
        print(f"- {title}")

# Example: Recommend for a new user who likes 'Action' and 'Sci-Fi'
recommend_by_genre(['Action', 'Sci-Fi'], items)


Top genre-based recommendations:
- GoldenEye (1995)
- Bulletproof (1996)
- Power 98 (1995)
- Fled (1996)
- Daylight (1996)
- Arrival, The (1996)
- Shadow, The (1994)
- Rising Sun (1993)
- Program, The (1993)
- Menace II Society (1993)


## 9. Model Evaluation on Test Set

In [13]:
# Evaluate on test set
print("📊 Model Evaluation on Test Set")
evaluate_model(model, train_interactions, test_interactions, item_features_matrix)

# Compute NDCG and Hit Rate on test interactions
ndcg_score_val = ndcg_at_k(model, test_interactions, item_features_matrix, k=10)
hit_rate_val = hit_rate_at_k(model, test_interactions, item_features_matrix, k=10)

print(f"NDCG@10 on test set: {ndcg_score_val:.4f}")
print(f"Hit Rate@10 on test set: {hit_rate_val:.4f}")


📊 Model Evaluation on Test Set
📊 Precision@10 (Train): 0.5981
📊 Precision@10 (Test): 0.1263
📈 AUC Score (Test): 0.9023
NDCG@10 on test set: 0.3784
Hit Rate@10 on test set: 0.7117


## 10. Scalability Considerations for Production Systems



To handle millions of users/items in production:

- **Approximate Nearest Neighbors**: Use `FAISS` or `Annoy` to store dense item embeddings for fast similarity search.
- **Distributed Training**: Train collaborative models with `Spark MLlib` (ALS) or GPU-accelerated versions of `LightFM`.
- **Incremental Updates**: Store user/item vectors in Redis and update embeddings incrementally with new interactions.
- **Microservice Architecture**: Separate collaborative filtering and content-based recommenders into scalable services with API layers.
- **Batch Pipelines**: Use `Feast` or custom pipelines for feature ingestion, nightly retraining, and reindexing.
