# 🎯 Multi-Modal Recommendation System using PyTorch (Hybrid Model)
**Problem 2: Multi-Modal Recommendation System**

- Uses MovieLens 100K dataset from URL
- Hybrid approach: Autoencoder + Content (genre)
- Handles cold-start problem
- Evaluated using Precision@10 and NDCG@10

## 📥 Step 1: Download and Extract Dataset

In [1]:

import os
import urllib.request
import zipfile
import pandas as pd
import numpy as np

# === 1. Dataset Download ===
if not os.path.exists('ml-100k'):
    url = 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'
    print("Downloading MovieLens 100K…")
    urllib.request.urlretrieve(url, 'ml-100k.zip')
    print("Extracting…")
    with zipfile.ZipFile('ml-100k.zip', 'r') as z:
        z.extractall()
    print("Done!")


## 📄 Step 2: Load and Preprocess Ratings and Genre Metadata

In [2]:

# === 2. Load & Preprocess Data ===
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=['user_id','movie_id','rating','ts'])

item_info = pd.read_csv("ml-100k/u.item", sep='|', encoding='latin-1', header=None,
                        names=['movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL',
                               'unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime',
                               'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
                               'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])

genres = item_info.drop(columns=['movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL'])
genre_matrix = genres.values


## 🧹 Step 3: Encode User and Item IDs and Create Interaction Matrix

In [3]:

ratings['user_id'] = ratings['user_id'].astype(int)
ratings['movie_id'] = ratings['movie_id'].astype(int)
user_ids = ratings['user_id'].unique()
item_ids = ratings['movie_id'].unique()
uid_map = {id_: idx for idx, id_ in enumerate(user_ids)}
iid_map = {id_: idx for idx, id_ in enumerate(item_ids)}
rev_iid_map = {v: k for k, v in iid_map.items()}
rev_uid_map = {v: k for k, v in uid_map.items()}
ratings['uid'] = ratings['user_id'].map(uid_map)
ratings['iid'] = ratings['movie_id'].map(iid_map)

from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(ratings, test_size=0.2, random_state=42)
n_users, n_items = len(uid_map), len(iid_map)
train_matrix = np.zeros((n_users, n_items))
for row in train_df.itertuples():
    train_matrix[row.uid, row.iid] = row.rating


## 🧠 Step 4: Define and Train Autoencoder (Collaborative Filtering)

In [4]:

import torch
import torch.nn as nn
import torch.optim as optim

class AutoEncoder(nn.Module):
    def __init__(self, input_dim):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.decoder(self.encoder(x))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoEncoder(input_dim=n_items).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

X_train = torch.FloatTensor(train_matrix).to(device)
for epoch in range(10):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs, X_train)
    loss.backward()
    optimizer.step()


## 🔀 Step 5: Hybrid Recommendation (Collaborative + Content)

In [5]:

def hybrid_recommend(user_vector, content_matrix, model, alpha=0.7, k=10):
    model.eval()
    with torch.no_grad():
        cf_scores = model(torch.FloatTensor(user_vector.reshape(1, -1)).to(device)).cpu().numpy().flatten()
    user_profile = user_vector @ content_matrix
    cb_scores = content_matrix @ user_profile.T
    cb_scores = cb_scores / (np.linalg.norm(cb_scores) + 1e-8)
    hybrid_scores = alpha * cf_scores + (1 - alpha) * cb_scores
    return np.argsort(-hybrid_scores)[:k]


## 📏 Step 6: Evaluate Model (Precision@10, NDCG@10)

In [6]:

from sklearn.metrics import ndcg_score

def precision_at_k(recommended, ground_truth, k):
    recommended = recommended[:k]
    return len(set(recommended) & set(ground_truth)) / k

def ndcg_at_k(recommended, ground_truth, k):
    rel = [1 if item in ground_truth else 0 for item in recommended[:k]]
    return ndcg_score([rel], [sorted(rel, reverse=True)])

precisions, ndcgs = [], []
for uid in test_df['uid'].unique():
    actual_items = test_df[test_df['uid'] == uid]['iid'].tolist()
    if not actual_items:
        continue
    user_vec = train_matrix[uid]
    top_k = hybrid_recommend(user_vec, genre_matrix, model, alpha=0.7, k=10)
    precisions.append(precision_at_k(top_k, actual_items, 10))
    ndcgs.append(ndcg_at_k(top_k, actual_items, 10))

print(f"✅ Precision@10: {np.mean(precisions):.4f}")
print(f"✅ NDCG@10: {np.mean(ndcgs):.4f}")


✅ Precision@10: 0.0623
✅ NDCG@10: 0.2293


## 🎁 Step 7: Show Recommendations (Known and Cold-Start)

In [7]:

known_user = 5
known_top = hybrid_recommend(train_matrix[known_user], genre_matrix, model)
known_recs = [item_info.iloc[i]['title'] for i in known_top]

popular_items = ratings.groupby('movie_id').size().sort_values(ascending=False).head(10).index
cold_recs = item_info[item_info['movie_id'].isin(popular_items)]['title'].tolist()

pd.DataFrame({
    'Known User Top 10': known_recs,
    'Cold Start User Top 10': cold_recs + [''] * (10 - len(cold_recs))
})


Unnamed: 0,Known User Top 10,Cold Start User Top 10
0,Midnight in the Garden of Good and Evil (1997),Toy Story (1995)
1,Indiana Jones and the Last Crusade (1989),Star Wars (1977)
2,Terminator 2: Judgment Day (1991),Fargo (1996)
3,Aliens (1986),Independence Day (ID4) (1996)
4,"Client, The (1994)",Return of the Jedi (1983)
5,"Horseman on the Roof, The (Hussard sur le toit...",Contact (1997)
6,North by Northwest (1959),"English Patient, The (1996)"
7,Powder (1995),Scream (1996)
8,Farinelli: il castrato (1994),Liar Liar (1997)
9,Grosse Pointe Blank (1997),Air Force One (1997)


## 🚀 Step 8: Scalability Considerations
- Use FAISS/Annoy for fast nearest neighbor search on embeddings
- Train autoencoders with mini-batches + GPU
- Deploy CF and CB models as separate microservices
- Maintain user/item vectors in a vector DB or cache for speed