## Multimodal Recommendation Study using CLIP Embeddings

**Objective:**

This tutorial combines effective prompting techniques with an end-to-end workflow for personalized product recommendations. You will learn how to design a recommendation system using an LLM that recommends products based on a user's preferences.

In [75]:
import os
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, ndcg_score
from PIL import Image
from tqdm import tqdm
import matplotlib.pyplot as plt

from transformers import CLIPProcessor, CLIPModel
from torch.utils.data import Dataset, DataLoader

# ======================
# 1. Setup & Load CLIP
# ======================

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


#### Step 1: Load Dataset and Preprocessing.

In [76]:
import os
import pandas as pd
from datasets import load_dataset

# Extract a representative large image URL (first non-null entry from 'images.large')
def extract_image_url(images_dict):
    try:
        large_imgs = images_dict.get('large', [])
        for url in large_imgs:
            if isinstance(url, str) and url.startswith("http"):
                return url
    except:
        pass
    return None

def process_data_sample(dataset_str, review_key, meta_key, sampling_percent=1.0):
    """
    Load and process Amazon reviews and metadata, sample a percentage of reviews, and return interactions and product metadata.

    Args:
        dataset_str (str): Hugging Face dataset path (e.g., "McAuley-Lab/Amazon-Reviews-2023")
        review_key (str): Subset key for reviews (e.g., "raw_review_Books")
        meta_key (str): Subset key for metadata (e.g., "raw_meta_Books")
        sampling_percent (float): Percentage of data to load (e.g., 1.0 for 1%)

    Returns:
        interaction_df (pd.DataFrame): Processed review interactions with user/item/rating info
        product_df (pd.DataFrame): Product metadata including item_id, title, category
    """

    # Load sampled user reviews
    split_str = f"full[:{sampling_percent}%]"
    reviews = load_dataset(dataset_str, review_key, split=split_str, trust_remote_code=True)
    reviews_df = pd.DataFrame(reviews)

    # Select and rename relevant review columns
    reviews_df = reviews_df[['user_id', 'parent_asin', 'timestamp', 'rating', 'verified_purchase']]
    reviews_df.rename(columns={
        'parent_asin': 'item_id'
    }, inplace=True)

    # Define interaction type
    def define_label(row):
        if row['rating'] >= 4:
            return 1
        else:
            return 0

    reviews_df['label'] = reviews_df.apply(define_label, axis=1)
    # reviews_df['timestamp'] = pd.to_datetime(reviews_df['timestamp'], unit='s')

    # Load full metadata
    metadata = load_dataset(dataset_str, meta_key, split="full", trust_remote_code=True)
    metadata_df = pd.DataFrame(metadata)
    # print(metadata_df.head())
    # Add 'image_url' field
    metadata_df['image_url'] = metadata_df['images'].apply(extract_image_url)

    # Process metadata and keep relevant fields
    metadata_df = metadata_df[['parent_asin', 'title', 'main_category', 'categories', 'image_url']]
    metadata_df.rename(columns={
        'parent_asin': 'item_id'
    }, inplace=True)
        
    # Merge review and product metadata
    interaction_df = reviews_df.merge(metadata_df, on='item_id', how='left')

    return interaction_df, metadata_df


In [77]:
import pandas as pd
import random

def prepare_pairwise_ranking_data_with_user_split(user_df, product_df, test_ratio=0.2, num_negatives=1, seed=42):
    """
    Perform train/test split within each user, keeping only users with ≥5 interactions,
    then generate pairwise ranking pairs.

    Args:
        user_df: Interaction DataFrame with ['user_id', 'item_id', 'label']
        product_df: Product DataFrame (used for item matching if needed)
        test_ratio: Proportion of each user's interactions to hold out for test
        num_negatives: Number of negative samples per positive
        seed: Random seed

    Returns:
        train_df, test_df: DataFrames
        train_pairs, test_pairs: List of (user_id, pos_item, neg_item)
    """
    random.seed(seed)
    train_rows, test_rows = [], []

    for user_id, group in user_df.groupby("user_id"):
        if len(group) < 5:
            continue  # Skip users with too few interactions

        group = group.sample(frac=1, random_state=seed)  # Shuffle interactions
        split_idx = int(len(group) * (1 - test_ratio))

        train_rows.append(group.iloc[:split_idx])
        test_rows.append(group.iloc[split_idx:])

    train_df = pd.concat(train_rows)
    test_df = pd.concat(test_rows)

    def generate_pairs(interactions, num_negatives=1, max_pairs_per_user=5):
        pairs = []
        grouped = interactions.groupby("user_id")
        for user_id, group in grouped:
            pos_items = group[group['label'] == 1]['item_id'].tolist()
            neg_items = group[group['label'] == 0]['item_id'].tolist()
            if len(pos_items) == 0 or len(neg_items) == 0:
                continue

            user_pairs = []
            for pos in pos_items:
                sampled_negs = random.sample(neg_items, min(len(neg_items), num_negatives))
                for neg in sampled_negs:
                    user_pairs.append((user_id, pos, neg))

            # Limit number of pairs per user
            sampled_user_pairs = random.sample(user_pairs, min(len(user_pairs), max_pairs_per_user))
            pairs.extend(sampled_user_pairs)

        return pairs

    train_pairs = generate_pairs(train_df, num_negatives)
    test_pairs = generate_pairs(test_df, num_negatives)

    return train_df, test_df, train_pairs, test_pairs


#### Step 2: Feature Extraction


In [78]:
import torch
import pandas as pd
from PIL import Image
from tqdm import tqdm
import torch.nn as nn

import requests
from io import BytesIO

# ======================
# Feature Extraction Module
# ======================
class FeatureExtractor:
    def __init__(self, clip_model, clip_processor, device='cpu'):
        self.clip_model = clip_model
        self.clip_processor = clip_processor
        self.device = device
        
    def get_text_embeddings(self, texts, batch_size=32):
        """Extract CLIP text embeddings"""
        all_embeddings = []
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            inputs = self.clip_processor(text=batch, return_tensors="pt", 
                                       padding=True, truncation=True).to(self.device)
            with torch.no_grad():
                emb = self.clip_model.get_text_features(**inputs)
            all_embeddings.append(emb.cpu())
        return torch.cat(all_embeddings)

    def get_image_embeddings(self, image_urls, batch_size=16):
        embeddings = []
        for i in range(0, len(image_urls), batch_size):
            batch = image_urls[i:i+batch_size]
            images = []
            for url in batch:
                try:
                    response = requests.get(url, timeout=5)
                    img = Image.open(BytesIO(response.content)).convert("RGB")
                    images.append(img)
                except Exception as e:
                    print(f"❌ Failed to load image from {url} — {e}")
                    # Add a dummy black image of standard size
                    images.append(Image.new("RGB", (224, 224), color=(0, 0, 0)))

            inputs = self.clip_processor(images=images, return_tensors="pt").to(self.device)
            with torch.no_grad():
                image_emb = self.clip_model.get_image_features(**inputs)
            image_emb = image_emb / image_emb.norm(p=2, dim=-1, keepdim=True)
            embeddings.append(image_emb.cpu())
        return torch.cat(embeddings, dim=0)

    def get_multimodal_embeddings(self, product_df, ablation = False):
        """Combine text and image embeddings and return with product_df index."""
        product_df['title'] = product_df['title'].fillna("")
        text_emb = self.get_text_embeddings(product_df['title'].tolist())
        image_emb = self.get_image_embeddings(product_df['image_url'].tolist())

        multimodal_emb = (text_emb + image_emb) / 2
        if ablation:
            multimodal_emb = text_emb

        # Return as pandas DataFrame indexed by product_df.index
        return pd.DataFrame(multimodal_emb.numpy(), index=product_df['item_id'])

    def create_user_embeddings(self, user_df, product_embeddings, product_df=None, positive_only=True):
        """
        Create user embeddings by aggregating the embeddings of positively interacted items.

        Args:
            user_df (pd.DataFrame): User interactions with 'user_id', 'item_id', and optionally 'rating' or 'label'.
            product_embeddings (pd.DataFrame): Embeddings indexed by 'item_id'.
            product_df (pd.DataFrame, optional): Product metadata for fallback.
            positive_only (bool): Whether to include only positive interactions.

        Returns:
            dict: Mapping from user_id to aggregated embedding tensor.
        """
        user_embeddings = {}
        embedding_dim = product_embeddings.shape[1]
        
        # Filter to only positive interactions if applicable
        if positive_only:
            if "rating" in user_df.columns:
                user_df = user_df[user_df["rating"] >= 4]
            elif "label" in user_df.columns:
                user_df = user_df[user_df["label"] == 1]

        grouped = user_df.groupby('user_id')

        for user_id, group in grouped:
            item_ids = group['item_id'].unique()

            # Retain only those items with valid embeddings
            available_ids = [item_id for item_id in item_ids if item_id in product_embeddings.index]

            if not available_ids:
                user_embeddings[user_id] = torch.zeros(embedding_dim)
                continue

            # Aggregate using mean pooling
            item_embs = product_embeddings.loc[available_ids].values
            user_embeddings[user_id] = torch.tensor(item_embs).float().mean(dim=0)

        return user_embeddings


#### Step 3: Model Training

In [79]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm
import pandas as pd

# ======================
# Model Definitions
# ======================

class RankingModel(nn.Module):
    """MLP-based ranking model"""
    def __init__(self, dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(dim * 2, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )
        
    def forward(self, user_emb, item_emb):
        x = torch.cat([user_emb, item_emb], dim=1)
        return self.model(x)

class DotProductModel(nn.Module):
    """Dot-product based ranking model"""
    def __init__(self):
        super().__init__()

    def forward(self, user_emb, item_emb):
        return (user_emb * item_emb).sum(dim=1, keepdim=True)

# ======================
# BPR Loss Function
# ======================

def bpr_loss(pos_score, neg_score):
    """Bayesian Personalized Ranking loss"""
    return -torch.log(torch.sigmoid(pos_score - neg_score)).mean()

# ======================
# Training Loop
# ======================

class Trainer:
    def __init__(self, model, device='cpu'):
        self.model = model.to(device)
        self.device = device
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-3)

    def train(self, pairs, user_embeddings, product_embeddings, epochs=5):
        for epoch in range(epochs):
            losses = []
            for user_id, pos_pid, neg_pid in tqdm(pairs):
                try:
                    # Retrieve embeddings
                    user_vec = user_embeddings[user_id].unsqueeze(0).to(self.device)
                    pos_vec = torch.tensor(product_embeddings.loc[pos_pid], dtype=torch.float32).unsqueeze(0).to(self.device)
                    neg_vec = torch.tensor(product_embeddings.loc[neg_pid], dtype=torch.float32).unsqueeze(0).to(self.device)

                    # Forward pass
                    pos_score = self.model(user_vec, pos_vec)
                    neg_score = self.model(user_vec, neg_vec)

                    # Compute BPR loss
                    loss = bpr_loss(pos_score, neg_score)

                    # Backward
                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()
                    losses.append(loss.item())
                except KeyError:
                    continue  # Skip if embeddings not found
            
            print(f"Epoch {epoch+1}, Loss: {np.mean(losses):.4f}")


In [80]:
# # ======================
# # Recommendation Module
# # ======================
# class Recommender:
#     def __init__(self, model, user_embeddings, product_embeddings, device='cpu'):
#         self.model = model.to(device)
#         self.user_embeddings = user_embeddings
#         self.product_embeddings = product_embeddings
#         self.device = device
        
#     def recommend(self, user_id, top_k=10):
#         """Generate top-k recommendations for a user"""
#         user_vec = self.user_embeddings[user_id].unsqueeze(0).to(self.device)
#         item_vectors = self.product_embeddings.values.to(self.device)
        
#         # Calculate scores
#         with torch.no_grad():
#             repeated_user = user_vec.repeat(item_vectors.shape[0], 1)
#             scores = self.model(repeated_user, item_vectors).squeeze()
        
#         # Get top-k indices
#         _, indices = torch.topk(scores, top_k)
#         return self.product_embeddings.index[indices.cpu().numpy()]


#### Step 4: Model Evaluation

In [81]:
# ======================
# Evaluation Module
# ======================
class Evaluator:
    def __init__(self, model, user_embeddings, product_embeddings, device='cpu'):
        self.model = model.to(device)
        self.user_embeddings = user_embeddings
        self.product_embeddings = product_embeddings
        self.device = device

    def evaluate_pairwise(self, eval_pairs):
        """Evaluate pairwise ranking accuracy, skipping users without embeddings"""
        correct = 0
        total = 0
        
        for user_id, pos_pid, neg_pid in eval_pairs:
            if user_id not in self.user_embeddings:
                continue  # Skip users without embeddings

            try:
                user_vec = self.user_embeddings[user_id].unsqueeze(0).to(self.device)
                pos_vec = torch.tensor(self.product_embeddings.loc[pos_pid].values).unsqueeze(0).to(self.device)
                neg_vec = torch.tensor(self.product_embeddings.loc[neg_pid].values).unsqueeze(0).to(self.device)
            except KeyError:
                continue  # Skip if product embeddings are missing

            with torch.no_grad():
                pos_score = self.model(user_vec, pos_vec)
                neg_score = self.model(user_vec, neg_vec)

            correct += int(pos_score.item() > neg_score.item())
            total += 1

        return correct / total if total > 0 else 0.0


In [82]:
import json

def run_ablation_experiment(ablation=False):
    # 3. Feature extraction
    fe = FeatureExtractor(clip_model, clip_processor, device)
    product_embeddings = fe.get_multimodal_embeddings(product_df, ablation=ablation)

    # 5. Create user representations
    user_embeddings = fe.create_user_embeddings(train_df, product_embeddings, product_df)

    # ========== 6. Initialize and train both models ==========

    # Model A: MLP-based Ranking Model
    mlp_model = RankingModel(product_embeddings.shape[1])
    mlp_trainer = Trainer(mlp_model, device)
    mlp_trainer.train(train_pairs, user_embeddings, product_embeddings, epochs=5)

    # Model B: Dot Product Model
    dot_model = DotProductModel()

    # ========== 8. Evaluate Both Models ==========
    mlp_evaluator = Evaluator(mlp_model, user_embeddings, product_embeddings, device)
    dot_evaluator = Evaluator(dot_model, user_embeddings, product_embeddings, device)

    mlp_trainaccuracy = mlp_evaluator.evaluate_pairwise(train_pairs)
    dot_trainaccuracy = dot_evaluator.evaluate_pairwise(train_pairs)

    mlp_accuracy = mlp_evaluator.evaluate_pairwise(test_pairs)
    dot_accuracy = dot_evaluator.evaluate_pairwise(test_pairs)

    # ========== Print Results ==========
    print("========== Ablation:", ablation, "==========")
    print(f"Train pairs: {len(train_pairs)}")
    print(f"Test pairs: {len(test_pairs)}")
    print(f"MLP Model Test Accuracy: {mlp_accuracy:.2f}")
    print(f"Dot Product Model Test Accuracy: {dot_accuracy:.2f}")

    # ========== Save Results to JSON ==========
    results_combined = {
        "ablation": ablation,
        "MLP Model": {
            "Accuracy": mlp_accuracy,
            "Training Accuracy": mlp_trainaccuracy,
            "Training pairs": len(train_pairs)
        },
        "DotProduct Model": {
            "Accuracy": dot_accuracy,
            "Training Accuracy": dot_trainaccuracy,
            "Test pairs": len(test_pairs)
        }
    }

    filename = f"ablation_results_{'text_only' if ablation else 'multimodal'}.json"
    with open(filename, "w") as f:
        json.dump(results_combined, f, indent=2)

    return results_combined


In [83]:
import json

# 1. Load data
user_df, product_df = process_data_sample(
    dataset_str="McAuley-Lab/Amazon-Reviews-2023",
    review_key="raw_review_Amazon_Fashion",
    meta_key="raw_meta_Amazon_Fashion",
    sampling_percent=5
)
print(user_df.shape)

# user_df = user_df.head(100)

# Extract all unique item_ids from the user_df
item_ids = user_df['item_id'].unique()

# Filter product_df to keep only those items that appear in user_df
product_df = product_df[product_df['item_id'].isin(item_ids)].reset_index(drop=True)
print(product_df.shape)

# 4. Prepare training pairs
train_df, test_df, train_pairs, test_pairs = prepare_pairwise_ranking_data_with_user_split(user_df, product_df)

# Run full multimodal setup
results_mm = run_ablation_experiment(ablation=False)

# Run ablation with text-only embeddings
results_text = run_ablation_experiment(ablation=True)

# Combine the two results under separate keys
combined_results = {
    "Multimodal Embeddings": results_mm,
    "Text-only Embeddings": results_text
}

# Save to a combined JSON file
with open("ablation_results_combined.json", "w") as f:
    json.dump(combined_results, f, indent=2)


(125047, 10)
(93236, 5)
❌ Failed to load image from https://m.media-amazon.com/images/I/51KhClp4jKL._AC_.jpg — HTTPSConnectionPool(host='m.media-amazon.com', port=443): Read timed out. (read timeout=5)
❌ Failed to load image from https://m.media-amazon.com/images/I/41xl5GXTHkL._AC_.jpg — HTTPSConnectionPool(host='m.media-amazon.com', port=443): Read timed out. (read timeout=5)
❌ Failed to load image from https://m.media-amazon.com/images/I/317sbhEtEZL._AC_.jpg — cannot identify image file <_io.BytesIO object at 0x7f9bc18d04f0>
❌ Failed to load image from https://m.media-amazon.com/images/I/51nfuaDfBwL._AC_.jpg — HTTPSConnectionPool(host='m.media-amazon.com', port=443): Read timed out. (read timeout=5)
❌ Failed to load image from https://m.media-amazon.com/images/I/41p3AOz0j8L._AC_.jpg — HTTPSConnectionPool(host='m.media-amazon.com', port=443): Read timed out. (read timeout=5)
❌ Failed to load image from https://m.media-amazon.com/images/I/41EHq1K13bL._AC_.jpg — HTTPSConnectionPool(host

 87%|████████▋ | 4433/5103 [00:18<00:02, 243.04it/s]

In [None]:
print(1)