In [1]:
#! pip install transformers datasets peft faiss-cpu

In [2]:
import pandas as pd
import torch
from datasets import load_dataset
from transformers import CLIPProcessor, CLIPModel
from peft import get_peft_model, LoraConfig, TaskType
from PIL import Image
from torchvision import transforms
import faiss
import requests
from io import BytesIO

### DATA CLEANING

In [3]:
# Load the metadata split from Amazon Reviews 2023
product_meta_data = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_All_Beauty", split="full", trust_remote_code=True)
df_meta = pd.DataFrame.from_records(product_meta_data).add_prefix("product_")

In [4]:
df_meta.shape

(112590, 16)

In [5]:
review_data = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty",  split="full", trust_remote_code=True)
df_review = pd.DataFrame.from_records(review_data).add_prefix("review_")

In [6]:
df_meta_clean = df_meta[df_meta['product_parent_asin'].isin(df_review['review_parent_asin'])]

In [7]:
# Columns to clean
target_cols = ['product_title', 'product_description', 'product_images']

# 1) Drop NaNs and literal None’s
df_meta_clean = df_meta_clean.dropna(subset=target_cols)
df_meta_clean = df_meta_clean[~df_meta_clean[target_cols]
    .applymap(lambda x: x is None).any(axis=1)]

  .applymap(lambda x: x is None).any(axis=1)]


In [8]:
df_meta_clean.shape

(112565, 16)

In [9]:
# 2) Define what an “invalid” string is
invalid_strs = {'', 'n/a', 'none', 'na'}

# 3) Validator for product_images
def images_valid(img_dict):
    if not isinstance(img_dict, dict):
        return False
    # only consider these keys for actual URLs
    for key in ('hi_res', 'large', 'thumb'):
        urls = img_dict.get(key, [])
        if not isinstance(urls, (list, tuple)):
            continue
        for url in urls:
            if isinstance(url, str) and url.strip().lower() not in invalid_strs:
                return True
    return False

# 4) General validator for text fields
def text_valid(x):
    return isinstance(x, str) and x.strip().lower() not in invalid_strs

# 5) Apply validators
#   - title & description must pass text_valid
#   - images must pass images_valid
df_meta_clean = df_meta_clean[
    df_meta_clean['product_title'].apply(text_valid) &
    #df_meta_clean['product_description'].apply(text_valid) &
    df_meta_clean['product_images'].apply(images_valid)
].reset_index(drop=True)

print(f"After cleaning: {len(df_meta_clean)} rows")  

After cleaning: 112553 rows


In [10]:
def extract_first_valid_image(images_dict):
    if not isinstance(images_dict, dict):
        return None

    # Keys we care about, in order of preference
    image_keys = ['hi_res', 'large', 'thumb']
    invalid_strs = {'', 'none', 'n/a', 'na'}

    for key in image_keys:
        urls = images_dict.get(key, [])
        if not isinstance(urls, list):
            continue
        for url in urls:
            if isinstance(url, str) and url.strip().lower() not in invalid_strs:
                return url.strip()
    return None

# Apply it to create a new column: product_image_url
df_meta_clean['product_image_url'] = df_meta_clean['product_images'].apply(extract_first_valid_image)

# Optional: drop rows where no valid image could be extracted (just in case)
df_meta_clean = df_meta_clean[df_meta_clean['product_image_url'].notnull()].reset_index(drop=True)

print(f"Final dataset with extracted image URLs: {len(df_meta_clean)} rows")


Final dataset with extracted image URLs: 112553 rows


In [11]:
def flatten_description(desc):
    if isinstance(desc, list):
        return " ".join([d.strip() for d in desc if isinstance(d, str)]).strip()
    elif isinstance(desc, str):
        return desc.strip()
    return ""

df_meta_clean['product_description'] = df_meta_clean['product_description'].apply(flatten_description)

## EMBEDDING APPROACH

In [12]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F

from torch.utils.data import Dataset
from transformers import CLIPProcessor,CLIPModel
from PIL import Image
import requests
from io import BytesIO

In [13]:
from utils import load_and_clean_data,get_model,generate_embeddings,save_embeddings,build_faiss_index
SAVE_DIR = "artifacts_zeroshot"
os.makedirs(SAVE_DIR, exist_ok=True)

In [14]:
prod_data = load_and_clean_data("product_data.csv")

In [15]:
import torch
from torch.utils.data import Dataset
from PIL import Image
import requests
from io import BytesIO
from transformers import CLIPProcessor

class ProductCLIPDataset(Dataset):
    def __init__(self, df, model_name="openai/clip-vit-base-patch32"):
        self.texts = df["product_text"].tolist()
        self.urls = df["product_image_url"].tolist()
        self.processor = CLIPProcessor.from_pretrained(model_name)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        url = self.urls[idx]

        try:
            image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
        except:
            # Fallback image in case of failure
            image = Image.new("RGB", (224, 224), color=(255, 255, 255))

        return {
            "text": text,
            "image": image
        }

    def collate_fn(self, batch):
        texts = [ex["text"] for ex in batch]
        images = [ex["image"] for ex in batch]

        # Tokenize text
        tokenized = self.processor.tokenizer(
            texts,
            padding=True,
            truncation=True,
            return_tensors="pt"
        )

        # Process images
        image_inputs = self.processor.image_processor(
            images,
            return_tensors="pt"
        )

        return {
            "input_ids": tokenized["input_ids"],
            "attention_mask": tokenized["attention_mask"],
            "pixel_values": image_inputs["pixel_values"]
        }


In [16]:
# Load Zero-Shot CLIP Model
model_zs = get_model(approach="zero_shot", save_dir=SAVE_DIR)
print("Loaded model:", model_zs.__class__.__name__)

Loaded model: CLIPModel


In [18]:
# Generate text & image embeddings
dataset = ProductCLIPDataset(prod_data)
text_embs, image_embs = generate_embeddings(model_zs, dataset, batch_size=32)
print("Generated embeddings:", text_embs.shape, image_embs.shape)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Generating embeddings: 100%|██████████████| 2753/2753 [1:14:48<00:00,  1.63s/it]


Generated embeddings: torch.Size([88083, 512]) torch.Size([88083, 512])


In [37]:
save_embeddings(text_embs, image_embs, SAVE_DIR)

combined = F.normalize((text_embs + image_embs) / 2, dim=-1)
index_path = build_faiss_index(combined, SAVE_DIR)
print("FAISS index saved to:", index_path)

FAISS index saved to: artifacts_zeroshot/faiss.index


In [None]:
## SAMPLE TESTING

q_text = "photo finish Professional airbrush makeup"
q_img  = "https://temptupro.com/cdn/shop/products/s-one-essential-airbrush-kit-hero_2.jpg?v=1743181132&width=1780"

recs_text, scores_text = unified_query(
    input_text=q_text,
    input_image_path=None,
    save_dir=SAVE_DIR,
    k=5
)
print("Text-only recommendations:")
print(recs_text[["product_title","product_image_url"]], scores_text)

recs_img, scores_img = unified_query(
    input_text=None,
    input_image_path=q_img,
    save_dir=SAVE_DIR,
    k=5
)
print("Image-only recommendations:")
print(recs_img[["product_title","product_image_url"]], scores_img)

recs_both, scores_both = unified_query(
    input_text=q_text,
    input_image_path=q_img,
    save_dir=SAVE_DIR,
    k=5
)
print("Text+Image recommendations:")
print(recs_both[["product_title","product_image_url"]], scores_both)

In [None]:
import pandas as pd

# 1) Load your queries
df_queries = pd.read_excel("Amazon_recom_queries.xlsx")

# 2) Group by unique Queries, collecting Amazon’s ground-truth lists
amazon_grouped = df_queries.groupby("Queries").agg({
    "Product_title":       list,
    "Product_description": list,
    "Product_link":        list,
    "Image_link":          list
}).reset_index()

# 3) Prepare columns to hold your model’s recommendations
amazon_grouped["Model_rec_titles"]       = None
amazon_grouped["Model_rec_descriptions"] = None
amazon_grouped["Model_rec_links"]        = None
amazon_grouped["Model_rec_scores"]       = None

# 4) For each unique query, run unified_query and store the top-K recs + scores
for i, row in amazon_grouped.iterrows():
    q = row["Queries"]
    img_url = row["Image_link"][0]  # use the first image for that query
    
    recs, scores = unified_query(input_text=q, input_image_path=img_url, k=5)
    
    # Extract the fields you want from the returned DataFrame
    amazon_grouped.at[i, "Model_rec_titles"]       = recs["product_title"].tolist()
    amazon_grouped.at[i, "Model_rec_descriptions"] = recs["product_description"].tolist()
    # If you also saved product links in your metadata, include them:
    amazon_grouped.at[i, "Model_rec_links"]        = recs.get("product_link", pd.Series()).tolist()
    amazon_grouped.at[i, "Model_rec_scores"]       = scores.tolist()

# 5) Inspect
amazon_grouped.head()


In [None]:
amazon_grouped["mean_similarity@5"] = amazon_grouped["Model_rec_scores"] \
                                            .apply(lambda scores: np.mean(scores))

In [None]:
overall_mean = amazon_grouped["mean_similarity@5"].mean()
overall_mean

In [None]:
amazon_grouped.to_csv(os.path.join(SAVE_DIR, "model_recommendations.csv"))