In [1]:
! pip install transformers datasets peft faiss-cpu



In [2]:
import pandas as pd
import torch
from datasets import load_dataset
from transformers import CLIPProcessor, CLIPModel
from peft import get_peft_model, LoraConfig, TaskType
from PIL import Image
from torchvision import transforms
import faiss
import requests
from io import BytesIO

In [3]:
# Load the metadata split from Amazon Reviews 2023
product_meta_data = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_All_Beauty", split="full", trust_remote_code=True)
df_meta = pd.DataFrame.from_records(product_meta_data).add_prefix("product_")

In [4]:
df_meta.shape

(112590, 16)

In [5]:
review_data = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty",  split="full", trust_remote_code=True)
df_review = pd.DataFrame.from_records(review_data).add_prefix("review_")

In [6]:
df_meta_clean = df_meta[df_meta['product_parent_asin'].isin(df_review['review_parent_asin'])]

In [7]:
# Columns to clean
target_cols = ['product_title', 'product_description', 'product_images']

# 1) Drop NaNs and literal None’s
df_meta_clean = df_meta_clean.dropna(subset=target_cols)
df_meta_clean = df_meta_clean[~df_meta_clean[target_cols]
    .applymap(lambda x: x is None).any(axis=1)]

  .applymap(lambda x: x is None).any(axis=1)]


In [8]:
df_meta_clean.shape

(112565, 16)

In [9]:
# 2) Define what an “invalid” string is
invalid_strs = {'', 'n/a', 'none', 'na'}

# 3) Validator for product_images
def images_valid(img_dict):
    if not isinstance(img_dict, dict):
        return False
    # only consider these keys for actual URLs
    for key in ('hi_res', 'large', 'thumb'):
        urls = img_dict.get(key, [])
        if not isinstance(urls, (list, tuple)):
            continue
        for url in urls:
            if isinstance(url, str) and url.strip().lower() not in invalid_strs:
                return True
    return False

# 4) General validator for text fields
def text_valid(x):
    return isinstance(x, str) and x.strip().lower() not in invalid_strs

# 5) Apply validators
#   - title & description must pass text_valid
#   - images must pass images_valid
df_meta_clean = df_meta_clean[
    df_meta_clean['product_title'].apply(text_valid) &
    #df_meta_clean['product_description'].apply(text_valid) &
    df_meta_clean['product_images'].apply(images_valid)
].reset_index(drop=True)

print(f"After cleaning: {len(df_meta_clean)} rows")  

After cleaning: 112553 rows


In [10]:
df_meta_clean[target_cols]

Unnamed: 0,product_title,product_description,product_images
0,"Howard LC0008 Leather Conditioner, 8-Ounce (4-...",[],"{'hi_res': [None, 'https://m.media-amazon.com/..."
1,Yes to Tomatoes Detoxifying Charcoal Cleanser ...,[],{'hi_res': ['https://m.media-amazon.com/images...
2,Eye Patch Black Adult with Tie Band (6 Per Pack),[],"{'hi_res': [None, None], 'large': ['https://m...."
3,"Tattoo Eyebrow Stickers, Waterproof Eyebrow, 4...",[],{'hi_res': ['https://m.media-amazon.com/images...
4,Precision Plunger Bars for Cartridge Grips – 9...,[The Precision Plunger Bars are designed to wo...,"{'hi_res': [None], 'large': ['https://m.media-..."
...,...,...,...
112548,"TOPREETY 24""120gr 3/4 Full Head clip in hair e...",[],{'hi_res': ['https://m.media-amazon.com/images...
112549,"Pets Playmate Pet Grooming Glove,Gentle Deshed...",[],{'hi_res': ['https://m.media-amazon.com/images...
112550,[10Pack] Makeup Brushes Set Cosmetics Tools Ki...,[],{'hi_res': ['https://m.media-amazon.com/images...
112551,Xcoser Pretty Party Anna Wig Hair Tails Hair S...,[],{'hi_res': ['https://m.media-amazon.com/images...


In [11]:
def extract_first_valid_image(images_dict):
    if not isinstance(images_dict, dict):
        return None

    # Keys we care about, in order of preference
    image_keys = ['hi_res', 'large', 'thumb']
    invalid_strs = {'', 'none', 'n/a', 'na'}

    for key in image_keys:
        urls = images_dict.get(key, [])
        if not isinstance(urls, list):
            continue
        for url in urls:
            if isinstance(url, str) and url.strip().lower() not in invalid_strs:
                return url.strip()
    return None

# Apply it to create a new column: product_image_url
df_meta_clean['product_image_url'] = df_meta_clean['product_images'].apply(extract_first_valid_image)

# Optional: drop rows where no valid image could be extracted (just in case)
df_meta_clean = df_meta_clean[df_meta_clean['product_image_url'].notnull()].reset_index(drop=True)

print(f"Final dataset with extracted image URLs: {len(df_meta_clean)} rows")


Final dataset with extracted image URLs: 112553 rows


In [12]:
def flatten_description(desc):
    if isinstance(desc, list):
        return " ".join([d.strip() for d in desc if isinstance(d, str)]).strip()
    elif isinstance(desc, str):
        return desc.strip()
    return ""

df_meta_clean['product_description'] = df_meta_clean['product_description'].apply(flatten_description)

In [13]:
final_data = df_meta_clean[['product_title','product_description','product_image_url']].dropna()

In [14]:
final_data.sample()

Unnamed: 0,product_title,product_description,product_image_url
97378,EZTAT2 Delta Trilateral Premium Tattoo Foot Pe...,,https://m.media-amazon.com/images/I/51ovqLcqhU...


In [15]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import requests
from io import BytesIO
from tqdm import tqdm

# Load CLIP base model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model.eval()

# Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
print(device)
# Sample batched embedding extractor
def encode_batch(texts, image_urls):
    images = []
    for url in image_urls:
        try:
            img = Image.open(BytesIO(requests.get(url, timeout=10).content)).convert("RGB")
            images.append(img)
        except:
            images.append(Image.new("RGB", (224, 224), color='white'))  # fallback

    inputs = processor(text=texts, images=images, return_tensors="pt", padding=True, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.text_embeds.cpu(), outputs.image_embeds.cpu()


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


cuda


In [16]:
text_embeddings = []
image_embeddings = []

batch_size = 32
for i in tqdm(range(0, len(df_meta_clean), batch_size)):
    batch = df_meta_clean.iloc[i:i+batch_size]
    texts = (batch['product_title'] + " " + batch['product_description']).tolist()
    urls = batch['product_image_url'].tolist()
    text_emb, img_emb = encode_batch(texts, urls)
    text_embeddings.append(text_emb)
    image_embeddings.append(img_emb)

# Concatenate everything
text_embeddings = torch.cat(text_embeddings)
image_embeddings = torch.cat(image_embeddings)

100%|█████████████████████████████████████| 3518/3518 [6:59:20<00:00,  7.15s/it]


In [25]:
from sklearn.metrics.pairwise import cosine_similarity

In [29]:
def recommend(query_text=None, query_image_url=None, alpha=0.5, top_k=5):
    assert query_text or query_image_url, "Need at least text or image"

    t_emb, i_emb = None, None
    
    if query_text:
        t_in = processor(text=query_text, return_tensors="pt", truncation=True).to(device)
        with torch.no_grad():
            t_emb = model.get_text_features(**t_in).cpu().numpy()  # [1, D]

    if query_image_url:
        img = Image.open(BytesIO(requests.get(query_image_url, timeout=10).content)).convert("RGB")
        i_in = processor(images=img, return_tensors="pt").to(device)
        with torch.no_grad():
            i_emb = model.get_image_features(**i_in).cpu().numpy()  # [1, D]

    sims = None

    if t_emb is not None and i_emb is not None:
        sim_text = cosine_similarity(t_emb, text_embeddings.numpy())[0]
        sim_image = cosine_similarity(i_emb, image_embeddings.numpy())[0]
        sims = alpha * sim_image + (1 - alpha) * sim_text

    elif t_emb is not None:
        sim_text = cosine_similarity(t_emb, text_embeddings.numpy())[0]
        sim_image = cosine_similarity(t_emb, image_embeddings.numpy())[0]
        sims = alpha * sim_image + (1 - alpha) * sim_text

    elif i_emb is not None:
        sim_text = cosine_similarity(i_emb, text_embeddings.numpy())[0]
        sim_image = cosine_similarity(i_emb, image_embeddings.numpy())[0]
        sims = alpha * sim_text + (1 - alpha) * sim_image

    idxs = sims.argsort()[::-1][:top_k]
    results = df_meta_clean.iloc[idxs][['product_title', 'product_image_url']].copy()
    results['similarity_score'] = sims[idxs]
    return results.reset_index(drop=True)

In [19]:
import numpy as np
np.save("product_text_embeddings.npy", np.stack(text_embeddings))
np.save("product_image_embeddings.npy", np.stack(image_embeddings))
df_meta_clean.to_csv("product_data.csv", index=False)

In [35]:
recommend(query_text="Anti-Aging Cream")

Unnamed: 0,product_title,product_image_url,similarity_score
0,Pure Face Anti-Aging Cream,https://m.media-amazon.com/images/I/41jEY8spj8...,0.622675
1,NaturaCel Anti-Aging Cream,https://m.media-amazon.com/images/I/51WnaCH-g-...,0.60763
2,Enhanced Night Cream,https://m.media-amazon.com/images/I/51adDpjO5C...,0.603435
3,JuvaLux Anti Aging Cream,https://m.media-amazon.com/images/I/31xYwHb56O...,0.602942
4,Natural Being Anti-Aging Oily Skin Night Cream,https://m.media-amazon.com/images/I/517lEwhCpG...,0.598397


In [36]:
query_image_url = "https://m.media-amazon.com/images/I/41pH66O0gcL._AC_.jpg"

In [37]:
recommend(query_image_url=query_image_url)

Unnamed: 0,product_title,product_image_url,similarity_score
0,"Brightening Moisturizing Cream, 1.7 fl oz (50 ...",https://m.media-amazon.com/images/I/61zzz+YOM4...,0.537231
1,I WOKE UP LIKE THIS Purifying Skin Balancing C...,https://m.media-amazon.com/images/I/51eziubITB...,0.535954
2,Brightening Dynamics Illuminate,https://m.media-amazon.com/images/I/31zgEeXNAL...,0.531672
3,Ayur-Medic Calming Cream for Polysensitive Skin,https://m.media-amazon.com/images/I/71WoRNwkjF...,0.530686
4,Ayur-Medic Calming Cream for Polysensitive Skin,https://m.media-amazon.com/images/I/71WoRNwkjF...,0.530686


In [38]:
recommend(query_text="Anti-Aging Cream", query_image_url= query_image_url)

Unnamed: 0,product_title,product_image_url,similarity_score
0,Pure Face Anti-Aging Cream,https://m.media-amazon.com/images/I/41jEY8spj8...,0.826946
1,Best Wrinkle Cream For Deep Wrinkles - Anti-Wr...,https://m.media-amazon.com/images/I/81TH-WWhHC...,0.815251
2,Selected Cosmetics - Eye Lift Cream,https://m.media-amazon.com/images/I/51QgHsAy7Y...,0.809106
3,NaturaCel Anti-Aging Cream,https://m.media-amazon.com/images/I/51WnaCH-g-...,0.807874
4,Natural Being Anti-Aging Oily Skin Night Cream,https://m.media-amazon.com/images/I/517lEwhCpG...,0.807649
