In [1]:
! pip install transformers datasets peft faiss-cpu



In [14]:
import pandas as pd
import torch
from datasets import load_dataset
from transformers import CLIPProcessor, CLIPModel
from peft import get_peft_model, LoraConfig, TaskType
from PIL import Image
from torchvision import transforms
import faiss
import requests
from io import BytesIO

In [3]:
# Load the metadata split from Amazon Reviews 2023
product_meta_data = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_All_Beauty", split="full", trust_remote_code=True)
df_meta = pd.DataFrame.from_records(product_meta_data).add_prefix("product_")

In [4]:
df_meta.shape

(112590, 16)

In [5]:
review_data = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty",  split="full", trust_remote_code=True)
df_review = pd.DataFrame.from_records(review_data).add_prefix("review_")

In [6]:
df_meta_clean = df_meta[df_meta['product_parent_asin'].isin(df_review['review_parent_asin'])]

In [7]:
# Columns to clean
target_cols = ['product_title', 'product_description', 'product_images']

# 1) Drop NaNs and literal None’s
df_meta_clean = df_meta_clean.dropna(subset=target_cols)
df_meta_clean = df_meta_clean[~df_meta_clean[target_cols]
    .applymap(lambda x: x is None).any(axis=1)]

  .applymap(lambda x: x is None).any(axis=1)]


In [8]:
df_meta_clean.shape

(112565, 16)

In [9]:
# 2) Define what an “invalid” string is
invalid_strs = {'', 'n/a', 'none', 'na'}

# 3) Validator for product_images
def images_valid(img_dict):
    if not isinstance(img_dict, dict):
        return False
    # only consider these keys for actual URLs
    for key in ('hi_res', 'large', 'thumb'):
        urls = img_dict.get(key, [])
        if not isinstance(urls, (list, tuple)):
            continue
        for url in urls:
            if isinstance(url, str) and url.strip().lower() not in invalid_strs:
                return True
    return False

# 4) General validator for text fields
def text_valid(x):
    return isinstance(x, str) and x.strip().lower() not in invalid_strs

# 5) Apply validators
#   - title & description must pass text_valid
#   - images must pass images_valid
df_meta_clean = df_meta_clean[
    df_meta_clean['product_title'].apply(text_valid) &
    #df_meta_clean['product_description'].apply(text_valid) &
    df_meta_clean['product_images'].apply(images_valid)
].reset_index(drop=True)

print(f"After cleaning: {len(df_meta_clean)} rows")  

After cleaning: 112553 rows


In [10]:
df_meta_clean[target_cols]

Unnamed: 0,product_title,product_description,product_images
0,"Howard LC0008 Leather Conditioner, 8-Ounce (4-...",[],"{'hi_res': [None, 'https://m.media-amazon.com/..."
1,Yes to Tomatoes Detoxifying Charcoal Cleanser ...,[],{'hi_res': ['https://m.media-amazon.com/images...
2,Eye Patch Black Adult with Tie Band (6 Per Pack),[],"{'hi_res': [None, None], 'large': ['https://m...."
3,"Tattoo Eyebrow Stickers, Waterproof Eyebrow, 4...",[],{'hi_res': ['https://m.media-amazon.com/images...
4,Precision Plunger Bars for Cartridge Grips – 9...,[The Precision Plunger Bars are designed to wo...,"{'hi_res': [None], 'large': ['https://m.media-..."
...,...,...,...
112548,"TOPREETY 24""120gr 3/4 Full Head clip in hair e...",[],{'hi_res': ['https://m.media-amazon.com/images...
112549,"Pets Playmate Pet Grooming Glove,Gentle Deshed...",[],{'hi_res': ['https://m.media-amazon.com/images...
112550,[10Pack] Makeup Brushes Set Cosmetics Tools Ki...,[],{'hi_res': ['https://m.media-amazon.com/images...
112551,Xcoser Pretty Party Anna Wig Hair Tails Hair S...,[],{'hi_res': ['https://m.media-amazon.com/images...


In [11]:
def extract_first_valid_image(images_dict):
    if not isinstance(images_dict, dict):
        return None

    # Keys we care about, in order of preference
    image_keys = ['hi_res', 'large', 'thumb']
    invalid_strs = {'', 'none', 'n/a', 'na'}

    for key in image_keys:
        urls = images_dict.get(key, [])
        if not isinstance(urls, list):
            continue
        for url in urls:
            if isinstance(url, str) and url.strip().lower() not in invalid_strs:
                return url.strip()
    return None

# Apply it to create a new column: product_image_url
df_meta_clean['product_image_url'] = df_meta_clean['product_images'].apply(extract_first_valid_image)

# Optional: drop rows where no valid image could be extracted (just in case)
df_meta_clean = df_meta_clean[df_meta_clean['product_image_url'].notnull()].reset_index(drop=True)

print(f"Final dataset with extracted image URLs: {len(df_meta_clean)} rows")


Final dataset with extracted image URLs: 112553 rows


In [12]:
def flatten_description(desc):
    if isinstance(desc, list):
        return " ".join([d.strip() for d in desc if isinstance(d, str)]).strip()
    elif isinstance(desc, str):
        return desc.strip()
    return ""

df_meta_clean['product_description'] = df_meta_clean['product_description'].apply(flatten_description)

In [13]:
final_data = df_meta_clean[['product_title','product_description','product_image_url']].dropna()

In [14]:
final_data.sample()

Unnamed: 0,product_title,product_description,product_image_url
97378,EZTAT2 Delta Trilateral Premium Tattoo Foot Pe...,,https://m.media-amazon.com/images/I/51ovqLcqhU...


In [15]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import requests
from io import BytesIO
from tqdm import tqdm

# Load CLIP base model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model.eval()

# Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
print(device)
# Sample batched embedding extractor
def encode_batch(texts, image_urls):
    images = []
    for url in image_urls:
        try:
            img = Image.open(BytesIO(requests.get(url, timeout=10).content)).convert("RGB")
            images.append(img)
        except:
            images.append(Image.new("RGB", (224, 224), color='white'))  # fallback

    inputs = processor(text=texts, images=images, return_tensors="pt", padding=True, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.text_embeds.cpu(), outputs.image_embeds.cpu()


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


cuda


In [16]:
text_embeddings = []
image_embeddings = []

batch_size = 32
for i in tqdm(range(0, len(df_meta_clean), batch_size)):
    batch = df_meta_clean.iloc[i:i+batch_size]
    texts = (batch['product_title'] + " " + batch['product_description']).tolist()
    urls = batch['product_image_url'].tolist()
    text_emb, img_emb = encode_batch(texts, urls)
    text_embeddings.append(text_emb)
    image_embeddings.append(img_emb)

# Concatenate everything
text_embeddings = torch.cat(text_embeddings)
image_embeddings = torch.cat(image_embeddings)

100%|█████████████████████████████████████| 3518/3518 [6:59:20<00:00,  7.15s/it]


In [25]:
from sklearn.metrics.pairwise import cosine_similarity

In [29]:
def recommend(query_text=None, query_image_url=None, alpha=0.5, top_k=5):
    assert query_text or query_image_url, "Need at least text or image"

    t_emb, i_emb = None, None
    
    if query_text:
        t_in = processor(text=query_text, return_tensors="pt", truncation=True).to(device)
        with torch.no_grad():
            t_emb = model.get_text_features(**t_in).cpu().numpy()  # [1, D]

    if query_image_url:
        img = Image.open(BytesIO(requests.get(query_image_url, timeout=10).content)).convert("RGB")
        i_in = processor(images=img, return_tensors="pt").to(device)
        with torch.no_grad():
            i_emb = model.get_image_features(**i_in).cpu().numpy()  # [1, D]

    sims = None

    if t_emb is not None and i_emb is not None:
        sim_text = cosine_similarity(t_emb, text_embeddings.numpy())[0]
        sim_image = cosine_similarity(i_emb, image_embeddings.numpy())[0]
        sims = alpha * sim_image + (1 - alpha) * sim_text

    elif t_emb is not None:
        sim_text = cosine_similarity(t_emb, text_embeddings.numpy())[0]
        sim_image = cosine_similarity(t_emb, image_embeddings.numpy())[0]
        sims = alpha * sim_image + (1 - alpha) * sim_text

    elif i_emb is not None:
        sim_text = cosine_similarity(i_emb, text_embeddings.numpy())[0]
        sim_image = cosine_similarity(i_emb, image_embeddings.numpy())[0]
        sims = alpha * sim_text + (1 - alpha) * sim_image

    idxs = sims.argsort()[::-1][:top_k]
    results = df_meta_clean.iloc[idxs][['product_title', 'product_image_url']].copy()
    results['similarity_score'] = sims[idxs]
    return results.reset_index(drop=True)

In [19]:
import numpy as np
np.save("product_text_embeddings.npy", np.stack(text_embeddings))
np.save("product_image_embeddings.npy", np.stack(image_embeddings))
df_meta_clean.to_csv("product_data.csv", index=False)

### Testing

In [71]:
import numpy as np
import pandas as pd

# Load embeddings
text_embeddings = np.load("product_text_embeddings.npy")
image_embeddings = np.load("product_image_embeddings.npy")

# Load metadata
df_meta = pd.read_csv("product_data.csv")

# Load Amazon recommendation queries
df_queries = pd.read_excel("Amazon_recom_queries.xlsx")

In [31]:
import pandas as pd

# Load the dataset
df = pd.read_csv("product_data.csv")

# Fill missing values
df["product_title"] = df["product_title"].fillna("")
df["product_description"] = df["product_description"].fillna("")

# Define keywords for filtering product titles and descriptions
keywords = ['nail', 'shampoo', 'conditioner', 'eye', 'lip', 'ear', 'nose', 'beauty', 'cosmetic', 'hair', 'skin','hand', 'leg', 'oil' ,'makeup', 'lotion', 'cream', 'cleanser', 'moisturizer']

# Filter rows where product_title or product_description contains any of the keywords
def contains_keywords(text, keywords):
    return any(keyword.lower() in text.lower() for keyword in keywords)

# Apply the filter
df_filtered = df[df["product_title"].apply(lambda x: contains_keywords(x, keywords)) | 
                 df["product_description"].apply(lambda x: contains_keywords(x, keywords))]

# Clean and format product text
def create_product_text(row):
    title = row["product_title"].strip()
    description = row["product_description"].strip()
    
    # If there's a description, include it; otherwise, just include the title
    if description:
        full_text = f"Product title is: {title}\nProduct description is: {description}"
    else:
        full_text = f"Product title is: {title}"
    
    return full_text[:512]  # limit text to 512 characters (adjust length if needed)

df_filtered["product_text"] = df_filtered.apply(create_product_text, axis=1)

# Keep only rows where product_text and product_image_url are not empty
df_cleaned = df_filtered[
    df_filtered["product_text"].str.strip().astype(bool) & 
    df_filtered["product_image_url"].str.strip().astype(bool)
].reset_index(drop=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["product_text"] = df_filtered.apply(create_product_text, axis=1)


In [23]:
import faiss
import torch
import os
import torch.nn.functional as F
import numpy as np

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model.eval()

# Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
print(device)

# Assuming you already have this:
combined_embs = (text_embeddings + image_embeddings) / 2

# Create FAISS index (inner product == cosine similarity if normalized)
index = faiss.IndexFlatIP(combined_embs.shape[1])
index.add(combined_embs)

# Create save dir if needed
SAVE_DIR = "artifacts_RAG"
os.makedirs(SAVE_DIR, exist_ok=True)

# Save the index
faiss.write_index(index, os.path.join(SAVE_DIR, "faiss_index_combined.index"))

print("FAISS index created and saved.")


cuda
FAISS index created and saved.


In [32]:
MODEL_NAME = "openai/clip-vit-base-patch32"

In [51]:
def unified_query(input_text=None, input_image_path=None, k=5):
    assert input_text or input_image_path, "Provide at least text or image input"

    inputs = {}
    if input_text:
        inputs.update({"text": input_text})
    if input_image_path:
        if input_image_path.startswith("http"):
            response = requests.get(input_image_path)
            image = Image.open(BytesIO(response.content)).convert("RGB")
        else:
            image = Image.open(input_image_path).convert("RGB")
        inputs.update({"images": image})

    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    encoded = processor(return_tensors="pt", padding=True, truncation=True, **inputs)
    encoded = {k: v.to(device) for k, v in encoded.items()}

    with torch.no_grad():
        if input_text and input_image_path:
            text_emb = model.get_text_features(input_ids=encoded["input_ids"], attention_mask=encoded["attention_mask"])
            image_emb = model.get_image_features(pixel_values=encoded["pixel_values"])
            query_emb =(text_emb + image_emb) / 2
        elif input_text:
            text_emb = model.get_text_features(input_ids=encoded["input_ids"], attention_mask=encoded["attention_mask"])
            query_emb = text_emb
        else:
            image_emb = model.get_image_features(pixel_values=encoded["pixel_values"])
            query_emb = image_emb

    # FAISS expects NumPy array in float32
    query_emb = F.normalize(query_emb, dim=-1)
    query_np = query_emb.cpu().numpy().astype("float32")
    faiss_index = faiss.read_index("artifacts_RAG/faiss_index_combined.index")
    # Perform the search
    scores, indices = faiss_index.search(query_np, k)
    top_items = df.iloc[indices[0]]
    top_scores = scores[0]

    return top_items, top_scores

In [45]:
unified_query(input_text="photo finish Professional airbrush makeup")

(                                           product_title product_description
 21870  Photo Finish Professional Airbrush Cosmetic Ma...                    
 16937   Color On Professional Eye Shadow Smokey Classics                    
 53700                      Waterproof Long Lasting Color                    
 42615  Fashion Fair Oil Free Perfect FinishÂ Cream-to...                    
 63760  Eye makeup-mascara,essence mascar,Mascara to B...                    ,
 array([0.55085355, 0.5320103 , 0.5204488 , 0.5203953 , 0.51483893],
       dtype=float32))

In [46]:
query_image_url = "https://temptupro.com/cdn/shop/products/s-one-essential-airbrush-kit-hero_2.jpg?v=1743181132&width=1780"

In [47]:
unified_query(input_image_path=query_image_url)

(                                           product_title product_description
 79541  Sephora Favorites Beauty Vault: 12 Days of Mak...                    
 48089  4 pcs Professional Eye Makeup Cosmetic Set Con...                    
 98878     Kapitza x Clinique 7 Piece Gift Set. $77 Value                    
 91989  Sephora Favorites LIP TEMPTATIONS Lip Set Beau...                    
 9033   RoseFlower 26Pcs Makeup Cosmetic All-in-One Ho...                    ,
 array([0.5736082, 0.5610577, 0.559578 , 0.5566915, 0.555905 ],
       dtype=float32))

In [48]:
unified_query(input_text="airbrush makeup kit with compressor", input_image_path= query_image_url)

(                                           product_title product_description
 2616                     ETA Shell All in One Makeup Kit                    
 21870  Photo Finish Professional Airbrush Cosmetic Ma...                    
 49561  Makeup Palette ,Start Makers 6 Colors Contour ...                    
 48089  4 pcs Professional Eye Makeup Cosmetic Set Con...                    
 47380  Deluxe 2.0 Battery - Aeroblend Airbrush Makeup...                    ,
 array([0.6628011 , 0.6554586 , 0.65243113, 0.65198946, 0.6485068 ],
       dtype=float32))

In [64]:
import pandas as pd

# 1) Load your queries
df_queries = pd.read_excel("Amazon_recom_queries.xlsx")

# 2) Group by unique Queries, collecting Amazon’s ground-truth lists
amazon_grouped = df_queries.groupby("Queries").agg({
    "Product_title":       list,
    "Product_description": list,
    "Product_link":        list,
    "Image_link":          list
}).reset_index()

# 3) Prepare columns to hold your model’s recommendations
amazon_grouped["Model_rec_titles"]       = None
amazon_grouped["Model_rec_descriptions"] = None
amazon_grouped["Model_rec_links"]        = None
amazon_grouped["Model_rec_scores"]       = None

# 4) For each unique query, run unified_query and store the top-K recs + scores
for i, row in amazon_grouped.iterrows():
    q = row["Queries"]
    img_url = row["Image_link"][0]  # use the first image for that query
    
    recs, scores = unified_query(input_text=q, input_image_path=img_url, k=5)
    
    # Extract the fields you want from the returned DataFrame
    amazon_grouped.at[i, "Model_rec_titles"]       = recs["product_title"].tolist()
    amazon_grouped.at[i, "Model_rec_descriptions"] = recs["product_description"].tolist()
    # If you also saved product links in your metadata, include them:
    amazon_grouped.at[i, "Model_rec_links"]        = recs.get("product_link", pd.Series()).tolist()
    amazon_grouped.at[i, "Model_rec_scores"]       = scores.tolist()

# 5) Inspect
amazon_grouped.head()


Unnamed: 0,Queries,Product_title,Product_description,Product_link,Image_link,Model_rec_titles,Model_rec_descriptions,Model_rec_links,Model_rec_scores
0,10A Straight Human Hair Bundles with Closure,[10A Straight Bundles with Closure 20 22 24 wi...,[Brand\tWowqueen Beauty Color\tBundles with Cl...,[https://www.amazon.com/Straight-Bundles-Closu...,[https://m.media-amazon.com/images/I/81NdEQli2...,[10A Brazilian Straight Hair Bundles with Clos...,"[, , , , ]",[],"[0.7701902389526367, 0.7669499516487122, 0.766..."
1,A gift set for body,[COBA'S DAUGHTER 3-Piece Body Care Gift Set | ...,"[Item Form\tScrub, Gel Scent\tTurmeric & Honey...",[https://www.amazon.com/COBAS-DAUGHTER-Turmeri...,[https://m.media-amazon.com/images/I/81UxWJWl6...,"[Clean Getaway Gift Set, AM & PM Origins Holid...","[, , , , ]",[],"[0.6938373446464539, 0.6834535598754883, 0.682..."
2,Abercrombie & Fitch RYDER Cedarwood Musk Fragr...,"[Abercrombie & Fitch Fierce Cologne Spray, 6.7...",[Brand Abercrombie & Fitch Item Form ...,[https://www.amazon.com/Abercrombie-Fitch-Fier...,[https://m.media-amazon.com/images/I/71vdx060Q...,[Abercrombie & Fitch Fierce Cologne For Men Tr...,"[, , The light scent of fresh cucumber and coc...",[],"[0.6689027547836304, 0.640284538269043, 0.6353..."
3,Amla Hair Oil,[Dabur Amla Hair Oil - Nourishing Indian Oil f...,[Brand\tDabur Hair Type\tAll Item Weight\t300 ...,[https://www.amazon.com/Dabur-Amla-Hair-Oil-30...,[https://m.media-amazon.com/images/I/71ZKLI829...,"[Dabur Amla Hair Oil, 360ml Bottle, PATANJALI ...","[, , , Parachute Amla Hair Cream 210ml, ]",[],"[0.6740005016326904, 0.6421761512756348, 0.623..."
4,Analgesic Magnesium Sulfate Liniment Cream,[Absorbine Veterinary Liniment Topical Analges...,[Spearmint-scented pain reliefgel contains nat...,[https://www.amazon.com/Absorbine-Veterinary-L...,[https://m.media-amazon.com/images/I/61uyMABIp...,"[Southwest Sunshine Herbal Gel, Robert Researc...","[, , Lotion Lite Plus 10%, , ]",[],"[0.6272090673446655, 0.6250230073928833, 0.618..."


In [69]:
amazon_grouped["mean_similarity@5"] = amazon_grouped["Model_rec_scores"] \
                                            .apply(lambda scores: np.mean(scores))

In [70]:
overall_mean = amazon_grouped["mean_similarity@5"].mean()
overall_mean

np.float64(0.6725095043182374)