In [2]:
# SETUP

!pip install transformers datasets peft accelerate faiss-cpu --quiet

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import CLIPProcessor, CLIPModel
from peft import get_peft_model, LoraConfig, TaskType
import pandas as pd
import numpy as np
from PIL import Image
import os
import faiss
from tqdm import tqdm
from torch.amp import autocast, GradScaler
import requests
from io import BytesIO
scaler = GradScaler()
import time


In [None]:
# CONFIG
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 128
NUM_EPOCHS = 10
MODEL_NAME = "openai/clip-vit-base-patch32"
DATA_PATH = "./product_data.csv"
N_SAMPLES = 20000

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv(DATA_PATH)

# Fill missing values
df["product_title"] = df["product_title"].fillna("")
df["product_description"] = df["product_description"].fillna("")

# Define keywords for filtering product titles and descriptions
keywords = ['nail', 'shampoo', 'conditioner', 'eye', 'lip', 'ear', 'nose', 'beauty', 'cosmetic', 'hair', 'skin','hand', 'leg', 'oil' ,'makeup', 'lotion', 'cream', 'cleanser', 'moisturizer']

# Filter rows where product_title or product_description contains any of the keywords
def contains_keywords(text, keywords):
    return any(keyword.lower() in text.lower() for keyword in keywords)

# Apply the filter
df_filtered = df[df["product_title"].apply(lambda x: contains_keywords(x, keywords)) | 
                 df["product_description"].apply(lambda x: contains_keywords(x, keywords))]

# Clean and format product text
def create_product_text(row):
    title = row["product_title"].strip()
    description = row["product_description"].strip()
    
    # If there's a description, include it; otherwise, just include the title
    if description:
        full_text = f"Product title is: {title}\nProduct description is: {description}"
    else:
        full_text = f"Product title is: {title}"
    
    return full_text[:512]  # limit text to 512 characters (adjust length if needed)

df_filtered["product_text"] = df_filtered.apply(create_product_text, axis=1)

# Keep only rows where product_text and product_image_url are not empty
df_cleaned = df_filtered[
    df_filtered["product_text"].str.strip().astype(bool) & 
    df_filtered["product_image_url"].str.strip().astype(bool)
].reset_index(drop=True)


In [None]:
df_cleaned['product_text'].shape

In [None]:

class ProductDataset(Dataset):
    def __init__(self, dataframe, model_name="openai/clip-vit-base-patch32"):
        self.texts = dataframe["product_text"].tolist()
        self.image_urls = dataframe["product_image_url"].tolist()
        # Load a single fast processor
        self.processor = CLIPProcessor.from_pretrained(model_name, use_fast=True)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        url  = self.image_urls[idx]
        try:
            img = Image.open(BytesIO(requests.get(url, timeout=5).content)).convert("RGB")
        except:
            img = Image.new("RGB", (224,224), "white")
        return {"text": text, "image": img}

    def collate_fn(self, batch):
        texts  = [ex["text"] for ex in batch]
        images = [ex["image"] for ex in batch]

        # 1) Tokenize text
        tokenized = self.processor.tokenizer(
            texts,
            padding=True,
            truncation=True,
            return_tensors="pt"
        )

        # 2) Preprocess images
        # Note: depending on your transformers version this may be `.feature_extractor` or `.image_processor`
        image_inputs = self.processor.image_processor(
            images=images,
            return_tensors="pt"
        )

        # 3) Merge
        tokenized["pixel_values"] = image_inputs["pixel_values"]
        return tokenized


In [None]:
# MODEL + LORA
def get_model_with_lora():
    base = CLIPModel.from_pretrained(MODEL_NAME)
    config = LoraConfig(
        r=8,
        lora_alpha=16,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.1,
        bias="none",
        task_type=TaskType.FEATURE_EXTRACTION
    )
    model = get_peft_model(base, config)
    return model.to(device)


In [None]:
def train_model(model, dataloader):
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
    scaler = GradScaler()
    start_time = time.time()
    for epoch in range(NUM_EPOCHS):
        total_loss = 0.0
        for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}"):
            batch = {k: v.to(device) for k, v in batch.items()}

            with autocast(device_type='cuda'):  # <<== Fix here
                text_embs = model.get_text_features(
                    input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"]
                )
                image_embs = model.get_image_features(
                    pixel_values=batch["pixel_values"]
                )
                # 3) Similarity logits
                logits_per_text  = text_embs @ image_embs.t()
                logits_per_image = logits_per_text.t()
        
                # 4) Contrastive loss
                B = logits_per_text.size(0)
                labels = torch.arange(B, device=device)
                loss_t2i = F.cross_entropy(logits_per_text, labels)
                loss_i2t = F.cross_entropy(logits_per_image, labels)
                loss = (loss_t2i + loss_i2t) / 2

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            total_loss += loss.item()

        print(f"Epoch {epoch+1} Loss: {total_loss / len(dataloader):.4f}")
    total_training_time = time.time() - start_time  # Total training time
    print(f"Total Training Time: {total_training_time / 60:.2f} minutes")


In [None]:
def generate_embeddings(model, dataset):
    dataloader = DataLoader(
        dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,  
        num_workers=4,
        pin_memory=True,
        collate_fn=dataset.collate_fn  
    )
    text_embs, image_embs = [], []  
    model.eval()  
    model.to(device)  
    with torch.no_grad():  
        for batch in tqdm(dataloader, desc="Generating embeddings"):
            batch = {k: v.to(device) for k, v in batch.items()}
            text_embeddings = model.get_text_features(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
            image_embeddings = model.get_image_features(pixel_values=batch["pixel_values"])
            text_embs.append(F.normalize(text_embeddings, p=2, dim=-1).cpu())  # L2 normalization
            image_embs.append(F.normalize(image_embeddings, p=2, dim=-1).cpu())  # L2 normalization
    text_embs = torch.cat(text_embs, dim=0)
    image_embs = torch.cat(image_embs, dim=0)
    return text_embs, image_embs

In [None]:
#  FAISS INDEXING
def build_faiss_index(embeddings):
    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings.numpy())
    return index

In [None]:
# MODEL FINETUNING 
df_train = df_cleaned.sample(N_SAMPLES)
dataset = ProductDataset(df_train, model_name=MODEL_NAME)
loader = DataLoader(
    dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4,
    pin_memory=True,
    collate_fn=dataset.collate_fn
)

model = get_model_with_lora()
train_model(model, loader)

In [None]:
# Generate embeddings after fine-tuning and building faiss index
dataset = ProductDataset(df_cleaned, model_name=MODEL_NAME)
text_embs, image_embs = generate_embeddings(model, dataset)
combined_embs = F.normalize(text_embs + image_embs, dim=-1)  # [N, D]
index = build_faiss_index(combined_embs)

In [None]:
def unified_query(input_text=None, input_image_path=None, k=5):
    assert input_text or input_image_path, "Provide at least text or image input"

    inputs = {}
    if input_text:
        inputs.update({"text": input_text})
    if input_image_path:
        if input_image_path.startswith("http"):
            response = requests.get(input_image_path)
            image = Image.open(BytesIO(response.content)).convert("RGB")
        else:
            image = Image.open(input_image_path).convert("RGB")
        inputs.update({"images": image})

    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    encoded = processor(return_tensors="pt", padding=True, truncation=True, **inputs)
    encoded = {k: v.to(device) for k, v in encoded.items()}

    with torch.no_grad():
        if input_text and input_image_path:
            text_emb = model.get_text_features(input_ids=encoded["input_ids"], attention_mask=encoded["attention_mask"])
            image_emb = model.get_image_features(pixel_values=encoded["pixel_values"])
            query_emb =(text_emb + image_emb) / 2
        elif input_text:
            text_emb = model.get_text_features(input_ids=encoded["input_ids"], attention_mask=encoded["attention_mask"])
            query_emb = text_emb
        else:
            image_emb = model.get_image_features(pixel_values=encoded["pixel_values"])
            query_emb = image_emb

    # FAISS expects NumPy array in float32
    query_emb = F.normalize(query_emb, dim=-1)
    query_np = query_emb.cpu().numpy().astype("float32")
    faiss_index = faiss.read_index(os.path.join(SAVE_DIR, "faiss_index_mp.index"))
    # Perform the search
    scores, indices = faiss_index.search(query_np, k)
    top_items = df.iloc[indices[0]]
    top_scores = scores[0]

    return top_items, top_scores

In [None]:
import os
import torch
import faiss
import pickle

# Define the save directory
SAVE_DIR = "artifacts_fp_20k_clip"
os.makedirs(SAVE_DIR, exist_ok=True)

# Save the fine-tuned model
model.save_pretrained(os.path.join(SAVE_DIR, "clip_lora_model_mp"))

CLIPProcessor.from_pretrained(MODEL_NAME, use_fast=True).save_pretrained(os.path.join(SAVE_DIR, "clip_processor_mp"))

torch.save(text_embs, os.path.join(SAVE_DIR, "text_embeddings_mp.pt"))
torch.save(image_embs, os.path.join(SAVE_DIR, "image_embeddings_mp.pt"))
torch.save(combined_embs, os.path.join(SAVE_DIR, "combined_embeddings_mp.pt"))

with open(os.path.join(SAVE_DIR, "product_metadata.pkl"), "wb") as f:
    pickle.dump(df_train.to_dict(), f)

faiss.write_index(index, os.path.join(SAVE_DIR, "faiss_index_mp.index"))

print(f"Model, processor, embeddings, metadata, and FAISS index saved in {SAVE_DIR}")


In [None]:
text_embs  = normalize(torch.load(os.path.join(SAVE_DIR, "text_embeddings_mp.pt")).to(device))
image_embs = normalize(torch.load(os.path.join(SAVE_DIR, "image_embeddings_mp.pt")).to(device))

In [None]:
unified_query(input_text="photo finish Professional airbrush makeup")

In [None]:
query_image_url = "https://temptupro.com/cdn/shop/products/s-one-essential-airbrush-kit-hero_2.jpg?v=1743181132&width=1780"

In [None]:
unified_query(input_image_path= query_image_url)

In [None]:
unified_query(input_text="airbrush makeup kit with compressor", input_image_path= query_image_url)

In [None]:
import pandas as pd

# 1) Load your queries
df_queries = pd.read_excel("Amazon_recom_queries.xlsx")

# 2) Group by unique Queries, collecting Amazon’s ground-truth lists
amazon_grouped = df_queries.groupby("Queries").agg({
    "Product_title":       list,
    "Product_description": list,
    "Product_link":        list,
    "Image_link":          list
}).reset_index()

# 3) Prepare columns to hold your model’s recommendations
amazon_grouped["Model_rec_titles"]       = None
amazon_grouped["Model_rec_descriptions"] = None
amazon_grouped["Model_rec_links"]        = None
amazon_grouped["Model_rec_scores"]       = None

# 4) For each unique query, run unified_query and store the top-K recs + scores
for i, row in amazon_grouped.iterrows():
    q = row["Queries"]
    img_url = row["Image_link"][0]  # use the first image for that query
    
    recs, scores = unified_query(input_text=q, input_image_path=img_url, k=5)
    
    # Extract the fields you want from the returned DataFrame
    amazon_grouped.at[i, "Model_rec_titles"]       = recs["product_title"].tolist()
    amazon_grouped.at[i, "Model_rec_descriptions"] = recs["product_description"].tolist()
    # If you also saved product links in your metadata, include them:
    amazon_grouped.at[i, "Model_rec_links"]        = recs.get("product_link", pd.Series()).tolist()
    amazon_grouped.at[i, "Model_rec_scores"]       = scores.tolist()

# 5) Inspect
amazon_grouped.head()


In [None]:
amazon_grouped["mean_similarity@5"] = amazon_grouped["Model_rec_scores"] \
                                            .apply(lambda scores: np.mean(scores))

In [None]:
overall_mean = amazon_grouped["mean_similarity@5"].mean()
overall_mean

In [None]:
amazon_grouped.to_csv(os.path.join(SAVE_DIR, "model_recommendations.csv"))