!pip install transformers accelerate peft datasets torchvision bitsandbytes

In [None]:
import os
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
from transformers import CLIPProcessor
from PIL import Image
import requests
from io import BytesIO

from utils import load_and_clean_data, get_model, save_model_and_processor

In [None]:
# ─── CONFIG ───────────────────────────────────────────────────────────────────
CSV_PATH      = "product_data.csv"
SAVE_DIR      = "artifacts_lora"
BATCH_SIZE    = 128
NUM_EPOCHS    = 10
LR            = 2e-5
DEVICE        = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_NAME    = "openai/clip-vit-base-patch32"

os.makedirs(SAVE_DIR,     exist_ok=True)

In [54]:
# ─── DATASET +  collate_fn ────────────────────────────────────────────────
class ProductCLIPDataset(Dataset):
    def __init__(self, df):
        self.texts     = df["product_text"].tolist()
        self.urls      = df["product_image_url"].tolist()
        proc = CLIPProcessor.from_pretrained(MODEL_NAME, use_fast=True)
        self.tokenizer       = proc.tokenizer
        self.image_processor = proc.image_processor

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        url  = self.urls[idx]
        try:
            resp = requests.get(url, timeout=5)
            img  = Image.open(BytesIO(resp.content)).convert("RGB")
        except:
            img  = Image.new("RGB", (224,224), "white")
        return {"text": text, "image": img}

    def collate_fn(self, batch):
        texts  = [ex["text"]  for ex in batch]
        images = [ex["image"] for ex in batch]

        tok = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            return_tensors="pt"
        )
        imgs = self.image_processor(
            images=images,
            return_tensors="pt"
        ).pixel_values

        return {
            "input_ids":      tok.input_ids,
            "attention_mask": tok.attention_mask,
            "pixel_values":   imgs
        }


In [28]:
df = load_and_clean_data(CSV_PATH)
df_train = df.sample(20000)

In [6]:
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
def collate_fn(batch):
    texts  = [ex["text"]  for ex in batch]
    images = [ex["image"] for ex in batch]
    enc = processor(text=texts,
                    images=images,
                    return_tensors="pt",
                    padding=True,
                    truncation=True)
    return enc

dataset = ProductCLIPDataset(df_train)
loader  = DataLoader(dataset, batch_size=128, shuffle=True, collate_fn=collate_fn)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [7]:
model = get_model(approach="lora", save_dir=None)
model.to(DEVICE)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

In [8]:
import time
start_time = time.time()
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for batch in tqdm(loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        # Move all to device
        batch = {k:v.to(device) for k,v in batch.items()}

        # 1) Get embeddings
        text_embs  = model.get_text_features(**{k:batch[k] for k in ["input_ids","attention_mask"]})
        image_embs = model.get_image_features(pixel_values=batch["pixel_values"])

        # 2) Normalize
        text_embs  = F.normalize(text_embs,  p=2, dim=-1)
        image_embs = F.normalize(image_embs, p=2, dim=-1)

        # 3) Similarity logits
        logits_per_text  = text_embs @ image_embs.t()
        logits_per_image = logits_per_text.t()

        # 4) Contrastive loss
        B = logits_per_text.size(0)
        labels = torch.arange(B, device=device)
        loss_t2i = F.cross_entropy(logits_per_text, labels)
        loss_i2t = F.cross_entropy(logits_per_image, labels)
        loss = (loss_t2i + loss_i2t) / 2

        # 5) Backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg = total_loss / len(loader)
    print(f"Epoch {epoch+1} avg loss: {avg:.4f}")
print(time.time() -  start_time)

Epoch 1/10: 100%|███████████████████████████| 157/157 [1:10:18<00:00, 26.87s/it]


Epoch 1 avg loss: 4.6583


Epoch 2/10: 100%|███████████████████████████| 157/157 [1:07:04<00:00, 25.63s/it]


Epoch 2 avg loss: 4.4892


Epoch 3/10: 100%|███████████████████████████| 157/157 [1:03:47<00:00, 24.38s/it]


Epoch 3 avg loss: 4.3517


Epoch 4/10: 100%|███████████████████████████| 157/157 [1:06:15<00:00, 25.32s/it]


Epoch 4 avg loss: 4.2874


Epoch 5/10: 100%|███████████████████████████| 157/157 [1:05:50<00:00, 25.16s/it]


Epoch 5 avg loss: 4.2533


Epoch 6/10: 100%|███████████████████████████| 157/157 [1:04:20<00:00, 24.59s/it]


Epoch 6 avg loss: 4.2296


Epoch 7/10: 100%|███████████████████████████| 157/157 [1:04:18<00:00, 24.57s/it]


Epoch 7 avg loss: 4.2136


Epoch 8/10: 100%|███████████████████████████| 157/157 [1:05:54<00:00, 25.19s/it]


Epoch 8 avg loss: 4.1984


Epoch 9/10: 100%|███████████████████████████| 157/157 [1:03:54<00:00, 24.42s/it]


Epoch 9 avg loss: 4.1872


Epoch 10/10: 100%|██████████████████████████| 157/157 [1:03:50<00:00, 24.40s/it]

Epoch 10 avg loss: 4.1785
39334.03454661369





In [25]:
print("Training time taken :" , round(39334.03454661369/60, 2))

Training time taken : 655.57


In [29]:
save_model_and_processor(model, SAVE_DIR)

## Model Testing

In [31]:
from utils import *

In [None]:
# ── Load Model & Generate Embeddings ────────────────────────────────────
tuned_model = get_model(approach="lora", save_dir=SAVE_DIR)
tuned_model.to(device)

In [51]:
from transformers import CLIPProcessor

processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32", use_fast=True)

def collate_fn(batch):
    texts  = [ex["text"]  for ex in batch]
    images = [ex["image"] for ex in batch]
    enc = processor(
        text=texts,
        images=images,
        return_tensors="pt",
        padding=True,
        truncation=True,
    )
    # **All tensors here are on CPU**—no .to(device)!
    return {
        "input_ids":      enc["input_ids"],
        "attention_mask": enc["attention_mask"],
        "pixel_values":   enc["pixel_values"],
    }

In [55]:
def generate_embeddings(model, dataset, batch_size=32, num_workers=4):
    """
    Returns two CPU tensors: text_embs [N, D], image_embs [N, D].
    """
    loader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=True,            # collate_fn returns CPU
        collate_fn=dataset.collate_fn
    )

    model.eval().to(DEVICE)
    all_text, all_image = [], []

    with torch.no_grad():
        for batch in tqdm(loader, desc="Generating embeddings"):
            batch = {k: v.to(DEVICE) for k, v in batch.items()}

            # 4️Get features
            t = model.get_text_features(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"]
            )
            i = model.get_image_features(pixel_values=batch["pixel_values"])

            # 5️Normalize & collect on CPU
            all_text.append(F.normalize(t,  dim=-1).cpu())
            all_image.append(F.normalize(i, dim=-1).cpu())

    text_embs  = torch.cat(all_text,  dim=0)
    image_embs = torch.cat(all_image, dim=0)
    return text_embs, image_embs

In [56]:
dataset = ProductCLIPDataset(df)
text_embs, image_embs = generate_embeddings(model, dataset, batch_size=64, num_workers=4)

Generating embeddings:   0%|                           | 0/1377 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after para

In [57]:
print("Embeddings shapes:", text_embs.shape, image_embs.shape)

Embeddings shapes: torch.Size([88083, 512]) torch.Size([88083, 512])


In [58]:
save_embeddings(text_embs, image_embs, SAVE_DIR)

combined = F.normalize((text_embs + image_embs) / 2, dim=-1)
index_path = build_faiss_index(combined, SAVE_DIR)
print("FAISS index saved to:", index_path)

FAISS index saved to: artifacts_lora/faiss.index


In [60]:
def unified_query(
    input_text: str = None,
    input_image_path: str = None,
    approach: str = "zero_shot",
    save_dir: str = None,
    k: int = 5
):
    """
    Query the FAISS index for a given approach (zero_shot, lora, or lora_opt).
    """
    
    model = get_model(approach=approach, save_dir=save_dir)
    model.to(DEVICE).eval()

    df  = pd.read_csv("product_data.csv")
    idx = load_faiss_index(save_dir)

    # 2) Load processor, then split tokenizer and image_processor
    proc = CLIPProcessor.from_pretrained(MODEL_NAME, use_fast=True)
    tokenizer       = proc.tokenizer
    image_processor = proc.image_processor

    # 3) Prepare inputs
    # We batch as a single‐element batch so outputs remain [1, D]
    batch = {}
    if input_text:
        tok = tokenizer(
            [input_text],
            padding=True,
            truncation=True,
            return_tensors="pt"
        )
        batch["input_ids"]      = tok.input_ids.to(DEVICE)
        batch["attention_mask"] = tok.attention_mask.to(DEVICE)

    if input_image_path:
        if input_image_path.startswith("http"):
            resp = requests.get(input_image_path, timeout=5)
            img  = Image.open(BytesIO(resp.content)).convert("RGB")
        else:
            img  = Image.open(input_image_path).convert("RGB")

        img_out = image_processor(
            images=[img],
            return_tensors="pt"
        )
        batch["pixel_values"] = img_out.pixel_values.to(DEVICE)

    # 4) Forward pass
    with torch.no_grad():
        if "input_ids" in batch and "pixel_values" in batch:
            t_emb = model.get_text_features(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"]
            )
            i_emb = model.get_image_features(
                pixel_values=batch["pixel_values"]
            )
            q_emb = (t_emb + i_emb) / 2
        elif "input_ids" in batch:
            q_emb = model.get_text_features(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"]
            )
        else:
            q_emb = model.get_image_features(
                pixel_values=batch["pixel_values"]
            )

    # 5) Normalize, search FAISS
    q_norm = F.normalize(q_emb, dim=-1).cpu().numpy().astype("float32")
    scores, ids = idx.search(q_norm, k)

    top_df     = df.iloc[ids[0]].reset_index(drop=True)
    top_scores = scores[0]
    return top_df, top_scores



In [61]:
## SAMPLE TESTING

q_text = "photo finish Professional airbrush makeup"
q_img  = "https://temptupro.com/cdn/shop/products/s-one-essential-airbrush-kit-hero_2.jpg?v=1743181132&width=1780"

recs_text, scores_text = unified_query(
    input_text=q_text,
    input_image_path=None,
    save_dir=SAVE_DIR,
    k=5
)
print("Text-only recommendations:")
print(recs_text[["product_title","product_image_url"]], scores_text)

recs_img, scores_img = unified_query(
    input_text=None,
    input_image_path=q_img,
    save_dir=SAVE_DIR,
    k=5
)
print("Image-only recommendations:")
print(recs_img[["product_title","product_image_url"]], scores_img)

recs_both, scores_both = unified_query(
    input_text=q_text,
    input_image_path=q_img,
    save_dir=SAVE_DIR,
    k=5
)
print("Text+Image recommendations:")
print(recs_both[["product_title","product_image_url"]], scores_both)

Text-only recommendations:
                                       product_title  \
0  Svanslashes Eyelash Extensions D Curl - Premiu...   
1  Silicone Lotion Bottles Squeezable Leak Proof ...   
2      Uppercut Deluxe Men's Conditioner - Pack of 3   
3  NOKMOPO Women Fashion Keep Warm Knitting Headb...   
4  613 Blonde Deep Wave 3 Bundles 100% Brazilian ...   

                                   product_image_url  
0  https://m.media-amazon.com/images/I/81TEjbcXTj...  
1  https://m.media-amazon.com/images/I/41+SopylN9...  
2  https://m.media-amazon.com/images/I/61YDP21AdP...  
3  https://m.media-amazon.com/images/I/61-zjZBMXL...  
4  https://m.media-amazon.com/images/I/71ZGWHAjlR...   [0.5009656  0.50083387 0.4996715  0.49935097 0.4992208 ]
Image-only recommendations:
                                       product_title  \
0  Detangling Brush Pink - Detangle Brush - No Ta...   
1  RORASA Magnetic Eyelashes with Magnetic Eyelin...   
2  L.O.L Surprise! Townley Girl Jumbo Hair Access... 