!pip install transformers accelerate peft datasets torchvision bitsandbytes

In [11]:
import os
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
from transformers import CLIPProcessor
from PIL import Image
import requests
from io import BytesIO

from utils import load_and_clean_data, get_model, save_model_and_processor

In [None]:
# ─── CONFIG ───────────────────────────────────────────────────────────────────
CSV_PATH      = "meta_data_beauty.csv"
SAVE_DIR      = "artifacts_lora_beauty/"
BATCH_SIZE    = 128 # batch size for training
NUM_EPOCHS    = 20 # 10 gave best results
LR            = 2e-5
DEVICE        = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_NAME    = "openai/clip-vit-base-patch32"
os.makedirs(SAVE_DIR,     exist_ok=True)

In [13]:
# ─── DATASET +  collate_fn ────────────────────────────────────────────────
class ProductCLIPDataset(Dataset):
    def __init__(self, df):
        self.texts     = df["product_text"].tolist()
        self.urls      = df["product_image_url"].tolist()
        proc = CLIPProcessor.from_pretrained(MODEL_NAME, use_fast=True)
        self.tokenizer       = proc.tokenizer
        self.image_processor = proc.image_processor

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        url  = self.urls[idx]
        try:
            resp = requests.get(url, timeout=5)
            img  = Image.open(BytesIO(resp.content)).convert("RGB")
        except:
            img  = Image.new("RGB", (224,224), "white")
        return {"text": text, "image": img}

    def collate_fn(self, batch):
        texts  = [ex["text"]  for ex in batch]
        images = [ex["image"] for ex in batch]

        tok = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            return_tensors="pt"
        )
        imgs = self.image_processor(
            images=images,
            return_tensors="pt"
        ).pixel_values

        return {
            "input_ids":      tok.input_ids,
            "attention_mask": tok.attention_mask,
            "pixel_values":   imgs
        }


In [14]:
import pandas as pd
#df = load_and_clean_data(CSV_PATH)
#df_train = df.sample(20000)
df_train = pd.read_csv(CSV_PATH).sample(20000)

In [None]:
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
def collate_fn(batch):
    texts  = [ex["text"]  for ex in batch]
    images = [ex["image"] for ex in batch]
    enc = processor(text=texts,
                    images=images,
                    return_tensors="pt",
                    padding=True,
                    truncation=True)
    return enc

dataset = ProductCLIPDataset(df_train)
loader  = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [6]:
model = get_model(approach="lora", save_dir=None)
model.to(DEVICE)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4)

In [7]:
import time
start_time = time.time()
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for batch in tqdm(loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        # Move all to device
        batch = {k:v.to(DEVICE) for k,v in batch.items()}

        # 1) Get embeddings
        text_embs  = model.get_text_features(**{k:batch[k] for k in ["input_ids","attention_mask"]})
        image_embs = model.get_image_features(pixel_values=batch["pixel_values"])

        # 2) Normalize
        text_embs  = F.normalize(text_embs,  p=2, dim=-1)
        image_embs = F.normalize(image_embs, p=2, dim=-1)

        # 3) Similarity logits
        logits_per_text  = text_embs @ image_embs.t()
        logits_per_image = logits_per_text.t()

        # 4) Contrastive loss
        B = logits_per_text.size(0)
        labels = torch.arange(B, device=DEVICE)
        loss_t2i = F.cross_entropy(logits_per_text, labels)
        loss_i2t = F.cross_entropy(logits_per_image, labels)
        loss = (loss_t2i + loss_i2t) / 2

        # 5) Backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg = total_loss / len(loader)
    print(f"Epoch {epoch+1} avg loss: {avg:.4f}")
print(time.time() -  start_time)
print("Training time taken :" , round(time.time() -  start_time)/60, 2)

Epoch 1/20: 100%|███████████████████████████| 157/157 [1:21:49<00:00, 31.27s/it]


Epoch 1 avg loss: 4.3760


Epoch 2/20: 100%|███████████████████████████| 157/157 [1:08:47<00:00, 26.29s/it]


Epoch 2 avg loss: 4.1852


Epoch 3/20: 100%|███████████████████████████| 157/157 [1:06:32<00:00, 25.43s/it]


Epoch 3 avg loss: 4.1410


Epoch 4/20: 100%|███████████████████████████| 157/157 [1:05:31<00:00, 25.04s/it]


Epoch 4 avg loss: 4.1139


Epoch 5/20: 100%|███████████████████████████| 157/157 [1:03:49<00:00, 24.39s/it]


Epoch 5 avg loss: 4.0960


Epoch 6/20: 100%|███████████████████████████| 157/157 [1:05:57<00:00, 25.21s/it]


Epoch 6 avg loss: 4.0812


Epoch 7/20: 100%|███████████████████████████| 157/157 [1:04:31<00:00, 24.66s/it]


Epoch 7 avg loss: 4.0719


Epoch 8/20: 100%|███████████████████████████| 157/157 [1:03:47<00:00, 24.38s/it]


Epoch 8 avg loss: 4.0624


Epoch 9/20: 100%|███████████████████████████| 157/157 [1:02:38<00:00, 23.94s/it]


Epoch 9 avg loss: 4.0551


Epoch 10/20: 100%|██████████████████████████| 157/157 [1:01:19<00:00, 23.44s/it]


Epoch 10 avg loss: 4.0516


Epoch 11/20: 100%|██████████████████████████| 157/157 [1:03:09<00:00, 24.13s/it]


Epoch 11 avg loss: 4.0468


Epoch 12/20: 100%|██████████████████████████| 157/157 [1:00:24<00:00, 23.09s/it]


Epoch 12 avg loss: 4.0424


Epoch 13/20: 100%|████████████████████████████| 157/157 [59:56<00:00, 22.91s/it]


Epoch 13 avg loss: 4.0418


Epoch 14/20: 100%|████████████████████████████| 157/157 [59:35<00:00, 22.78s/it]


Epoch 14 avg loss: 4.0367


Epoch 15/20: 100%|████████████████████████████| 157/157 [59:05<00:00, 22.58s/it]


Epoch 15 avg loss: 4.0360


Epoch 16/20: 100%|████████████████████████████| 157/157 [59:23<00:00, 22.70s/it]


Epoch 16 avg loss: 4.0340


Epoch 17/20: 100%|████████████████████████████| 157/157 [59:32<00:00, 22.76s/it]


Epoch 17 avg loss: 4.0313


Epoch 18/20: 100%|████████████████████████████| 157/157 [59:34<00:00, 22.77s/it]


Epoch 18 avg loss: 4.0290


Epoch 19/20: 100%|████████████████████████████| 157/157 [59:54<00:00, 22.90s/it]


Epoch 19 avg loss: 4.0270


Epoch 20/20: 100%|██████████████████████████| 157/157 [1:01:00<00:00, 23.32s/it]

Epoch 20 avg loss: 4.0245
75984.39484095573
Training time taken : 1266.4 2





In [8]:
print("Training time taken :" , round(1197.38/60, 2))

Training time taken : 19.96


In [9]:
save_model_and_processor(model, SAVE_DIR)

## Model Testing

In [16]:
from utils import *

In [17]:
# ── Load Model & Generate Embeddings ────────────────────────────────────
tuned_model = get_model(approach="lora", save_dir=SAVE_DIR)
tuned_model.to(DEVICE)



PeftModelForFeatureExtraction(
  (base_model): LoraModel(
    (model): CLIPModel(
      (text_model): CLIPTextTransformer(
        (embeddings): CLIPTextEmbeddings(
          (token_embedding): Embedding(49408, 512)
          (position_embedding): Embedding(77, 512)
        )
        (encoder): CLIPEncoder(
          (layers): ModuleList(
            (0-11): 12 x CLIPEncoderLayer(
              (self_attn): CLIPSdpaAttention(
                (k_proj): Linear(in_features=512, out_features=512, bias=True)
                (v_proj): lora.Linear(
                  (base_layer): Linear(in_features=512, out_features=512, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=512, out_features=8, bias=False)
                  )
                  (lora_B): ModuleDict(
                    (default): Linear(in_features=8, out_

In [18]:
from transformers import CLIPProcessor

processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32", use_fast=True)

def collate_fn(batch):
    texts  = [ex["text"]  for ex in batch]
    images = [ex["image"] for ex in batch]
    enc = processor(
        text=texts,
        images=images,
        return_tensors="pt",
        padding=True,
        truncation=True,
    )
    # **All tensors here are on CPU**—no .to(device)!
    return {
        "input_ids":      enc["input_ids"],
        "attention_mask": enc["attention_mask"],
        "pixel_values":   enc["pixel_values"],
    }

In [19]:
def generate_embeddings(model, dataset, batch_size=32, num_workers=4):
    """
    Returns two CPU tensors: text_embs [N, D], image_embs [N, D].
    """
    loader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=True,            # collate_fn returns CPU
        collate_fn=dataset.collate_fn
    )

    model.eval().to(DEVICE)
    all_text, all_image = [], []

    with torch.no_grad():
        for batch in tqdm(loader, desc="Generating embeddings"):
            batch = {k: v.to(DEVICE) for k, v in batch.items()}

            # 4️Get features
            t = model.get_text_features(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"]
            )
            i = model.get_image_features(pixel_values=batch["pixel_values"])

            # 5️Normalize & collect on CPU
            all_text.append(F.normalize(t,  dim=-1).cpu())
            all_image.append(F.normalize(i, dim=-1).cpu())

    text_embs  = torch.cat(all_text,  dim=0)
    image_embs = torch.cat(all_image, dim=0)
    return text_embs, image_embs

In [None]:
import os
import pandas as pd
SAVE_DIR      = "artifacts_lora_beauty/"
os.makedirs(SAVE_DIR, exist_ok=True)

In [None]:
df_full = pd.read_csv(CSV_PATH)

In [22]:
dataset = ProductCLIPDataset(df_full)
text_embs, image_embs = generate_embeddings(tuned_model, dataset, batch_size=64, num_workers=4)

Generating embeddings: 100%|██████████████| 1760/1760 [1:25:38<00:00,  2.92s/it]


In [23]:
print("Embeddings shapes:", text_embs.shape, image_embs.shape)

Embeddings shapes: torch.Size([112578, 512]) torch.Size([112578, 512])


In [24]:
save_embeddings(text_embs, image_embs, SAVE_DIR)

combined = F.normalize((text_embs + image_embs) / 2, dim=-1)
index_path = build_faiss_index(combined, SAVE_DIR)
print("FAISS index saved to:", index_path)

FAISS index saved to: artifacts_lora_fash/faiss.index
