In [1]:
!pip install transformers accelerate peft datasets torchvision bitsandbytes



In [2]:
import pandas as pd
df_meta_clean = pd.read_csv('product_data.csv')

In [3]:
df_meta_clean['product_title']

0         Howard LC0008 Leather Conditioner, 8-Ounce (4-...
1         Yes to Tomatoes Detoxifying Charcoal Cleanser ...
2          Eye Patch Black Adult with Tie Band (6 Per Pack)
3         Tattoo Eyebrow Stickers, Waterproof Eyebrow, 4...
4         Precision Plunger Bars for Cartridge Grips – 9...
                                ...                        
112548    TOPREETY 24"120gr 3/4 Full Head clip in hair e...
112549    Pets Playmate Pet Grooming Glove,Gentle Deshed...
112550    [10Pack] Makeup Brushes Set Cosmetics Tools Ki...
112551    Xcoser Pretty Party Anna Wig Hair Tails Hair S...
112552    DVIO Men's Voyage Perfume, Spicy woody fragran...
Name: product_title, Length: 112553, dtype: object

In [4]:
df_meta_clean["product_text"] = df_meta_clean.apply(lambda x : str(x["product_title"]) + " " + str(x["product_description"]), axis=1)

df_train = df_meta_clean[
    df_meta_clean["product_text"].str.strip().astype(bool) &
    df_meta_clean["product_image_url"].str.strip().astype(bool)
].reset_index(drop=True)

In [5]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import CLIPProcessor, CLIPModel
from peft import get_peft_model, LoraConfig, TaskType
from PIL import Image
from io import BytesIO
import requests
import torch.nn.functional as F
from tqdm import tqdm

In [6]:
df = df_meta_clean[["product_text","product_image_url"]]

In [7]:
class ProductCLIPDataset(Dataset):
    def __init__(self, df):
        self.texts = df["product_text"].tolist()
        self.urls  = df["product_image_url"].tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        url  = self.urls[idx]
        try:
            img = Image.open(BytesIO(requests.get(url, timeout=5).content)).convert("RGB")
        except:
            img = Image.new("RGB",(224,224),"white")
        return {"text": text, "image": img}

In [8]:
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
def collate_fn(batch):
    texts  = [ex["text"]  for ex in batch]
    images = [ex["image"] for ex in batch]
    enc = processor(text=texts,
                    images=images,
                    return_tensors="pt",
                    padding=True,
                    truncation=True)
    return enc

dataset = ProductCLIPDataset(df_train)
loader  = DataLoader(dataset, batch_size=256, shuffle=True, collate_fn=collate_fn)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
lora_cfg = LoraConfig(
    r=8, lora_alpha=16,
    target_modules=["q_proj","v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.FEATURE_EXTRACTION
)
model = get_peft_model(base, lora_cfg).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

In [None]:
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for batch in tqdm(loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        # Move all to device
        batch = {k:v.to(device) for k,v in batch.items()}

        # 1) Get embeddings
        text_embs  = model.get_text_features(**{k:batch[k] for k in ["input_ids","attention_mask"]})
        image_embs = model.get_image_features(pixel_values=batch["pixel_values"])

        # 2) Normalize
        text_embs  = F.normalize(text_embs,  p=2, dim=-1)
        image_embs = F.normalize(image_embs, p=2, dim=-1)

        # 3) Similarity logits
        logits_per_text  = text_embs @ image_embs.t()
        logits_per_image = logits_per_text.t()

        # 4) Contrastive loss
        B = logits_per_text.size(0)
        labels = torch.arange(B, device=device)
        loss_t2i = F.cross_entropy(logits_per_text, labels)
        loss_i2t = F.cross_entropy(logits_per_image, labels)
        loss = (loss_t2i + loss_i2t) / 2

        # 5) Backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg = total_loss / len(loader)
    print(f"Epoch {epoch+1} avg loss: {avg:.4f}")

Epoch 1/20: 100%|███████████████████████████| 440/440 [5:53:28<00:00, 48.20s/it]


Epoch 1 avg loss: 5.2189


Epoch 2/20: 100%|███████████████████████████| 440/440 [5:45:18<00:00, 47.09s/it]


Epoch 2 avg loss: 4.9793


Epoch 3/20: 100%|███████████████████████████| 440/440 [6:10:37<00:00, 50.54s/it]


Epoch 3 avg loss: 4.9190


Epoch 4/20: 100%|███████████████████████████| 440/440 [6:30:56<00:00, 53.31s/it]


Epoch 4 avg loss: 4.8887


Epoch 5/20: 100%|███████████████████████████| 440/440 [6:17:37<00:00, 51.49s/it]


Epoch 5 avg loss: 4.8694


Epoch 6/20: 100%|███████████████████████████| 440/440 [6:13:59<00:00, 51.00s/it]


Epoch 6 avg loss: 4.8564


Epoch 7/20: 100%|███████████████████████████| 440/440 [6:14:45<00:00, 51.10s/it]


Epoch 7 avg loss: 4.8463


Epoch 8/20:   0%|▏                            | 2/440 [01:33<5:37:44, 46.27s/it]

In [None]:
model.save_pretrained("clip-lora-beauty-full")
processor.save_pretrained("clip-lora-beauty-full")

In [None]:
from transformers import CLIPProcessor, CLIPModel
from peft import PeftModel
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load base CLIP
base_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
# Load your fine-tuned adapter
model = PeftModel.from_pretrained(base_model, "clip-lora-beauty-full")
model = model.to(device).eval()

# Load processor
processor = CLIPProcessor.from_pretrained("clip-lora-beauty-full")


In [None]:
from tqdm import tqdm
import numpy as np

text_embeddings = []
image_embeddings = []

for i, row in tqdm(df_meta_clean.iterrows(), total=len(df_meta_clean)):
    text = row['product_text']
    url  = row['product_image_url']

    # Image loading
    try:
        image = Image.open(BytesIO(requests.get(url, timeout=5).content)).convert("RGB")
    except:
        image = Image.new("RGB", (224, 224), "white")

    inputs = processor(text=text, images=image, return_tensors="pt", padding=True, truncation=True).to(device)

    with torch.no_grad():
        t_emb = model.get_text_features(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
        i_emb = model.get_image_features(pixel_values=inputs["pixel_values"])

    text_embeddings.append(t_emb.cpu().numpy())
    image_embeddings.append(i_emb.cpu().numpy())

text_embeddings = np.vstack(text_embeddings)
image_embeddings = np.vstack(image_embeddings)


In [None]:
import numpy as np
np.save("product_text_embeddings_finetuned_full.npy", np.stack(text_embeddings))
np.save("product_image_embeddings_finetuned_full.npy", np.stack(image_embeddings))

In [None]:
def recommend(query_text=None, query_image_url=None, alpha=0.5, top_k=5):
    assert query_text or query_image_url, "Need at least text or image"

    t_emb, i_emb = None, None
    
    if query_text:
        t_in = processor(text=query_text, return_tensors="pt", truncation=True).to(device)
        with torch.no_grad():
            t_emb = model.get_text_features(**t_in).cpu()  # [1, D]

    if query_image_url:
        img = Image.open(BytesIO(requests.get(query_image_url, timeout=10).content)).convert("RGB")
        i_in = processor(images=img, return_tensors="pt").to(device)
        with torch.no_grad():
            i_emb = model.get_image_features(**i_in).cpu()# [1, D]

    sims = None

    if t_emb is not None and i_emb is not None:
        sim_text = cosine_similarity(t_emb, text_embeddings)[0]
        sim_image = cosine_similarity(i_emb, image_embeddings)[0]
        sims = alpha * sim_image + (1 - alpha) * sim_text

    elif t_emb is not None:
        sim_text = cosine_similarity(t_emb, text_embeddings)[0]
        sim_image = cosine_similarity(t_emb, image_embeddings)[0]
        sims = alpha * sim_image + (1 - alpha) * sim_text

    elif i_emb is not None:
        sim_text = cosine_similarity(i_emb, text_embeddings)[0]
        sim_image = cosine_similarity(i_emb, image_embeddings)[0]
        sims = alpha * sim_text + (1 - alpha) * sim_image

    idxs = sims.argsort()[::-1][:top_k]
    results = df_meta_clean.iloc[idxs][['product_title', 'product_image_url']].copy()
    results['similarity_score'] = sims[idxs]
    return results.reset_index(drop=True)

In [None]:
import numpy as np
recommend(query_text="photo finish Professional airbrush makeup")

In [None]:
query_image_url = "https://temptupro.com/cdn/shop/products/s-one-essential-airbrush-kit-hero_2.jpg?v=1743181132&width=1780"

In [None]:
recommend(query_image_url=query_image_url)

In [None]:
recommend(query_text=" temptu airbrush makeup kit with compressor", query_image_url= query_image_url)

In [None]:
list(df_meta_clean['product_title'].sample(50))