# CLIP embeddings

There are models that give you embeeddings both for images and texts. One of them is [CLIP](https://huggingface.co/openai/clip-vit-base-patch32). It is based on a different architecture that DenseNN or CNN that we will discuss later (*transformers*).

In [None]:
# to install transformers
# !pip install -qq transformers

## To get embeddings for text/image

In [None]:
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

def get_clip_embeddings(input_data, input_type='text'):
    # Load the CLIP model and processor
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

    # Prepare the input based on the type
    if input_type == 'text':
        inputs = processor(text=input_data, return_tensors="pt", padding=True, truncation=True)
    elif input_type == 'image':
        if isinstance(input_data, str):
            image = Image.open(input_data)
        elif isinstance(input_data, Image.Image):
            image = input_data
        else:
            raise ValueError("For image input, provide either a file path or a PIL Image object")
        inputs = processor(images=image, return_tensors="pt")
    else:
        raise ValueError("Invalid input_type. Choose 'text' or 'image'")

    # Get the embeddings
    with torch.no_grad():
        if input_type == 'text':
            embeddings = model.get_text_features(**inputs)
        else:
            embeddings = model.get_image_features(**inputs)

    return embeddings.numpy()

# Example usage
text_input = "A beautiful sunset over the ocean"
text_embedding = get_clip_embeddings(text_input, input_type='text')
print("Text embedding shape:", text_embedding.shape)

image_path = "path/to/your/image.jpg"
image_embedding = get_clip_embeddings(image_path, input_type='image')
print("Image embedding shape:", image_embedding.shape)

## To get embeddings for all images in the folder

In [None]:
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
from torch.utils.data import Dataset, DataLoader
import os
import numpy as np

class ImageDataset(Dataset):
    def __init__(self, image_dir, processor):
        self.image_paths = [os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.endswith(('.png', '.jpg', '.jpeg'))]
        self.processor = processor

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx])
        return self.processor(images=image, return_tensors="pt")['pixel_values'][0]

def get_clip_embeddings_batch(image_dir, batch_size=32, device='cuda'):
    # Load the CLIP model and processor
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

    # Create dataset and dataloader
    dataset = ImageDataset(image_dir, processor)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=4)

    all_embeddings = []

    model.eval()
    with torch.no_grad():
        for batch in dataloader:
            batch = batch.to(device)
            embeddings = model.get_image_features(pixel_values=batch)
            all_embeddings.append(embeddings.cpu().numpy())

    return np.concatenate(all_embeddings)

image_dir = "path/to/your/folder/"
batch_size = 32  
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

embeddings = get_clip_embeddings_batch(image_dir, batch_size, device)
print("Embeddings shape:", embeddings.shape)