In [1]:
from clip.model import OnnxClip, softmax, get_similarity_scores
from PIL import Image

images = [Image.open("clip/data/dog.jpg").convert("RGB")]

texts = {"classification": ["a photo of a man", "a photo of a woman", "s photo of a dog"],
         "situational": ["a dog standing up", "a dog running", "a dog laying on grass"],
     }


onnx_model = OnnxClip(batch_size=16, type='siglip')

image_embeddings = onnx_model.get_image_embeddings(images)
text_embeddings_class = onnx_model.get_text_embeddings(texts['classification'])
text_embeddings_situational = onnx_model.get_text_embeddings(texts['situational'])


contexts = {"classification": text_embeddings_class,
            "situational": text_embeddings_situational,
           }

logits = get_similarity_scores(image_embeddings, contexts)
probabilities = softmax(logits['classification'])


for k,v in contexts.items():
    
    print(f'\ncontext: {k}\n')
    probabilities = softmax(logits[k])
    
    for text, p in zip(texts[k], probabilities[0]):
        probabilities = softmax(logits['classification'])
        print(f"Probability that the image is '{text}': {p:.3f}")

INFO:root:Available providers for ONNXRuntime: CPUExecutionProvider
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /google/siglip-base-patch16-384/resolve/main/processor_config.json HTTP/1.1" 404 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /google/siglip-base-patch16-384/resolve/main/preprocessor_config.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /google/siglip-base-patch16-384/resolve/main/preprocessor_config.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /google/siglip-base-patch16-384/resolve/main/preprocessor_config.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /google/siglip-base-patch16-384/resolve/main/tokenizer_config.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /google/siglip-base-patch16-384/resolve/main/processo


context: classification

Probability that the image is 'a photo of a man': 0.212
Probability that the image is 'a photo of a woman': 0.212
Probability that the image is 's photo of a dog': 0.576

context: situational

Probability that the image is 'a dog standing up': 0.219
Probability that the image is 'a dog running': 0.226
Probability that the image is 'a dog laying on grass': 0.555


In [6]:
logits

{'classification': array([[1.7257822e-05, 2.6411062e-06, 9.9998003e-01]], dtype=float32),
 'situational': array([[0.01211032, 0.04465795, 0.9432317 ]], dtype=float32)}

In [11]:
text_embeddings.shape

(3, 768)

In [7]:

import errno
import os
import logging
from pathlib import Path
from typing import List, Tuple, Union, Iterable, Iterator, TypeVar, Optional
import gdown

import numpy as np
import onnxruntime as ort
from PIL import Image

from clip import Preprocessor, Tokenizer

logging.basicConfig(level=logging.DEBUG)


def softmax(x: np.ndarray) -> np.ndarray:
    """
    Computes softmax values for each sets of scores in x.
    This ensures the output sums to 1 for each image (along axis 1).
    """

    # Exponents
    exp_arr = np.exp(x)

    return exp_arr / np.sum(exp_arr, axis=1, keepdims=True)


def cosine_similarity(
    embeddings_1: np.ndarray, embeddings_2: np.ndarray
) -> np.ndarray:
    """Compute the pairwise cosine similarities between two embedding arrays.

    Args:
        embeddings_1: An array of embeddings of shape (N, D).
        embeddings_2: An array of embeddings of shape (M, D).

    Returns:
        An array of shape (N, M) with the pairwise cosine similarities.
    """

    for embeddings in [embeddings_1, embeddings_2]:
        if len(embeddings.shape) != 2:
            raise ValueError(
                f"Expected 2-D arrays but got shape {embeddings.shape}."
            )

    d1 = embeddings_1.shape[1]
    d2 = embeddings_2.shape[1]
    if d1 != d2:
        raise ValueError(
            "Expected second dimension of embeddings_1 and embeddings_2 to "
            f"match, but got {d1} and {d2} respectively."
        )

    def normalize(embeddings):
        return embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

    embeddings_1 = normalize(embeddings_1)
    embeddings_2 = normalize(embeddings_2)

    return embeddings_1 @ embeddings_2.T


def get_similarity_scores(image_embedding: list,
                           queries: dict):
    """Compute pairwise similarity scores between two arrays of embeddings.

    """

    res_dict = {}

    for key, query in queries.items():
      if not isinstance(query, (np.ndarray, np.generic) ):
        continue

      if image_embedding.ndim == 1:
          # Convert to 2-D array using x[np.newaxis, :]
          # and remove the extra dimension at the end.
          res_dict[key] = softmax(get_similarity_scores(
              image_embedding[np.newaxis, :], query
          )[0])

      if query.ndim == 1:
          # Convert to 2-D array using x[np.newaxis, :]
          # and remove the extra dimension at the end.
          res_dict[key] = softmax(get_similarity_scores(
              image_embedding, query[np.newaxis, :]
          )[:, 0])

      res_dict[key] = softmax(cosine_similarity(image_embedding, query) * 100)


    return res_dict




class OnnxClip2:
    """
    This class can be utilised to predict the most relevant text snippet, given
    an image, without directly optimizing for the task, similarly to the
    zero-shot capabilities of GPT-2 and 3. The difference between this class
    and [CLIP](https://github.com/openai/CLIP) is that here we don't depend on
    `torch` or `torchvision`.
    """


    def __init__(
        self, model: str = "ViT-B/32", batch_size: Optional[int] = None
    ):
        """
        Instantiates the model and required encoding classes.

        Args:
            model: The model to utilise. Currently ViT-B/32 
            batch_size: If set, splits the lists in `get_image_embeddings`
                and `get_text_embeddings` into batches of this size before
                passing them to the model. The embeddings are then concatenated
                back together before being returned. This is necessary when
                passing large amounts of data (perhaps ~100 or more).
            
        """ 

        providers = ort.get_available_providers()

        if providers:
            logging.info(
                "Available providers for ONNXRuntime: %s", ", ".join(providers)
            )
 

        self.embedding_size = 512
        self._model_urls = {'clip_image_model_vitb32.onnx': 'https://drive.google.com/file/d/1WbRBDaBLsVdAZRD_1deq0uYGhIVFNoAi/view?usp=drive_link',
                            'clip_text_model_vitb32.onnx': 'https://drive.google.com/file/d/1EC2ju-gIlLfBJ3un-1G5QFQzYi8DoA9o/view?usp=drive_link'}

        self.image_model, self.text_model = self._load_models(model)
        self._tokenizer = Tokenizer()
        self._preprocessor = Preprocessor()
        self._batch_size = batch_size
       
    
    @property
    def EMBEDDING_SIZE(self):
        raise RuntimeError("OnnxModel.EMBEDDING_SIZE is no longer supported,f please use the instance attribute: onnx_model.embedding_size")


    def _load_models(
        self,
        model: str,
    ) -> Tuple[ort.InferenceSession, ort.InferenceSession]:
      
  
        IMAGE_MODEL_FILE = "clip_image_model_vitb32.onnx"
        TEXT_MODEL_FILE = "clip_text_model_vitb32.onnx"
       
        base_dir = os.path.dirname(os.path.abspath(__file__))

        models = []

        for model_file in [IMAGE_MODEL_FILE, TEXT_MODEL_FILE]:
            path = os.path.join(base_dir, "data", model_file)
            models.append(self._load_model(path))

        return models[0], models[1]

    def _load_model(self, path: str):
        try:
            if os.path.exists(path):
                # `providers` need to be set explicitly since ORT 1.9
                return ort.InferenceSession(
                    path, providers=ort.get_available_providers()
                )
            else:
                raise FileNotFoundError(
                    errno.ENOENT,
                    os.strerror(errno.ENOENT),
                    path,
                )
        except FileNotFoundError:

            gdown.download(url=self._model_urls[os.path.basename(path)], 
                            output=path, 
                            fuzzy=True
                            )
        
            # `providers` need to be set explicitly since ORT 1.9
            return ort.InferenceSession(
                path, providers=ort.get_available_providers()
            )

    def get_image_embeddings(
        self,
        images: Iterable[Union[Image.Image, np.ndarray]],
        with_batching: bool = True,
    ) -> np.ndarray:
        """Compute the embeddings for a list of images.

        Args:
            images: A list of images to run on. Each image must be a 3-channel
                (RGB) image. Can be any size, as the preprocessing step will
                resize each image to size (224, 224).
            with_batching: Whether to use batching - see the `batch_size` param
                in `__init__()`

        Returns:
            An array of embeddings of shape (len(images), embedding_size).
        """
        if not with_batching or self._batch_size is None:
            # Preprocess images
            images = [
                self._preprocessor.encode_image(image) for image in images
            ]
            if not images:
                return self._get_empty_embedding()

            batch = np.concatenate(images)

            return self.image_model.run(None, {"IMAGE": batch})[0]

        else:
            embeddings = []
            for batch in to_batches(images, self._batch_size):
                embeddings.append(
                    self.get_image_embeddings(batch, with_batching=False)
                )

            if not embeddings:
                return self._get_empty_embedding()

            return np.concatenate(embeddings)

    def get_text_embeddings(
        self, texts: Iterable[str], with_batching: bool = True
    ) -> np.ndarray:
        """Compute the embeddings for a list of texts.

        Args:
            texts: A list of texts to run on. Each entry can be at most
                77 characters.
            with_batching: Whether to use batching - see the `batch_size` param
                in `__init__()`

        Returns:
            An array of embeddings of shape (len(texts), embedding_size).
        """
        if not with_batching or self._batch_size is None:
            text = self._tokenizer.encode_text(texts)
            if len(text) == 0:
                return self._get_empty_embedding()

            return self.text_model.run(None, {"TEXT": text})[0]
        else:
            embeddings = []
            for batch in to_batches(texts, self._batch_size):
                embeddings.append(
                    self.get_text_embeddings(batch, with_batching=False)
                )

            if not embeddings:
                return self._get_empty_embedding()

            return np.concatenate(embeddings)

    def _get_empty_embedding(self):
        return np.empty((0, self.embedding_size), dtype=np.float32)
    
    def encode_text_with_prompt_ensemble(self, model, texts, device, prompt_templates=None):

        # using default prompt templates for ImageNet
        if prompt_templates == None:
            prompt_templates = ['a bad photo of a {}.', 'a photo of many {}.', 'a sculpture of a {}.', 'a photo of the hard to see {}.', 'a low resolution photo of the {}.', 'a rendering of a {}.', 'graffiti of a {}.', 'a bad photo of the {}.', 'a cropped photo of the {}.', 'a tattoo of a {}.', 'the embroidered {}.', 'a photo of a hard to see {}.', 'a bright photo of a {}.', 'a photo of a clean {}.', 'a photo of a dirty {}.', 'a dark photo of the {}.', 'a drawing of a {}.', 'a photo of my {}.', 'the plastic {}.', 'a photo of the cool {}.', 'a close-up photo of a {}.', 'a black and white photo of the {}.', 'a painting of the {}.', 'a painting of a {}.', 'a pixelated photo of the {}.', 'a sculpture of the {}.', 'a bright photo of the {}.', 'a cropped photo of a {}.', 'a plastic {}.', 'a photo of the dirty {}.', 'a jpeg corrupted photo of a {}.', 'a blurry photo of the {}.', 'a photo of the {}.', 'a good photo of the {}.', 'a rendering of the {}.', 'a {} in a video game.', 'a photo of one {}.', 'a doodle of a {}.', 'a close-up photo of the {}.', 'a photo of a {}.', 'the origami {}.', 'the {} in a video game.', 'a sketch of a {}.', 'a doodle of the {}.', 'a origami {}.', 'a low resolution photo of a {}.', 'the toy {}.', 'a rendition of the {}.', 'a photo of the clean {}.', 'a photo of a large {}.', 'a rendition of a {}.', 'a photo of a nice {}.', 'a photo of a weird {}.', 'a blurry photo of a {}.', 'a cartoon {}.', 'art of a {}.', 'a sketch of the {}.', 'a embroidered {}.', 'a pixelated photo of a {}.', 'itap of the {}.', 'a jpeg corrupted photo of the {}.', 'a good photo of a {}.', 'a plushie {}.', 'a photo of the nice {}.', 'a photo of the small {}.', 'a photo of the weird {}.', 'the cartoon {}.', 'art of the {}.', 'a drawing of the {}.', 'a photo of the large {}.', 'a black and white photo of a {}.', 'the plushie {}.', 'a dark photo of a {}.', 'itap of a {}.', 'graffiti of the {}.', 'a toy {}.', 'itap of my {}.', 'a photo of a cool {}.', 'a photo of a small {}.', 'a tattoo of the {}.', 'there is a {} in the scene.', 'there is the {} in the scene.', 'this is a {} in the scene.', 'this is the {} in the scene.', 'this is one {} in the scene.']

        text_features = []
        for t in texts:
            prompted_t = [template.format(t) for template in prompt_templates]
            propmpted_t  = self._tokenizer.encode_text(prompted_t)
            class_embeddings = self.text_model.run(None, {"TEXT": text})[0]
            class_embeddings /= class_embeddings.norm(dim=-1, keepdim=True)
            class_embedding = class_embeddings.mean(dim=0)
            class_embedding /= class_embedding.norm()
            text_features.append(class_embedding)
        text_features = torch.stack(text_features, dim=1).to(device).t()

        return text_features



T = TypeVar("T")


def to_batches(items: Iterable[T], size: int) -> Iterator[List[T]]:
    """
    Splits an iterable (e.g. a list) into batches of length `size`. Includes
    the last, potentially shorter batch.

    Examples:
        >>> list(to_batches([1, 2, 3, 4], size=2))
        [[1, 2], [3, 4]]
        >>> list(to_batches([1, 2, 3, 4, 5], size=2))
        [[1, 2], [3, 4], [5]]

        # To limit the number of batches returned
        # (avoids reading the rest of `items`):
        >>> import itertools
        >>> list(itertools.islice(to_batches([1, 2, 3, 4, 5], size=2), 1))
        [[1, 2]]

    Args:
        items: The iterable to split.
        size: How many elements per batch.
    """
    if size < 1:
        raise ValueError("Chunk size must be positive.")

    batch = []
    for item in items:
        batch.append(item)

        if len(batch) == size:
            yield batch
            batch = []

    # The last, potentially incomplete batch
    if batch:
        yield batch



def encode_text_with_prompt_ensemble(self, model, texts, device, prompt_templates=None):

    # using default prompt templates for ImageNet
    if prompt_templates == None:
        prompt_templates = ['a bad photo of a {}.', 'a photo of many {}.', 'a sculpture of a {}.', 'a photo of the hard to see {}.', 'a low resolution photo of the {}.', 'a rendering of a {}.', 'graffiti of a {}.', 'a bad photo of the {}.', 'a cropped photo of the {}.', 'a tattoo of a {}.', 'the embroidered {}.', 'a photo of a hard to see {}.', 'a bright photo of a {}.', 'a photo of a clean {}.', 'a photo of a dirty {}.', 'a dark photo of the {}.', 'a drawing of a {}.', 'a photo of my {}.', 'the plastic {}.', 'a photo of the cool {}.', 'a close-up photo of a {}.', 'a black and white photo of the {}.', 'a painting of the {}.', 'a painting of a {}.', 'a pixelated photo of the {}.', 'a sculpture of the {}.', 'a bright photo of the {}.', 'a cropped photo of a {}.', 'a plastic {}.', 'a photo of the dirty {}.', 'a jpeg corrupted photo of a {}.', 'a blurry photo of the {}.', 'a photo of the {}.', 'a good photo of the {}.', 'a rendering of the {}.', 'a {} in a video game.', 'a photo of one {}.', 'a doodle of a {}.', 'a close-up photo of the {}.', 'a photo of a {}.', 'the origami {}.', 'the {} in a video game.', 'a sketch of a {}.', 'a doodle of the {}.', 'a origami {}.', 'a low resolution photo of a {}.', 'the toy {}.', 'a rendition of the {}.', 'a photo of the clean {}.', 'a photo of a large {}.', 'a rendition of a {}.', 'a photo of a nice {}.', 'a photo of a weird {}.', 'a blurry photo of a {}.', 'a cartoon {}.', 'art of a {}.', 'a sketch of the {}.', 'a embroidered {}.', 'a pixelated photo of a {}.', 'itap of the {}.', 'a jpeg corrupted photo of the {}.', 'a good photo of a {}.', 'a plushie {}.', 'a photo of the nice {}.', 'a photo of the small {}.', 'a photo of the weird {}.', 'the cartoon {}.', 'art of the {}.', 'a drawing of the {}.', 'a photo of the large {}.', 'a black and white photo of a {}.', 'the plushie {}.', 'a dark photo of a {}.', 'itap of a {}.', 'graffiti of the {}.', 'a toy {}.', 'itap of my {}.', 'a photo of a cool {}.', 'a photo of a small {}.', 'a tattoo of the {}.', 'there is a {} in the scene.', 'there is the {} in the scene.', 'this is a {} in the scene.', 'this is the {} in the scene.', 'this is one {} in the scene.']

    text_features = []
    for t in texts:
        prompted_t = [template.format(t) for template in prompt_templates]
        propmpted_t  = self._tokenizer.encode_text(prompted_t)
        class_embeddings = self.text_model.run(None, {"TEXT": text})[0]
        class_embeddings /= class_embeddings.norm(dim=-1, keepdim=True)
        class_embedding = class_embeddings.mean(dim=0)
        class_embedding /= class_embedding.norm()
        text_features.append(class_embedding)
    text_features = torch.stack(text_features, dim=1).to(device).t()

    return text_features


def get_similarity_map(sm, shape):

    # min-max norm
    sm = (sm - sm.min(1, keepdim=True)[0]) / (sm.max(1, keepdim=True)[0] - sm.min(1, keepdim=True)[0])

    # reshape
    side = int(sm.shape[1] ** 0.5) # square output
    sm = sm.reshape(sm.shape[0], side, side, -1).permute(0, 3, 1, 2)

    # interpolate
    sm = torch.nn.functional.interpolate(sm, shape, mode='bilinear')
    sm = sm.permute(0, 2, 3, 1)
    
    return sm


def clip_feature_surgery(image_features, text_features, redundant_feats=None, t=2):

    if redundant_feats != None:
        similarity = image_features @ (text_features - redundant_feats).t()

    else:
        # weights to restrain influence of obvious classes on others
        prob = image_features[:, :1, :] @ text_features.t()
        prob = (prob * 2).softmax(-1)
        w = prob / prob.mean(-1, keepdim=True)

        # element-wise multiplied features
        b, n_t, n_i, c = image_features.shape[0], text_features.shape[0], image_features.shape[1], image_features.shape[2]
        feats = image_features.reshape(b, n_i, 1, c) * text_features.reshape(1, 1, n_t, c)
        feats *= w.reshape(1, 1, n_t, 1)
        redundant_feats = feats.mean(2, keepdim=True) # along cls dim
        feats = feats - redundant_feats
        
        # sum the element-wise multiplied features as cosine similarity
        similarity = feats.sum(-1)

    return similarity


# sm shape N_t
def similarity_map_to_points(sm, shape, t=0.8, down_sample=2):
    side = int(sm.shape[0] ** 0.5)
    sm = sm.reshape(1, 1, side, side)

    # down sample to smooth results
    down_side = side // down_sample
    sm = torch.nn.functional.interpolate(sm, (down_side, down_side), mode='bilinear')[0, 0, :, :]
    h, w = sm.shape
    sm = sm.reshape(-1)

    sm = (sm - sm.min()) / (sm.max() - sm.min())
    rank = sm.sort(0)[1]
    scale_h = float(shape[0]) / h
    scale_w = float(shape[1]) / w

    num = min((sm >= t).sum(), sm.shape[0] // 2)
    labels = np.ones(num * 2).astype('uint8')
    labels[num:] = 0
    points = []

    # positives
    for idx in rank[-num:]:
        x = min((idx % w + 0.5) * scale_w, shape[1] - 1) # +0.5 to center
        y = min((idx // w + 0.5) * scale_h, shape[0] - 1)
        points.append([int(x.item()), int(y.item())])

    # negatives
    for idx in rank[:num]:
        x = min((idx % w + 0.5) * scale_w, shape[1] - 1)
        y = min((idx // w + 0.5) * scale_h, shape[0] - 1)
        points.append([int(x.item()), int(y.item())])

    return points, labels


In [1]:
from clip.model import OnnxClip, softmax, get_similarity_scores
from PIL import Image



all_texts = ['airplane', 'bag', 'bed', 'bedclothes', 'bench', 'bicycle', 'bird', 'boat', 'book', 'bottle', 'building', 'bus', 'cabinet', 'car', 'cat', 'ceiling', 'chair', 'cloth', 'computer', 'cow', 'cup', 'curtain', 'dog', 'door', 'fence', 'floor', 'flower', 'food', 'grass', 'ground', 'horse', 'keyboard', 'light', 'motorbike', 'mountain', 'mouse', 'person', 'plate', 'platform', 'potted plant', 'road', 'rock', 'sheep', 'shelves', 'sidewalk', 'sign', 'sky', 'snow', 'sofa', 'table', 'track', 'train', 'tree', 'truck', 'tv monitor', 'wall', 'water', 'window', 'wood']
target_texts = ['bench', 'person', 'ground', 'building']

# Your images/texts will get split into batches of this size before being
# passed to CLIP, to limit memory usage
onnx_model = OnnxClip(batch_size=16)

# Unlike the original CLIP, there is no need to run tokenization/preprocessing
# separately - simply run get_image_embeddings directly on PIL images/NumPy
# arrays, and run get_text_embeddings directly on strings.
image_embeddings = onnx_model.get_image_embeddings(images)

encoded_image = onnx_model._preprocessor.encode_image(images[0])
print(encoded_image.shape)
print(image_embeddings.shape)
text_embeddings = onnx_model.get_text_embeddings(target_texts)

# To use the embeddings for zero-shot classification, you can use these two
# functions. Here we run on a single image, but any number is supported.
logits = get_similarity_scores(image_embeddings, {"ct1": text_embeddings})
probabilities = softmax(logits['ct1'])

print("Logits:", logits)

for text, p in zip(target_texts, probabilities[0]):
    print(f"Probability that the image is '{text}': {p:.3f}")

INFO:root:Available providers for ONNXRuntime: CPUExecutionProvider


(1, 3, 224, 224)
(1, 512)
Logits: {'ct1': array([[0.2674303 , 0.4776909 , 0.21659727, 0.03828153]], dtype=float32)}
Probability that the image is 'bench': 0.251
Probability that the image is 'person': 0.310
Probability that the image is 'ground': 0.239
Probability that the image is 'building': 0.200


In [3]:
import numpy as np
from numpy import linalg as LA

images = [Image.open("dog.jpg").convert("RGB")]
image_embeddings = onnx_model.get_image_embeddings(images)
image_embeddings = np.expand_dims(image_embeddings, axis=1)
image_features = image_embeddings / LA.norm(image_embeddings, axis=1, keepdims=True)
# Prompt ensemble for text features with normalization
text_features = onnx_model.encode_text_with_prompt_ensemble(all_texts)


In [4]:
features = image_features @ text_features.T

In [5]:
features.shape

(1, 1, 59)

In [6]:
features[:, 1:, :]

array([], shape=(1, 0, 59), dtype=float32)

In [None]:
features = image_features @ text_features.t()
similarity_map = clip.get_similarity_map(features[:, 1:, :], cv2_img.shape[:2])


In [None]:
image_embeddings = onnx_model.get_image_embeddings(images)
image_embeddings = np.expand_dims(image_embeddings, axis=1)
image_features = image_embeddings / LA.norm(image_embeddings, axis=1, keepdims=True)

In [20]:
text_features.shape

(59, 512)

In [27]:
image_features.shape

(1, 512)

In [44]:
image_embeddings = onnx_model.get_image_embeddings(images)
image_embeddings = np.expand_dims(image_embeddings, axis=1)
image_features = image_embeddings / LA.norm(image_embeddings, axis=1, keepdims=True)

In [18]:
def clip_feature_surgery(image_features, text_features, redundant_feats=None, t=2):

    if redundant_feats != None:
        similarity = image_features @ (text_features - redundant_feats).t()

    else:
        # weights to restrain influence of obvious classes on others
        prob = image_features[:, :1, :] @ text_features.t()
        prob = (prob * 2).softmax(-1)
        w = prob / prob.mean(-1, keepdim=True)

        # element-wise multiplied features
        b, n_t, n_i, c = image_features.shape[0], text_features.shape[0], image_features.shape[1], image_features.shape[2]
        feats = image_features.reshape(b, n_i, 1, c) * text_features.reshape(1, 1, n_t, c)
        feats *= w.reshape(1, 1, n_t, 1)
        redundant_feats = feats.mean(2, keepdim=True) # along cls dim
        feats = feats - redundant_feats
        
        # sum the element-wise multiplied features as cosine similarity
        similarity = feats.sum(-1)

    return similarity


In [42]:
image_features.shape

(1, 512)

In [45]:
similarity = clip_feature_surgery(image_features, text_features)

AttributeError: 'numpy.ndarray' object has no attribute 't'

In [46]:

with torch.no_grad():
    # Extract image features
    image_features = model.encode_image(image)
    image_features = image_features / image_features.norm(dim=1, keepdim=True)

    # Prompt ensemble for text features with normalization
    text_features = clip.encode_text_with_prompt_ensemble(model, all_texts, device)

    # Similarity map from image tokens with min-max norm and resize, B,H,W,N
    features = image_features @ text_features.t()
    similarity_map = clip.get_similarity_map(features[:, 1:, :], cv2_img.shape[:2])

    # Draw similarity map
    for b in range(similarity_map.shape[0]):
        for n in range(similarity_map.shape[-1]):
            if all_texts[n] not in target_texts:
                continue
            vis = (similarity_map[b, :, :, n].cpu().numpy() * 255).astype('uint8')
            vis = cv2.applyColorMap(vis, cv2.COLORMAP_JET)
            vis = cv2_img * 0.4 + vis * 0.6
            vis = cv2.cvtColor(vis.astype('uint8'), cv2.COLOR_BGR2RGB)
            print('CLIP:', all_texts[n])
            plt.imshow(vis)
            plt.show()

NameError: name 'torch' is not defined

In [3]:
from clip.model import OnnxClip, softmax, get_similarity_scores
from PIL import Image
from transformers import AutoProcessor, AutoModel
processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")


images = [Image.open("clip/data/dog.jpg").convert("RGB")]
texts = ["a photo of a man", "a photo of a woman", "s photo of a dog"]

# Your images/texts will get split into batches of this size before being
# passed to CLIP, to limit memory usage
onnx_model = OnnxClip(batch_size=16, type='siglip')

# Unlike the original CLIP, there is no need to run tokenization/preprocessing
# separately - simply run get_image_embeddings directly on PIL images/NumPy
# arrays, and run get_text_embeddings directly on strings.
image_embeddings = onnx_model.get_image_embeddings(images)
text_embeddings = onnx_model.get_text_embeddings(texts)

DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /google/siglip-base-patch16-224/resolve/main/processor_config.json HTTP/1.1" 404 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /google/siglip-base-patch16-224/resolve/main/preprocessor_config.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /google/siglip-base-patch16-224/resolve/main/preprocessor_config.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /google/siglip-base-patch16-224/resolve/main/preprocessor_config.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /google/siglip-base-patch16-224/resolve/main/tokenizer_config.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /google/siglip-base-patch16-224/resolve/main/processor_config.json HTTP/1.1" 404 0
INFO:root:Available providers for ONNXRuntime: CPUExecutionProvider


NameError: name 'processor' is not defined

In [11]:
image_embeddings.shape

(1, 576, 768)

In [8]:
contexts = {'ct1': 'text_embeddings'}

logits = get_similarity_scores(image_embeddings, contexts)
probabilities = softmax(logits['ct1'])

print("Logits:", logits)

for text, p in zip(texts, probabilities[0]):
    print(f"Probability that the image is '{text}': {p:.3f}")



KeyError: 'ct1'

In [9]:
logits

{}

In [3]:
text_embeddings.shape

(3, 768)

In [8]:
text_embeddings.shape

(3, 64, 768)

In [2]:
import cv2
image = cv2.imread('dog.jpg')
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 


In [None]:
image.

In [None]:
import torch
def get_similarity_map(sm, shape):

    # min-max norm
    print
    sm = (sm - np.min(sm, axis=1, keepdims=True)[0]) / (np.max(sm, axis=1, keepdims=True)[0] - np.min(sm, axis=1, keepdims=True)[0])

    # reshape
    side = int(sm.shape[1] ** 0.5) # square output
    print(side)
    sm = sm.reshape(sm.shape[0], side, side, -1).transpose(0, 3, 1, 2)
    print('shaaape')
    print(sm.shape)
    print(shape)


    # interpolate
    sm = torch.nn.functional.interpolate(torch.Tensor(sm), shape, mode='bilinear')
    sm = sm.numpy()
    sm = np.transpose(sm, (0, 2, 3, 1))
    
    return sm

def clip_feature_surgery(image_features, text_features, redundant_feats=None, t=2):

    if redundant_feats != None:
        similarity = image_features @ (text_features - redundant_feats).T

    else:
        # weights to restrain influence of obvious classes on others
        prob = image_features[:, :1, :] @ text_features.T
        prob = (prob * 2).softmax(-1)
        w = prob / prob.mean(-1, keepdim=True)

        # element-wise multiplied features
        b, n_t, n_i, c = image_features.shape[0], text_features.shape[0], image_features.shape[1], image_features.shape[2]
        feats = image_features.reshape(b, n_i, 1, c) * text_features.reshape(1, 1, n_t, c)
        feats *= w.reshape(1, 1, n_t, 1)
        redundant_feats = feats.mean(2, keepdim=True) # along cls dim
        feats = feats - redundant_feats
        
        # sum the element-wise multiplied features as cosine similarity
        similarity = feats.sum(-1)

    return similarity

In [97]:
import numpy as np
from numpy import linalg as LA

images = [Image.open("demo.jpg").convert("RGB")]
image_embeddings = onnx_model.get_image_embeddings(images)
#image_embeddings = np.expand_dims(image_embeddings, axis=1)
image_features = image_embeddings / LA.norm(image_embeddings, axis=1, keepdims=True)
# Prompt ensemble for text features with normalization
text_features = onnx_model.encode_text_with_prompt_ensemble(all_texts)



In [2]:
image_features = image_embeddings / LA.norm(image_embeddings, axis=1, keepdims=True)
image_features = torch.tensor(image_embeddings) / torch.tensor(image_embeddings).norm(dim=1, keepdim=True)

features = image_features @ text_features.T
similarity_map = get_similarity_map(features[:, 1:, :].numpy(), np.array(images[0]).shape[:2])


# Apply feature surgery
similarity = clip_feature_surgery(image_features, text_features)
#similarity_map = get_similarity_map(similarity[:, 1:, :], cv2_img.shape[:2])
similarity_map = get_similarity_map(similarity[:, 1:, :].numpy(), cv2_img.shape[:2])

NameError: name 'image_embeddings' is not defined

In [3]:
import matplotlib.pyplot as plt

cv2_img = np.array(images[0])
# Draw similarity map
for b in range(similarity_map.shape[0]):
    for n in range(similarity_map.shape[-1]):
        if all_texts[n] not in target_texts:
            continue
        vis = (similarity_map[b, :, :, n] * 255).astype('uint8')
        vis = cv2.applyColorMap(vis, cv2.COLORMAP_JET)
        vis = cv2_img * 0.4 + vis * 0.6
        vis = cv2.cvtColor(vis.astype('uint8'), cv2.COLOR_BGR2RGB)
        print('CLIP:', all_texts[n])
        plt.imshow(vis)
        plt.show()

DEBUG:matplotlib:matplotlib data path: /usr/share/matplotlib/mpl-data
DEBUG:matplotlib:CONFIGDIR=/home/rhys/.config/matplotlib
DEBUG:matplotlib:interactive is False
DEBUG:matplotlib:platform is linux


DEBUG:matplotlib:CACHEDIR=/home/rhys/.cache/matplotlib
DEBUG:matplotlib.font_manager:Using fontManager instance from /home/rhys/.cache/matplotlib/fontlist-v330.json
DEBUG:matplotlib.pyplot:Loaded backend module://matplotlib_inline.backend_inline version unknown.
DEBUG:matplotlib.pyplot:Loaded backend module://matplotlib_inline.backend_inline version unknown.


NameError: name 'np' is not defined

In [None]:
def get_similarity_map(sm, shape):

    # min-max norm
    sm = (sm - sm.min(1, keepdim=True)[0]) / (sm.max(1, keepdim=True)[0] - sm.min(1, keepdim=True)[0])

    # reshape
    side = int(sm.shape[1] ** 0.5) # square output
    sm = sm.reshape(sm.shape[0], side, side, -1).permute(0, 3, 1, 2)

    # interpolate
    sm = torch.nn.functional.interpolate(sm, shape, mode='bilinear')
    sm = sm.permute(0, 2, 3, 1)
    
    return sm


def clip_feature_surgery(image_features, text_features, redundant_feats=None, t=2):

    if redundant_feats != None:
        similarity = image_features @ (text_features - redundant_feats).t()

    else:
        # weights to restrain influence of obvious classes on others
        prob = image_features[:, :1, :] @ text_features.t()
        prob = (prob * 2).softmax(-1)
        w = prob / prob.mean(-1, keepdim=True)

        # element-wise multiplied features
        b, n_t, n_i, c = image_features.shape[0], text_features.shape[0], image_features.shape[1], image_features.shape[2]
        feats = image_features.reshape(b, n_i, 1, c) * text_features.reshape(1, 1, n_t, c)
        feats *= w.reshape(1, 1, n_t, 1)
        redundant_feats = feats.mean(2, keepdim=True) # along cls dim
        feats = feats - redundant_feats
        
        # sum the element-wise multiplied features as cosine similarity
        similarity = feats.sum(-1)

    return similarity


# sm shape N_t
def similarity_map_to_points(sm, shape, t=0.8, down_sample=2):
    side = int(sm.shape[0] ** 0.5)
    sm = sm.reshape(1, 1, side, side)

    # down sample to smooth results
    down_side = side // down_sample
    sm = torch.nn.functional.interpolate(sm, (down_side, down_side), mode='bilinear')[0, 0, :, :]
    h, w = sm.shape
    sm = sm.reshape(-1)

    sm = (sm - sm.min()) / (sm.max() - sm.min())
    rank = sm.sort(0)[1]
    scale_h = float(shape[0]) / h
    scale_w = float(shape[1]) / w

    num = min((sm >= t).sum(), sm.shape[0] // 2)
    labels = np.ones(num * 2).astype('uint8')
    labels[num:] = 0
    points = []

    # positives
    for idx in rank[-num:]:
        x = min((idx % w + 0.5) * scale_w, shape[1] - 1) # +0.5 to center
        y = min((idx // w + 0.5) * scale_h, shape[0] - 1)
        points.append([int(x.item()), int(y.item())])

    # negatives
    for idx in rank[:num]:
        x = min((idx % w + 0.5) * scale_w, shape[1] - 1)
        y = min((idx // w + 0.5) * scale_h, shape[0] - 1)
        points.append([int(x.item()), int(y.item())])

    return points, labels
