<a href="https://colab.research.google.com/github/rohitrnath/LLM-Training-Colab-Sync/blob/main/OpenCLIP_Model_Inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
!pip install ftfy regex tqdm

Collecting ftfy
  Downloading ftfy-6.1.3-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.4/53.4 kB[0m [31m942.2 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: ftfy
Successfully installed ftfy-6.1.3


In [4]:
!git clone https://github.com/openai/CLIP

Cloning into 'CLIP'...
remote: Enumerating objects: 251, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 251 (delta 3), reused 3 (delta 0), pack-reused 243[K
Receiving objects: 100% (251/251), 8.93 MiB | 21.51 MiB/s, done.
Resolving deltas: 100% (127/127), done.


In [9]:
%cd CLIP

/content/CLIP/clip


In [27]:
from typing import Callable

import torch
import torchvision

class ClipTextEncoder(torch.nn.Module):
    def __init__(self, net: torch.nn.Module):
        super().__init__()
        """ Wrapper for OpenAI CLIP."""
        self.net = net
        self.eot_token = 49407

    def forward(self, text: torch.Tensor):
        """Forward call on Open AI CLIP model.

        Inputs:
            text: torch.Tensor (Shape: [1, 77] context_length=77)
                Processed text tensor to be tokenized.

        Outputs:
            text_features: torch.Tensor [512 (transformer_width), num_text_prompts]
                Raw text features are returned. When multiplied to image features,
                you can obtain a matrix of cosine similarities between the
                corresponding image and text input.

        """
        clipped_text = torch.clip(text, min=0, max=self.eot_token)
        text_features = self.net.encode_text(clipped_text)
        text_features = text_features / text_features.norm(dim=1, keepdim=True)
        return text_features

    def get_input_spec(
        self,
        batch_size: int = 1,
        text_length: int = 77,
    ) :
        # Get the input specification ordered (name -> (shape, type)) pairs for this model.
        #
        # This can be used with the qai_hub python API to declare
        # the model input specification upon submitting a profile job.
        return {
            "text": ((batch_size, text_length), "int32"),
        }


class ClipImageEncoder(torch.nn.Module):
    def __init__(self, net: torch.nn.Module):
        super().__init__()
        """ Wrapper for OpenAI Clip."""
        self.net = net
        self.eot_token = 49407

    def forward(self, image: torch.Tensor):
        """Forward call on Open AI Clip model.

        Inputs:
            image: torch.Tensor (Shape: [1, 3, 224, 224])
                Processed image tensor with values normalized to be between 0-1.
                Channel Layout: RGB

        Outputs:
            image_features: torch.Tensor [num_images, 512 (transformer_width)]
                Raw image features (multiplied to 100) are returned.
                When multiplied to text features, you can obtain a
                matrix of cosine similarities between the corresponding image and
                text input.

        """
        image_features = self.net.encode_image(image)
        image_features = image_features / image_features.norm(dim=1, keepdim=True)
        return self.net.logit_scale.exp() * image_features

    def get_input_spec(
        self,
        height: int = 224,
        width: int = 224,
    ) :
        # Get the input specification ordered (name -> (shape, type)) pairs for this model.
        #
        # This can be used with the qai_hub python API to declare
        # the model input specification upon submitting a profile job.
        return {
            "image": ((1, 3, height, width), "float32"),
        }

In [54]:
from typing import Tuple

import torch
from PIL import Image


class ClipApp:
    """
    This class consists of light-weight "app code" that is required to perform end to end inference with Clip.

    The app uses 1 model:
        * Clip

    For a given image input, the app will:
        * pre-process the image
        * pre-process the text
        * Run Clip inference
    """

    def __init__(
        self,
        net: torch.nn.Module,
        preprocess: torchvision.transforms.transforms.Compose,
        tokenizer_func: Callable,
    ):
        # Open AI Clip
        self.text_encoder = ClipTextEncoder(net)
        self.image_encoder = ClipImageEncoder(net)
        # Preprocess Compose function from Open AI clip
        self.preprocess = preprocess
        self.tokenizer = tokenizer_func

    def predict(self, *args, **kwargs):
        # See predict_similarity.
        return self.predict_similarity(*args, **kwargs)

    def predict_similarity(
        self, image: torch.Tensor, text: torch.Tensor
    ) -> torch.Tensor:
        """
        Inputs:
            image: torch.Tensor (Shape: [1, 3, 224, 224])
                Processed image tensor with values normalized to be between 0-1.
            text: torch.Tensor (Shape: [1, 77])
                Processed text tensor to be tokenized.

        Outputs:
            logits_per_image: torch.Tensor (Shape: [num_images, num_text_prompts])

                Given a batch of images and a batch of text tokens, returns a tensor,
                containing the logit scores corresponding to each image per text input.
                The values are cosine similarities between the corresponding image and
                text features, times 100. The logits of text per image can be computed
                by doing a transpose.

        """
        with torch.no_grad():
            image_features = self.image_encoder(image)
            text_features = self.text_encoder(text)
            logits_per_image = image_features @ text_features.t()
        return logits_per_image.cpu().numpy()

    def process_image(self, image: Image) -> torch.Tensor:
        """Process image before calling forward.

        Inputs:
            image: PIL.Image
                Image loaded by Pillow must be provided.
                Example: image = Image.open('<path>')

        Outputs:
            processed_image: torch.Tensor (shape [1, 3, 224, 224])
                Layout: RGB
                The image is converted to torch tensor and normalized
                to be in the range of 0-1.
        """
        return self.preprocess(image).unsqueeze(0)

    def process_text(self, text: str) -> torch.Tensor:
        """Process text into tokens for forward call.

        Input:
            text: str
                Text prompt intended for inference.
                Example: "golden hour"

        Output:
            tokenized_tensor: torch.Tensor (shape: [1, 77])
            Example: tensor([[49406,  3878,  2232, 49407, 0, 0...]])

        """
        return self.tokenizer(text)

    def get_input_spec(
        self,
        image_size: Tuple[int, int] = (224, 224),
        text_size: Tuple[int, int] = (3, 77),
    ):
        # Get the input specification ordered (name -> (shape, type)) pairs for this model.
        #
        # This can be used with the qai_hub python API to declare
        # the model input specification upon submitting a profile job.
        if isinstance(image_size, int):
            image_size = (image_size, image_size)
        return {
            "image": ((1, 3, *image_size), "float32"),
            "text": (text_size, "int32"),
        }

In [50]:
import clip
PRETRAINED_WEIGHTS = "ViT-B/16"
tokenizer_func = clip.tokenize
net, preprocess = clip.load(PRETRAINED_WEIGHTS)

In [55]:
import os

import numpy as np
import torch



# Run Clip on a directory of images with a query text.
# The demo will display similarity score for each image.
def main(is_test: bool = False):
    # Demo parameters
    text_str = "eye retina operated image"
    image_dir = "/content"
    image_names = "image1.jpg,image2.jpg,image3.jpg"
    # Load model
    app = ClipApp(net, preprocess, tokenizer_func)

    image_names = image_names.split(",")
    text = app.process_text(text_str)
    images = []

    # Iterate through images and text provided by user
    for filename in image_names:
        # Make sure the file is an image
        if os.path.splitext(filename)[1].lower() in [".jpg", ".jpeg", ".png"]:
            if image_dir:
                image = os.path.join(image_dir, filename)
            # Preprocess image and text pair
            print(image)
            image = app.process_image(Image.open(image))
            images.append(image)

        else:
            print(f"Skipping file {filename}")

    images = torch.stack(images).squeeze(1)

    # Compute similarity
    predictions = app.predict_similarity(images, text).flatten()

    # Display all the images and their score wrt to the text prompt provided.
    print(f"Searching images by prompt: {text}")
    for i in range(len(predictions)):
        print(
            f"\t Image with name: {image_names[i]} has a similarity score={predictions[i]}"
        )

    # Show image
    print("Displaying the most relevant image")

    print(np.argmax(predictions))

if __name__ == "__main__":
    main()

/content/image1.jpg
/content/image2.jpg
/content/image3.jpg
Searching images by prompt: tensor([[49406,  3272, 36919, 23031,  2867, 49407,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0]], dtype=torch.int32)
	 Image with name: image1.jpg has a similarity score=26.78359031677246
	 Image with name: image2.jpg has a similarity score=19.889570236206055
	 Image with name: image3.jpg has a similarity score=21.584095001220703
Displaying the most relevant image
0


### Example given by OpenAI

In [56]:
import torch
import clip
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

image = preprocess(Image.open("/content/image1.jpg")).unsqueeze(0).to(device)
text = clip.tokenize(["a diagram", "a dog", "eye retina operated image"]).to(device)

with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)

    logits_per_image, logits_per_text = model(image, text)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

print("Label probs:", probs)  # prints: [[0.9927937  0.00421068 0.00299572]]

Label probs: [[7.2572782e-04 4.6486195e-04 9.9880946e-01]]
