## CLIP LOADING AND INFERENCE TESTING:

In [1]:
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import time
import urllib

print(f"TORCH VERSION: {torch.__version__}")
print(f"CUDA AVAILABLE: {torch.cuda.is_available()}")

  from .autonotebook import tqdm as notebook_tqdm


TORCH VERSION: 2.9.1+cpu
CUDA AVAILABLE: False


In [2]:
# LOAD CLIP MODEL

print("LOADING CLIP MODEL")
start_time = time.time()
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
load_time = time.time() - start_time

print(f"CLIP MODEL LOADED IN {load_time:.2f} SECONDS")
print(f"MODEL DEVICE: {next(model.parameters()).device}")




LOADING CLIP MODEL


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


CLIP MODEL LOADED IN 2.32 SECONDS
MODEL DEVICE: cpu


In [6]:
# TEST CLIP INFERENCE ON SINGLE IMAGE-TEXT PAIR

# DOWNLOAD IMAGE
image_path = "img/cat_image.jpg" # EXAMPLE SINGLE IMAGE TAKEN FROM WIKIPEDIA (REAL LABEL = CAT)

# LOAD IMAGE
image = Image.open(image_path).convert("RGB")
image.thumbnail((224, 224)) # RESIZE FOR SPEED 

# INFERENCE RUN

print("RUNNING INFERENCE ON CLIP MODEL")
texts = ["a photo of a cat", "a photo of a dog", "a photo of a car"]  # EXAMPLE TEXTS
inputs = processor(text = texts, images = image, return_tensors = "pt", padding = True)
inputs = {k: v.to(device) for k, v in inputs.items()}

# TIME METRICS
start = time.time()
outputs = model(**inputs)
inference_time = time.time() - start

logits_per_image = outputs.logits_per_image  # THIS IS THE IMAGE-TO-TEXT SIMILARITY SCORE
probs = logits_per_image.softmax(dim=1)  # WE CAN TAKE THE SOFTMAX TO GET THE PROBABILITIES OF EACH TEXT

print(f"INFERENCE COMPLETED IN {inference_time:.2f} SECONDS")
print(f"\nSCORES:")
for text, prob in zip(texts, probs[0]):
    print(f"'{text}': {prob.item():.4f}")


RUNNING INFERENCE ON CLIP MODEL
INFERENCE COMPLETED IN 0.07 SECONDS

SCORES:
'a photo of a cat': 0.9973
'a photo of a dog': 0.0027
'a photo of a car': 0.0001


## LLaVA LOADING AND INFERENCE TESTING:


In [7]:
from transformers import AutoProcessor, LlavaForConditionalGeneration

print("LOADING LLavA MODEL")

start_time = time.time()
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
model = LlavaForConditionalGeneration.from_pretrained(
    "llava-hf/llava-1.5-7b-hf",  # MODEL NAME
    torch_dtype=torch.float16, # USE FP16 FOR SPEED
    device_map="auto", # AUTOMATICALLY PLACE ON GPU IF AVAILABLE
)

load_time = time.time() - start_time

print(f"LLaVA MODEL LOADED IN {load_time:.2f} SECONDS")

LOADING LLavA MODEL


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
`torch_dtype` is deprecated! Use `dtype` instead!
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installe

In [9]:
# TEST LLAVA INFERENCE ON IMAGE + TEXT

image = Image.open(image_path).convert("RGB")

# PROMPT
prompt = "Describe the image in detail."
inputs = processor(text=prompt, images= image, return_tensors="pt").to(model.device)

print("RUNNING INFERENCE ON LLaVA MODEL")
start = time.time()

# RESPONSE
output_ids = model.generate(**inputs, max_new_tokens=100, do_sample = False)
inference_time = time.time() - start

response = processor.decode(output_ids[0], skip_special_tokens=True)
print(f"INFERENCE COMPLETED IN {inference_time:.2f} SECONDS")
print(f"\nLLaVA RESPONSE: {response}")

RUNNING INFERENCE ON LLaVA MODEL


ValueError: Image features and image tokens do not match: tokens: 0, features 2359296