## CLIP LOADING AND INFERENCE TESTING:

In [2]:
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import time
import urllib

print(f"TORCH VERSION: {torch.__version__}")
print(f"CUDA AVAILABLE: {torch.cuda.is_available()}")

TORCH VERSION: 2.9.1+cpu
CUDA AVAILABLE: False


In [13]:
# LOAD CLIP MODEL

print("LOADING CLIP MODEL")
start_time = time.time()
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
load_time = time.time() - start_time

print(f"CLIP MODEL LOADED IN {load_time:.2f} SECONDS")
print(f"MODEL DEVICE: {next(model.parameters()).device}")




LOADING CLIP MODEL
CLIP MODEL LOADED IN 2.71 SECONDS
MODEL DEVICE: cpu


In [14]:
# TEST CLIP INFERENCE ON SINGLE IMAGE-TEXT PAIR

# DOWNLOAD IMAGE
image_path = "img/car_image.jpg" # EXAMPLE SINGLE IMAGE TAKEN FROM WIKIPEDIA (REAL LABEL = CAR)

# LOAD IMAGE
image = Image.open(image_path).convert("RGB")
image.thumbnail((224, 224)) # RESIZE FOR SPEED 

# INFERENCE RUN

print("RUNNING INFERENCE ON CLIP MODEL")
texts = ["a photo of a cat", "a photo of a dog", "a photo of a car"]  # EXAMPLE TEXTS
inputs = processor(text = texts, images = image, return_tensors = "pt", padding = True)
inputs = {k: v.to(device) for k, v in inputs.items()}

# TIME METRICS
start = time.time()
outputs = model(**inputs)
inference_time = time.time() - start

logits_per_image = outputs.logits_per_image  # THIS IS THE IMAGE-TO-TEXT SIMILARITY SCORE
probs = logits_per_image.softmax(dim=1)  # WE CAN TAKE THE SOFTMAX TO GET THE PROBABILITIES OF EACH TEXT

print(f"INFERENCE COMPLETED IN {inference_time:.2f} SECONDS")
print(f"\nSCORES:")
for text, prob in zip(texts, probs[0]):
    print(f"'{text}': {prob.item():.4f}")


RUNNING INFERENCE ON CLIP MODEL
INFERENCE COMPLETED IN 0.24 SECONDS

SCORES:
'a photo of a cat': 0.0011
'a photo of a dog': 0.0006
'a photo of a car': 0.9983


## LLaVA LOADING AND INFERENCE TESTING:


In [16]:
from transformers import AutoProcessor, LlavaForConditionalGeneration

print("LOADING LLavA MODEL")

start_time = time.time()
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
model = LlavaForConditionalGeneration.from_pretrained(
    "llava-hf/llava-1.5-7b-hf",  # MODEL NAME
    torch_dtype=torch.float16, # USE FP16 FOR SPEED
    device_map="auto", # AUTOMATICALLY PLACE ON GPU IF AVAILABLE
)

load_time = time.time() - start_time

print(f"LLaVA MODEL LOADED IN {load_time:.2f} SECONDS")

LOADING LLavA MODEL


Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00,  6.43it/s]
Some parameters are on the meta device because they were offloaded to the disk and cpu.


LLaVA MODEL LOADED IN 3.40 SECONDS


In [None]:
from PIL import Image
import torch
import time
from transformers import AutoProcessor

print("TESTING LLaVA INFERENCE (STANDARD AUTO-PROCESSOR)")

# LOAD IMAGE / PROCESSOR
image = Image.open(image_path).convert("RGB")
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

# DEFINE PROMPT
prompt = "USER: <image>\nDescribe this image in detail.\nASSISTANT:"

# INPUTS
inputs = processor(text=prompt, images=image, return_tensors="pt").to(model.device)

# INFERENCE RUN
print(f"\nPROMPT: '{prompt}'")
print(f"GENERATING RESPONSE...")

start = time.time()
with torch.no_grad():
    output_ids = model.generate(
        **inputs,
        max_new_tokens=100,
        do_sample=False
    )
inference_time = time.time() - start

# DECODE RESPONSE
response = processor.decode(output_ids[0], skip_special_tokens=True)

print(f"\n INFERENCE COMPLETED IN {inference_time:.2f} SECONDS")
print(f"\n LLaVA Response:\n{response}")
