In [None]:
# pip install open_clip_torch pillow torch torchvision

import torch
import open_clip
from PIL import Image

# 1) Load model + preprocess
device = "cuda" if torch.cuda.is_available() else "cpu"
model, _, preprocess = open_clip.create_model_and_transforms(
    "ViT-B-16", pretrained="laion2b_s34b_b79k"
)
model = model.to(device).eval()

# 2) Prepare inputs
image_path = "your_image.jpg"  # replace with a local image
image = Image.open(image_path).convert("RGB")
image_tensor = preprocess(image).unsqueeze(0).to(device)   # [1,3,224,224]

texts = [
    "a photo of a cat",
    "a photo of a dog",
    "a bowl of fruit",
    "a scenic mountain landscape",
]
tokenizer = open_clip.get_tokenizer("ViT-B-16")
text_tokens = tokenizer(texts).to(device)                  # [4,token_len]

# 3) Encode to the shared embedding space
with torch.no_grad():
    img_feat  = model.encode_image(image_tensor)           # [1,d]
    txt_feat  = model.encode_text(text_tokens)             # [4,d]
    img_feat  = img_feat / img_feat.norm(dim=-1, keepdim=True)
    txt_feat  = txt_feat / txt_feat.norm(dim=-1, keepdim=True)

    # 4) Similarities & probabilities (zero-shot classification)
    logits = img_feat @ txt_feat.T                         # [1,4]
    probs  = logits.softmax(dim=-1).squeeze(0)             # [4]

# 5) Show results
best_idx = int(torch.argmax(probs).item())
print("Texts (with probs):")
for i, (t, p) in enumerate(zip(texts, probs.tolist())):
    print(f"  {i}: {t:35s}  p={p:.3f}")
print(f"\nTop match: {texts[best_idx]}")
