In [1]:
import os

import open_clip
import spacy
import torch

from PIL import Image
from nltk.stem import WordNetLemmatizer

device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16

# Loading EVA clip 2
clip_model, _, clip_preprocess = open_clip.create_model_and_transforms(
    "EVA02-E-14-plus",
    pretrained="laion2b_s9b_b144k",
    precision="fp16",
    device=device,
)
clip_model.eval()  # model in train mode by default, impacts some models with BatchNorm or stochastic depth active
clip_tokenizer = open_clip.get_tokenizer("EVA02-E-14-plus")

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()
# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

In [2]:
img = Image.open("../clean.png").convert("RGB")
caption = "a small bird with yellow and brown plumage stands on a metal surface, with green foliage in the background."

doc = nlp(caption)
# Extract nouns
identified_nouns = set(
    [token.text for token in doc if token.pos_ in {"NOUN", "PROPN"}]
)
# Lemmatize
concept_list = set([lemmatizer.lemmatize(x.lower()) for x in identified_nouns])

In [3]:
image = clip_preprocess(img).unsqueeze(0).to(device, dtype)
text = clip_tokenizer(concept_list).to(device)

# Compute features
image_features = clip_model.encode_image(image)
text_features = clip_model.encode_text(text)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)

# Convert to probabilities
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)

# top 5 concepts and their probabilities
top_concepts = sorted(
    [
        {"concept": concept, "prob": prob.item()}
        for concept, prob in zip(concept_list, text_probs[0])
    ],
    key=lambda x: x["prob"],
    reverse=True,
)[:5]

In [4]:
import pprint

pprint.pprint(top_concepts)
print(f"Top concept: {top_concepts[0]['concept']}")

[{'concept': 'bird', 'prob': 0.9951171875},
 {'concept': 'metal', 'prob': 0.0038204193115234375},
 {'concept': 'plumage', 'prob': 0.001129150390625},
 {'concept': 'background', 'prob': 3.409385681152344e-05},
 {'concept': 'foliage', 'prob': 2.574920654296875e-05}]
Top concept: bird
