In [10]:
import os
import json
import nltk
import torch
import clip
from PIL import Image
from nltk.corpus import wordnet
from transformers import BlipProcessor, BlipForConditionalGeneration
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Download NLTK data if not already downloaded
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
nltk.download("wordnet")

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Load BLIP model for captioning
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
caption_model = BlipForConditionalGeneration.from_pretrained(
    "Salesforce/blip-image-captioning-base"
).to(device)

# File to store image captions and tags
DATA_FILE = "image_tags.json"

# Load or initialize the JSON file for storing processed images
if os.path.exists(DATA_FILE):
    with open(DATA_FILE, "r") as file:
        image_data = json.load(file)
else:
    image_data = {}


# Function to generate caption for an image
def generate_caption(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(image, return_tensors="pt").to(device)
    out = caption_model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)
    return caption


# Function to extract nouns from text
def extract_nouns(text):
    words = nltk.word_tokenize(text)
    pos_tags = nltk.pos_tag(words)
    nouns = [word for word, pos in pos_tags if pos.startswith("NN")]
    return nouns


# Function to find synonyms of a word using WordNet
def find_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(
                lemma.name().replace("_", " ")
            )  # Replace underscores with spaces for readability
    return list(synonyms)


# Function to generate tags with synonyms for new images
def generate_tags_with_synonyms(image_folder):
    for img_file in os.listdir(image_folder):
        if img_file.lower().endswith(("jpg", "jpeg", "png", "webp")):
            image_path = os.path.join(image_folder, img_file)

            # Skip images that are already processed and stored in the JSON file
            if img_file in image_data:
                print(f"Skipping already processed image: {img_file}")
                continue

            # Step 1: Generate caption for the image
            caption = generate_caption(image_path)
            print(f"Caption for {img_file}: {caption}")

            # Step 2: Extract nouns from the caption
            initial_tags = extract_nouns(caption)
            expanded_tags = set()

            # Step 3: Expand each noun with synonyms and add to tag set
            for tag in initial_tags:
                expanded_tags.add(tag)  # Include the original noun
                synonyms = find_synonyms(tag)
                expanded_tags.update(synonyms)  # Add synonyms to the tag set

            # Store the caption and tags in the image data dictionary
            image_data[img_file] = {"caption": caption, "tags": list(expanded_tags)}
            print(f"Tags for {img_file}: {image_data[img_file]['tags']}")

    # Save updated image data to the JSON file
    with open(DATA_FILE, "w") as file:
        json.dump(image_data, file, indent=4)
    print(f"Updated data saved to {DATA_FILE}")


# Function to retrieve images based on free-text query
def retrieve_images(query):
    # Tokenize and normalize the query (convert to lowercase)
    query_tokens = set(nltk.word_tokenize(query.lower()))
    results = {}

    # Check each image's tags to see if they match the query tokens
    for img_file, data in image_data.items():
        tags = set(tag.lower() for tag in data["tags"])
        common_tags = query_tokens.intersection(tags)
        if common_tags:
            # Count of matching tags can be used as a score
            results[img_file] = len(common_tags)

    # Sort images by the number of matching tags (descending order)
    sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)
    top_images = [img for img, score in sorted_results]

    return top_images


# Usage Example
image_folder = "images"  # Replace with your actual image folder path
generate_tags_with_synonyms(image_folder)

# Free-text query example
query = "woman in red dress"
top_images = retrieve_images(query)
print("Top retrieved images:", top_images)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\punya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\punya\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\punya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Caption for 1.webp: two women in red and yellow dresses posing for the camera
Tags for 1.webp: ['crop', 'plume', 'adult female', 'photographic camera', 'women', 'coiffure', 'television camera', 'coiffe', 'trim', 'curry', 'set', 'tv camera', 'dress', 'cut back', 'garb', 'habilitate', 'preen', 'woman', 'get dressed', 'attire', 'clothe', 'arrange', 'wearing apparel', 'tog', 'dress up', 'dresses', 'cleaning lady', 'cleaning woman', 'apparel', 'camera', 'decorate', 'primp', 'frock', 'garnish', 'garment', 'snip', 'womanhood', 'enclothe', 'coif', 'fair sex', 'clip', 'do', 'lop', 'prune', 'groom', 'fit out', 'line up', 'charwoman', 'raiment', 'clothes', 'char', 'dress out']
Caption for 2.webp: a pair of white shoes on a blue and pink background
Tags for 2.webp: ['wild blue yonder', 'juicy', 'bluing', 'pinko', 'dispirited', 'shoe', 'blueing', 'twain', 'twosome', 'brace', 'background knowledge', 'mate', 'couplet', 'geminate', 'backcloth', 'blue air', 'duo', 'ping', 'screen background', 'risque',

In [11]:
query = "female"
top_images = retrieve_images(query)

In [12]:
top_images

[]