In [2]:
HF_TOKEN = "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"

In [12]:
import clip as clip
import torch
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity
import translators as ts
from transformers import AutoTokenizer, AutoModel
import numpy as np
import os

In [13]:
# Step 1: Load CLIP Model for Tagging
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

In [24]:
# Step 2: Function to Generate Tags for Images
def generate_tags(image_folder):
    tags_dict = {}
    for img_file in os.listdir(image_folder):
        image_path = os.path.join(image_folder, img_file)
        image = Image.open(image_path).convert("RGB")
        image_tensor = preprocess(image).unsqueeze(0).to(device)

        # Detect objects using CLIP (or substitute with YOLO/Detectron if needed)
        with torch.no_grad():
            text_tags = [
                "woman",
                "cat",
                "sofa",
                "shoe",
            ]  # Add more tags based on detected objects
            text_inputs = clip.tokenize(text_tags).to(device)
            logits_per_image, logits_per_text = model(image_tensor, text_inputs)
            probs = logits_per_image.softmax(dim=-1).cpu().numpy()

        # Filter tags based on threshold (e.g., > 0.5 probability)
        selected_tags = [tag for i, tag in enumerate(text_tags) if probs[0][i] > 0.5]
        tags_dict[img_file] = selected_tags

    return tags_dict

In [25]:
# Step 3: Translate Hinglish Query to English
def translate_query(hindi_text):
    return ts.translate_text(
        hindi_text, translator="google", from_language="hi", to_language="en"
    )
    # return ts.google(hindi_text, from_language="auto", to_language="en")

In [26]:
# Step 4: Embedding Model for Multilingual Similarity (LaBSE or Similar)
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")
embedding_model = AutoModel.from_pretrained("sentence-transformers/LaBSE").to(device)

def get_embedding(text):
    tokens = tokenizer(text, return_tensors="pt").to(device)
    with torch.no_grad():
        embedding = embedding_model(**tokens).pooler_output.cpu().numpy()
    return embedding

In [27]:
# Step 5: Query Processing and Retrieval
def retrieve_images(query, tags_dict):
    # Translate and embed query
    translated_query = translate_query(query)
    query_embedding = get_embedding(translated_query)

    # Calculate cosine similarity between query and tags
    results = {}
    for img_file, tags in tags_dict.items():
        tag_embeddings = np.vstack([get_embedding(tag) for tag in tags])
        similarity_scores = cosine_similarity(query_embedding, tag_embeddings).flatten()
        results[img_file] = max(similarity_scores)

    # Sort results by highest similarity
    sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)
    return [img for img, score in sorted_results[:1]]  # Retrieve top 5 images

In [28]:
# Example Usage
image_folder = "D:\SNU\Semester VII\CSD358 Information Retrieval\Project\images"
tags_dict = generate_tags(image_folder)
query = "ladki ka photo"
top_images = retrieve_images(query, tags_dict)
print("Top retrieved images:", top_images)

Top retrieved images: ['2.webp']


In [29]:
query = "shoes"
top_images = retrieve_images(query, tags_dict)
print("Top retrieved images:", top_images)


Top retrieved images: ['2.webp']
