Building an AI Image Caption Recommendation System

In [None]:
dependecies

In [None]:
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import numpy as np
from typing import List, Tuple
import sys
import os


In [None]:
sys.path.append(os.path.dirname(os.path.abspath('.')))
from caption_recommendation import ImageCaptionRecommendationSystem


function for handling the loading and preprocessing of an image

In [None]:
def load_and_preprocess_image(image_path: str) -> Tuple[dict, CLIPProcessor]:
    """Load and preprocess an image for CLIP model."""
    image = Image.open(image_path).convert('RGB')
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    inputs = processor(images=image, return_tensors="pt")
    return inputs, processor


Generating Image Embeddings

In [None]:
def generate_image_embeddings(inputs: dict) -> Tuple[torch.Tensor, CLIPModel]:
    """Generate image embeddings using CLIP model."""
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    model.eval()
    with torch.no_grad():
        image_features = model.get_image_features(**inputs)
    return image_features, model


function to match captions

In [None]:


def rank_captions(
    image_features: torch.Tensor, 
    captions: List[str], 
    model: CLIPModel, 
    processor: CLIPProcessor
) -> Tuple[List[str], List[float]]:
    """Rank captions by similarity to image features."""
    # Process text inputs
    text_inputs = processor(
        text=captions, 
        return_tensors="pt", 
        padding=True, 
        truncation=True
    )
    
    # Generate text features
    with torch.no_grad():
        text_features = model.get_text_features(**text_inputs)
    
    # Normalize features
    image_features = image_features / image_features.norm(dim=-1, keepdim=True)
    text_features = text_features / text_features.norm(dim=-1, keepdim=True)
    
    # Calculate similarity scores
    similarity_scores = torch.matmul(image_features, text_features.T).squeeze(0)
    
    # Sort captions by similarity score (descending)
    sorted_indices = torch.argsort(similarity_scores, descending=True)
    sorted_captions = [captions[i] for i in sorted_indices]
    sorted_scores = similarity_scores[sorted_indices].tolist()
    
    return sorted_captions, sorted_scores


driver function


In [None]:

def get_top_captions(
    image_path: str, 
    candidate_captions: List[str], 
    top_n: int = 5
) -> Tuple[List[str], List[float]]:
    """Get top-n captions for an image from candidate list."""
    recommendations = st.session_state.model.rank_predefined_captions(
        image_path=image_path,
        candidate_captions=candidate_captions,
        top_n=top_n
    )
    
    # Extract captions and scores
    captions = [rec[0] for rec in recommendations]
    scores = [rec[1] for rec in recommendations]
    
    return captions, scores

In [None]:

def generate_and_rank_captions(
    image_path: str, 
    keywords: List[str], 
    top_n: int = 5, 
    num_candidates: int = 10
) -> Tuple[List[str], List[float]]:
    """Generate captions from keywords and rank them by image similarity."""
    # Initialize the recommendation system
    recommender = ImageCaptionRecommendationSystem()
    
    # Generate candidate captions using keywords
    candidate_captions = recommender.generate_captions(keywords, num_candidates)
    
    if not candidate_captions:
        return [], []
    
    # Load and preprocess image
    inputs, processor = load_and_preprocess_image(image_path)
    
    # Generate image embeddings
    image_features, model = generate_image_embeddings(inputs)
    
    # Rank captions by similarity
    sorted_captions, sorted_scores = rank_captions(
        image_features, candidate_captions, model, processor
    )
    
    # Return top-n captions and scores
    top_n = min(top_n, len(sorted_captions))
    return sorted_captions[:top_n], sorted_scores[:top_n]


Bellow is a alternative way

from sklearn.metrics.pairwise import cosine_similarity
best_match_captios,similarities=image_captioning("path",candidate_captions)
top-n=min(5,len(best_mtch_caption))
top_best_caprtions=best_captions[:top_n]
top_similarities=similarities[:top_n]   
PRINT("Most suitable captions")
for i,(caption,similarity) in enumerate(zip(top_best_caprtions,top_similarities)):
    print(f"{i+1}.{caption} (Similarity:{similarity:.4f})")
