Project : Build your own intelligent search engine (LLM, google genai, langchain)

Images - Animals

In [2]:
!pip install --upgrade --quiet langchain-google-genai

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.3/53.3 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.3/66.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m713.3/713.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m490.2/490.2 kB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m234.9/234.9 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires google-auth==2.43.0, but you have google-auth 2.47.0 which is incompatible.[0m[31m
[0m

In [3]:
import langchain_google_genai
print("langchain_google_genai imported successfully, confirming installation.")

langchain_google_genai imported successfully, confirming installation.


In [4]:
# check if google api key is added
import os
from google.colab import userdata

# Access the API key from Colab Secrets
os.environ["GOOGLE_API_KEY"] = userdata.get("GOOGLE_API_KEY")

print("Google API Key loaded successfully from Colab secrets.")

Google API Key loaded successfully from Colab secrets.


In [5]:
#validate model
import os
from google.colab import userdata
from langchain_google_genai import ChatGoogleGenerativeAI

# Access the API key from Colab Secrets
os.environ["GOOGLE_API_KEY"] = userdata.get("GOOGLE_API_KEY")

# Initialize the model
gemini_client = ChatGoogleGenerativeAI(model="gemini-2.5-flash")

# Now you can use the llm to interact with the model
response = gemini_client.invoke(""" What is the capital of germany""")
print(response.content)

The capital of Germany is **Berlin**.


In [6]:
#prompt
image_description_prompt = """
**Role:**
You are an expert Computer Vision Metadata Specialist and SEO Archivist. You possess a deep understanding of visual semantics, image indexing, and natural language retrieval. Your capability involves translating visual data into precise, machine-readable text optimized for search algorithms.

**Context:**
We are building a high-performance multimodal search engine. The metadata you generate will be used to populate a vector database, allowing users to find images using both specific keyword matching and semantic natural language queries (e.g., "a quiet place to work with coffee"). High accuracy and descriptive depth are required to distinguish between similar images.

**Task:**
Analyze the provided image and generate a structured JSON object containing detailed metadata. You must extract the main subject, describe the scene atmospherics, identify visual elements, transcribe specific text, and generate search-optimized keywords.

**Constraints:**

1.  **Description:** Must be objective and factual. Limit to 2-3 sentences. Focus on action, lighting, and mood.
2.  **Text Content:** If no text is visible, return `null`. If text is visible, transcribe it exactly as it appears, preserving case where possible.
3.  **Keywords:** Provide exactly 10-15 keywords. Mix broad concepts (e.g., "urban," "melancholy") with specific objects (e.g., "red umbrella," "taxi").
4.  **Safety:** Do not generate content that violates safety policies. If the image is unclear or blurry, note this in the description.
5.  **Format:** The output must be raw JSON only. Do not include markdown formatting (like ` json ...  `) or conversational filler.

**Examples:**

*Input:* [An image of a golden retriever catching a frisbee in a park on a sunny day]

*Output:*

```json
{
  "main_subject": "Golden Retriever dog",
  "detailed_description": "A golden retriever leaps into the air to catch a red frisbee against a clear blue sky. The lighting is bright and natural, casting sharp shadows on the green grass below. The mood is energetic and playful.",
  "visual_elements": [
    "Golden Retriever",
    "Red Frisbee",
    "Green Grass",
    "Blue Sky",
    "Motion Blur"
  ],
  "text_content": null,
  "search_keywords": [
    "dog",
    "pet",
    "playing",
    "fetch",
    "frisbee",
    "park",
    "sunny",
    "action shot",
    "canine",
    "jumping",
    "outdoor",
    "joyful",
    "golden retriever",
    "summer"
  ]
}
```

**Output Format:**
Generate the response in the following JSON structure:

```json
{
  "main_subject": "String",
  "detailed_description": "String",
  "visual_elements": ["String", "String"],
  "text_content": "String or null",
  "search_keywords": ["String", "String"]
}
```
"""

In [7]:
import os
import base64
import mimetypes
from google.colab import userdata
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage

# 1. Setup
os.environ["GOOGLE_API_KEY"] = userdata.get("GOOGLE_API_KEY")

# Initialize the model
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0.2
)

# 2. Helper function to encode image to Base64
def encode_image(image_path):
    """Encodes a local image to base64 string."""
    # https://en.wikipedia.org/wiki/Base64
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

# 3. The Metadata Function
def generate_search_metadata(image_path, llm, prompt_text):
    """
    Generates a detailed description using LangChain and Gemini.
    """
    try:
        # Determine mime type (e.g., 'image/jpeg', 'image/png')
        mime_type, _ = mimetypes.guess_type(image_path)
        if not mime_type:
            mime_type = 'image/jpeg' # Default fallback

        # Get Base64 string
        img_base64 = encode_image(image_path)

        # Create the Multimodal Message
        # LangChain expects a list of dictionaries for multimodal content
        message = HumanMessage(
            content=[
                {"type": "text", "text": prompt_text},
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:{mime_type};base64,{img_base64}"}
                }
            ]
        )

        # Invoke the LLM
        response = llm.invoke([message])

        return response.content.replace('```json', '').replace('```', '')

    except FileNotFoundError:
        print(f"Error: The file at {image_path} was not found.")
        return None
    except Exception as e:
        print(f"An error occurred during generation: {e}")
        return None

In [8]:
from google.colab import drive # use this code to connect to the drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
#generate meta data
image_path = "/content/drive/MyDrive/Images/pets/animals/animals/antelope/1.jpg"

# Run the function
metadata = generate_search_metadata(image_path, llm, image_description_prompt)

if metadata:
    print(metadata)

An error occurred during generation: Error calling model 'gemini-2.5-flash' (RESOURCE_EXHAUSTED): 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/rate-limit. \n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 20, model: gemini-2.5-flash\nPlease retry in 22.256408823s.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerDayPerProjectPerMod

In [None]:
import os
import json
import time # Import time module

source_folder = "/content/drive/MyDrive/Images/"
output_folder = "/content/drive/MyDrive/jsons/"

os.makedirs(output_folder, exist_ok=True)

# Supported image extensions to filter for
valid_extensions = ('.jpg', '.jpeg', '.png', '.webp', '.heic')

print(f"Starting processing for images in: {source_folder}\n")

# Collect all image files, including those in subdirectories
all_image_files = []
for root, dirs, files in os.walk(source_folder):
    for file in files:
        if file.lower().endswith(valid_extensions):
            all_image_files.append(os.path.join(root, file))

total_files = len(all_image_files)
processed_count = 0

# Define a delay to respect API rate limits
API_DELAY_SECONDS = 3 # Approximately 20 requests per minute for gemini-2.5-flash

for image_path in all_image_files:
    # Get the relative path from source_folder to maintain directory structure in output
    relative_path = os.path.relpath(image_path, source_folder)

    # Define the output JSON path (same name as image, but .json extension)
    # Ensure output directory structure mirrors input structure
    json_sub_dir = os.path.join(output_folder, os.path.dirname(relative_path))
    os.makedirs(json_sub_dir, exist_ok=True)

    json_filename = f"{os.path.splitext(os.path.basename(image_path))[0]}.json"
    json_path = os.path.join(json_sub_dir, json_filename)

    if os.path.exists(json_path):
        print(f"[{processed_count + 1}/{total_files}] Skipping: {relative_path} (JSON already exists)")
        processed_count += 1
        continue

    print(f"[{processed_count + 1}/{total_files}] Generating metadata for: {relative_path}...")

    try:
        # Call your existing function
        description = generate_search_metadata(image_path, llm, image_description_prompt)

        # Ensure description is parsed to dict if returned as string
        if isinstance(description, str):
            description = json.loads(description)

        if description:
            # Prepare the data structure
            data = {
                "file_path": image_path, # Store the absolute image path
                "description": description
            }

            # Write to JSON file
            with open(json_path, 'w', encoding='utf-8') as json_file:
                json.dump(data, json_file, indent=4)

            print(f"   -> Saved to {os.path.relpath(json_path, output_folder)}")
        else:
            print(f"   -> Skipped {relative_path} (Generation returned empty)")

    except Exception as e:
        print(f"   -> Error processing {relative_path}: {e}")

    processed_count += 1
    # Add a delay to respect API rate limits
    time.sleep(API_DELAY_SECONDS)

print("\n--- Processing Complete ---")

Starting processing for images in: /content/drive/MyDrive/Images/

[1/5400] Skipping: pets/animals/animals/okapi/17c60106d1.jpg (JSON already exists)
[2/5400] Skipping: pets/animals/animals/okapi/74cfa2b9a7.jpg (JSON already exists)
[3/5400] Skipping: pets/animals/animals/okapi/1efedee0ea.jpg (JSON already exists)
[4/5400] Skipping: pets/animals/animals/okapi/151c90d3e9.jpg (JSON already exists)
[5/5400] Skipping: pets/animals/animals/okapi/2f31bf11d7.jpg (JSON already exists)
[6/5400] Generating metadata for: pets/animals/animals/okapi/216ddb19f5.jpg...
An error occurred during generation: Error calling model 'gemini-2.5-flash' (RESOURCE_EXHAUSTED): 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/rate-limit. \n* Quota exceeded for metric: generative

KeyboardInterrupt: 

Embedding

In [None]:
import os
import json
import time
import shutil # Import shutil for copying directories
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document
from google.colab import userdata

# 1. Setup
os.environ["GOOGLE_API_KEY"] = userdata.get("GOOGLE_API_KEY")
json_folder = "/content/drive/MyDrive/jsons"

# --- MODIFICATION: Use a local path for the database during creation ---
local_db_path = "/content/chroma_db"
persistent_db_path = "/content/drive/MyDrive/chroma_db"

# Initialize Embeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")

# 2. Check if DB exists before doing heavy lifting
# Forcing recreation of the DB to ensure all documents are processed
# This ensures that if new JSONs were generated or old ones updated, they are included.

# Remove the existing local database to force a full re-ingestion
if os.path.exists(local_db_path):
    shutil.rmtree(local_db_path)
    print(f"Existing local database at {local_db_path} removed to force re-ingestion.")

print("Database not found or removed. Starting fresh ingestion process...")

# Load ALL JSONs into a list of Documents first (No API calls here)
documents = []

if os.path.exists(json_folder):
    json_files_found = []
    for root, dirs, files in os.walk(json_folder):
        for file in files:
            if file.lower().endswith('.json'):
                json_files_found.append(os.path.join(root, file))

    print(f"Found {len(json_files_found)} JSON files. Preparing data...")

    for file_path in json_files_found:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            desc_obj = data.get("description", {})

            # Construct Context
            text_content = f"""
            Subject: {desc_obj.get('main_subject', 'Unknown')}
            Description: {desc_obj.get('detailed_description', '')}
            Keywords: {', '.join(desc_obj.get('search_keywords', []))}
            Visual Elements: {', '.join(desc_obj.get('visual_elements', []))}
            """

            doc = Document(
                page_content=text_content,
                metadata={
                    "file_name": os.path.basename(file_path),
                    "file_path": data.get("file_path", "")
                }
            )
            documents.append(doc)
else:
    print("JSON folder not found.")

# 3. Add to Chroma in Batches with Timeout
if documents:
    print(f"Loaded {len(documents)} documents. Starting ingestion with rate limits...")

    # Initialize the DB using the local path
    vectorstore = Chroma(
        persist_directory=local_db_path,
        embedding_function=embeddings
    )

    BATCH_SIZE = 5
    total_batches = (len(documents) + BATCH_SIZE - 1) // BATCH_SIZE

    for i in range(0, len(documents), BATCH_SIZE):
        batch = documents[i : i + BATCH_SIZE]
        current_batch_num = (i // BATCH_SIZE) + 1

        print(f"Processing Batch {current_batch_num}/{total_batches} ({len(batch)} docs)...")

        # A. Add documents (This triggers the API Call)
        vectorstore.add_documents(batch)

        # B. Wait if this is not the last batch
        if i + BATCH_SIZE < len(documents):
            print("   -> Waiting 60 seconds to respect rate limits...")
            time.sleep(60)

    print(f"\nSuccess! All documents saved locally to: {local_db_path}")

    # --- MODIFICATION: Copy the local database to Google Drive ---
    if os.path.exists(persistent_db_path):
        shutil.rmtree(persistent_db_path)
        print(f"Existing persistent database at {persistent_db_path} removed before copy.")
    shutil.copytree(local_db_path, persistent_db_path)
    print(f"Successfully copied database to persistent location: {persistent_db_path}")

else:
    print("No documents to process.")

Database not found or removed. Starting fresh ingestion process...
Found 8 JSON files. Preparing data...
Loaded 8 documents. Starting ingestion with rate limits...
Processing Batch 1/2 (5 docs)...
   -> Waiting 60 seconds to respect rate limits...


KeyboardInterrupt: 

In [None]:
import os
from langchain_chroma import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from google.colab import userdata
from IPython.display import display, Image


db_path = "/content/drive/MyDrive/chroma_db"

# IMPORTANT: Must use the exact same model name you used to create the DB
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")

# Load the Database
# We point to the persist_directory to load the existing index
vectorstore = Chroma(
    persist_directory=db_path,
    embedding_function=embeddings
)

print("Database loaded successfully!")

def search_images(query, top_k=3):
    """
    Searches the vector DB for images matching the text query.
    """

    # Perform Similarity Search
    # This embeds the query and finds the 'top_k' closest vectors
    results = vectorstore.similarity_search(query, k=top_k)

    if not results:
        print("No matches found.")
        return

    # Display Results
    for i, doc in enumerate(results):
        file_path = doc.metadata.get("file_path")
        subject = doc.metadata.get("main_subject")
        file_name = doc.metadata.get("file_name")

        print(f"Match #{i+1}")
        print(f"File: {file_name}")
        print(f"Context: {doc.page_content[:100]}...") # Show first 100 chars of description

        # Display the actual image
        if os.path.exists(file_path):
            display(Image(filename=file_path, width=300))
        else:
            print(f"Image file not found at: {file_path}")

        print("-" * 40)

Database loaded successfully!


In [None]:
search_images("")

Output hidden; open in https://colab.research.google.com to view.