# üñºÔ∏è AI-Powered Image Retrieval System

This notebook demonstrates the complete image retrieval pipeline using:
- **Ministral-3:3b** for image description (Vision model)
- **EmbeddingGemma** for vector embeddings (768 dimensions)
- **Cosine similarity** for semantic search

## Step 1: Import Required Libraries

In [None]:
import ollama
import numpy as np
import pandas as pd
import os
import glob
import time

print("‚úÖ All libraries imported successfully!")

## Step 2: Configure Image Directory

Update `IMAGE_FOLDER` to point to your frames directory containing .jpg images.

In [None]:
# Configure your image directory
IMAGE_FOLDER = "/Users/sayemaraf/Desktop/BIG Data projects/Image_Retrieval_System/frames"
MODEL_FILE = "model.pkl"

# Discover all images
image_paths = glob.glob(os.path.join(IMAGE_FOLDER, "*.jpg"))
print(f"üîé Found {len(image_paths)} images:")
for img in image_paths[:5]:  # Show first 5
    print(f"   - {os.path.basename(img)}")

## Step 3: Build the Image Index

This cell processes all images and creates vector embeddings.

In [None]:
def run_indexing_pipeline():
    """Process images and create vector embeddings"""
    
    # Load existing model or create new
    if os.path.exists(MODEL_FILE):
        df = pd.read_pickle(MODEL_FILE)
        print(f"üìÇ Loaded existing model with {len(df)} images")
    else:
        df = pd.DataFrame(columns=['filename', 'description', 'embedding'])
        print("üìÇ Creating new model")
    
    new_entries = []
    
    for path in image_paths:
        # Skip already indexed images
        if path in df['filename'].values:
            continue
        
        print(f"\nüì∏ Processing: {os.path.basename(path)}...", end=" ")
        start_time = time.time()
        
        try:
            # Step A: Get image description using Vision model
            response = ollama.chat(
                model='ministral-3:3b',
                messages=[{
                    'role': 'user',
                    'content': 'Describe this image in 5 words.',
                    'images': [path]
                }]
            )
            description = response['message']['content'].strip()
            
            # Step B: Generate vector embedding
            embedding_response = ollama.embed(
                model='embeddinggemma:latest',
                input=description
            )
            vector = np.array(embedding_response['embeddings'][0])
            
            elapsed = time.time() - start_time
            print(f"‚úÖ Done ({elapsed:.2f}s)")
            print(f"   Description: {description}")
            
            new_entries.append({
                'filename': path,
                'description': description,
                'embedding': vector
            })
            
        except Exception as e:
            print(f"‚ùå Error: {e}")
    
    # Save updated model
    if new_entries:
        df = pd.concat([df, pd.DataFrame(new_entries)], ignore_index=True)
        df.to_pickle(MODEL_FILE)
        print(f"\nüíæ Saved {len(new_entries)} new images. Total: {len(df)} images")
    else:
        print("\n‚úÖ All images already indexed!")
    
    return df

# Run the indexing
df_model = run_indexing_pipeline()

## Step 4: Inspect Vector Embeddings

Check the quality and dimensions of your embeddings.

In [None]:
if len(df_model) > 0:
    print("="*60)
    print("VECTOR EMBEDDINGS INSPECTOR")
    print("="*60)
    print(f"\nüìÅ Total Images Indexed: {len(df_model)}")
    print(f"üìã Columns: {list(df_model.columns)}")
    
    # Analyze first embedding
    first_row = df_model.iloc[0]
    vector = np.array(first_row['embedding'])
    
    print(f"\nüîç Sample Entry:")
    print(f"   Filename: {os.path.basename(first_row['filename'])}")
    print(f"   Description: {first_row['description']}")
    print(f"   Vector Dimensions: {len(vector)}")
    print(f"   Vector Type: {vector.dtype}")
    
    print(f"\nüìä Vector Statistics:")
    print(f"   Min: {np.min(vector):.6f}")
    print(f"   Max: {np.max(vector):.6f}")
    print(f"   Mean: {np.mean(vector):.6f}")
    print(f"   Std: {np.std(vector):.6f}")
    
    print(f"\nüìÇ All Indexed Images:")
    for idx, row in df_model.iterrows():
        print(f"   {idx+1}. {os.path.basename(row['filename'])} - {row['description']}")
else:
    print("‚ö†Ô∏è No images indexed yet!")

## Step 5: Search for Images

Use natural language queries to find similar images.

In [None]:
def search_images(query, top_k=3):
    """Search for images using semantic similarity"""
    
    # Load model if not already loaded
    if not os.path.exists(MODEL_FILE):
        print("‚ùå Model file not found. Run indexing first!")
        return
    
    df = pd.read_pickle(MODEL_FILE)
    print(f"\nüîç Searching for: '{query}'")
    print(f"üìö Database: {len(df)} images\n")
    
    # Generate query embedding
    query_response = ollama.embed(
        model='embeddinggemma:latest',
        input=query
    )
    query_vector = np.array(query_response['embeddings'][0])
    
    # Calculate cosine similarity
    def cosine_similarity(a, b):
        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
    
    df['score'] = df['embedding'].apply(
        lambda x: cosine_similarity(np.array(x), query_vector)
    )
    
    # Get top results
    results = df.sort_values('score', ascending=False).head(top_k)
    
    print("="*60)
    print(f"TOP {top_k} MATCHES")
    print("="*60)
    
    for i, (idx, row) in enumerate(results.iterrows(), 1):
        print(f"\n{i}. FILE: {os.path.basename(row['filename'])}")
        print(f"   CONFIDENCE: {row['score']:.2%}")
        print(f"   DESCRIPTION: {row['description']}")
        print("-"*60)
    
    return results

# Example search
search_query = "grizzly bear in nature"
results = search_images(search_query)

## Step 6: Interactive Search

Run custom searches with your own queries.