# Step 5: Accepting Text Query and Searching FAISS Index using CLIP

This notebook allows you to enter a natural language query and find the top matching video scenes based on CLIP embeddings stored in a FAISS index.

In [34]:
import os
import json
import faiss
import torch
import numpy as np
from transformers import CLIPProcessor, CLIPModel

# Prevent OpenMP crashes on Windows
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load CLIP model and processor
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


## Load FAISS Index and Metadata

In [35]:
# Load FAISS index
index = faiss.read_index("faiss_index.index")

# Load scene metadata
with open("scene_metadata.json", "r") as f:
    metadata = json.load(f)

print(f"Loaded FAISS index with {index.ntotal} scene embeddings.")

Loaded FAISS index with 7 scene embeddings.


## Accept User Query and Convert to CLIP Embedding

In [36]:

# Accept user query
user_query = input("Enter your search query: ")

# Convert text to CLIP embedding
inputs = clip_processor(text=[user_query], return_tensors="pt", padding=True).to(device)
with torch.no_grad():
    query_embedding = clip_model.get_text_features(**inputs)

# Normalize for FAISS search
query_np = query_embedding.cpu().numpy().astype("float32")
faiss.normalize_L2(query_np)

print(f"Query shape: {query_np.shape}")


Query shape: (1, 512)


## Perform Semantic Search

In [37]:
# Search the FAISS index
k = 10  # Top K results
D, I = index.search(query_np, k)

print(f"Top {k} scene IDs:", I[0])


Top 10 scene IDs: [ 3  0  2  4  5  1  6 -1 -1 -1]


## Show Top Matching Scenes

In [38]:
threshold = 0.1  # Minimum similarity score
print(f"\nQuery: '{user_query}'")
print(f"Matching scenes with score ≥ {threshold}:")
found = False

for i, idx in enumerate(I[0]):
    if idx == -1 or idx >= len(metadata):
        continue

    score = D[0][i]
    if score < threshold:
        continue

    data = metadata[idx]
    print(f"\n{i+1}. Video: {data.get('video_name', 'N/A')}")
    print(f"   Scene Path: {data.get('scene_path', 'N/A')}")
    #print(f"   Timestamp: {data.get('timestamp', 'unknown')} seconds")
    print(f"   Similarity Score: {score:.4f}")
    print("-" * 50)
    found = True

if not found:
    print("No matching scenes passed the filters.")


Query: 'bus'
Matching scenes with score ≥ 0.1:

1. Video: vecteezy_third-transportation-ring-of-moscow_28261175_clip_000.mp4
   Scene Path: video_data/scenes\vecteezy_third-transportation-ring-of-moscow_28261175_clip_000.mp4
   Similarity Score: 0.2046
--------------------------------------------------

2. Video: vecteezy_car-and-truck-traffic-on-the-highway-in-europe-poland_7957364_clip_000.mp4
   Scene Path: video_data/scenes\vecteezy_car-and-truck-traffic-on-the-highway-in-europe-poland_7957364_clip_000.mp4
   Similarity Score: 0.1976
--------------------------------------------------

3. Video: vecteezy_car-and-truck-traffic-on-the-highway-in-europe-poland_7957364_clip_002.mp4
   Scene Path: video_data/scenes\vecteezy_car-and-truck-traffic-on-the-highway-in-europe-poland_7957364_clip_002.mp4
   Similarity Score: 0.1974
--------------------------------------------------

4. Video: vecteezy_third-transportation-ring-of-moscow_28261175_clip_001.mp4
   Scene Path: video_data/scenes\ve