# Step 3: Generate CLIP Embeddings for Keyframes
This notebook loads the keyframes extracted in Step 2 and uses OpenAI's CLIP model to generate image embeddings, which will be used for semantic search later.

In [12]:
import os
import torch
import numpy as np
from PIL import Image
from tqdm import tqdm
from transformers import CLIPProcessor, CLIPModel

# Set device to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


Using device: cpu


In [13]:
# Load the CLIP model and processor from Hugging Face
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


In [14]:
# Define keyframe and scene directories
keyframe_dir = "video_data/scene_keyframes"
scene_dir = "video_data/scenes"

# Output lists for embeddings and metadata
embeddings = []
metadata = []


In [15]:
# Process all keyframes and generate CLIP embeddings
scene_files = sorted([f for f in os.listdir(scene_dir) if f.endswith(".mp4")])

for scene_file in tqdm(scene_files, desc="Processing scenes"):
    scene_path = os.path.join(scene_dir, scene_file)
    keyframe_path = os.path.join(keyframe_dir, scene_file.replace(".mp4", ".jpg"))

    if not os.path.exists(keyframe_path):
        print(f"Keyframe not found for: {scene_file}")
        continue

    try:
        image = Image.open(keyframe_path).convert("RGB")
        inputs = clip_processor(images=image, return_tensors="pt").to(device)
        with torch.no_grad():
            embedding = clip_model.get_image_features(**inputs)
            embedding = embedding.cpu().numpy().astype("float32")
            embedding /= np.linalg.norm(embedding, axis=1, keepdims=True)
            embeddings.append(embedding[0])
            metadata.append({
                "scene_path": scene_path,
                "video_name": scene_file,
                "timestamp": "unknown"
            })
    except Exception as e:
        print(f"Error processing {scene_file}: {e}")


Processing scenes: 100%|██████████| 7/7 [00:02<00:00,  3.07it/s]


In [16]:
# Save embeddings and metadata for Step 4
np.save("embeddings.npy", np.array(embeddings))
import json
with open("scene_metadata.json", "w") as f:
    json.dump(metadata, f, indent=2)

print(f"Saved {len(embeddings)} embeddings and metadata.")


Saved 7 embeddings and metadata.
