Step 3: Extract Key Frames and Generate CLIP Embeddings
    1. Here we process each scene-level video clip, extracts the middle frame as a key visual summary, passes it through the CLIP image encoder, and saves the resulting embedding as a .npy file.
    2. These embeddings will be used later for semantic search using a text query.

Setup and configuration

In [3]:
import os
import cv2
import torch
import numpy as np
from PIL import Image
from tqdm import tqdm
from transformers import CLIPProcessor, CLIPModel

SCENE_DIR = 'video_data/scenes'         # Input: Folder containing short video clips (scenes)
EMBEDDING_DIR = 'embeddings'            # Output: Folder to save scene embeddings
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'  # Use GPU if available

# Ensure required folders exist
os.makedirs(EMBEDDING_DIR, exist_ok=True)

# Load CLIP model and processor from Hugging Face
# CLIP helps map images and texts into the same embedding space
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(DEVICE)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

Function: Extract the middle frame of a video

In [4]:
def extract_middle_frame(video_path):
    """
    Extracts the middle frame from a video file.
    Returns the frame as a NumPy array or None if the video can't be read.
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Could not open {video_path}")
        return None

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    middle_frame_index = total_frames // 2
    cap.set(cv2.CAP_PROP_POS_FRAMES, middle_frame_index)
    ret, frame = cap.read()
    cap.release()

    if not ret:
        print(f"Failed to read middle frame from {video_path}")
        return None

    return frame

Function: Generate and return CLIP embedding

In [5]:
def generate_clip_embedding(video_path, model, processor):
    """
    Extracts the middle frame of a video clip and returns a CLIP image embedding.
    """
    frame = extract_middle_frame(video_path)
    if frame is None:
        return None

    # Convert BGR (OpenCV format) to RGB (PIL format)
    image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

    # Prepare image for CLIP model
    inputs = processor(images=image, return_tensors="pt").to(DEVICE)

    # Get embedding from CLIP
    with torch.no_grad():
        embedding = model.get_image_features(**inputs)
        embedding = embedding.cpu().numpy().flatten()  # Flatten to 1D array

    return embedding

For each scene clip, extract the embedding and save it

In [6]:
scene_files = sorted([
    f for f in os.listdir(SCENE_DIR)
    if f.lower().endswith('.mp4')
])

for scene_file in tqdm(scene_files, desc="🔎 Generating CLIP embeddings"):
    scene_path = os.path.join(SCENE_DIR, scene_file)
    embedding = generate_clip_embedding(scene_path, model, processor)

    if embedding is not None:
        # Save the embedding as a .npy file with the same base name
        out_path = os.path.join(EMBEDDING_DIR, scene_file.replace('.mp4', '.npy'))
        np.save(out_path, embedding)

🔎 Generating CLIP embeddings: 100%|██████████| 7/7 [00:02<00:00,  2.37it/s]
