# Introduction
Searching for specific scenes in a movie can be time-consuming. By combining video processing, AI-generated captions, and embeddings, we can create a powerful tool that allows users to find scenes based on textual descriptions. This tool extracts key frames from a video, generates captions for each frame, computes embeddings, and allows users to search through the scenes using natural language queries.

## Installing Required Libraries
Install the necessary libraries

In [None]:
%pip install openai
%pip install opencv-python
%pip install scikit-learn
%pip install scenedetect
%pip install PySceneDetect
%pip install boto3
%pip install pandas
%pip install python-dotenv

# Import the necessary modules

In [None]:
import os
import cv2
import numpy as np
import pandas as pd
import boto3
from sklearn.metrics.pairwise import cosine_similarity
from scenedetect import VideoManager, SceneManager
from scenedetect.detectors import ContentDetector
from IPython.display import Image, display
import concurrent.futures
from dotenv import load_dotenv

# Load the environment variables

In [None]:
load_dotenv()

# Initializing OpenAI Client
Initialize the OpenAI client using your API key:

In [None]:
from openai import OpenAI
# Initializing OpenAI client
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

# AWS S3 Configuration
Set up your AWS S3 client:

In [None]:
# AWS Credentials
aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')

# AWS S3 Configuration
s3_bucket_name = 'your-bucket-name'
s3_region = 'your-aws-region'

# Initialize S3 client
s3 = boto3.client('s3',
                  region_name=s3_region,
                  aws_access_key_id=aws_access_key_id,
                  aws_secret_access_key=aws_secret_access_key)

Replace 'your-s3-bucket-name' and 'your-s3-region' with your AWS S3 bucket name and region.

# Downloading Video from S3
We'll start by downloading the video file from AWS S3:

In [None]:
# Function to download video from S3
def download_video_from_s3(s3_bucket, s3_key, local_path):
    s3.download_file(s3_bucket, s3_key, local_path)
    print(f"Downloaded {s3_key} from S3 to {local_path}")

Call the function with the appropriate parameters:

In [None]:
# Video S3 key (path within the bucket)
s3_video_key = 'movie.mp4'  # Update if your video is stored under a different key

# Local path to save the video
video_local_path = 'movie.mp4'  # Save in the current working directory

# Download the video from S3
download_video_from_s3(s3_bucket_name, s3_video_key, video_local_path)

# Extracting Key Frames from Video
Next, we'll extract key frames from the video using PySceneDetect:

In [None]:
# Function to extract key frames from video
def extract_key_frames(video_path, output_dir):
    import os
    import cv2
    from scenedetect import VideoManager, SceneManager
    from scenedetect.detectors import ContentDetector

    # Create a VideoManager object
    video_manager = VideoManager([video_path])
    # Create a SceneManager object
    scene_manager = SceneManager()
    # Add ContentDetector algorithm (detects fast cuts)
    scene_manager.add_detector(ContentDetector(threshold=30.0))

    # Start the video manager and perform scene detection
    try:
        video_manager.start()
        scene_manager.detect_scenes(frame_source=video_manager)
    except Exception as e:
        print(f"Error during scene detection: {e}")
        return []

    # Get list of detected scenes
    scene_list = scene_manager.get_scene_list()
    if not scene_list:
        print("No scenes were detected in the video.")
        return []
    else:
        print(f"Detected {len(scene_list)} scenes in video.")

    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Open video file using OpenCV
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Failed to open video file {video_path}")
        return []

    # Save the first frame of each scene
    frame_list = []
    total_scenes = len(scene_list)
    for i, scene in enumerate(scene_list):
        print(f"Processing scene {i + 1}/{total_scenes}")
        # Access the start timecode of the scene
        start_timecode = scene[0]
        # Get the frame number
        start_frame = start_timecode.get_frames()
        cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
        ret, frame = cap.read()
        if ret:
            frame_filename = os.path.join(output_dir, f"frame_{i}.jpg")
            cv2.imwrite(frame_filename, frame)
            frame_list.append(frame_filename)
        else:
            print(f"Failed to read frame at position {start_frame}")
    cap.release()
    video_manager.release()  # Release the video manager resources
    return frame_list

Extract the frames:

In [None]:
# Output directory for frames
output_dir = 'frames'  # Local directory to save frames

# Extract key frames from the video
frame_list = extract_key_frames(video_local_path, output_dir)

# Uploading Images to S3
We'll upload the extracted frames to S3 and get their public URLs:

In [None]:
# Function to upload image to S3 and get public URL
def upload_image_to_s3(image_path, s3_bucket, s3_folder='frames'):
    file_name = os.path.basename(image_path)
    s3_key = f"{s3_folder}/{file_name}"
    s3.upload_file(image_path, s3_bucket, s3_key)
    image_url = f"https://{s3_bucket}.s3.{s3_region}.amazonaws.com/{s3_key}"
    return image_url

# Generating Captions with OpenAI
We'll generate captions for each frame using OpenAI's GPT model:

In [None]:
# Function to generate captions using OpenAI's gpt-4o-mini model
def generate_caption(image_url):
    caption_system_prompt = '''
    You are an assistant that generates concise captions for images. These captions will be embedded and stored so 
    people can semantically search for scenes. Ensure your captions include:
    - Physical descriptions of people
    - Identify and name of key actors
    - Descriptions of key scene objects such as their color
    - The mood of the scene
    - The actions or activites taking place in the scene
    '''

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": caption_system_prompt
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": image_url,
                        }
                    }
                ],
            }
        ],
        max_tokens=300,
        temperature=0.2
    )
    return response.choices[0].message.content

# Getting Embeddings
Compute embeddings for the captions using OpenAI's embedding models:

In [None]:
# Function to get embeddings
def get_embedding(value, model="text-embedding-3-large"): 
    embeddings = client.embeddings.create(
        model=model,
        input=value,
        encoding_format="float"
    )
    return embeddings.data[0].embedding

# Searching Frames Based on a Prompt
We'll search for frames that are most similar to a user's query:

In [None]:
# Function to search frames based on a prompt
def search_frames(prompt, df_frames, top_n=5):
    prompt_embedding = get_embedding(prompt)
    df_frames['similarity'] = df_frames['embedding'].apply(
        lambda x: cosine_similarity(np.array(x).reshape(1, -1), np.array(prompt_embedding).reshape(1, -1))[0][0]
    )
    results = df_frames.sort_values('similarity', ascending=False).head(top_n)
    return results

# Putting It All Together
We'll process each frame, generate captions and embeddings, and store the data in a DataFrame:

In [None]:
# Function to process a single frame
def process_frame(frame_filename):
    # Upload image to S3 and get the URL
    image_url = upload_image_to_s3(frame_filename, s3_bucket_name)
    # Generate caption
    caption = generate_caption(image_url)
    # Generate embedding
    embedding = get_embedding(caption)
    # Return frame data
    return {
        'frame_filename': frame_filename,
        'image_url': image_url,
        'caption': caption,
        'embedding': embedding
    }

# Process frames in parallel
frame_data_list = []
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    results = executor.map(process_frame, frame_list)
    frame_data_list = list(results)

# Create DataFrame
df_frames = pd.DataFrame(frame_data_list)

# User Query and Displaying Results
Now, let's allow the user to search for scenes:

In [None]:
prompt = "Find images of Paul Walker, a blonde hair blue-eyed man, in a car."
results = search_frames(prompt, df_frames, top_n=5)

# Display the results
for index, row in results.iterrows():
    print(f"Frame: {row['frame_filename']}")
    print(f"Caption: {row['caption']}")
    print(f"Similarity: {row['similarity']}")
    display(Image(url=row['image_url']))