In [1]:
import os
import csv
import torch
import numpy as np
import av
from transformers import VideoLlavaForConditionalGeneration, VideoLlavaProcessor
from huggingface_hub import hf_hub_download
from tqdm.notebook import tqdm
import pandas as pd

def read_video_pyav(container, indices):
    """
    Decode the video with PyAV decoder.
    Args:
        container (av.container.input.InputContainer): PyAV container.
        indices (List[int]): List of frame indices to decode.
    Returns:
        np.ndarray: Decoded frames of shape (num_frames, height, width, 3).
    """
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    if len(frames) < len(indices):
        # Handle cases where the video has fewer frames than expected
        last_frame = frames[-1]
        while len(frames) < len(indices):
            frames.append(last_frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

def extract_video_embedding(model, processor, video_path, device):
    """
    Extract embedding for a single video.
    Args:
        model (VideoLlavaForConditionalGeneration): The Video-LLaVA model.
        processor (VideoLlavaProcessor): The processor for Video-LLaVA.
        video_path (str): Path to the video file.
        device (torch.device): Device to run the model on.
    Returns:
        np.ndarray: Embedding vector for the video.
    """
    container = av.open(video_path)
    total_frames = container.streams.video[0].frames
    if total_frames == 0:
        raise ValueError(f"No frames found in video: {video_path}")
    # Sample uniformly 8 frames
    step = max(total_frames // 8, 1)
    indices = list(range(0, min(total_frames, step * 8), step))[:8]
    if len(indices) < 8:
        # Pad with the last frame if not enough frames
        indices += [indices[-1]] * (8 - len(indices))
    video = read_video_pyav(container, indices)

    # Prepare prompt (dummy prompt as we are extracting embeddings)
    prompt = "USER: <video>\nExtract embedding. ASSISTANT:"
    inputs = processor(text=prompt, videos=video, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True, return_dict=True)
        # Extract video_hidden_states
        video_hidden_states = outputs.video_hidden_states  # Shape: (batch_size, num_frames, hidden_size)
        # Average across frames to get a single embedding vector
        embedding = video_hidden_states.mean(dim=1).squeeze().cpu().numpy()
    return embedding

def main():
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Initialize model and processor
    model_name = "LanguageBind/Video-LLaVA-7B-hf"
    model = VideoLlavaForConditionalGeneration.from_pretrained(
        model_name,
        torch_dtype=torch.float16 if device.type == "cuda" else torch.float32,
        device_map="auto" if device.type == "cuda" else None
    )
    model.to(device)
    model.eval()

    processor = VideoLlavaProcessor.from_pretrained(model_name)
    # Set padding_side to "left" as per usage tips
    processor.tokenizer.padding_side = "left"

    # Directory containing videos
    videos_dir = "videos_new"
    if not os.path.isdir(videos_dir):
        raise ValueError(f"Directory '{videos_dir}' does not exist.")

    # Supported video extensions
    video_extensions = {".mp4", ".mov", ".avi", ".mkv"}

    # List all video files
    video_files = [
        f for f in os.listdir(videos_dir)
        if os.path.splitext(f)[1].lower() in video_extensions
    ]

    if not video_files:
        raise ValueError(f"No video files found in directory '{videos_dir}'.")

    # Prepare list to hold filename and embeddings
    data = []

    print("Processing videos and extracting embeddings...")
    for video_file in tqdm(video_files, desc="Embedding Videos"):
        video_path = os.path.join(videos_dir, video_file)
        filename, _ = os.path.splitext(video_file)
        try:
            embedding = extract_video_embedding(model, processor, video_path, device)
            # Convert embedding to list for CSV
            embedding_list = embedding.tolist()
            # Optionally, you can reduce the embedding size or process it further
            data.append({
                "filename": filename,
                "embedding": embedding_list
            })
        except Exception as e:
            print(f"Error processing {video_file}: {e}")

    # Convert data to DataFrame
    df = pd.DataFrame(data)

    # Save to CSV
    output_csv = "video_embeddings_new.csv"
    # To store embeddings as strings, join the list elements
    df['embedding'] = df['embedding'].apply(lambda x: ",".join(map(str, x)))
    df.to_csv(output_csv, index=False)
    print(f"Embeddings saved to {output_csv}")

if __name__ == "__main__":
    main()


Using device: cpu


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Processing videos and extracting embeddings...


Embedding Videos:   0%|          | 0/13 [00:00<?, ?it/s]

Embeddings saved to video_embeddings_new.csv


In [2]:
import pandas as pd
import os
import ast

def merge_csv_files(embeddings_file, features_file, output_file, merge_type='inner'):
    """
    Merges video_embeddings.csv with video_features.csv based on video filenames.

    Args:
        embeddings_file (str): Path to video_embeddings.csv.
        features_file (str): Path to video_features.csv.
        output_file (str): Path to save the merged video_info.csv.
        merge_type (str): Type of merge to perform ('inner', 'outer', 'left', 'right').
    """

    # Check if input files exist
    if not os.path.isfile(embeddings_file):
        raise FileNotFoundError(f"The file {embeddings_file} does not exist.")
    if not os.path.isfile(features_file):
        raise FileNotFoundError(f"The file {features_file} does not exist.")

    # Read video_embeddings.csv
    print(f"Reading {embeddings_file}...")
    embeddings_df = pd.read_csv(embeddings_file)

    # Ensure necessary columns exist
    if 'filename' not in embeddings_df.columns or 'embedding' not in embeddings_df.columns:
        raise ValueError(f"{embeddings_file} must contain 'filename' and 'embedding' columns.")

    # Convert 'filename' in embeddings_df to string
    embeddings_df['filename'] = embeddings_df['filename'].astype(str)

    # Read video_features.csv
    print(f"Reading {features_file}...")
    features_df = pd.read_csv(features_file)

    # Ensure 'filename' column exists
    if 'filename' not in features_df.columns:
        raise ValueError(f"{features_file} must contain a 'filename' column.")

    # Extract filename without extension from features_df
    print("Processing filenames to remove extensions...")
    features_df['filename_no_ext'] = features_df['filename'].apply(lambda x: os.path.splitext(x)[0])

    # Convert 'filename_no_ext' to string
    features_df['filename_no_ext'] = features_df['filename_no_ext'].astype(str)

    # Merge the two DataFrames on 'filename_no_ext' and 'filename'
    print(f"Merging DataFrames with merge type '{merge_type}'...")
    merged_df = pd.merge(
        features_df,
        embeddings_df,
        left_on='filename_no_ext',
        right_on='filename',
        how=merge_type
    )

    # Drop redundant columns
    # After merging, 'filename_x' comes from features_df and 'filename_y' from embeddings_df
    columns_to_drop = ['filename_no_ext', 'filename_y']
    merged_df.drop(columns=columns_to_drop, axis=1, inplace=True)

    # Rename 'filename_x' to 'filename'
    merged_df.rename(columns={'filename_x': 'filename'}, inplace=True)

    # Optional: Reorder columns to place 'embedding' at the end
    cols = [col for col in merged_df.columns if col != 'embedding'] + ['embedding']
    merged_df = merged_df[cols]

    # Save the merged DataFrame to CSV
    print(f"Saving merged data to {output_file}...")
    merged_df.to_csv(output_file, index=False)

    print("Merge completed successfully!")

if __name__ == "__main__":
    # Define file paths
    embeddings_csv = 'video_embeddings_new.csv'  # Path to your video_embeddings.csv
    features_csv = 'video_features_new.csv'      # Path to your video_features.csv
    output_csv = 'video_info_new.csv'            # Desired output file

    # Define merge type: 'inner', 'outer', 'left', 'right'
    # 'inner' will only include rows with matching filenames in both CSVs
    # 'outer' will include all rows, filling NaN where there are no matches
    merge_type = 'inner'  # Change to 'outer' if you want all records

    # Call the merge function
    merge_csv_files(embeddings_csv, features_csv, output_csv, merge_type)


Reading video_embeddings_new.csv...
Reading video_features_new.csv...
Processing filenames to remove extensions...
Merging DataFrames with merge type 'inner'...
Saving merged data to video_info_new.csv...
Merge completed successfully!
