### 在运行此笔记本之前，请将变量 **VIDEO_FOLDER** 更改为您的视频文件夹名称，并将 **VIDEO_SHEET_CSV** 更改为您的 CSV 表，该表有每个视频 ID 到其对应的点赞数。

### Video Features

In [None]:
import os
import subprocess
import json
import pandas as pd
from tqdm import tqdm

# Define the paths to the video folder and CSV files
VIDEO_FOLDER = 'videos_new' #change name to your own video folder
OUTPUT_CSV = 'video_features_new.csv'
VIDEO_SHEET_CSV = 'video_sheet.csv'  #change name to your .csv sheet with video likes info

def get_ffprobe_metadata(video_path):
    """
    Extracts metadata from a video file using ffprobe.

    Args:
        video_path (str): Path to the video file.

    Returns:
        dict: A dictionary containing video metadata.
    """
    metadata = {}
    try:
        # Construct ffprobe command
        cmd = [
            'ffprobe',
            '-v', 'error',
            '-print_format', 'json',
            '-show_format',
            '-show_streams',
            video_path
        ]

        result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

        # Parse the JSON output
        ffprobe_output = json.loads(result.stdout)

        # Extract general format info
        format_info = ffprobe_output.get('format', {})
        metadata['filename'] = os.path.basename(video_path)
        metadata['duration_seconds'] = float(format_info.get('duration', 0))
        metadata['file_size_MB'] = float(format_info.get('size', 0)) / (1024 * 1024)

        # Initialize fields
        metadata['fps'] = None
        metadata['width'] = None
        metadata['height'] = None
        metadata['codec'] = None
        metadata['has_audio'] = False
        metadata['audio_fps'] = None
        metadata['audio_channels'] = None
        metadata['num_frames'] = 0

        # Iterate over streams
        streams = ffprobe_output.get('streams', [])
        for stream in streams:
            if stream.get('codec_type') == 'video':
                metadata['codec'] = stream.get('codec_name', 'Unknown')
                metadata['width'] = stream.get('width', 0)
                metadata['height'] = stream.get('height', 0)
                # Calculate fps
                r_frame_rate = stream.get('r_frame_rate', '0/0')
                nums = r_frame_rate.split('/')
                if len(nums) == 2 and int(nums[1]) != 0:
                    metadata['fps'] = float(nums[0]) / float(nums[1])
                # Number of frames
                if 'nb_frames' in stream:
                    metadata['num_frames'] = int(stream['nb_frames'])
            elif stream.get('codec_type') == 'audio':
                metadata['has_audio'] = True
                metadata['audio_fps'] = float(stream.get('sample_rate', 0))
                metadata['audio_channels'] = int(stream.get('channels', 0))

    except Exception as e:
        print(f"Error processing {video_path}: {e}")
        metadata = {
            'filename': os.path.basename(video_path),
            'duration_seconds': None,
            'fps': None,
            'width': None,
            'height': None,
            'num_frames': None,
            'file_size_MB': None,
            'codec': None,
            'has_audio': None,
            'audio_fps': None,
            'audio_channels': None
        }

    return metadata

def merge_video_metadata(df_metadata, info_csv):
    """
    Merges the extracted metadata with additional video information.

    Args:
        df_metadata (pd.DataFrame): DataFrame containing video metadata.
        info_csv (str): Path to 'video_info.csv'.

    Returns:
        pd.DataFrame: Merged DataFrame.
    """
    # Extract 'id' by removing the extension and converting to integer
    try:
        df_metadata['id'] = df_metadata['filename'].apply(lambda x: int(os.path.splitext(x)[0]))
    except ValueError as ve:
        print("Error extracting 'id' from filenames. Ensure all filenames before '.mp4' are integers.")
        print(ve)
        # Optionally, handle or remove problematic rows
        raise

    # Debugging: Print some IDs to verify
    print("Sample 'id' values:")
    print(df_metadata['id'].head())

    # Drop existing 'title', 'publish_time', 'likes' columns if present
    columns_to_drop = ['title', 'publish_time', 'likes']
    df_metadata = df_metadata.drop(columns=[col for col in columns_to_drop if col in df_metadata.columns])

    # Read the additional video info CSV
    try:
        df_info = pd.read_csv(info_csv)
    except FileNotFoundError:
        print(f"Error: The file '{info_csv}' does not exist.")
        return pd.DataFrame()  # Return empty DataFrame or handle as needed

    # Rename relevant columns for clarity and consistency
    df_info = df_info.rename(columns={
        '序号': 'id',
        '标题': 'title',
        '发布时间': 'publish_time',
        '点赞数': 'likes'
    })

    # Ensure 'id' in df_info is of integer type
    df_info['id'] = df_info['id'].astype(int)

    # Merge the DataFrames on 'id'
    merged_df = pd.merge(
        df_metadata,
        df_info[['id', 'title', 'publish_time', 'likes']],
        on='id',
        how='left'  # Use left join to retain all metadata entries
    )

    # Check for any missing merges
    missing_info = merged_df[merged_df['title'].isnull()]
    if not missing_info.empty:
        print("Warning: The following video IDs did not have matching entries in 'video_info.csv':")
        print(missing_info['id'].tolist())

    # Drop the temporary 'id' column as it's no longer needed
    merged_df = merged_df.drop(columns=['id'])

    return merged_df

def main():
    # Verify that ffprobe is accessible
    from shutil import which
    if which('ffprobe') is None:
        print("Error: ffprobe is not installed or not found in PATH.")
        print("Please install ffmpeg (which includes ffprobe) and ensure it's accessible from the command line.")
        print("Refer to the installation instructions provided earlier.")
        return
    else:
        print("ffprobe is found in PATH.")

    # Get list of video files
    try:
        video_files = [f for f in os.listdir(VIDEO_FOLDER) if f.lower().endswith('.mp4')]
    except FileNotFoundError:
        print(f"Error: The directory '{VIDEO_FOLDER}' does not exist.")
        return

    if not video_files:
        print(f"No '.mp4' files found in the directory '{VIDEO_FOLDER}'.")
        return

    video_paths = [os.path.join(VIDEO_FOLDER, f) for f in video_files]

    # Initialize a list to store metadata dictionaries
    metadata_list = []

    # Iterate over each video and extract metadata
    for video_path in tqdm(video_paths, desc="Extracting metadata with ffprobe"):
        metadata = get_ffprobe_metadata(video_path)
        metadata_list.append(metadata)

    # Convert the list of dictionaries to a pandas DataFrame
    df_metadata = pd.DataFrame(metadata_list)

    # Reorder columns for better readability
    columns_order = [
        'filename',
        'duration_seconds',
        'fps',
        'width',
        'height',
        'num_frames',
        'file_size_MB',
        'codec',
        'has_audio',
        'audio_fps',
        'audio_channels'
    ]
    
    # Ensure all columns are present before reordering
    df_metadata = df_metadata.reindex(columns=columns_order)

    # ================================
    # Merge with Additional Video Info
    # ================================

    # Define the path to the additional CSV file
    # VIDEO_SHEET_CSV = 'video_info.csv'  # Already defined at the top

    # Check if the additional CSV file exists
    if not os.path.exists(VIDEO_SHEET_CSV):
        print(f"Error: The file '{VIDEO_SHEET_CSV}' does not exist.")
        print("Please ensure the additional CSV file is present in the working directory.")
        return

    # Perform the merge
    try:
        merged_df = merge_video_metadata(df_metadata, VIDEO_SHEET_CSV)
    except Exception as e:
        print(f"An error occurred during merging: {e}")
        return

    if merged_df.empty:
        print("Merged DataFrame is empty. Exiting.")
        return

    # Reorder columns to include the new fields
    final_columns_order = [
        'filename',
        'duration_seconds',
        'fps',
        'width',
        'height',
        'num_frames',
        'file_size_MB',
        'codec',
        'has_audio',
        'audio_fps',
        'audio_channels',
        'title',
        'publish_time',
        'likes'
    ]
    
    # Ensure all final columns are present
    missing_final_cols = [col for col in final_columns_order if col not in merged_df.columns]
    if missing_final_cols:
        print(f"Warning: The following expected columns are missing in the merged DataFrame: {missing_final_cols}")
        # Optionally, handle missing columns, e.g., fill with NaN
        for col in missing_final_cols:
            merged_df[col] = pd.NA

    merged_df = merged_df[final_columns_order]

    # Save the merged DataFrame to a CSV file
    merged_df.to_csv(OUTPUT_CSV, index=False, encoding='utf-8-sig')
    print(f"Metadata extraction and merging complete. Metadata saved to '{OUTPUT_CSV}'.")

if __name__ == "__main__":
    main()

ffprobe is found in PATH.


Extracting metadata with ffprobe: 100%|██████████| 13/13 [00:00<00:00, 19.75it/s]

Sample 'id' values:
0    269
1    282
2    283
3    268
4    280
Name: id, dtype: int64
Metadata extraction and merging complete. Metadata saved to 'video_features_new.csv'.





### Video Embeddings

In [2]:
import os
import csv
import torch
import numpy as np
import av
from transformers import VideoLlavaForConditionalGeneration, VideoLlavaProcessor
from huggingface_hub import hf_hub_download
from tqdm.notebook import tqdm
import pandas as pd

def read_video_pyav(container, indices):
    """
    Decode the video with PyAV decoder.
    Args:
        container (av.container.input.InputContainer): PyAV container.
        indices (List[int]): List of frame indices to decode.
    Returns:
        np.ndarray: Decoded frames of shape (num_frames, height, width, 3).
    """
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    if len(frames) < len(indices):
        # Handle cases where the video has fewer frames than expected
        last_frame = frames[-1]
        while len(frames) < len(indices):
            frames.append(last_frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

def extract_video_embedding(model, processor, video_path, device):
    """
    Extract embedding for a single video.
    Args:
        model (VideoLlavaForConditionalGeneration): The Video-LLaVA model.
        processor (VideoLlavaProcessor): The processor for Video-LLaVA.
        video_path (str): Path to the video file.
        device (torch.device): Device to run the model on.
    Returns:
        np.ndarray: Embedding vector for the video.
    """
    container = av.open(video_path)
    total_frames = container.streams.video[0].frames
    if total_frames == 0:
        raise ValueError(f"No frames found in video: {video_path}")
    
    # Sample uniformly 8 frames
    step = max(total_frames // 8, 1)
    indices = list(range(0, min(total_frames, step * 8), step))[:8]
    if len(indices) < 8:
        # Pad with the last frame if not enough frames
        indices += [indices[-1]] * (8 - len(indices))
    video = read_video_pyav(container, indices)

    # Prepare prompt (dummy prompt as we are extracting embeddings)
    prompt = "USER: <video>\nExtract embedding. ASSISTANT:"
    
    # Important: The processor will handle the 'patch_size' and 'vision_feature_select_strategy'
    inputs = processor(text=prompt, videos=video, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True, return_dict=True)
        # Extract video_hidden_states
        video_hidden_states = outputs.video_hidden_states  # Shape: (batch_size, num_frames, hidden_size)
        # Average across frames to get a single embedding vector
        embedding = video_hidden_states.mean(dim=1).squeeze().cpu().numpy()
    return embedding

def main():
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Initialize model and processor
    model_name = "LanguageBind/Video-LLaVA-7B-hf"
    model = VideoLlavaForConditionalGeneration.from_pretrained(
        model_name,
        torch_dtype=torch.float16 if device.type == "cuda" else torch.float32,
        device_map="auto" if device.type == "cuda" else None
    )
    model.to(device)
    model.eval()

    # Load the processor
    processor = VideoLlavaProcessor.from_pretrained(model_name)

    # -- Fix for deprecation warning: set patch_size and vision_feature_select_strategy --
    # You can explicitly set them here. The values shown (14, "mean") are just examples.
    # Adjust them if your model is configured differently.
    processor.patch_size = 14
    processor.vision_feature_select_strategy = "mean"
    
    # Set padding_side to "left" as per usage tips
    processor.tokenizer.padding_side = "left"

    # Directory containing videos
    videos_dir = "videos_new"
    if not os.path.isdir(videos_dir):
        raise ValueError(f"Directory '{videos_dir}' does not exist.")

    # Supported video extensions
    video_extensions = {".mp4", ".mov", ".avi", ".mkv"}

    # List all video files
    video_files = [
        f for f in os.listdir(videos_dir)
        if os.path.splitext(f)[1].lower() in video_extensions
    ]

    if not video_files:
        raise ValueError(f"No video files found in directory '{videos_dir}'.")

    # Prepare list to hold filename and embeddings
    data = []

    print("Processing videos and extracting embeddings...")
    for video_file in tqdm(video_files, desc="Embedding Videos"):
        video_path = os.path.join(videos_dir, video_file)
        filename, _ = os.path.splitext(video_file)
        try:
            embedding = extract_video_embedding(model, processor, video_path, device)
            # Convert embedding to list for CSV
            embedding_list = embedding.tolist()
            data.append({
                "filename": filename,
                "embedding": embedding_list
            })
        except Exception as e:
            print(f"Error processing {video_file}: {e}")

    # Convert data to DataFrame
    df = pd.DataFrame(data)

    # Save to CSV
    output_csv = "video_embeddings_new.csv"
    # To store embeddings as strings, join the list elements
    df['embedding'] = df['embedding'].apply(lambda x: ",".join(map(str, x)))
    df.to_csv(output_csv, index=False)
    print(f"Embeddings saved to {output_csv}")

if __name__ == "__main__":
    main()

Using device: cpu


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Processing videos and extracting embeddings...


Embedding Videos:   0%|          | 0/13 [00:00<?, ?it/s]

Embeddings saved to video_embeddings_new.csv


### Merge into one .csv file

In [3]:
import pandas as pd
import os
import ast

def merge_csv_files(embeddings_file, features_file, output_file, merge_type='inner'):
    """
    Merges video_embeddings.csv with video_features.csv based on video filenames.

    Args:
        embeddings_file (str): Path to video_embeddings.csv.
        features_file (str): Path to video_features.csv.
        output_file (str): Path to save the merged video_info.csv.
        merge_type (str): Type of merge to perform ('inner', 'outer', 'left', 'right').
    """

    # Check if input files exist
    if not os.path.isfile(embeddings_file):
        raise FileNotFoundError(f"The file {embeddings_file} does not exist.")
    if not os.path.isfile(features_file):
        raise FileNotFoundError(f"The file {features_file} does not exist.")

    # Read video_embeddings.csv
    print(f"Reading {embeddings_file}...")
    embeddings_df = pd.read_csv(embeddings_file)

    # Ensure necessary columns exist
    if 'filename' not in embeddings_df.columns or 'embedding' not in embeddings_df.columns:
        raise ValueError(f"{embeddings_file} must contain 'filename' and 'embedding' columns.")

    # Convert 'filename' in embeddings_df to string
    embeddings_df['filename'] = embeddings_df['filename'].astype(str)

    # Read video_features.csv
    print(f"Reading {features_file}...")
    features_df = pd.read_csv(features_file)

    # Ensure 'filename' column exists
    if 'filename' not in features_df.columns:
        raise ValueError(f"{features_file} must contain a 'filename' column.")

    # Extract filename without extension from features_df
    print("Processing filenames to remove extensions...")
    features_df['filename_no_ext'] = features_df['filename'].apply(lambda x: os.path.splitext(x)[0])

    # Convert 'filename_no_ext' to string
    features_df['filename_no_ext'] = features_df['filename_no_ext'].astype(str)

    # Merge the two DataFrames on 'filename_no_ext' and 'filename'
    print(f"Merging DataFrames with merge type '{merge_type}'...")
    merged_df = pd.merge(
        features_df,
        embeddings_df,
        left_on='filename_no_ext',
        right_on='filename',
        how=merge_type
    )

    # Drop redundant columns
    # After merging, 'filename_x' comes from features_df and 'filename_y' from embeddings_df
    columns_to_drop = ['filename_no_ext', 'filename_y']
    merged_df.drop(columns=columns_to_drop, axis=1, inplace=True)

    # Rename 'filename_x' to 'filename'
    merged_df.rename(columns={'filename_x': 'filename'}, inplace=True)

    # Optional: Reorder columns to place 'embedding' at the end
    cols = [col for col in merged_df.columns if col != 'embedding'] + ['embedding']
    merged_df = merged_df[cols]

    # Save the merged DataFrame to CSV
    print(f"Saving merged data to {output_file}...")
    merged_df.to_csv(output_file, index=False)

    print("Merge completed successfully!")

if __name__ == "__main__":
    # Define file paths
    embeddings_csv = 'video_embeddings_new.csv'  # Path to your video_embeddings.csv
    features_csv = 'video_features_new.csv'      # Path to your video_features.csv
    output_csv = 'video_info_new.csv'            # Desired output file

    # Define merge type: 'inner', 'outer', 'left', 'right'
    # 'inner' will only include rows with matching filenames in both CSVs
    # 'outer' will include all rows, filling NaN where there are no matches
    merge_type = 'inner'  # Change to 'outer' if you want all records

    # Call the merge function
    merge_csv_files(embeddings_csv, features_csv, output_csv, merge_type)

Reading video_embeddings_new.csv...
Reading video_features_new.csv...
Processing filenames to remove extensions...
Merging DataFrames with merge type 'inner'...
Saving merged data to video_info_new.csv...
Merge completed successfully!
