In [2]:
import os
import subprocess
import json
import pandas as pd
from tqdm import tqdm

# Define the paths to the video folder and CSV files
VIDEO_FOLDER = 'videos_new' 
OUTPUT_CSV = 'video_features_new.csv'
VIDEO_SHEET_CSV = 'video_sheet.csv'  # Ensure this path is correct

def get_ffprobe_metadata(video_path):
    """
    Extracts metadata from a video file using ffprobe.

    Args:
        video_path (str): Path to the video file.

    Returns:
        dict: A dictionary containing video metadata.
    """
    metadata = {}
    try:
        # Construct ffprobe command
        cmd = [
            'ffprobe',
            '-v', 'error',
            '-print_format', 'json',
            '-show_format',
            '-show_streams',
            video_path
        ]

        result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

        # Parse the JSON output
        ffprobe_output = json.loads(result.stdout)

        # Extract general format info
        format_info = ffprobe_output.get('format', {})
        metadata['filename'] = os.path.basename(video_path)
        metadata['duration_seconds'] = float(format_info.get('duration', 0))
        metadata['file_size_MB'] = float(format_info.get('size', 0)) / (1024 * 1024)

        # Initialize fields
        metadata['fps'] = None
        metadata['width'] = None
        metadata['height'] = None
        metadata['codec'] = None
        metadata['has_audio'] = False
        metadata['audio_fps'] = None
        metadata['audio_channels'] = None
        metadata['num_frames'] = 0

        # Iterate over streams
        streams = ffprobe_output.get('streams', [])
        for stream in streams:
            if stream.get('codec_type') == 'video':
                metadata['codec'] = stream.get('codec_name', 'Unknown')
                metadata['width'] = stream.get('width', 0)
                metadata['height'] = stream.get('height', 0)
                # Calculate fps
                r_frame_rate = stream.get('r_frame_rate', '0/0')
                nums = r_frame_rate.split('/')
                if len(nums) == 2 and int(nums[1]) != 0:
                    metadata['fps'] = float(nums[0]) / float(nums[1])
                # Number of frames
                if 'nb_frames' in stream:
                    metadata['num_frames'] = int(stream['nb_frames'])
            elif stream.get('codec_type') == 'audio':
                metadata['has_audio'] = True
                metadata['audio_fps'] = float(stream.get('sample_rate', 0))
                metadata['audio_channels'] = int(stream.get('channels', 0))

    except Exception as e:
        print(f"Error processing {video_path}: {e}")
        metadata = {
            'filename': os.path.basename(video_path),
            'duration_seconds': None,
            'fps': None,
            'width': None,
            'height': None,
            'num_frames': None,
            'file_size_MB': None,
            'codec': None,
            'has_audio': None,
            'audio_fps': None,
            'audio_channels': None
        }

    return metadata

def merge_video_metadata(df_metadata, info_csv):
    """
    Merges the extracted metadata with additional video information.

    Args:
        df_metadata (pd.DataFrame): DataFrame containing video metadata.
        info_csv (str): Path to 'video_info.csv'.

    Returns:
        pd.DataFrame: Merged DataFrame.
    """
    # Extract 'id' by removing the extension and converting to integer
    try:
        df_metadata['id'] = df_metadata['filename'].apply(lambda x: int(os.path.splitext(x)[0]))
    except ValueError as ve:
        print("Error extracting 'id' from filenames. Ensure all filenames before '.mp4' are integers.")
        print(ve)
        # Optionally, handle or remove problematic rows
        raise

    # Debugging: Print some IDs to verify
    print("Sample 'id' values:")
    print(df_metadata['id'].head())

    # Drop existing 'title', 'publish_time', 'likes' columns if present
    columns_to_drop = ['title', 'publish_time', 'likes']
    df_metadata = df_metadata.drop(columns=[col for col in columns_to_drop if col in df_metadata.columns])

    # Read the additional video info CSV
    try:
        df_info = pd.read_csv(info_csv)
    except FileNotFoundError:
        print(f"Error: The file '{info_csv}' does not exist.")
        return pd.DataFrame()  # Return empty DataFrame or handle as needed

    # Rename relevant columns for clarity and consistency
    df_info = df_info.rename(columns={
        '序号': 'id',
        '标题': 'title',
        '发布时间': 'publish_time',
        '点赞数': 'likes'
    })

    # Ensure 'id' in df_info is of integer type
    df_info['id'] = df_info['id'].astype(int)

    # Merge the DataFrames on 'id'
    merged_df = pd.merge(
        df_metadata,
        df_info[['id', 'title', 'publish_time', 'likes']],
        on='id',
        how='left'  # Use left join to retain all metadata entries
    )

    # Check for any missing merges
    missing_info = merged_df[merged_df['title'].isnull()]
    if not missing_info.empty:
        print("Warning: The following video IDs did not have matching entries in 'video_info.csv':")
        print(missing_info['id'].tolist())

    # Drop the temporary 'id' column as it's no longer needed
    merged_df = merged_df.drop(columns=['id'])

    return merged_df

def main():
    # Verify that ffprobe is accessible
    from shutil import which
    if which('ffprobe') is None:
        print("Error: ffprobe is not installed or not found in PATH.")
        print("Please install ffmpeg (which includes ffprobe) and ensure it's accessible from the command line.")
        print("Refer to the installation instructions provided earlier.")
        return
    else:
        print("ffprobe is found in PATH.")

    # Get list of video files
    try:
        video_files = [f for f in os.listdir(VIDEO_FOLDER) if f.lower().endswith('.mp4')]
    except FileNotFoundError:
        print(f"Error: The directory '{VIDEO_FOLDER}' does not exist.")
        return

    if not video_files:
        print(f"No '.mp4' files found in the directory '{VIDEO_FOLDER}'.")
        return

    video_paths = [os.path.join(VIDEO_FOLDER, f) for f in video_files]

    # Initialize a list to store metadata dictionaries
    metadata_list = []

    # Iterate over each video and extract metadata
    for video_path in tqdm(video_paths, desc="Extracting metadata with ffprobe"):
        metadata = get_ffprobe_metadata(video_path)
        metadata_list.append(metadata)

    # Convert the list of dictionaries to a pandas DataFrame
    df_metadata = pd.DataFrame(metadata_list)

    # Reorder columns for better readability
    columns_order = [
        'filename',
        'duration_seconds',
        'fps',
        'width',
        'height',
        'num_frames',
        'file_size_MB',
        'codec',
        'has_audio',
        'audio_fps',
        'audio_channels'
    ]
    
    # Ensure all columns are present before reordering
    df_metadata = df_metadata.reindex(columns=columns_order)

    # ================================
    # Merge with Additional Video Info
    # ================================

    # Define the path to the additional CSV file
    # VIDEO_SHEET_CSV = 'video_info.csv'  # Already defined at the top

    # Check if the additional CSV file exists
    if not os.path.exists(VIDEO_SHEET_CSV):
        print(f"Error: The file '{VIDEO_SHEET_CSV}' does not exist.")
        print("Please ensure the additional CSV file is present in the working directory.")
        return

    # Perform the merge
    try:
        merged_df = merge_video_metadata(df_metadata, VIDEO_SHEET_CSV)
    except Exception as e:
        print(f"An error occurred during merging: {e}")
        return

    if merged_df.empty:
        print("Merged DataFrame is empty. Exiting.")
        return

    # Reorder columns to include the new fields
    final_columns_order = [
        'filename',
        'duration_seconds',
        'fps',
        'width',
        'height',
        'num_frames',
        'file_size_MB',
        'codec',
        'has_audio',
        'audio_fps',
        'audio_channels',
        'title',
        'publish_time',
        'likes'
    ]
    
    # Ensure all final columns are present
    missing_final_cols = [col for col in final_columns_order if col not in merged_df.columns]
    if missing_final_cols:
        print(f"Warning: The following expected columns are missing in the merged DataFrame: {missing_final_cols}")
        # Optionally, handle missing columns, e.g., fill with NaN
        for col in missing_final_cols:
            merged_df[col] = pd.NA

    merged_df = merged_df[final_columns_order]

    # Save the merged DataFrame to a CSV file
    merged_df.to_csv(OUTPUT_CSV, index=False, encoding='utf-8-sig')
    print(f"Metadata extraction and merging complete. Metadata saved to '{OUTPUT_CSV}'.")

if __name__ == "__main__":
    main()


ffprobe is found in PATH.


Extracting metadata with ffprobe: 100%|██████████| 13/13 [00:00<00:00, 20.98it/s]

Sample 'id' values:
0    269
1    282
2    283
3    268
4    280
Name: id, dtype: int64
Metadata extraction and merging complete. Metadata saved to 'video_features_new.csv'.



