In [1]:
data_dir = "G:/ModelscopeTraining/Datasets/subset_v20230528_1848" # the input directory path

In [2]:
# Crop black bars

import os
import subprocess


# Input
toCrop_dir = f"{data_dir}/toCrop"

# Output
cropped_dir = f"{data_dir}/cropped"

input_dir = toCrop_dir
output_dir = cropped_dir

# Make sure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Iterate over all video files in the input directory
for filename in os.listdir(input_dir):
    # Skip files that are not videos
    if not (filename.endswith(".mp4") or filename.endswith(".webm") or filename.endswith(".mkv") or filename.endswith(".avi") or filename.endswith(".mov")):
        continue

    input_file = os.path.join(input_dir, filename)
    output_file = os.path.join(output_dir, os.path.splitext(filename)[0] + ".mp4")

    # Step 1: Use FFmpeg to detect black bars
    command = ["ffmpeg", "-i", input_file, "-vf", "cropdetect=24:16:0", "-f", "null", "-"]
    cropdetect_result = subprocess.run(command, capture_output=True, text=True)

    # Parse the cropdetect result to get the crop parameters
    try:
        crop_params = [line for line in cropdetect_result.stderr.split('\n') if 'crop=' in line][-1].split('crop=')[1].split(' ')[0]
    except IndexError:
        crop_params = None

    # Step 2: If black bars are detected, crop them out
    if crop_params:
        command = ["ffmpeg", "-i", input_file, "-vf", f"crop={crop_params}", output_file]
    else:
        # If no black bars are detected, just convert the video to .mp4
        command = ["ffmpeg", "-i", input_file, "-c:v", "libx264", "-crf", "23", "-preset", "veryfast", output_file]
    
    subprocess.run(command)


Exception in thread Thread-4 (_readerthread):
Traceback (most recent call last):
  File "c:\Users\cerspense\.conda\envs\t2vdataprep\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "c:\Users\cerspense\.conda\envs\t2vdataprep\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "c:\Users\cerspense\.conda\envs\t2vdataprep\lib\subprocess.py", line 1515, in _readerthread
    buffer.append(fh.read())
  File "c:\Users\cerspense\.conda\envs\t2vdataprep\lib\codecs.py", line 322, in decode
    (result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 2074: invalid continuation byte


AttributeError: 'NoneType' object has no attribute 'split'

In [4]:
# Filter and crop

import os
import subprocess
import re
from concurrent.futures import ThreadPoolExecutor
from moviepy.editor import VideoFileClip

# Input
toFinalize = f"{data_dir}/toCrop"

# Output
finished = f"{data_dir}/finished"

input_dir = toCrop_dir
output_dir = cropped_dir

# Make sure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Step 1: Setup environment
min_res_width = 640  # minimum resolution width
min_res_height = 360  # minimum resolution height
min_duration = 60  # minimum duration in seconds
out_res_width = 576  # output resolution width
out_res_height = 320  # output resolution height
num_threads = 8  # number of threads to use for processing

# Step 2: Iterate through all files
file_list = []

for file_name in os.listdir(input_dir):
    if file_name.endswith((".mp4", ".mov", ".webm", ".mkv", ".avi")):
        file_path = os.path.join(input_dir, file_name)

        cmd_width = f'ffprobe -v error -select_streams v:0 -show_entries stream=width -of default=noprint_wrappers=1:nokey=1 "{file_path}"'
        cmd_height = f'ffprobe -v error -select_streams v:0 -show_entries stream=height -of default=noprint_wrappers=1:nokey=1 "{file_path}"'
        
        try:
            width = int(float(subprocess.check_output(cmd_width, shell=True).decode("utf-8").strip()))
            height = int(float(subprocess.check_output(cmd_height, shell=True).decode("utf-8").strip()))
            
            # Try getting the duration with ffprobe first
            try:
                cmd_duration = f'ffprobe -v error -select_streams v:0 -show_entries stream=duration -of default=noprint_wrappers=1:nokey=1 "{file_path}"'
                duration = int(float(subprocess.check_output(cmd_duration, shell=True).decode("utf-8").strip()))
            except:
                # If ffprobe fails, use moviepy
                clip = VideoFileClip(file_path)
                duration = int(clip.duration)
                clip.close()  # Ensure to close the clip to free up system resources
            
            if width >= min_res_width and height >= min_res_height and duration >= min_duration:
                file_list.append(file_name)
        except ValueError as e:
            print(f"Error: Could not retrieve width, height, or duration for file {file_name}. Skipping...")
        except subprocess.CalledProcessError as e:
            print(f"Error processing file {file_name}: {e}")


# Step 3: Write to .txt file
with open(os.path.join(output_dir, "video_list.txt"), "w") as f:
    for file_name in file_list:
        f.write("%s\n" % file_name)

# Step 4 & 5: Scale, crop videos and save (multithreaded)
def process_video(file_name):
    clean_name = re.sub(r"\W+", "", file_name)
    input_path = os.path.join(input_dir, file_name)
    output_path = os.path.join(output_dir, clean_name)
    output_path = os.path.splitext(output_path)[0] + ".mp4"
    cmd = f'ffmpeg -i "{input_path}" -vf "scale={out_res_width}:{out_res_height}:force_original_aspect_ratio=increase,crop={out_res_width}:{out_res_height},setsar=1" -b:v 1M "{output_path}"'
    try:
        subprocess.run(cmd, shell=True, check=True)
        print(f"Processing complete for file: {file_name}")
    except subprocess.CalledProcessError as e:
        print(f"Error processing file {file_name}: {e}")

# Use ThreadPoolExecutor to process videos in parallel
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    executor.map(process_video, file_list)


Processing complete for file: 2015 Kid Fest Recap-151918644.mp4
Processing complete for file: 16bit - Dinosaurs (Official Video) [Kevdt1T9daA].webm
Processing complete for file: 'Left, Right, Left, Right'-79727688.mp4
Processing complete for file: 20syl - Back & Forth (Official Music Video) [NtjnO-ge6s0].webm
Processing complete for file: 2 Days in Doha-114674220.mov
Processing complete for file: 2Pac - Brenda's Got A Baby [NRWUs0KtB-I].webm
Processing complete for file: 2Pac - Changes ft. Talent [eXvBjCO19QY].webmProcessing complete for file: 2Pac - If My Homie Calls [HWMsWO0vPa8].webm
Processing complete for file: 112 - Anywhere (Official Music Video) [fLghl1M-a1I].webm
Processing complete for file: 30x30 (Aktion)-84528250.mp4
Processing complete for file: 2Pac, The Outlawz - Baby Don't Cry (Keep Ya Head Up II) [ZgQZENuDrBY].webm
Processing complete for file: 2Pac - I Wonder If Heaven Got A Ghetto [_fCoK2OVJAc].webm

Processing complete for file: 4K MYSTIEYED-90443453.mp4
Processing 

In [8]:
#Split clips

import os
import subprocess

# Input
toSplit_dir = f"{data_dir}/toSplit"

# Output
split_dir = f"{data_dir}/splitted"

input_dir = toSplit_dir
output_dir = split_dir

# Get a list of video files in the input directory
video_files = [
    file for file in os.listdir(input_dir)
    if file.endswith((".mkv", ".mp4", ".avi", ".webm"))
]

# Iterate over each video file
for video_file in video_files:
    # Construct the full paths for input and output files
    input_path = os.path.join(input_dir, video_file)
    output_path = os.path.join(output_dir, os.path.splitext(video_file)[0])

    # Create the output directory if it doesn't exist
    os.makedirs(output_path, exist_ok=True)

    # Run SceneDetect CLI via subprocess
    subprocess.run(
        [
            "scenedetect",
            "-i",
            input_path,
            "-o",
            output_path,
            "detect-content",
            "-t",
            "27",  # Set detection threshold (adjust as needed)
            "list-scenes",
            "split-video",
        ],
        capture_output=True,
        text=True
    )

    print(f"Scenes detected and clips split for {video_file}")


Scenes detected and clips split for 'Left, Right, Left, Right'-79727688.mp4
Scenes detected and clips split for 112 - Anywhere (Official Music Video) [fLghl1M-a1I].webm
Scenes detected and clips split for 16bit - Dinosaurs (Official Video) [Kevdt1T9daA].webm
Scenes detected and clips split for 20syl - Back & Forth (Official Music Video) [NtjnO-ge6s0].webm
Scenes detected and clips split for 2Pac, The Outlawz - Baby Don't Cry (Keep Ya Head Up II) [ZgQZENuDrBY].webm
Scenes detected and clips split for 30x30 (Aktion)-84528250.mp4
Scenes detected and clips split for A Family Christmas-155074322.mp4
Scenes detected and clips split for A Tribe Called Quest - Can I Kick It？ (Official HD Video) [O3pyCGnZzYA].webm
Scenes detected and clips split for A Tribe Called Quest - Jazz (We've Got) Buggin' Out (Official HD Video) [cxN4nKk2cfk].webm
Scenes detected and clips split for A Tribe Called Quest - Scenario (Official HD Video) [Q6TLWqn82J4].webm
Scenes detected and clips split for a-ha - Take On 

KeyboardInterrupt: 