In [None]:
input_dir = "G:/ModelscopeTraining/Datasets/subset100_052423" # the input directory path
output_dir = "G:/ModelscopeTraining/Datasets/subset100_clean" # the output directory path

In [None]:
%pip install moviepy

In [None]:
import os
import subprocess
import re
from concurrent.futures import ThreadPoolExecutor
from moviepy.editor import VideoFileClip

# Step 1: Setup environment
input_dir = "G:\ModelscopeTraining\Datasets\subset_v20230525" # the input directory path
output_dir = "G:\ModelscopeTraining\Datasets\subset_v20230525_prepped" # the output directory path
min_res_width = 640  # minimum resolution width
min_res_height = 360  # minimum resolution height
min_duration = 60  # minimum duration in seconds
out_res_width = 576  # output resolution width
out_res_height = 320  # output resolution height
num_threads = 8  # number of threads to use for processing

# Step 2: Iterate through all files
file_list = []

for file_name in os.listdir(input_dir):
    if file_name.endswith((".mp4", ".mov", ".webm")):
        file_path = os.path.join(input_dir, file_name)

        cmd_width = f'ffprobe -v error -select_streams v:0 -show_entries stream=width -of default=noprint_wrappers=1:nokey=1 "{file_path}"'
        cmd_height = f'ffprobe -v error -select_streams v:0 -show_entries stream=height -of default=noprint_wrappers=1:nokey=1 "{file_path}"'
        
        try:
            width = int(float(subprocess.check_output(cmd_width, shell=True).decode("utf-8").strip()))
            height = int(float(subprocess.check_output(cmd_height, shell=True).decode("utf-8").strip()))
            
            # Try getting the duration with ffprobe first
            try:
                cmd_duration = f'ffprobe -v error -select_streams v:0 -show_entries stream=duration -of default=noprint_wrappers=1:nokey=1 "{file_path}"'
                duration = int(float(subprocess.check_output(cmd_duration, shell=True).decode("utf-8").strip()))
            except:
                # If ffprobe fails, use moviepy
                clip = VideoFileClip(file_path)
                duration = int(clip.duration)
                clip.close()  # Ensure to close the clip to free up system resources
            
            if width >= min_res_width and height >= min_res_height and duration >= min_duration:
                file_list.append(file_name)
        except ValueError as e:
            print(f"Error: Could not retrieve width, height, or duration for file {file_name}. Skipping...")
        except subprocess.CalledProcessError as e:
            print(f"Error processing file {file_name}: {e}")


# Step 3: Write to .txt file
with open(os.path.join(output_dir, "video_list.txt"), "w") as f:
    for file_name in file_list:
        f.write("%s\n" % file_name)

# Step 4 & 5: Scale, crop videos and save (multithreaded)
def process_video(file_name):
    clean_name = re.sub(r"\W+", "", file_name)
    input_path = os.path.join(input_dir, file_name)
    output_path = os.path.join(output_dir, clean_name)
    output_path = os.path.splitext(output_path)[0] + ".mp4"
    cmd = f'ffmpeg -i "{input_path}" -vf "scale={out_res_width}:{out_res_height}:force_original_aspect_ratio=increase,crop={out_res_width}:{out_res_height},setsar=1" -b:v 1M "{output_path}"'
    try:
        subprocess.run(cmd, shell=True, check=True)
        print(f"Processing complete for file: {file_name}")
    except subprocess.CalledProcessError as e:
        print(f"Error processing file {file_name}: {e}")

# Use ThreadPoolExecutor to process videos in parallel
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    executor.map(process_video, file_list)
