In [2]:
data_dir = "G:/ModelscopeTraining/Datasets/subset_v20230528_1848" # the input directory path

Crop Black Bars in folder /toCrop

In [4]:
# Detect and Crop black bars in /toCrop

import os
import subprocess


# Input
toCrop_dir = f"{data_dir}/toCrop"

# Output
cropped_dir = f"{data_dir}/cropped"

input_dir = toCrop_dir
output_dir = cropped_dir

# Make sure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Iterate over all video files in the input directory
for filename in os.listdir(input_dir):
    # Skip files that are not videos
    if not (filename.endswith(".mp4") or filename.endswith(".webm") or filename.endswith(".mkv") or filename.endswith(".avi") or filename.endswith(".mov")):
        continue

    input_file = os.path.join(input_dir, filename)
    output_file = os.path.join(output_dir, os.path.splitext(filename)[0] + ".mp4")

    # Step 1: Use FFmpeg to detect black bars
    command = ["ffmpeg", "-i", input_file, "-vf", "cropdetect=24:16:0", "-f", "null", "-"]
    try:
        cropdetect_result = subprocess.run(command, capture_output=True, text=True, errors='ignore')
    except Exception as e:
        print(f"Error executing command {command}. Error: {e}")
        continue

    # Parse the cropdetect result to get the crop parameters
    try:
        if cropdetect_result.stderr is not None:
            crop_params = [line for line in cropdetect_result.stderr.split('\n') if 'crop=' in line][-1].split('crop=')[1].split(' ')[0]
        else:
            crop_params = None
    except IndexError:
        crop_params = None

    # Step 2: If black bars are detected, crop them out
    if crop_params:
        command = ["ffmpeg", "-i", input_file, "-vf", f"crop={crop_params}", output_file]
    else:
        # If no black bars are detected, just convert the video to .mp4
        command = ["ffmpeg", "-i", input_file, "-c:v", "libx264", "-crf", "23", "-preset", "veryfast", output_file]
    
    try:
        subprocess.run(command)
    except Exception as e:
        print(f"Error executing command {command}. Error: {e}")



Cut N Crop (process files in /toCutNcrop)

In [None]:
#Split clips in /toCutnCrop

import os
import subprocess

# Input
toSplit_dir = f"{data_dir}/toCutNcrop

# Output
split_dir = f"{data_dir}/splittedb4crop"

input_dir = toSplit_dir
output_dir = split_dir

# Get a list of video files in the input directory
video_files = [
    file for file in os.listdir(input_dir)
    if file.endswith((".mkv", ".mp4", ".avi", ".webm", ".mov"))
]

# Iterate over each video file
for video_file in video_files:
    # Construct the full paths for input and output files
    input_path = os.path.join(input_dir, video_file)
    output_path = os.path.join(output_dir, os.path.splitext(video_file)[0])

    # Create the output directory if it doesn't exist
    os.makedirs(output_path, exist_ok=True)

    # Run SceneDetect CLI via subprocess
    subprocess.run(
        [
            "scenedetect",
            "-i",
            input_path,
            "-o",
            output_path,
            "detect-content",
            "-t",
            "27",  # Set detection threshold (adjust as needed)
            "list-scenes",
            "split-video",
        ],
        capture_output=True,
        text=True
    )

    print(f"Scenes detected and clips split for {video_file}")


In [None]:
# Crop black bars in /toCutnCrop

import os
import subprocess


# Input
toCrop_dir = f"{data_dir}/splittedb4crop"

# Output
cropped_dir = f"{data_dir}/splitNcropped"

input_dir = toCrop_dir
output_dir = cropped_dir

# Make sure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Iterate over all video files in the input directory
for filename in os.listdir(input_dir):
    # Skip files that are not videos
    if not (filename.endswith(".mp4") or filename.endswith(".webm") or filename.endswith(".mkv") or filename.endswith(".avi") or filename.endswith(".mov")):
        continue

    input_file = os.path.join(input_dir, filename)
    output_file = os.path.join(output_dir, os.path.splitext(filename)[0] + ".mp4")

    # Step 1: Use FFmpeg to detect black bars
    command = ["ffmpeg", "-i", input_file, "-vf", "cropdetect=24:16:0", "-f", "null", "-"]
    try:
        cropdetect_result = subprocess.run(command, capture_output=True, text=True, errors='ignore')
    except Exception as e:
        print(f"Error executing command {command}. Error: {e}")
        continue

    # Parse the cropdetect result to get the crop parameters
    try:
        if cropdetect_result.stderr is not None:
            crop_params = [line for line in cropdetect_result.stderr.split('\n') if 'crop=' in line][-1].split('crop=')[1].split(' ')[0]
        else:
            crop_params = None
    except IndexError:
        crop_params = None

    # Step 2: If black bars are detected, crop them out
    if crop_params:
        command = ["ffmpeg", "-i", input_file, "-vf", f"crop={crop_params}", output_file]
    else:
        # If no black bars are detected, just convert the video to .mp4
        command = ["ffmpeg", "-i", input_file, "-c:v", "libx264", "-crf", "23", "-preset", "veryfast", output_file]
    
    try:
        subprocess.run(command)
    except Exception as e:
        print(f"Error executing command {command}. Error: {e}")



Split Clean Clips

In [7]:
#Split clips in /clean

import os
import subprocess

# Input
toSplit_dir = f"{data_dir}/clean"

# Output
split_dir = f"{data_dir}/splitted"

input_dir = toSplit_dir
output_dir = split_dir

# Get a list of video files in the input directory
video_files = [
    file for file in os.listdir(input_dir)
    if file.endswith((".mkv", ".mp4", ".avi", ".webm", ".mov"))
]

# Iterate over each video file
for video_file in video_files:
    # Construct the full paths for input and output files
    input_path = os.path.join(input_dir, video_file)
    output_path = os.path.join(output_dir, os.path.splitext(video_file)[0])

    # Create the output directory if it doesn't exist
    os.makedirs(output_path, exist_ok=True)

    # Run SceneDetect CLI via subprocess
    subprocess.run(
        [
            "scenedetect",
            "-i",
            input_path,
            "-o",
            output_path,
            "detect-content",
            "-t",
            "27",  # Set detection threshold (adjust as needed)
            "list-scenes",
            "split-video",
        ],
        capture_output=True,
        text=True
    )

    print(f"Scenes detected and clips split for {video_file}")


Scenes detected and clips split for 'Left, Right, Left, Right'-79727688.mp4
Scenes detected and clips split for 'Machine'  -30-100912126.mov
Scenes detected and clips split for 'minibeer licor43'-114865390.mov
Scenes detected and clips split for 00 Publix KK 30-122682021.mov
Scenes detected and clips split for 08 PYM 30 Taggable SY-132834069.mov
Scenes detected and clips split for 10 Second Holiday Video-55491848.mov
Scenes detected and clips split for 112 - Anywhere (Official Music Video) [fLghl1M-a1I].webm
Scenes detected and clips split for 12month。-122514986.mov
Scenes detected and clips split for 150306 12ZOO0003-00 ZOO 20s NL Planckendael-121935891.mov
Scenes detected and clips split for 16bit - Dinosaurs (Official Video) [Kevdt1T9daA].webm
Scenes detected and clips split for 2 Days in Doha-114674220.mov
Scenes detected and clips split for 2015 01 27 BolgerNaglForum ImagesOverNagl-119148197.mov
Scenes detected and clips split for 20syl - Back & Forth (Official Music Video) [NtjnO

Exception in thread Thread-1370 (_readerthread):
Traceback (most recent call last):
  File "c:\Users\cerspense\.conda\envs\t2vdataprep\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "c:\Users\cerspense\.conda\envs\t2vdataprep\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "c:\Users\cerspense\.conda\envs\t2vdataprep\lib\subprocess.py", line 1515, in _readerthread
    buffer.append(fh.read())
  File "c:\Users\cerspense\.conda\envs\t2vdataprep\lib\codecs.py", line 322, in decode
    (result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x92 in position 5679: invalid start byte


Scenes detected and clips split for p-FUNDACIO  N DONDE TV GENERICO-30fps prores-104426946.mov
Scenes detected and clips split for Panda Bear - Boys Latin (Official Video) [prBaZzYmQrI].webm
Scenes detected and clips split for Pat Lok - WYG ( 4 ME ) (feat. JONES) [Official Lyric Video] [gKIpZYH7rns].webm
Scenes detected and clips split for Pearl Jam - Even Flow (Official Video) [CxKWTzr-k6s].webm
Scenes detected and clips split for Pensées positives-77641129.mp4
Scenes detected and clips split for Perturbator - ＂Sentient＂ [Music Video - UNCENSORED - ＂The Uncanny Valley＂] [oTN6cGmH2yM].webm
Scenes detected and clips split for Pet Turkey AXMT3779000H ProRes-80556741.mov
Scenes detected and clips split for Philosophy Rough V2-107201812.mp4
Scenes detected and clips split for Pils Hellas-125829199.mov
Scenes detected and clips split for Plurabelle - Our Fires [1zTr1Zn-h7I].webm
Scenes detected and clips split for Poncho - Tiki tiki ft. Drear Mar I (video oficial) [tHfkjjhWK8c].webm
Scenes

Exception in thread Thread-1724 (_readerthread):
Traceback (most recent call last):
  File "c:\Users\cerspense\.conda\envs\t2vdataprep\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "c:\Users\cerspense\.conda\envs\t2vdataprep\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "c:\Users\cerspense\.conda\envs\t2vdataprep\lib\subprocess.py", line 1515, in _readerthread
    buffer.append(fh.read())
  File "c:\Users\cerspense\.conda\envs\t2vdataprep\lib\codecs.py", line 322, in decode
    (result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 4838: invalid continuation byte


Scenes detected and clips split for Vichy Neovadiol Elixir-90860884.mov
Scenes detected and clips split for VISA - Antman-134110643.mov
Scenes detected and clips split for VISA - Hulk Hands-125691155.mov
Scenes detected and clips split for VISA - Little Mind Readers-134110644.mov
Scenes detected and clips split for VLAM dir cut 36 fr,-prores-119764911.mov
Scenes detected and clips split for Vodafone Blendle - 'Kiosk' (NL)-94507446.mov
Scenes detected and clips split for Volvo Funny OscarRospide Prores-121838294.mov
Scenes detected and clips split for Von Pariahs - Skywalking (Official) [SnwytlUUAYU].webm
Scenes detected and clips split for VW GLI 'Sidewalk'-148431970.mov
Scenes detected and clips split for Website Intro Video H264 1080p 25-121327555.mov
Scenes detected and clips split for Website Slider-130394245.mov
Scenes detected and clips split for Welcome to our website!-122497110.mov
Scenes detected and clips split for WGSS0125000H-100445787.mov
Scenes detected and clips split fo

In [8]:
#Split clips in /cropped

import os
import subprocess

# Input
toSplit_dir = f"{data_dir}/cropped"

# Output
split_dir = f"{data_dir}/splitted"

input_dir = toSplit_dir
output_dir = split_dir

# Get a list of video files in the input directory
video_files = [
    file for file in os.listdir(input_dir)
    if file.endswith((".mkv", ".mp4", ".avi", ".webm", ".mov"))
]

# Iterate over each video file
for video_file in video_files:
    # Construct the full paths for input and output files
    input_path = os.path.join(input_dir, video_file)
    output_path = os.path.join(output_dir, os.path.splitext(video_file)[0])

    # Create the output directory if it doesn't exist
    os.makedirs(output_path, exist_ok=True)

    # Run SceneDetect CLI via subprocess
    subprocess.run(
        [
            "scenedetect",
            "-i",
            input_path,
            "-o",
            output_path,
            "detect-content",
            "-t",
            "27",  # Set detection threshold (adjust as needed)
            "list-scenes",
            "split-video",
        ],
        capture_output=True,
        text=True
    )

    print(f"Scenes detected and clips split for {video_file}")


Scenes detected and clips split for 2Pac - Brenda's Got A Baby [NRWUs0KtB-I].mp4
Scenes detected and clips split for 2Pac - I Wonder If Heaven Got A Ghetto [_fCoK2OVJAc].mp4
Scenes detected and clips split for 2Pac - If My Homie Calls [HWMsWO0vPa8].mp4
Scenes detected and clips split for 4 Non Blondes - What's Up (Official Music Video) [6NXnxTNIWkc].mp4
Scenes detected and clips split for 4K MYSTIEYED-90443453.mp4
Scenes detected and clips split for 4K _ Hooked Blackmagic Harbour-99668293.mp4
Scenes detected and clips split for Aerosmith - Amazing (Official Music Video) [zSmOvYzSeaQ].mp4
Scenes detected and clips split for Aerosmith - Crazy (Official Music Video) [NMNgbISmF4I].mp4
Scenes detected and clips split for Aerosmith - Cryin' (Official Music Video) [qfNmyxV2Ncw].mp4
Scenes detected and clips split for Aerosmith - Livin' On The Edge (Official Music Video) [7nqcL0mjMjw].mp4
Scenes detected and clips split for Aerosmith - The Other Side [zkGfPrst29Y].mp4
Scenes detected and clips

Exception in thread Thread-1900 (_readerthread):
Traceback (most recent call last):
  File "c:\Users\cerspense\.conda\envs\t2vdataprep\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "c:\Users\cerspense\.conda\envs\t2vdataprep\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "c:\Users\cerspense\.conda\envs\t2vdataprep\lib\subprocess.py", line 1515, in _readerthread
    buffer.append(fh.read())
  File "c:\Users\cerspense\.conda\envs\t2vdataprep\lib\codecs.py", line 322, in decode
    (result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 4016: invalid continuation byte


Scenes detected and clips split for Cacique - Mojito-153015875.mp4
Scenes detected and clips split for Canon 7D raw test 1-93346398.mp4
Scenes detected and clips split for CONCORDE - Sons [ZeBrnuQxEsQ].mp4
Scenes detected and clips split for Dance Gavin Dance - Son Of Robot (Official Music Video) [H-IoyUQ5Dp0].mp4
Scenes detected and clips split for Dance Gavin Dance - Young Robot [Vl2Rmw5hsao].mp4
Scenes detected and clips split for David Gilmour - Rattle That Lock (Official Music Video) [L1v7hXEQhsQ].mp4
Scenes detected and clips split for Deep Blue Something - Breakfast At Tiffany's (Official Music Video) [1ClCpfeIELw].mp4
Scenes detected and clips split for Def Leppard - Have You Ever Needed Someone So Bad？ [BJOC0zIjln8].mp4
Scenes detected and clips split for Divinyls - I Touch Myself (Official Music Video) [wv-34w8kGPM].mp4
Scenes detected and clips split for DMX - How's It Goin' Down [4AognXgM9FQ].mp4
Scenes detected and clips split for DMX - Ruff Ryders' Anthem [ThlhSnRk21E].mp

Filter and Crop to Size

In [24]:
# Filter and crop Splitted 

import os
import subprocess
import re
from concurrent.futures import ThreadPoolExecutor
from moviepy.editor import VideoFileClip

# Input
clean_dir = f"{data_dir}/splitted"

# Output
finished_dir = f"{data_dir}/finished"

input_dir = clean_dir
output_dir = finished_dir

# Make sure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Step 1: Setup environment
min_res_width = 640  # minimum resolution width
min_res_height = 360  # minimum resolution height
min_duration = 1  # minimum duration in seconds
max_duration = 2  # maximum duration in seconds
out_res_width = 320  # output resolution width
out_res_height = 320  # output resolution height
num_threads = 8  # number of threads to use for processing

# Step 2: Iterate through all files
file_list = []

# Step 2: Iterate through all files
file_list = []

# Instead of using os.listdir(), use os.walk() to also search in subdirectories
for root, dirs, files in os.walk(input_dir):
    for file_name in files:
        if file_name.endswith((".mp4", ".mov", ".webm", ".mkv", ".avi")):
            file_path = os.path.join(root, file_name)

            cmd_width = f'ffprobe -v error -select_streams v:0 -show_entries stream=width -of default=noprint_wrappers=1:nokey=1 "{file_path}"'
            cmd_height = f'ffprobe -v error -select_streams v:0 -show_entries stream=height -of default=noprint_wrappers=1:nokey=1 "{file_path}"'
            
            try:
                width_output = subprocess.check_output(cmd_width, shell=True).decode("utf-8").strip()
                height_output = subprocess.check_output(cmd_height, shell=True).decode("utf-8").strip()

                # If width or height could not be determined, skip this file
                if width_output == '' or height_output == '':
                    print(f"Width or height could not be determined for file {file_name}. Skipping...")
                    continue

                width = int(float(width_output))
                height = int(float(height_output))

                            # Try getting the duration with ffprobe first
                try:
                    cmd_duration = f'ffprobe -v error -select_streams v:0 -show_entries stream=duration -of default=noprint_wrappers=1:nokey=1 "{file_path}"'
                    duration = int(float(subprocess.check_output(cmd_duration, shell=True).decode("utf-8").strip()))
                except:
                    # If ffprobe fails, use moviepy
                    clip = VideoFileClip(file_path)
                    duration = int(clip.duration)
                    clip.close()  # Ensure to close the clip to free up system resources
                
                if width >= min_res_width and height >= min_res_height and duration >= min_duration:
                    file_list.append(file_name)

            except subprocess.CalledProcessError as e:
                print(f"Error processing file {file_name}: {e}")

            try:
                if width >= min_res_width and height >= min_res_height and min_duration <= duration <= max_duration:
                    file_list.append(os.path.join(dirpath, file_name))
            except ValueError as e:
                print(f"Error: Could not retrieve width, height, or duration for file {file_name}. Skipping...")


# Step 3: Write to .txt file
with open(os.path.join(output_dir, "video_list.txt"), "w") as f:
    for file_name in file_list:
        f.write("%s\n" % file_name)

# Step 4 & 5: Scale, crop videos and save (multithreaded)
def process_video(file_name):
    clean_name = re.sub(r"\W+", "", file_name)
    input_path = os.path.join(input_dir, file_name)
    output_path = os.path.join(output_dir, clean_name)
    output_path = os.path.splitext(output_path)[0] + ".mp4"
    cmd = f'ffmpeg -i "{input_path}" -vf "scale={out_res_width}:{out_res_height}:force_original_aspect_ratio=increase,crop={out_res_width}:{out_res_height},setsar=1" -b:v 1M "{output_path}"'
    try:
        subprocess.run(cmd, shell=True, check=True)
        print(f"Processing complete for file: {file_name}")
    except subprocess.CalledProcessError as e:
        print(f"Error processing file {file_name}: {e}")

# Use ThreadPoolExecutor to process videos in parallel
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    executor.map(process_video, file_list)


Width or height could not be determined for file 'Machine'  -30-100912126-Scene-030.mp4. Skipping...
Error processing file 'minibeer licor43'-114865390-Scene-001.mp4: Command 'ffprobe -v error -select_streams v:0 -show_entries stream=width -of default=noprint_wrappers=1:nokey=1 "G:/ModelscopeTraining/Datasets/subset_v20230528_1848/splitted\'minibeer licor43'-114865390\'minibeer licor43'-114865390-Scene-001.mp4"' returned non-zero exit status 1.
Error processing file 00 Publix KK 30-122682021-Scene-001.mp4: Command 'ffprobe -v error -select_streams v:0 -show_entries stream=width -of default=noprint_wrappers=1:nokey=1 "G:/ModelscopeTraining/Datasets/subset_v20230528_1848/splitted\00 Publix KK 30-122682021\00 Publix KK 30-122682021-Scene-001.mp4"' returned non-zero exit status 1.
Error processing file 08 PYM 30 Taggable SY-132834069-Scene-001.mp4: Command 'ffprobe -v error -select_streams v:0 -show_entries stream=width -of default=noprint_wrappers=1:nokey=1 "G:/ModelscopeTraining/Datasets

In [None]:
# Filter and crop Splitted old

import os
import subprocess
import re
from concurrent.futures import ThreadPoolExecutor
from moviepy.editor import VideoFileClip

# Input
clean_dir = f"{data_dir}/splitted"

# Output
finished_dir = f"{data_dir}/finished"

input_dir = clean_dir
output_dir = finished_dir

# Make sure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Step 1: Setup environment
min_res_width = 640  # minimum resolution width
min_res_height = 360  # minimum resolution height
min_duration = 60  # minimum duration in seconds
out_res_width = 320  # output resolution width
out_res_height = 320  # output resolution height
num_threads = 8  # number of threads to use for processing

# Step 2: Iterate through all files
file_list = []

for file_name in os.listdir(input_dir):
    if file_name.endswith((".mp4", ".mov", ".webm", ".mkv", ".avi")):
        file_path = os.path.join(input_dir, file_name)

        cmd_width = f'ffprobe -v error -select_streams v:0 -show_entries stream=width -of default=noprint_wrappers=1:nokey=1 "{file_path}"'
        cmd_height = f'ffprobe -v error -select_streams v:0 -show_entries stream=height -of default=noprint_wrappers=1:nokey=1 "{file_path}"'
        
        try:
            width = int(float(subprocess.check_output(cmd_width, shell=True).decode("utf-8").strip()))
            height = int(float(subprocess.check_output(cmd_height, shell=True).decode("utf-8").strip()))
            
            # Try getting the duration with ffprobe first
            try:
                cmd_duration = f'ffprobe -v error -select_streams v:0 -show_entries stream=duration -of default=noprint_wrappers=1:nokey=1 "{file_path}"'
                duration = int(float(subprocess.check_output(cmd_duration, shell=True).decode("utf-8").strip()))
            except:
                # If ffprobe fails, use moviepy
                clip = VideoFileClip(file_path)
                duration = int(clip.duration)
                clip.close()  # Ensure to close the clip to free up system resources
            
            if width >= min_res_width and height >= min_res_height and duration >= min_duration:
                file_list.append(file_name)
        except ValueError as e:
            print(f"Error: Could not retrieve width, height, or duration for file {file_name}. Skipping...")
        except subprocess.CalledProcessError as e:
            print(f"Error processing file {file_name}: {e}")


# Step 3: Write to .txt file
with open(os.path.join(output_dir, "video_list.txt"), "w") as f:
    for file_name in file_list:
        f.write("%s\n" % file_name)

# Step 4 & 5: Scale, crop videos and save (multithreaded)
def process_video(file_name):
    clean_name = re.sub(r"\W+", "", file_name)
    input_path = os.path.join(input_dir, file_name)
    output_path = os.path.join(output_dir, clean_name)
    output_path = os.path.splitext(output_path)[0] + ".mp4"
    cmd = f'ffmpeg -i "{input_path}" -vf "scale={out_res_width}:{out_res_height}:force_original_aspect_ratio=increase,crop={out_res_width}:{out_res_height},setsar=1" -b:v 1M "{output_path}"'
    try:
        subprocess.run(cmd, shell=True, check=True)
        print(f"Processing complete for file: {file_name}")
    except subprocess.CalledProcessError as e:
        print(f"Error processing file {file_name}: {e}")

# Use ThreadPoolExecutor to process videos in parallel
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    executor.map(process_video, file_list)


Processing complete for file: 2015 Kid Fest Recap-151918644.mp4
Processing complete for file: 16bit - Dinosaurs (Official Video) [Kevdt1T9daA].webm
Processing complete for file: 'Left, Right, Left, Right'-79727688.mp4
Processing complete for file: 20syl - Back & Forth (Official Music Video) [NtjnO-ge6s0].webm
Processing complete for file: 2 Days in Doha-114674220.mov
Processing complete for file: 2Pac - Brenda's Got A Baby [NRWUs0KtB-I].webm
Processing complete for file: 2Pac - Changes ft. Talent [eXvBjCO19QY].webmProcessing complete for file: 2Pac - If My Homie Calls [HWMsWO0vPa8].webm
Processing complete for file: 112 - Anywhere (Official Music Video) [fLghl1M-a1I].webm
Processing complete for file: 30x30 (Aktion)-84528250.mp4
Processing complete for file: 2Pac, The Outlawz - Baby Don't Cry (Keep Ya Head Up II) [ZgQZENuDrBY].webm
Processing complete for file: 2Pac - I Wonder If Heaven Got A Ghetto [_fCoK2OVJAc].webm

Processing complete for file: 4K MYSTIEYED-90443453.mp4
Processing 

In [None]:
# Filter and crop splitNCropped

import os
import subprocess
import re
from concurrent.futures import ThreadPoolExecutor
from moviepy.editor import VideoFileClip

# Input
clean_dir = f"{data_dir}/splitNcropped"

# Output
finished_dir = f"{data_dir}/finished"

input_dir = clean_dir
output_dir = finished_dir

# Make sure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Step 1: Setup environment
min_res_width = 640  # minimum resolution width
min_res_height = 360  # minimum resolution height
min_duration = 60  # minimum duration in seconds
out_res_width = 576  # output resolution width
out_res_height = 320  # output resolution height
num_threads = 8  # number of threads to use for processing

# Step 2: Iterate through all files
file_list = []

for file_name in os.listdir(input_dir):
    if file_name.endswith((".mp4", ".mov", ".webm", ".mkv", ".avi")):
        file_path = os.path.join(input_dir, file_name)

        cmd_width = f'ffprobe -v error -select_streams v:0 -show_entries stream=width -of default=noprint_wrappers=1:nokey=1 "{file_path}"'
        cmd_height = f'ffprobe -v error -select_streams v:0 -show_entries stream=height -of default=noprint_wrappers=1:nokey=1 "{file_path}"'
        
        try:
            width = int(float(subprocess.check_output(cmd_width, shell=True).decode("utf-8").strip()))
            height = int(float(subprocess.check_output(cmd_height, shell=True).decode("utf-8").strip()))
            
            # Try getting the duration with ffprobe first
            try:
                cmd_duration = f'ffprobe -v error -select_streams v:0 -show_entries stream=duration -of default=noprint_wrappers=1:nokey=1 "{file_path}"'
                duration = int(float(subprocess.check_output(cmd_duration, shell=True).decode("utf-8").strip()))
            except:
                # If ffprobe fails, use moviepy
                clip = VideoFileClip(file_path)
                duration = int(clip.duration)
                clip.close()  # Ensure to close the clip to free up system resources
            
            if width >= min_res_width and height >= min_res_height and duration >= min_duration:
                file_list.append(file_name)
        except ValueError as e:
            print(f"Error: Could not retrieve width, height, or duration for file {file_name}. Skipping...")
        except subprocess.CalledProcessError as e:
            print(f"Error processing file {file_name}: {e}")


# Step 3: Write to .txt file
with open(os.path.join(output_dir, "video_list.txt"), "w") as f:
    for file_name in file_list:
        f.write("%s\n" % file_name)

# Step 4 & 5: Scale, crop videos and save (multithreaded)
def process_video(file_name):
    clean_name = re.sub(r"\W+", "", file_name)
    input_path = os.path.join(input_dir, file_name)
    output_path = os.path.join(output_dir, clean_name)
    output_path = os.path.splitext(output_path)[0] + ".mp4"
    cmd = f'ffmpeg -i "{input_path}" -vf "scale={out_res_width}:{out_res_height}:force_original_aspect_ratio=increase,crop={out_res_width}:{out_res_height},setsar=1" -b:v 1M "{output_path}"'
    try:
        subprocess.run(cmd, shell=True, check=True)
        print(f"Processing complete for file: {file_name}")
    except subprocess.CalledProcessError as e:
        print(f"Error processing file {file_name}: {e}")

# Use ThreadPoolExecutor to process videos in parallel
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    executor.map(process_video, file_list)
