In [1]:
import os
import shutil
import random
import cv2
# Define paths
SOURCE_DIR = "F:\\vimeo_septuplet_full\\sequences"  # Change this if needed
TARGET_DIR = "F:\\vimeo_septuplet_full\\Arranged_50"  # Change this if you want a different location


In [2]:
#!pip install --upgrade numpy opencv-python


In [16]:
!pip install --upgrade opencv-python-headless




In [3]:

# Ensure directories exist
os.makedirs(os.path.join(TARGET_DIR, "train/lr"), exist_ok=True)
os.makedirs(os.path.join(TARGET_DIR, "train/hr"), exist_ok=True)
os.makedirs(os.path.join(TARGET_DIR, "val/lr"), exist_ok=True)
os.makedirs(os.path.join(TARGET_DIR, "val/hr"), exist_ok=True)



In [None]:

# Collect all sequence paths
all_sequences = []
for seq_folder in sorted(os.listdir(SOURCE_DIR)):  # e.g., 00001, 00002, ...
    seq_path = os.path.join(SOURCE_DIR, seq_folder)
    if os.path.isdir(seq_path):
        for sub_folder in sorted(os.listdir(seq_path)):  # e.g., 0001, 0002, ...
            full_path = os.path.join(seq_path, sub_folder)
            if os.path.isdir(full_path):
                frame_files = [f"im{i}.png" for i in range(1, 8)]
                if all(os.path.exists(os.path.join(full_path, f)) for f in frame_files):
                    all_sequences.append(full_path)

# Shuffle and split dataset (90% train, 10% val)
random.shuffle(all_sequences)
split_index = int(0.9 * len(all_sequences))
train_sequences = all_sequences[:split_index]
val_sequences = all_sequences[split_index:]


In [None]:

# Function to process dataset
def process_sequences(sequences, split_type):
    """Copies and renames frames to structured format."""
    lr_folder = os.path.join(TARGET_DIR, f"{split_type}/lr")
    hr_folder = os.path.join(TARGET_DIR, f"{split_type}/hr")

    for seq_path in sequences:
        seq_name = "_".join(seq_path.split(os.sep)[-2:])  # Format: 00001_0001
        os.makedirs(os.path.join(lr_folder, seq_name), exist_ok=True)
        os.makedirs(os.path.join(hr_folder, seq_name), exist_ok=True)

        for i in range(1, 8):  # Frames im1.png to im7.png
            img_path = os.path.join(seq_path, f"im{i}.png")

            # Load HR image
            hr_img = cv2.imread(img_path)
            h, w, _ = hr_img.shape

            # Generate LR image (downscale + upscale for alignment)
            lr_img = cv2.resize(hr_img, (w // 4, h // 4), interpolation=cv2.INTER_CUBIC)
            lr_img = cv2.resize(lr_img, (w, h), interpolation=cv2.INTER_CUBIC)

            # Save images
            hr_filename = f"frame_{i:05d}.png"
            lr_filename = f"frame_{i:05d}.png"

            cv2.imwrite(os.path.join(hr_folder, seq_name, hr_filename), hr_img)
            cv2.imwrite(os.path.join(lr_folder, seq_name, lr_filename), lr_img)

# Process train and val datasets
print("Processing training set...")
process_sequences(train_sequences, "train")

print("Processing validation set...")
process_sequences(val_sequences, "val")

print("Dataset preprocessing complete! 🎉")

In [2]:
import os
import shutil
import random
import cv2

# Define paths
SOURCE_DIR = "F:\\vimeo_septuplet_full\\sequences"  # Original dataset
TARGET_DIR = "F:\\vimeo_septuplet_full\\Vime90_arranged"   # New structured dataset

# Number of folders to process at a time (Set to 10 for testing, later increase)
num_folders_to_process = 10

# Ensure directories exist
for split in ["train", "val", "test"]:
    os.makedirs(os.path.join(TARGET_DIR, f"{split}/lr"), exist_ok=True)
    os.makedirs(os.path.join(TARGET_DIR, f"{split}/hr"), exist_ok=True)

# Collect only a subset of folders for processing
all_folders = sorted(os.listdir(SOURCE_DIR))[:num_folders_to_process]  # Process only first N folders

# Collect all valid sequences
all_sequences = []
for seq_folder in all_folders:  # Only process limited folders
    seq_path = os.path.join(SOURCE_DIR, seq_folder)
    if os.path.isdir(seq_path):
        for sub_folder in sorted(os.listdir(seq_path)):
            full_path = os.path.join(seq_path, sub_folder)
            if os.path.isdir(full_path):
                frame_files = [f"im{i}.png" for i in range(1, 8)]
                if all(os.path.exists(os.path.join(full_path, f)) for f in frame_files):
                    all_sequences.append(full_path)


In [3]:

# Shuffle and split dataset (80% train, 10% val, 10% test)
random.shuffle(all_sequences)
train_end = int(0.8 * len(all_sequences))
val_end = int(0.9 * len(all_sequences))

train_sequences = all_sequences[:train_end]
val_sequences = all_sequences[train_end:val_end]
test_sequences = all_sequences[val_end:]

# Function to process sequences
def process_sequences(sequences, split_type):
    """Copies and renames frames to structured format"""
    lr_folder = os.path.join(TARGET_DIR, f"{split_type}/lr")
    hr_folder = os.path.join(TARGET_DIR, f"{split_type}/hr")

    for seq_path in sequences:
        seq_name = "_".join(seq_path.split(os.sep)[-2:])  # e.g., 00001_0001
        os.makedirs(os.path.join(lr_folder, seq_name), exist_ok=True)
        os.makedirs(os.path.join(hr_folder, seq_name), exist_ok=True)

        for i in range(1, 8):  # Frames im1.png to im7.png
            img_path = os.path.join(seq_path, f"im{i}.png")

            # Load HR image
            hr_img = cv2.imread(img_path)
            if hr_img is None:
                print(f"Skipping corrupted/missing image: {img_path}")
                continue

            h, w, _ = hr_img.shape

            # Generate LR image (downscale + upscale for alignment)
            lr_img = cv2.resize(hr_img, (w // 4, h // 4), interpolation=cv2.INTER_CUBIC)
            lr_img = cv2.resize(lr_img, (w, h), interpolation=cv2.INTER_CUBIC)

            # Save images
            hr_filename = f"frame_{i:05d}.png"
            lr_filename = f"frame_{i:05d}.png"

            cv2.imwrite(os.path.join(hr_folder, seq_name, hr_filename), hr_img)
            cv2.imwrite(os.path.join(lr_folder, seq_name, lr_filename), lr_img)


In [4]:

# Process datasets
print("Processing training set...")
process_sequences(train_sequences, "train")

print("Processing validation set...")
process_sequences(val_sequences, "val")

print("Processing test set...")
process_sequences(test_sequences, "test")

print("Dataset preprocessing complete! 🎉")


Processing training set...
Processing validation set...
Processing test set...
Dataset preprocessing complete! 🎉


In [5]:
import os
import shutil
import random
import cv2

# Define paths
SOURCE_DIR = "F:\\vimeo_septuplet_full\\sequences"  # Original dataset
TARGET_DIR = "F:\\vimeo_septuplet_full\\Arranged"   # New structured dataset

# Number of folders to process at a time (Set small first, then increase)
num_folders_to_process = 10

# Ensure directories exist
for split in ["train", "val", "test"]:
    os.makedirs(os.path.join(TARGET_DIR, f"{split}/lr"), exist_ok=True)
    os.makedirs(os.path.join(TARGET_DIR, f"{split}/hr"), exist_ok=True)

# Collect only a subset of folders for processing
all_folders = sorted(os.listdir(SOURCE_DIR))[:num_folders_to_process]  # Process only first N folders

# Collect all valid sequences
all_sequences = []
for seq_folder in all_folders:
    seq_path = os.path.join(SOURCE_DIR, seq_folder)
    if os.path.isdir(seq_path):
        for sub_folder in sorted(os.listdir(seq_path)):
            full_path = os.path.join(seq_path, sub_folder)
            if os.path.isdir(full_path):
                frame_files = [f"im{i}.png" for i in range(1, 8)]
                if all(os.path.exists(os.path.join(full_path, f)) for f in frame_files):
                    all_sequences.append(full_path)

# Shuffle and split dataset (80% train, 10% val, 10% test)
random.shuffle(all_sequences)
train_end = int(0.8 * len(all_sequences))
val_end = int(0.9 * len(all_sequences))

train_sequences = all_sequences[:train_end]
val_sequences = all_sequences[train_end:val_end]
test_sequences = all_sequences[val_end:]

# Function to process sequences
def process_sequences(sequences, split_type):
    """Copies HR and generates LR from HR"""
    lr_folder = os.path.join(TARGET_DIR, f"{split_type}/lr")
    hr_folder = os.path.join(TARGET_DIR, f"{split_type}/hr")

    for seq_path in sequences:
        seq_name = "_".join(seq_path.split(os.sep)[-2:])  # e.g., 00001_0001
        os.makedirs(os.path.join(lr_folder, seq_name), exist_ok=True)
        os.makedirs(os.path.join(hr_folder, seq_name), exist_ok=True)

        for i in range(1, 8):  # Frames im1.png to im7.png
            img_path = os.path.join(seq_path, f"im{i}.png")

            # Load HR image (keep original)
            hr_img = cv2.imread(img_path)
            if hr_img is None:
                print(f"Skipping corrupted/missing image: {img_path}")
                continue

            h, w, _ = hr_img.shape

            # Generate LR image (downscale only)
            lr_img = cv2.resize(hr_img, (w // 4, h // 4), interpolation=cv2.INTER_CUBIC)

            # Save HR image (original quality)
            hr_filename = f"frame_{i:05d}.png"
            cv2.imwrite(os.path.join(hr_folder, seq_name, hr_filename), hr_img)

            # Save LR image (downscaled only)
            lr_filename = f"frame_{i:05d}.png"
            cv2.imwrite(os.path.join(lr_folder, seq_name, lr_filename), lr_img)

# Process datasets
print("Processing training set...")
process_sequences(train_sequences, "train")

print("Processing validation set...")
process_sequences(val_sequences, "val")

print("Processing test set...")
process_sequences(test_sequences, "test")

print("Dataset preprocessing complete! 🎉")


Processing training set...


KeyboardInterrupt: 

In [6]:
import os
import shutil
import random
import cv2

# Define paths
SOURCE_DIR = "F:\\vimeo_septuplet_full\\sequences"  # Original dataset
TARGET_DIR = "F:\\vimeo_septuplet_full\\vime90k_10folders"   # Output dataset (only LR)

# Number of folders to process at a time (set to small first, then increase)
num_folders_to_process = 10  # Change this to process more later

# Ensure output directories exist
for split in ["train", "val", "test"]:
    os.makedirs(os.path.join(TARGET_DIR, f"{split}/lr"), exist_ok=True)

# Collect only a subset of folders for processing
all_folders = sorted(os.listdir(SOURCE_DIR))[:num_folders_to_process]  # Process only first N folders

# Collect all valid sequences
all_sequences = []
for seq_folder in all_folders:
    seq_path = os.path.join(SOURCE_DIR, seq_folder)
    if os.path.isdir(seq_path):
        for sub_folder in sorted(os.listdir(seq_path)):
            full_path = os.path.join(seq_path, sub_folder)
            if os.path.isdir(full_path):
                frame_files = [f"im{i}.png" for i in range(1, 8)]
                if all(os.path.exists(os.path.join(full_path, f)) for f in frame_files):
                    all_sequences.append(full_path)

# Shuffle and split dataset (80% train, 10% val, 10% test)
random.shuffle(all_sequences)
train_end = int(0.8 * len(all_sequences))
val_end = int(0.9 * len(all_sequences))

train_sequences = all_sequences[:train_end]
val_sequences = all_sequences[train_end:val_end]
test_sequences = all_sequences[val_end:]

# Function to process sequences
def process_sequences(sequences, split_type):
    """Copies LR images only, no HR."""
    lr_folder = os.path.join(TARGET_DIR, f"{split_type}/lr")

    for seq_path in sequences:
        seq_name = "_".join(seq_path.split(os.sep)[-2:])  # e.g., 00001_0001
        os.makedirs(os.path.join(lr_folder, seq_name), exist_ok=True)

        for i in range(1, 8):  # Frames im1.png to im7.png
            img_path = os.path.join(seq_path, f"im{i}.png")

            # Load image
            img = cv2.imread(img_path)
            if img is None:
                print(f"Skipping corrupted/missing image: {img_path}")
                continue

            # Save LR image directly (NO HR)
            lr_filename = f"frame_{i:05d}.png"
            cv2.imwrite(os.path.join(lr_folder, seq_name, lr_filename), img)

# Process datasets
print("Processing training set...")
process_sequences(train_sequences, "train")

print("Processing validation set...")
process_sequences(val_sequences, "val")

print("Processing test set...")
process_sequences(test_sequences, "test")

print("Dataset preprocessing complete! 🎉")


Processing training set...


KeyboardInterrupt: 

In [None]:
import os
import shutil
import random
import cv2
import numpy as np
from pathlib import Path
from tqdm import tqdm

# Define paths
source_dir = r"F:\\vimeo_septuplet_full\\sequences"
output_dir = r"F:\vimeo_mobilenetv3_dataset"

# Process only a limited number of folders
max_folders_to_process = 10  # Adjust this number as needed

# Define split ratios
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

# Downsampling factor for creating LR images
scale_factor = 4  # Common scale factor for super-resolution (4x)

# Create output directories
def create_directory_structure():
    for split in ['train', 'val', 'test']:
        for res in ['lr', 'hr']:
            os.makedirs(os.path.join(output_dir, f"{split}-{res}"), exist_ok=True)
    
    print("Directory structure created successfully.")

# Function to downsample an image
def downsample_image(image_path, scale):
    img = cv2.imread(image_path)
    h, w = img.shape[:2]
    
    # Calculate new dimensions
    new_h, new_w = h // scale, w // scale
    
    # Resize down
    lr_img = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
    
    # For training purposes, we sometimes resize back to original size to match dimensions
    # Uncomment below if you want LR images to be the same size as HR but lower quality
    # lr_img = cv2.resize(lr_img, (w, h), interpolation=cv2.INTER_CUBIC)
    
    return lr_img

# Function to process and split the dataset
def split_dataset():
    # Get a subset of sequence folders
    sequence_folders = []
    for folder in os.listdir(source_dir):
        folder_path = os.path.join(source_dir, folder)
        if os.path.isdir(folder_path):
            sequence_folders.append(folder)
    
    # Shuffle and limit to max_folders_to_process
    random.shuffle(sequence_folders)
    sequence_folders = sequence_folders[:max_folders_to_process]
    
    print(f"Processing {len(sequence_folders)} folders out of the full dataset")
    
    # Calculate split indices
    total_folders = len(sequence_folders)
    train_count = int(total_folders * train_ratio)
    val_count = int(total_folders * val_ratio)
    
    train_folders = sequence_folders[:train_count]
    val_folders = sequence_folders[train_count:train_count+val_count]
    test_folders = sequence_folders[train_count+val_count:]
    
    print(f"Dataset will be split into: {len(train_folders)} train, {len(val_folders)} val, {len(test_folders)} test folders")
    
    # Process each split
    process_split(train_folders, 'train')
    process_split(val_folders, 'val')
    process_split(test_folders, 'test')

# Process a specific split (train/val/test)
def process_split(folders, split_name):
    print(f"Processing {split_name} split...")
    
    # Track sequence count per split
    sequence_count = 0
    image_count = 0
    
    # Use tqdm for progress tracking
    for folder in tqdm(folders, desc=f"{split_name.capitalize()} Split"):
        folder_path = os.path.join(source_dir, folder)
        
        # Get a limited number of subfolders per folder (to further limit processing)
        sub_folders = [f for f in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, f))]
        max_sub_per_folder = min(5, len(sub_folders))  # Process at most 5 subfolders per folder
        sub_folders = sub_folders[:max_sub_per_folder]
        
        for sub_folder in sub_folders:
            sub_folder_path = os.path.join(folder_path, sub_folder)
            images = sorted([f for f in os.listdir(sub_folder_path) if f.endswith(('.png', '.jpg', '.jpeg'))])
            
            # For video super resolution, we typically use sequences
            if len(images) < 3:  # Skip if not enough images for a sequence
                continue
            
            sequence_count += 1
            
            # Create unique identifier for this sequence
            sequence_id = f"{folder}_{sub_folder}"
            
            # Process each image in the sequence
            for img_name in images:
                img_path = os.path.join(sub_folder_path, img_name)
                
                # Copy original image to HR directory
                hr_dest_path = os.path.join(output_dir, f"{split_name}-hr", f"{sequence_id}_{img_name}")
                shutil.copy(img_path, hr_dest_path)
                
                # Create and save downsampled version to LR directory
                lr_img = downsample_image(img_path, scale_factor)
                lr_dest_path = os.path.join(output_dir, f"{split_name}-lr", f"{sequence_id}_{img_name}")
                cv2.imwrite(lr_dest_path, lr_img)
                
                image_count += 1
    
    print(f"Processed {sequence_count} sequences with {image_count} total images for {split_name} split")

# Calculate dataset statistics
def calculate_stats():
    print("\nCalculating dataset statistics...")
    
    stats = {}
    for split in ['train', 'val', 'test']:
        hr_dir = os.path.join(output_dir, f"{split}-hr")
        lr_dir = os.path.join(output_dir, f"{split}-lr")
        
        hr_count = len([f for f in os.listdir(hr_dir) if f.endswith(('.png', '.jpg', '.jpeg'))])
        lr_count = len([f for f in os.listdir(lr_dir) if f.endswith(('.png', '.jpg', '.jpeg'))])
        
        stats[split] = {'hr': hr_count, 'lr': lr_count}
        
        print(f"{split.capitalize()} set: {hr_count} HR images, {lr_count} LR images")
    
    # Verify a sample image to confirm downsampling
    if stats['train']['hr'] > 0:
        sample_hr = os.path.join(output_dir, "train-hr", os.listdir(os.path.join(output_dir, "train-hr"))[0])
        sample_lr = os.path.join(output_dir, "train-lr", os.listdir(os.path.join(output_dir, "train-lr"))[0])
        
        hr_img = cv2.imread(sample_hr)
        lr_img = cv2.imread(sample_lr)
        
        print(f"\nSample HR image shape: {hr_img.shape}")
        print(f"Sample LR image shape: {lr_img.shape}")
        print(f"Resolution reduction: {scale_factor}x")

# Main execution
def main():
    print(f"Starting Vimeo-90K SUBSET preparation for MobileNetV3 video super resolution")
    print(f"Source directory: {source_dir}")
    print(f"Output directory: {output_dir}")
    print(f"Processing only {max_folders_to_process} folders to save time")
    print(f"Downsampling factor: {scale_factor}x")
    
    # Create directories
    create_directory_structure()
    
    # Split and process the dataset
    split_dataset()
    
    # Calculate statistics
    calculate_stats()
    
    print("\nSubset dataset preparation complete!")

if __name__ == "__main__":
    main()

In [None]:
##################################use below one #####################

In [None]:
import os
import shutil
import random
import cv2

# Define paths
SOURCE_DIR = "F:\\vimeo_septuplet_full\\sequences"  # Original dataset
TARGET_DIR = "F:\\vimeo_septuplet_full\\Arranged_full"   # Output dataset (LR + HR)

# Number of folders to process at a time (set to small first, then increase)
num_folders_to_process = 95  # Change this to process more later

# Ensure output directories exist
for split in ["train", "val", "test"]:
    os.makedirs(os.path.join(TARGET_DIR, f"{split}/lr"), exist_ok=True)
    os.makedirs(os.path.join(TARGET_DIR, f"{split}/hr"), exist_ok=True)

# Collect only a subset of folders for processing
all_folders = sorted(os.listdir(SOURCE_DIR))[:num_folders_to_process]  # Process only first N folders

# Collect all valid sequences
all_sequences = []
for seq_folder in all_folders:
    seq_path = os.path.join(SOURCE_DIR, seq_folder)
    if os.path.isdir(seq_path):
        for sub_folder in sorted(os.listdir(seq_path)):
            full_path = os.path.join(seq_path, sub_folder)
            if os.path.isdir(full_path):
                frame_files = [f"im{i}.png" for i in range(1, 8)]
                if all(os.path.exists(os.path.join(full_path, f)) for f in frame_files):
                    all_sequences.append(full_path)

# Shuffle and split dataset (80% train, 10% val, 10% test)
random.shuffle(all_sequences)
train_end = int(0.8 * len(all_sequences))
val_end = int(0.9 * len(all_sequences))

train_sequences = all_sequences[:train_end]
val_sequences = all_sequences[train_end:val_end]
test_sequences = all_sequences[val_end:]

# Function to process sequences
def process_sequences(sequences, split_type):
    """Copies HR and generates LR from HR"""
    lr_folder = os.path.join(TARGET_DIR, f"{split_type}/lr")
    hr_folder = os.path.join(TARGET_DIR, f"{split_type}/hr")

    for seq_path in sequences:
        seq_name = "_".join(seq_path.split(os.sep)[-2:])  # e.g., 00001_0001
        os.makedirs(os.path.join(lr_folder, seq_name), exist_ok=True)
        os.makedirs(os.path.join(hr_folder, seq_name), exist_ok=True)

        for i in range(1, 8):  # Frames im1.png to im7.png
            img_path = os.path.join(seq_path, f"im{i}.png")

            # Load HR image (keep original)
            hr_img = cv2.imread(img_path)
            if hr_img is None:
                print(f"Skipping corrupted/missing image: {img_path}")
                continue

            h, w, _ = hr_img.shape

            # Generate LR image (downscale only)
            lr_img = cv2.resize(hr_img, (w // 4, h // 4), interpolation=cv2.INTER_CUBIC)

            # Save HR image (original quality)
            hr_filename = f"frame_{i:05d}.png"
            cv2.imwrite(os.path.join(hr_folder, seq_name, hr_filename), hr_img)

            # Save LR image (downscaled version)
            lr_filename = f"frame_{i:05d}.png"
            cv2.imwrite(os.path.join(lr_folder, seq_name, lr_filename), lr_img)

# Process datasets
print("Processing training set...")
process_sequences(train_sequences, "train")

print("Processing validation set...")
process_sequences(val_sequences, "val")

print("Processing test set...")
process_sequences(test_sequences, "test")

print("Dataset preprocessing complete! 🎉")


Processing training set...
