In [None]:
import os
import shutil
import random
from tqdm import tqdm
from sklearn.model_selection import train_test_split

random_seed = 42
num_images_per_bird = [50, 100, 150, 200, 250, 300, 350, 400, 450]  # instances to extract per bird
base_directory = "/gpfs/data/fs72607/juarezs98/masked_frames/"
output_root = "/gpfs/data/fs72607/juarezs98/subsets_finetune/"

random.seed(random_seed)

os.makedirs(output_root, exist_ok=True)

for num_images in tqdm(num_images_per_bird, desc="Processing subsets"):
    subset_directory = os.path.join(output_root, str(num_images))
    os.makedirs(subset_directory, exist_ok=True)

    # Within each subset, create train, val, and test folders
    train_dir = os.path.join(subset_directory, "train")
    val_dir   = os.path.join(subset_directory, "val")
    test_dir  = os.path.join(subset_directory, "test")
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(val_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)

    for bird_id in os.listdir(base_directory):
        bird_directory = os.path.join(base_directory, bird_id)
        if not os.path.isdir(bird_directory):
            continue

        # Gather and shuffle all images for each bird
        image_files = [f for f in os.listdir(bird_directory) 
                       if f.lower().endswith(('.jpg', '.png'))]
        if len(image_files) == 0:
            continue
        random.shuffle(image_files)

        # Select the first `num_images` files
        selected_images = image_files[:num_images]

        # For a 70-20-10% split:
        # First split 70% train, 30% remainder
        train_files, remainder = train_test_split(
            selected_images, 
            test_size=0.3, 
            random_state=random_seed
        )
        # Then split the remainder ~50/50 to get 20% val, 10% test overall
        val_files, test_files = train_test_split(
            remainder, 
            test_size=0.3333,  # 1/3 of 30% -> ~10% of total
            random_state=random_seed
        )

        # Create bird-specific directories within train, val, and test
        train_bird_dir = os.path.join(train_dir, bird_id)
        val_bird_dir   = os.path.join(val_dir, bird_id)
        test_bird_dir  = os.path.join(test_dir, bird_id)

        os.makedirs(train_bird_dir, exist_ok=True)
        os.makedirs(val_bird_dir, exist_ok=True)
        os.makedirs(test_bird_dir, exist_ok=True)

        def copy_images(file_list, src_dir, dst_dir):
            for img_name in file_list:
                src_path = os.path.join(src_dir, img_name)
                dst_path = os.path.join(dst_dir, img_name)
                shutil.copy2(src_path, dst_path)

        # Copy training, validation and test images
        copy_images(train_files, bird_directory, train_bird_dir)
        copy_images(val_files, bird_directory, val_bird_dir)
        copy_images(test_files, bird_directory, test_bird_dir)

print("Done!")

Processing subsets: 100%|██████████| 5/5 [02:28<00:00, 29.78s/it]

Done!



