In [2]:
import os
from collections import defaultdict

BASE_DIR = "/lisc/data/scratch/becogbio/juarez/thesis/4.3_Viewpoint_full_dataset/2_Vp_specific_full_dataset"
VIEWPOINTS = ["front", "back", "left_side", "right_side", "side_view"]
IMG_EXT = ".png"

counts = defaultdict(dict)

for vp in VIEWPOINTS:
    vp_path = os.path.join(BASE_DIR, vp)
    if not os.path.isdir(vp_path):
        continue
    for bird_id in os.listdir(vp_path):
        bird_path = os.path.join(vp_path, bird_id)
        if not os.path.isdir(bird_path):
            continue
        images = [f for f in os.listdir(bird_path) if f.lower().endswith(IMG_EXT)]
        counts[vp][bird_id] = len(images)

for vp in counts:
    print(f"\nViewpoint: {vp}")
    for bird_id, num in sorted(counts[vp].items(), key=lambda x: x[0]):
        print(f"  {bird_id}: {num}")


Viewpoint: front
  BEU-RPM: 72
  BNU-RPM: 2435
  BNY-RPM: 81
  BRG-YOM: 2770
  BRK-NOM: 97
  EYB-RPM: 1240
  GBM-ORY: 830
  GBY-ORM: 904
  OEB-RPM: 1066
  OGY-BRM: 1595
  ORB-UYM: 514
  ORG-BYM: 194
  OUB-RPM: 341
  OYR-BGM: 254
  PUE-ONM: 25
  RGY-BOM: 176
  RYO-BGM: 671
  YGO-M: 289
  YM-OBR: 544
  YRU-POM: 604

Viewpoint: back
  BEU-RPM: 85
  BNU-RPM: 5396
  BNY-RPM: 288
  BRG-YOM: 8672
  BRK-NOM: 716
  EYB-RPM: 3250
  GBM-ORY: 3040
  GBY-ORM: 1368
  OEB-RPM: 1521
  OGY-BRM: 1732
  ORB-UYM: 646
  ORG-BYM: 63
  OUB-RPM: 974
  OYR-BGM: 750
  PUE-ONM: 29
  RGY-BOM: 248
  RYO-BGM: 1068
  YGO-M: 88
  YM-OBR: 758
  YRU-POM: 376

Viewpoint: left_side
  BEU-RPM: 131
  BNU-RPM: 3077
  BNY-RPM: 236
  BRG-YOM: 3814
  BRK-NOM: 651
  EYB-RPM: 5052
  GBM-ORY: 1996
  GBY-ORM: 845
  OEB-RPM: 1253
  OGY-BRM: 1546
  ORB-UYM: 505
  ORG-BYM: 33
  OUB-RPM: 510
  OYR-BGM: 747
  PUE-ONM: 36
  RGY-BOM: 165
  RYO-BGM: 1110
  YGO-M: 366
  YM-OBR: 669
  YRU-POM: 391

Viewpoint: right_side
  BEU-RPM: 105
  BN

In [3]:
import os
import shutil
import random
from tqdm import tqdm
from sklearn.model_selection import train_test_split

random_seed = 42
rng = random.Random(random_seed)

NUM_IMAGES_PER_BIRD = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 600, 700, 800, 900, 1000]

BASE_DIR = "/lisc/data/scratch/becogbio/juarez/thesis/4.3_Viewpoint_full_dataset/2_Vp_specific_full_dataset"

# each subset size gets its own folder
OUTPUT_ROOT = "/lisc/data/scratch/becogbio/juarez/thesis/4.4_Viewpoint_subsets/Data"
VIEWPOINTS = ["front", "back", "left_side", "right_side", "side_view"]
ALLOWED_BIRD_IDS = {
    "BRG-YOM","EYB-RPM","BNU-RPM","GBM-ORY","OGY-BRM","GBY-ORM","RYO-BGM",
    "OYR-BGM","OEB-RPM","ORB-UYM","YM-OBR","BNY-RPM","OUB-RPM","YRU-POM",
    "RGY-BOM","BRK-NOM","YGO-M"
}

IMG_EXT = (".png")

os.makedirs(OUTPUT_ROOT, exist_ok=True)

def list_images(dir_path):
    if not os.path.isdir(dir_path):
        return []
    return [f for f in os.listdir(dir_path) if f.lower().endswith(IMG_EXT)]

def copy_images(file_list, src_dir, dst_dir):
    os.makedirs(dst_dir, exist_ok=True)
    for name in file_list:
        shutil.copy2(os.path.join(src_dir, name), os.path.join(dst_dir, name))

for num_images in tqdm(NUM_IMAGES_PER_BIRD, desc="Building subsets"):
    subset_dir = os.path.join(OUTPUT_ROOT, str(num_images))
    os.makedirs(subset_dir, exist_ok=True)

    for vp in VIEWPOINTS:
        vp_src = os.path.join(BASE_DIR, vp)
        if not os.path.isdir(vp_src):
            continue  # skip missing viewpoint

        # Create viewpoint-level split dirs for this subset
        vp_out = os.path.join(subset_dir, vp)
        train_dir = os.path.join(vp_out, "train")
        val_dir   = os.path.join(vp_out, "val")
        os.makedirs(train_dir, exist_ok=True)
        os.makedirs(val_dir, exist_ok=True)

        # Discover bird IDs present in this viewpoint
        candidate_birds = [
            d for d in os.listdir(vp_src)
            if os.path.isdir(os.path.join(vp_src, d))
        ]
        if ALLOWED_BIRD_IDS is not None:
            candidate_birds = [b for b in candidate_birds if b in ALLOWED_BIRD_IDS]

        for bird_id in candidate_birds:
            bird_src = os.path.join(vp_src, bird_id)

            images = list_images(bird_src)
            if not images:
                continue

            # Only proceed if enough images are available for this subset size
            if len(images) < num_images:
                continue

            rng.shuffle(images)
            selected = images[:num_images]

            # 70 / 30 split (deterministic)
            train_files, val_files = train_test_split(
                selected, test_size=0.3, random_state=random_seed
            )

            # Create class folders only when we actually include this bird
            train_bird_dir = os.path.join(train_dir, bird_id)
            val_bird_dir   = os.path.join(val_dir, bird_id)

            copy_images(train_files, bird_src, train_bird_dir)
            copy_images(val_files,   bird_src, val_bird_dir)

print("Done!")

Building subsets: 100%|██████████| 15/15 [19:13<00:00, 76.87s/it] 

Done!



