In [None]:
from pathlib import Path
import shutil
import random
from collections import defaultdict
import pandas as pd

src_root = Path("per_bird_videos")
dst_root = Path("test_videos")
metadata_path = dst_root / "test_videos_metadata.csv"

rng = random.Random(42)

dst_root.mkdir(parents=True, exist_ok=True)

target_birds = {
    "BRG-YOM", "EYB-RPM", "OYR-BGM", "GBY-ORM", "OGY-BRM", "ORB-UYM",
    "BNY-RPM", "GBM-ORY", "BNU-RPM", "RGY-BOM", "RYO-BGM", "OUB-RPM",
    "OEB-RPM", "YRU-POM", "YM-OBR", "BRK-NOM", "ORG-BYM"
}

moved_counts = defaultdict(int)
moved_rows = []  # rows for CSV
total_moved = 0

# move 10% of videos per bird into test set 
for bird in sorted(target_birds):
    bird_dir = src_root / bird
    if not bird_dir.is_dir():
        print(f"Warning: {bird} folder not found in {src_root}")
        continue

    vids = [p for p in bird_dir.rglob("*") if p.is_file() and p.suffix.lower() == ".mp4"]
    n = len(vids)
    if n == 0:
        continue

    k = max(1, int(n * 0.10))  # 10% or at least 1
    sample = rng.sample(vids, k)

    out_dir = dst_root / bird
    out_dir.mkdir(parents=True, exist_ok=True)

    for src in sample:
        dest_name = f"{src.stem}.MP4"  # force uppercase extension
        dest = out_dir / dest_name

        if dest.exists():
            i = 1
            while True:
                alt = out_dir / f"{src.stem}__dup{i}.MP4"
                if not alt.exists():
                    dest = alt
                    break
                i += 1
            dest = alt

        shutil.move(str(src), str(dest))

        moved_counts[bird] += 1
        total_moved += 1
        moved_rows.append({"Bird_ID": bird, "Video_ID": dest.name})

pd.DataFrame(moved_rows).to_csv(metadata_path, index=False)

print("Moved per bird (10% sample):")
for bird in sorted(moved_counts, key=lambda b: moved_counts[b], reverse=True):
    print(f"{bird}: {moved_counts[bird]}")

print(f"\nTotal moved: {total_moved}")
print(f"Metadata written to: {metadata_path}")

print("\nFinal counts per bird (after move):")
header = f"{'Bird':<10} {'Test set':>10} {'Remaining':>12} {'Total':>10}"
print(header)
print("-" * len(header))

grand_test, grand_remain = 0, 0

for bird in sorted(target_birds):
    test_count = len(list((dst_root / bird).glob("*.mp4"))) + len(list((dst_root / bird).glob("*.MP4")))
    remain_count = len(list((src_root / bird).glob("*.mp4"))) + len(list((src_root / bird).glob("*.MP4")))
    total = test_count + remain_count
    grand_test += test_count
    grand_remain += remain_count
    print(f"{bird:<10} {test_count:>10} {remain_count:>12} {total:>10}")

print("-" * len(header))
print(f"{'TOTAL':<10} {grand_test:>10} {grand_remain:>12} {grand_test+grand_remain:>10}")
