#### SPDX-License-Identifier: Apache-2.0
#### Copyright (c) 2025 Onciul Alexandra and Klinovsky Sebastian
#### This code was written with the assistance of an AI (e.g. ChatGPT).


In [1]:
import os
import shutil

# === DATASET CONFIGURATIONS ===
dataset_folders = [
    "Anthisnes_Chateau_de_Xhos_Camera_1_HIT",
    "Bornival_PHOTO_2023CAM03",
    "Bornival_PHOTO_2023CAM04",
    "Chaumont_Gistoux_Camera_1",
    "Chaumont_Gistoux_Camera_2",
    "Jenneret_Camera_1_PHOTO",
    "Modave_Camera_3_toiture_PHOTO",
    "Modave_Camera_plancher_PHOTO",
    "Pont_de_Bousval_Photos_2022_PHOTO",
    "Pont_de_Bousval_Photos_2023_PHOTO_2022CAM12",
    "Pont_de_Bousval_Photos_2023_PHOTO_2023CAM05",
    "Pont_de_Bousval_Photos_2023_PHOTO_2023CAM06",
    "Pont_de_Bousval_Photos_2023_PHOTO_WK6HDBOUSVAL",
]

cropped_dir = "cropped_bats"
subdivision_dir = "subdivision_result_6_expanded"

for folder_name in dataset_folders:
    source = os.path.join(cropped_dir, f"cropped_bats_{folder_name}")
    dest = os.path.join(subdivision_dir, folder_name, "bats")

    if not os.path.exists(source):
        print(f"❌ Source folder does not exist: {source}")
        continue

    if not os.path.exists(dest):
        print(f"⚠️ Destination folder missing: {dest} — creating it.")
        os.makedirs(dest, exist_ok=True)

    files_moved = 0
    for file_name in os.listdir(source):
        src_file = os.path.join(source, file_name)
        dst_file = os.path.join(dest, file_name)
        try:
            shutil.move(src_file, dst_file)
            files_moved += 1
        except Exception as e:
            print(f"❌ Failed to move {file_name}: {e}")

    print(f"✅ Moved {files_moved} files from {source} to {dest}")


✅ Moved 7153 files from cropped_bats/cropped_bats_Anthisnes_Chateau_de_Xhos_Camera_1_HIT to subdivision_result_6_expanded/Anthisnes_Chateau_de_Xhos_Camera_1_HIT/bats
✅ Moved 245 files from cropped_bats/cropped_bats_Bornival_PHOTO_2023CAM03 to subdivision_result_6_expanded/Bornival_PHOTO_2023CAM03/bats
✅ Moved 98 files from cropped_bats/cropped_bats_Bornival_PHOTO_2023CAM04 to subdivision_result_6_expanded/Bornival_PHOTO_2023CAM04/bats
✅ Moved 53 files from cropped_bats/cropped_bats_Chaumont_Gistoux_Camera_1 to subdivision_result_6_expanded/Chaumont_Gistoux_Camera_1/bats
✅ Moved 221 files from cropped_bats/cropped_bats_Chaumont_Gistoux_Camera_2 to subdivision_result_6_expanded/Chaumont_Gistoux_Camera_2/bats
✅ Moved 968 files from cropped_bats/cropped_bats_Jenneret_Camera_1_PHOTO to subdivision_result_6_expanded/Jenneret_Camera_1_PHOTO/bats
✅ Moved 2111 files from cropped_bats/cropped_bats_Modave_Camera_3_toiture_PHOTO to subdivision_result_6_expanded/Modave_Camera_3_toiture_PHOTO/bats
✅

In [6]:

def count_images(root_dir):
  """Counts the number of image files in a directory and its subdirectories."""
  image_extensions = ('.jpg', '.jpeg', '.png', '.gif', '.bmp')  # Add more if needed
  count = 0
  for dirpath, dirnames, filenames in os.walk(root_dir):
    for filename in filenames:
      if filename.lower().endswith(image_extensions):
        count += 1
  return count

bat_background_count = count_images("LOO_local/background")
bat_image_count = count_images("LOO_local/bats")
groupbats_image_count = count_images("LOO_local/groupbats")

# Total image count
total_image_count = bat_image_count + groupbats_image_count + bat_background_count
print(f"Number of images in LOO_local/background: {bat_background_count/total_image_count *100}")
print(f"Number of images in LOO_local/bats: {bat_image_count/ total_image_count *100}")
print(f"Number of images in LOO_local/groupbats: {groupbats_image_count/ total_image_count *100}")
print(f"Total number of images: {total_image_count}")


Number of images in LOO_local/background: 79.03974813065722
Number of images in LOO_local/bats: 18.2353404171586
Number of images in LOO_local/groupbats: 2.7249114521841795
Total number of images: 63525


In [10]:
lst = os.listdir("LOO_local/temp_groupbats")
print(lst)

['Pont_de_Bousval_Photos_2022_PHOTO', 'Modave_Camera_3_toiture_PHOTO', 'Bornival_PHOTO_2023CAM04', 'Pont_de_Bousval_Photos_2023_PHOTO_WK6HDBOUSVAL', 'Pont_de_Bousval_Photos_2023_PHOTO_2022CAM12', 'Pont_de_Bousval_Photos_2023_PHOTO_2023CAM06', 'Bornival_PHOTO_2023CAM03', 'Chaumont_Gistoux_Camera_2', 'Chaumont_Gistoux_Camera_1', 'Pont_de_Bousval_Photos_2023_PHOTO_2023CAM05', 'Anthisnes_Chateau_de_Xhos_Camera_1_HIT', 'Jenneret_Camera_1_PHOTO', 'Modave_Camera_plancher_PHOTO']


### FOLDER CONTAINS DIFFERENT GÎTES, EACH WITH 3 CLASS SUBFOLDERS

Each folder inside `LOOCV_ALL_FROM_SUB` corresponds to a **gîte** (e.g., `Chaumont_Gistoux_Camera_1`, `Bornival_PHOTO_2023CAM03`, etc.).

Inside each gîte folder, there are 3 subfolders representing the **classes**:

#### 1. background  
#### 2. bats  
#### 3. groupbats  

Each class folder contains the image data for that class and gîte, used in the LOOCV (Leave-One-Out Cross-Validation) pipeline.


In [None]:
import os
import shutil
import time

# Classes to include
CLASS_NAMES = ["background", "bats", "groupbats"]

# Base path where all gîte folders are located
BASE_GITES_PATH = "subdivision_result_6_expanded"

# List of gîtes to use for LOOCV
chosen_gites = [
    'Pont_de_Bousval_Photos_2022_PHOTO', 
    'Modave_Camera_3_toiture_PHOTO', 
    'Bornival_PHOTO_2023CAM04', 
    'Pont_de_Bousval_Photos_2023_PHOTO_WK6HDBOUSVAL', 
    'Pont_de_Bousval_Photos_2023_PHOTO_2022CAM12', 
    'Pont_de_Bousval_Photos_2023_PHOTO_2023CAM06', 
    'Bornival_PHOTO_2023CAM03', 
    'Chaumont_Gistoux_Camera_2', 
    'Chaumont_Gistoux_Camera_1', 
    'Pont_de_Bousval_Photos_2023_PHOTO_2023CAM05', 
    'Anthisnes_Chateau_de_Xhos_Camera_1_HIT', 
    'Jenneret_Camera_1_PHOTO', 
    'Modave_Camera_plancher_PHOTO'
]

# Output base path
drive_base_path = "LOOCV_from_back_substraction_test_3"
os.makedirs(drive_base_path, exist_ok=True)

for chosen_gite in chosen_gites:
    print(f"🚀 Processing LOOCV for gite: {chosen_gite}")
    start_time = time.time()

    # Define temporary train/test folders inside the output folder
    BASE_TEMP = os.path.join(drive_base_path, f"loo_temp_{chosen_gite}")
    train_dir = os.path.join(BASE_TEMP, "train")
    test_dir  = os.path.join(BASE_TEMP, "test")

    # Clean previous temp folders
    shutil.rmtree(train_dir, ignore_errors=True)
    shutil.rmtree(test_dir, ignore_errors=True)

    # Recreate temp folders for each class
    for class_name in CLASS_NAMES:
        os.makedirs(os.path.join(train_dir, class_name), exist_ok=True)
        os.makedirs(os.path.join(test_dir, class_name), exist_ok=True)

    # Loop through each gîte folder
    for gite_folder in os.listdir(BASE_GITES_PATH):
        gite_path = os.path.join(BASE_GITES_PATH, gite_folder)
        if not os.path.isdir(gite_path):
            continue

        is_test = (gite_folder == chosen_gite)
        dest_root = test_dir if is_test else train_dir

        for class_name in CLASS_NAMES:
            class_folder = os.path.join(gite_path, class_name)
            if not os.path.isdir(class_folder):
                continue

            dest_class_folder = os.path.join(dest_root, class_name)
            for filename in os.listdir(class_folder):
                if filename == ".ipynb_checkpoints":
                    continue
                src_file = os.path.join(class_folder, filename)
                dst_file = os.path.join(dest_class_folder, filename)
                if os.path.isfile(src_file):
                    shutil.copy2(src_file, dst_file)

    print(f"✅ Done with {chosen_gite} in {time.time() - start_time:.2f}s")

print("🎉 All LOOCV splits are complete!")


In [3]:
import os
import subprocess

# must match exactly what you used before
CLASS_NAMES = ["background", "bats"]
LOOCV_BASE  = "LOOCV_from_back_substraction_test_3"
# where to dump your flattened train crops
DEST_BASE   = "flat_train"

# make the two flat class‐folders up front
for cls in CLASS_NAMES:
    os.makedirs(os.path.join(DEST_BASE, cls), exist_ok=True)

# walk each loo_temp_<gite>/train/<class> and rsync into flat folders
for temp in os.listdir(LOOCV_BASE):
    temp_path = os.path.join(LOOCV_BASE, temp)
    if not (temp.startswith("loo_temp_") and os.path.isdir(temp_path)):
        continue

    train_dir = os.path.join(temp_path, "train")
    for cls in CLASS_NAMES:
        src = os.path.join(train_dir, cls) + os.sep
        dst = os.path.join(DEST_BASE, cls) + os.sep

        if os.path.isdir(src):
            # -a preserves timestamps/permissions and recurses
            # --ignore-existing skips files already in dst
            subprocess.run(
                ["rsync", "-a", "--ignore-existing", src, dst],
                check=True
            )

print("✅ All train‐split crops have been merged into flat_train/{background,bats} without duplication.")


✅ All train‐split crops have been merged into flat_train/{background,bats} without duplication.


In [None]:
# with mask
"""
Leave-One-Out cross-validation splitter **with images + masks** folders.

Given :
    BASE_GITES_PATH/
        <gite>/
            images/
                background/*.png
                bats/*.png
                groupbats/*.png
            masks/
                background/*_mask.png
                bats/*_mask.png
                groupbats/*_mask.png

Produces (per left-out gite):
    drive_base_path/loo_temp_<gite>/
        train_images/{background,bats,groupbats}/*.png
        train_masks/{background,bats,groupbats}/*_mask.png
        test_images/{background,bats,groupbats}/*.png
        test_masks/{background,bats,groupbats}/*_mask.png

Run exactly like the old script – only the folder names change.
"""
from __future__ import annotations

import os
import shutil
import time
from pathlib import Path
from typing import List

CLASS_NAMES: List[str] = ["background", "bats", "groupbats"]

# -----------------------------------------------------------------------------
# Helper
# -----------------------------------------------------------------------------

def ensure_dir(path: Path):
    path.mkdir(parents=True, exist_ok=True)
    return path


def copy_tree(src: Path, dst: Path):
    """Fast flat copy of *.png files from src → dst (non-recursive)."""
    if not src.is_dir():
        return
    for f in src.iterdir():
        if f.suffix.lower() != ".png" or f.name == ".ipynb_checkpoints":
            continue
        shutil.copy(f, dst / f.name)

# -----------------------------------------------------------------------------
# Main
# -----------------------------------------------------------------------------

def loocv_split(
    base_gites_path: str | Path,
    chosen_gites: List[str],
    drive_base_path: str | Path,
):
    base_gites_path = Path(base_gites_path)
    drive_base_path = Path(drive_base_path)
    ensure_dir(drive_base_path)

    gite_folders = [d for d in base_gites_path.iterdir() if d.is_dir()]

    for chosen in chosen_gites:
        print(f"🚀 Processing LOOCV for gite: {chosen}")
        t0 = time.time()

        # temp root for this split
        split_root = drive_base_path / f"loo_temp_{chosen}"
        train_imgs = ensure_dir(split_root / "train_images")
        train_msks = ensure_dir(split_root / "train_masks")
        test_imgs  = ensure_dir(split_root / "test_images")
        test_msks  = ensure_dir(split_root / "test_masks")

        # (Re)create class subfolders
        for cls in CLASS_NAMES:
            for root in (train_imgs, train_msks, test_imgs, test_msks):
                shutil.rmtree(root / cls, ignore_errors=True)
                ensure_dir(root / cls)

        # Iterate gites → copy
        for gite_path in gite_folders:
            is_test = (gite_path.name == chosen)
            dst_images_root = test_imgs if is_test else train_imgs
            dst_masks_root  = test_msks if is_test else train_msks

            for cls in CLASS_NAMES:
                copy_tree(gite_path / "images" / cls, dst_images_root / cls)
                copy_tree(gite_path / "masks"  / cls, dst_masks_root  / cls)

        print(f"✅ Done with {chosen} in {time.time() - t0:.1f}s")

    print("🎉 All LOOCV splits are complete!")


# -----------------------------------------------------------------------------
# CLI entry point (optional)
# -----------------------------------------------------------------------------
if __name__ == "__main__":
    # quick & dirty CLI so we can run as script without extra args parsing
    BASE_GITES_PATH = "subdivision_MASK"
    DRIVE_BASE_PATH = "LOOCV_from_back_substraction_test_5"
    CHOSEN_GITES = [
        'Pont_de_Bousval_Photos_2022_PHOTO',
        'Modave_Camera_3_toiture_PHOTO',
        'Bornival_PHOTO_2023CAM04',
        'Pont_de_Bousval_Photos_2023_PHOTO_WK6HDBOUSVAL',
        'Pont_de_Bousval_Photos_2023_PHOTO_2022CAM12',
        'Pont_de_Bousval_Photos_2023_PHOTO_2023CAM06',
        'Bornival_PHOTO_2023CAM03',
        'Chaumont_Gistoux_Camera_2',
        'Chaumont_Gistoux_Camera_1',
        'Pont_de_Bousval_Photos_2023_PHOTO_2023CAM05',
        'Anthisnes_Chateau_de_Xhos_Camera_1_HIT',
        'Jenneret_Camera_1_PHOTO',
        'Modave_Camera_plancher_PHOTO',
    ]

    loocv_split(BASE_GITES_PATH, CHOSEN_GITES, DRIVE_BASE_PATH)


### FOLDER CONTAINS 3 DIFFERENT CLASSES

#### 1. background  
#### 2. bats  
#### 3. groupbats  

Each of these class folders contains subfolders named after the **gîte** (e.g., `Pont_de_Bousval_Photos_2022_PHOTO`, `Chaumont_Gistoux_Camera_1`, etc.).  
These gîte subfolders hold the actual image data used for Leave-One-Out Cross-Validation (LOOCV).


In [None]:
import os
import shutil
import time

# Define paths to original class folders
CLASS_FOLDERS = {
    "background": "LOO_local/temp_background",
    "bat": "LOO_local/temp_bat",
    "groupbats": "LOO_local/temp_groupbats",
}

# List of gites to loop through for LOOCV
chosen_gites = [
    'Pont_de_Bousval_Photos_2022_PHOTO', 
    'Modave_Camera_3_toiture_PHOTO', 
    'Bornival_PHOTO_2023CAM04', 
    'Pont_de_Bousval_Photos_2023_PHOTO_WK6HDBOUSVAL', 
    'Pont_de_Bousval_Photos_2023_PHOTO_2022CAM12', 
    'Pont_de_Bousval_Photos_2023_PHOTO_2023CAM06', 
    'Bornival_PHOTO_2023CAM03', 
    'Chaumont_Gistoux_Camera_2', 
    'Chaumont_Gistoux_Camera_1', 
    'Pont_de_Bousval_Photos_2023_PHOTO_2023CAM05', 
    'Anthisnes_Chateau_de_Xhos_Camera_1_HIT', 
    'Jenneret_Camera_1_PHOTO', 
    'Modave_Camera_plancher_PHOTO'
]

# Base path in Google Drive to save LOOCV results
drive_base_path = "LOOCV_from_back_substraction_test"
os.makedirs(drive_base_path, exist_ok=True)

for chosen_gite in chosen_gites:
    print(f"🚀 Processing LOOCV for gite: {chosen_gite}")
    start_time = time.time()

    # Temporary train/test folders
    BASE_TEMP = f"loo_temp_{chosen_gite}"
    train_dir = os.path.join(BASE_TEMP, "train")
    test_dir  = os.path.join(BASE_TEMP, "test")

    # Clean previous temp folders
    shutil.rmtree(train_dir, ignore_errors=True)
    shutil.rmtree(test_dir, ignore_errors=True)

    # Recreate temp folders
    for class_name in CLASS_FOLDERS:
        os.makedirs(os.path.join(train_dir, class_name), exist_ok=True)
        os.makedirs(os.path.join(test_dir, class_name), exist_ok=True)

    # Copy files: chosen_gite to test, others to train
    for class_name, class_folder_path in CLASS_FOLDERS.items():
        subfolders = [f for f in os.listdir(class_folder_path) if f != ".ipynb_checkpoints"]

        for subl in subfolders:
            subf_path = os.path.join(class_folder_path, subl)
            if not os.path.isdir(subf_path):
                continue

            dest = os.path.join(test_dir if subl == chosen_gite else train_dir, class_name)

            for item in os.listdir(subf_path):
                if item == ".ipynb_checkpoints":
                    continue
                source_file = os.path.join(subf_path, item)
                dest_file = os.path.join(dest, item)
                if os.path.isfile(source_file):
                    shutil.copy2(source_file, dest_file)


print("🎉 All LOOCV splits are complete!")


In [None]:
folder = "LOO_local/temp_background/just_background"


# randomly delete 50% of the images in the folder
files = os.listdir(folder)
files_to_delete = random.sample(files, len(files) // 2)
for file in files_to_delete:
    file_path = os.path.join(folder, file)
    if os.path.isfile(file_path):
        os.remove(file_path)
        print(f"Deleted {file_path}")
    else:
        print(f"File not found: {file_path}")

In [None]:
base_path = "LOOCV_from_back_substraction_test_2"
folders = [f for f in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, f))]
for folder in folders:
    folder_path = os.path.join(base_path, folder)
    size = subprocess.check_output(['du', '-sh', folder_path]).split()[0].decode('utf-8')
    print(f"Folder: {folder}, Size: {size}")

### 🔄 MERGE `groupbats` INTO `bats` FOR EACH LOOCV FOLDER

Each folder inside `LOOCV_from_back_substraction_test_xx` is a LOOCV split (e.g., `loo_temp_Anthisnes_Chateau_de_Xhos_Camera_1_HIT`).

Inside each LOOCV folder, there are two subdirectories:
- `train/`
- `test/`

Each of these contains the class folders:
- `background/`
- `bats/`
- `groupbats/`

This script performs the following operations:
1. For every LOOCV folder and for both `train` and `test`:
   - Moves all files from `groupbats/` into the corresponding `bats/` folder.
   - Deletes the now-empty `groupbats/` folder.
   
✅ After running the script, there will be no `groupbats` folders left — their contents will be merged into `bats`.


In [None]:
import os
import shutil

# Path to your LOOCV output root folder
base_loocv_path = "LOOCV_from_back_substraction_test_3"

# Loop through each loo_temp_* folder
for entry in os.listdir(base_loocv_path):
    temp_path = os.path.join(base_loocv_path, entry)
    if not entry.startswith("loo_temp_") or not os.path.isdir(temp_path):
        continue

    print(f"📂 Processing: {entry}")

    for split in ["train", "test"]:
        bats_dir = os.path.join(temp_path, split, "bats")
        groupbats_dir = os.path.join(temp_path, split, "groupbats")

        if not os.path.isdir(groupbats_dir):
            continue

        # Move files from groupbats to bats
        for filename in os.listdir(groupbats_dir):
            src_file = os.path.join(groupbats_dir, filename)
            dst_file = os.path.join(bats_dir, filename)

            if os.path.isfile(src_file):
                shutil.move(src_file, dst_file)

        # Remove the now-empty groupbats directory
        shutil.rmtree(groupbats_dir, ignore_errors=True)
        print(f"✅ Merged 'groupbats' into 'bats' and removed: {split}/groupbats")

print("🎉 All groupbats folders merged and deleted.")


In [None]:
# Mask 
"""
Utility: **merge *groupbats* into *bats*** for the new LOOCV folder layout
-------------------------------------------------------------------------

After running ``loocv_split_with_masks.py`` you may decide to collapse
``groupbats`` into ``bats`` for both images *and* masks.  This script walks
through every ``loo_temp_*`` split and moves the files accordingly:

<loo_temp_*>/
    train_images/{bats,groupbats}
    train_masks/{bats,groupbats}
    test_images/{bats,groupbats}
    test_masks/{bats,groupbats}

Result: all files from ``groupbats`` land in the matching ``bats`` folder and
the empty ``groupbats`` directories are removed.
"""
from __future__ import annotations

import os
import shutil
from pathlib import Path

# -----------------------------------------------------------------------------
# Config – change if needed
# -----------------------------------------------------------------------------
BASE_LOOCV_PATH = Path("LOOCV_from_back_substraction_test_5")
SPLIT_DIRS = ["train_images", "train_masks", "test_images", "test_masks"]

# -----------------------------------------------------------------------------
# Main
# -----------------------------------------------------------------------------

def merge_groupbats(base_path: Path):
    for temp_dir in base_path.iterdir():
        if not (temp_dir.is_dir() and temp_dir.name.startswith("loo_temp_")):
            continue
        print(f"📂 Processing: {temp_dir.name}")

        for split in SPLIT_DIRS:
            split_root = temp_dir / split
            bats_dir = split_root / "bats"
            group_dir = split_root / "groupbats"
            if not group_dir.is_dir():
                continue

            # ensure bats dir exists
            bats_dir.mkdir(parents=True, exist_ok=True)

            for src_file in group_dir.iterdir():
                if src_file.is_file():
                    dst_file = bats_dir / src_file.name
                    # If a name clash happens, append a suffix
                    if dst_file.exists():
                        dst_file = bats_dir / f"{src_file.stem}_dup{src_file.suffix}"
                    shutil.move(src_file, dst_file)

            shutil.rmtree(group_dir, ignore_errors=True)
            print(f"  ✅ Merged {split}/groupbats → {split}/bats")

    print("🎉 All groupbats folders merged and deleted.")


if __name__ == "__main__":
    merge_groupbats(BASE_LOOCV_PATH)


In [1]:
import os

sup_dir = "LOOCV_from_back_substraction_test_3"
classes = ["background", "bats"]

def count_images(dir_path):
    """Count how many files exist in `dir_path`, ignoring .ipynb_checkpoints."""
    return sum(
        1
        for f in os.listdir(dir_path)
        if f != ".ipynb_checkpoints" and os.path.isfile(os.path.join(dir_path, f))
    )

# List all LOO folders in the parent directory
lst_LOOCV = [f for f in os.listdir(sup_dir) if os.path.isdir(os.path.join(sup_dir, f))]

for loo_folder in sorted(lst_LOOCV):
    base_dir = os.path.join(sup_dir, loo_folder)

    print("="*60)
    print(f"LOO FOLDER: {loo_folder}")
    
    total_train = 0
    total_test = 0
    test_bats = 0
    test_back = 0
    for cls in classes:
        train_path = os.path.join(base_dir, "train", cls)
        test_path  = os.path.join(base_dir, "test", cls)

        # Count files in each subfolder
        train_count = count_images(train_path) if os.path.exists(train_path) else 0
        test_count  = count_images(test_path) if os.path.exists(test_path) else 0
        total_count = train_count + test_count

        total_train += train_count
        total_test  += test_count

        if total_count > 0:
            train_pct = (train_count / total_count) * 100
            test_pct  = (test_count / total_count) * 100
        else:
            train_pct = 0
            test_pct  = 0
        test_bats = test_count if cls == "bats" else test_bats
        test_groupbats = test_count if cls == "background" else test_groupbats

        print(f"Class: {cls}")
        print(f"  Train: {train_count} ({train_pct:.1f}%)")
        print(f"  Test:  {test_count} ({test_pct:.1f}%)")
        print()
    
    print("="*60)
    print(f"TEST BACKGROUND: {test_groupbats/total_test * 100:.1f}%")
    print(f"TEST BATS: {test_bats/total_test * 100:.1f}%")

    total_images = total_train + total_test
    if total_images > 0:
        overall_train_pct = (total_train / total_images) * 100
        overall_test_pct  = (total_test / total_images) * 100
    else:
        overall_train_pct = 0
        overall_test_pct  = 0

    print("-"*60)
    print("OVERALL for this LOO")
    print(f"  Train: {total_train} ({overall_train_pct:.1f}%)")
    print(f"  Test:  {total_test} ({overall_test_pct:.1f}%)")
    print(f"  Total images: {total_images}")
    print("="*60)
    print()


LOO FOLDER: loo_temp_Anthisnes_Chateau_de_Xhos_Camera_1_HIT


Class: background
  Train: 35214 (92.9%)
  Test:  2682 (7.1%)

Class: bats
  Train: 10825 (67.6%)
  Test:  5197 (32.4%)

TEST BACKGROUND: 34.0%
TEST BATS: 66.0%
------------------------------------------------------------
OVERALL for this LOO
  Train: 46039 (85.4%)
  Test:  7879 (14.6%)
  Total images: 53918

LOO FOLDER: loo_temp_Bornival_PHOTO_2023CAM03
Class: background
  Train: 31866 (84.1%)
  Test:  6030 (15.9%)

Class: bats
  Train: 22664 (98.8%)
  Test:  266 (1.2%)

TEST BACKGROUND: 95.8%
TEST BATS: 4.2%
------------------------------------------------------------
OVERALL for this LOO
  Train: 54530 (89.6%)
  Test:  6296 (10.4%)
  Total images: 60826

LOO FOLDER: loo_temp_Bornival_PHOTO_2023CAM04
Class: background
  Train: 30886 (81.5%)
  Test:  7010 (18.5%)

Class: bats
  Train: 23004 (99.7%)
  Test:  73 (0.3%)

TEST BACKGROUND: 99.0%
TEST BATS: 1.0%
------------------------------------------------------------
OVERALL for this LOO
  Train: 53890 (88.4%)
  Test:  7083 (11.6%)
  T

In [None]:
import os

# Root directory where all the loo_temp folders are located #Delete GROUND TRUTH in test folders because doesnt reflect the real data
root_dir = "LOOCV_from_back_substraction_test_3"  # Replace this with the path where the loo_temp_ folders are

# Loop over all folders in root_dir
for folder in os.listdir(root_dir):
    if folder.startswith("loo_temp_"):
        test_path = os.path.join(root_dir, folder, "test")
        if os.path.exists(test_path):
            for subdir, _, files in os.walk(test_path):
                for file in files:
                    if "GT" in file:
                        file_path = os.path.join(subdir, file)
                        try:
                            os.remove(file_path)
                            print(f"Deleted: {file_path}")
                        except Exception as e:
                            print(f"Failed to delete {file_path}: {e}")


In [None]:
# undersample_background.py
"""
Undersample *background* images in every LOOCV split so that – for **each
camera / gîte** – the number of background images kept is at most
``RATIO × (# bats images from that camera)``.

Everything is done in-place and **reproducibly** thanks to a fixed RNG seed.

Expected layout (per split):

<loo_temp_*>
    train_images/
        background/*.png
        bats/*.png
    train_masks/
        background/*_mask.png
        bats/*_mask.png
    test_images/...
    test_masks/...

Only the *train* folders are touched.
"""
from __future__ import annotations

import random
import re
from pathlib import Path
from typing import Dict, List

# ─────────────────────────────────────────────────────────────────────────────
# ✏️ Config – adjust if needed
# ─────────────────────────────────────────────────────────────────────────────
BASE_LOOCV_PATH = Path("LOOCV_from_back_substraction_test_5")  # root holding loo_temp_* dirs
RATIO: float = 1.0      # keep at most RATIO × (# bats) background images PER CAMERA
RNG_SEED: int = 42      # change for a different deterministic sample
IMG_SUFFIX = ".png"     # image extension
CAM_RE = re.compile(r"^(.*?)_IM_", re.IGNORECASE)  # grabs camera id before “…_IM_”

# ─────────────────────────────────────────────────────────────────────────────
# 🔧 Helpers
# ─────────────────────────────────────────────────────────────────────────────
def camera_id(fname: str) -> str:
    """Extract camera / gîte identifier from a file name."""
    m = CAM_RE.match(fname)
    if m:
        return m.group(1)
    # fallback: strip last 3 underscore-segments
    parts = fname.split("_")
    return "_".join(parts[:-3]) if len(parts) > 3 else fname

def list_pngs(folder: Path) -> List[Path]:
    return [p for p in folder.iterdir() if p.suffix.lower() == IMG_SUFFIX]

def undersample_split(split_root: Path, rng: random.Random):
    """Undersample background images inside one <loo_temp_*> directory."""
    bg_img_dir   = split_root / "train_images" / "background"
    bats_img_dir = split_root / "train_images" / "bats"
    bg_msk_dir   = split_root / "train_masks" / "background"

    if not (bg_img_dir.is_dir() and bats_img_dir.is_dir()):
        print(f"⚠️  Skipping {split_root.name}: missing expected folders")
        return

    # 1️⃣ Count bats images per camera
    bats_per_cam: Dict[str, int] = {}
    for img in list_pngs(bats_img_dir):
        cam = camera_id(img.name)
        bats_per_cam[cam] = bats_per_cam.get(cam, 0) + 1

    # 2️⃣ Group background images per camera
    bg_by_cam: Dict[str, List[Path]] = {}
    for img in list_pngs(bg_img_dir):
        cam = camera_id(img.name)
        bg_by_cam.setdefault(cam, []).append(img)

    # 3️⃣ Decide which background images to keep
    keep_set: set[Path] = set()
    for cam, bg_imgs in bg_by_cam.items():
        bats_cnt = bats_per_cam.get(cam, 0)
        if bats_cnt == 0:
            # no bats from this camera → keep all background (or tweak logic here)
            keep_set.update(bg_imgs)
            continue

        max_keep = int(bats_cnt * RATIO)
        if len(bg_imgs) <= max_keep:
            keep_set.update(bg_imgs)
        else:
            keep_set.update(rng.sample(bg_imgs, k=max_keep))

    # 4️⃣ Delete the rest (both image & matching mask)
    removed = 0
    for bg_imgs in bg_by_cam.values():
        for img in bg_imgs:
            if img in keep_set:
                continue
            img.unlink(missing_ok=True)
            mask = bg_msk_dir / f"{img.stem}_mask{IMG_SUFFIX}"
            mask.unlink(missing_ok=True)
            removed += 1
    print(f"  ✅ {split_root.name}: removed {removed} background images")

# ─────────────────────────────────────────────────────────────────────────────
# 🚀 Main
# ─────────────────────────────────────────────────────────────────────────────
def undersample_all(base_path: Path = BASE_LOOCV_PATH):
    rng = random.Random(RNG_SEED)
    for temp_dir in base_path.iterdir():
        if temp_dir.is_dir() and temp_dir.name.startswith("loo_temp_"):
            undersample_split(temp_dir, rng)
    print("🎉 Undersampling completed for all LOOCV splits.")

if __name__ == "__main__":
    undersample_all()


### KNN per GITE


In [None]:
import os
import random
import numpy as np
from PIL import Image
import cv2
from tqdm import tqdm

import torch
import torch.nn as nn
import torchvision
from torchvision import transforms

from sklearn.cluster import KMeans
from concurrent.futures import ThreadPoolExecutor, as_completed

# -------------------------------
# CONFIGURATION
# -------------------------------
sup_dir = "LOOCV_from_back_substraction_test_3"
classes = ["background", "bats"]

# How many total images (train+test) you want in the final dataset
# Train/test ratio (e.g. 80/20)
train_ratio = 0.8

# Where to save the final balanced datasets
balanced_root = "PBAS_balanced_LOOC_datasets_3"
os.makedirs(balanced_root, exist_ok=True)

# We'll only run the heavy ResNet on SUBSET_SIZE images if we have more than 'desired_count'
# in a subset. Adjust as needed.
SUBSET_SIZE = 500
N_CLUSTERS = 80  # how many clusters we form in deep-feature space

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

feature_extractor = torchvision.models.resnet50(pretrained=True)
feature_extractor.fc = nn.Identity()
feature_extractor = feature_extractor.eval().to(device)

feature_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

def extract_deep_feature(img_path):
    """
    Extract a 2048-dim embedding from ResNet-50 for a single image.
    """
    with Image.open(img_path).convert("RGB") as img:
        x = feature_transform(img).unsqueeze(0).to(device)
        with torch.no_grad():
            feat = feature_extractor(x)  # shape: [1, 2048]
    return feat.squeeze().cpu().numpy()

def compute_color_histogram(img_path, bins=32):
    """
    Returns a flattened, normalized color histogram for an image using OpenCV.
    bins=32 => total dimension = 32*3 = 96.
    """
    img = cv2.imread(img_path)
    if img is None:
        return None
    hist_b = cv2.calcHist([img], [0], None, [bins], [0, 256]).flatten()
    hist_g = cv2.calcHist([img], [1], None, [bins], [0, 256]).flatten()
    hist_r = cv2.calcHist([img], [2], None, [bins], [0, 256]).flatten()
    hist = np.concatenate([hist_b, hist_g, hist_r])
    hist_sum = np.sum(hist)
    if hist_sum > 0:
        hist = hist / hist_sum
    return hist

def l2_distance(a, b):
    return np.linalg.norm(a - b)

# -------------------------------
# AUGMENTATION PIPELINES
# -------------------------------
def add_gaussian_noise(img, mean=0, std=15):
    np_img = np.array(img)
    noise = np.random.normal(mean, std, np_img.shape).astype(np.int16)
    noisy = np.clip(np_img.astype(np.int16) + noise, 0, 255).astype(np.uint8)
    return Image.fromarray(noisy)

def random_augmentation_2(img):
    width, height = img.size
    augmentation = transforms.Compose([
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomVerticalFlip(p=0.5),
        transforms.RandomResizedCrop(size=(height, width),
                                     scale=(0.1, 1.0),
                                     ratio=(0.5, 2.0)),
        transforms.ColorJitter(brightness=0.5, contrast=0.5,
                               saturation=0.5, hue=0.1),
        transforms.RandomRotation(degrees=20)
    ])
    return augmentation(img)

def random_augmentation(img):
    transform_list = []
    transform_list.append(transforms.RandomHorizontalFlip(p=0.5))
    transform_list.append(
        transforms.RandomAffine(
            degrees=20,
            translate=(0.1, 0.1),
            scale=(0.8, 1.2),
            shear=10,
        )
    )
    if random.random() < 0.5:
        transform_list.append(
            transforms.RandomPerspective(distortion_scale=0.2, p=1.0)
        )
    transform_list.append(
        transforms.ColorJitter(
            brightness=0.3,
            contrast=0.3,
            saturation=0.3,
            hue=0.05
        )
    )
    transform_list.append(transforms.ToTensor())
    if random.random() < 0.2:
        transform_list.append(
            transforms.RandomErasing(
                p=1.0,
                scale=(0.02, 0.10),
                ratio=(0.3, 3.3)
            )
        )
    transform_list.append(transforms.ToPILImage())
    
    pipeline = transforms.Compose(transform_list)
    return pipeline(img)

def save_image(img, out_path):
    img.save(out_path)

# -------------------------------
# FILE LOADING UTILS
# -------------------------------
def load_image_paths(base_dir, subset, cls):
    """
    Return a list of file paths in base_dir/subset/cls,
    ignoring .ipynb_checkpoints, etc.
    """
    paths = []
    target_dir = os.path.join(base_dir, subset, cls)
    if os.path.exists(target_dir):
        for fname in os.listdir(target_dir):
            if fname.startswith(".") or fname == ".ipynb_checkpoints":
                continue
            full_path = os.path.join(target_dir, fname)
            if os.path.isfile(full_path):
                paths.append(full_path)
    return paths

def get_subset_images(loo_dir, cls):
    train_list = load_image_paths(loo_dir, "train", cls)
    test_list  = load_image_paths(loo_dir, "test", cls)
    return train_list, test_list

# -------------------------------
# HYBRID TWO-LEVEL DOWNSAMPLING (unchanged)
# -------------------------------
def hybrid_downsample(images, desired_count):
    """
    The same KMeans+color-hist approach as before,
    except we only run deep-feature extraction on a random SUBSET_SIZE.
    """
    current_count = len(images)
    if current_count <= desired_count:
        return images

    print(f" -> Hybrid approach on {current_count} images, target = {desired_count}")
    subset_size = min(SUBSET_SIZE, current_count)
    print(f"    Randomly sampling {subset_size} images for deep feature extraction...")
    deep_subset = random.sample(images, subset_size)

    print(f"    Extracting ResNet features from subset of {subset_size} images...")
    subset_features = []
    for p in tqdm(deep_subset, desc="Extracting deep features"):
        feat = extract_deep_feature(p)
        subset_features.append(feat)
    subset_features = np.vstack(subset_features)  # shape: (subset_size, 2048)

    print(f"    Running KMeans (k={N_CLUSTERS}) on subset embeddings...")
    kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=42)
    cluster_labels = kmeans.fit_predict(subset_features)

    clusters_dict = {}
    for idx, lbl in enumerate(cluster_labels):
        clusters_dict.setdefault(lbl, []).append(deep_subset[idx])

    print("    Computing average color-hist for each cluster...")
    cluster_color_centroids = []
    for lbl in range(N_CLUSTERS):
        paths_in_cluster = clusters_dict.get(lbl, [])
        if not paths_in_cluster:
            cluster_color_centroids.append(None)
            continue
        hist_list = []
        for img_path in paths_in_cluster:
            hist = compute_color_histogram(img_path, bins=32)
            if hist is not None:
                hist_list.append(hist)
        if len(hist_list) == 0:
            cluster_color_centroids.append(None)
            continue
        mean_hist = np.mean(hist_list, axis=0)
        cluster_color_centroids.append(mean_hist)

    remaining = set(images) - set(deep_subset)
    print(f"    Assigning the other {len(remaining)} images by color-hist distance...")
    for p in tqdm(remaining, desc="Assigning remaining"):
        hist = compute_color_histogram(p, bins=32)
        if hist is None:
            continue
        best_label = None
        best_dist = 1e10
        for lbl in range(N_CLUSTERS):
            centroid = cluster_color_centroids[lbl]
            if centroid is None:
                continue
            dist = l2_distance(hist, centroid)
            if dist < best_dist:
                best_dist = dist
                best_label = lbl
        if best_label is not None:
            clusters_dict.setdefault(best_label, []).append(p)

    # Proportional pick from each cluster
    all_count = sum(len(v) for v in clusters_dict.values())
    selected = []
    for lbl, paths_in_cluster in clusters_dict.items():
        csize = len(paths_in_cluster)
        proportion = csize / all_count
        pick = int(round(proportion * desired_count))
        if pick >= csize:
            selected.extend(paths_in_cluster)
        else:
            selected.extend(random.sample(paths_in_cluster, pick))

    # Adjust if overshoot/undershoot
    if len(selected) > desired_count:
        selected = random.sample(selected, desired_count)
    elif len(selected) < desired_count:
        shortfall = desired_count - len(selected)
        extra = random.sample(images, shortfall)
        selected.extend(extra)

    print(f"    Final selection: {len(selected)} (desired {desired_count}).")
    return selected

# -------------------------------
# Original balance_subset
# -------------------------------
def balance_subset(original_list, desired_count, do_augmentation=True):
    """
    Single subset logic: if we have more images than needed, do hybrid_downsample.
    If fewer, augment or replicate. If equal, keep as is.
    """
    final_list = []
    current_count = len(original_list)
    if current_count == 0 and desired_count > 0:
        print("WARNING: No images found but desired_count > 0. Can't do much.")
        return final_list

    if current_count > desired_count:
        print(f"  - {current_count} images > {desired_count}, using HYBRID approach.")
        selected = hybrid_downsample(original_list, desired_count)
        final_list = [(p, False) for p in selected]
    elif current_count == desired_count:
        print(f"  - Exactly the number needed ({current_count}). No downsampling.")
        final_list = [(p, False) for p in original_list]
    else:
        # fewer images than desired
        print(f"  - Fewer images than desired ({current_count} < {desired_count}). Will augment.")
        final_list = [(p, False) for p in original_list]
        shortfall = desired_count - current_count
        if do_augmentation:
            for _ in range(shortfall):
                src_path = random.choice(original_list)
                final_list.append((src_path, True))
        else:
            for _ in range(shortfall):
                src_path = random.choice(original_list)
                final_list.append((src_path, False))
    return final_list

# -------------------------------
# NEW: balance_subset_per_gite
# -------------------------------
def get_gite_name(image_path):
    """
    Example: For file:
      'Anthisnes_Chateau_de_Xhos_Camera_1_HIT_IM_00329_crop_4.png'
    we remove everything from the first occurrence of '_IM_' to get:
      'Anthisnes_Chateau_de_Xhos_Camera_1_HIT'
    If '_IM_' not found, we just remove the extension and use the entire name.
    Adjust this function if your naming is different.
    """
    filename = os.path.basename(image_path)
    name_no_ext = os.path.splitext(filename)[0]  # e.g. "Anthisnes_Chateau_de_Xhos_Camera_1_HIT_IM_00329_crop_4"
    idx = name_no_ext.find("_IM_")
    if idx != -1:
        return name_no_ext[:idx]  # everything before "_IM_"
    else:
        # no "_IM_" found, just return the entire name
        return name_no_ext

def balance_subset_per_gite(original_list, desired_count, do_augmentation=True):
    """
    1) Group 'original_list' by gite name.
    2) Distribute desired_count proportionally among the gites.
    3) For each gite-group, run the usual 'balance_subset' logic to get the right count for that gite.
    4) Combine all gites' results.
    5) Fix any small rounding overshoot/undershoot at the end.
    Returns list of (path, is_augmented).
    """
    if len(original_list) == 0 and desired_count > 0:
        print("WARNING: No images found but desired_count > 0. Can't do much.")
        return []

    # 1) Group by gite
    gite_dict = {}
    for p in original_list:
        gite = get_gite_name(p)
        
        gite_dict.setdefault(gite, []).append(p)

    total_images = len(original_list)
    # We'll accumulate the exact picks so we can fix rounding at the end
    gite_picks = {}
    sum_picked = 0
    # 2) For each gite, compute the fraction * desired_count
    for gite, paths in gite_dict.items():
        gite_size = len(paths)

        # Proportional desired count
        proportion = gite_size / total_images
        gite_desired = int(round(proportion * desired_count))
        # We'll call the original balance_subset on these gite images
        balanced = balance_subset(paths, gite_desired, do_augmentation=do_augmentation)
        gite_picks[gite] = balanced
        sum_picked += len(balanced)

    # 3) If there's a rounding mismatch, fix it
    if sum_picked > desired_count:
        # too many by "over" amount
        over = sum_picked - desired_count
        print(f"  [GITE] Overshoot by {over} images; removing randomly.")
        # flatten everything and remove a random subset
        all_items = [(g, item) for g in gite_picks for item in gite_picks[g]]
        # random sample the items we want to drop
        to_drop = random.sample(all_items, over)
        to_drop_set = set(to_drop)

        # rebuild gite_picks with some removed
        new_gite_picks = {}
        for g in gite_picks:
            # filter out the ones in to_drop
            new_gite_picks[g] = [x for x in gite_picks[g] if (g, x) not in to_drop_set]
        gite_picks = new_gite_picks

    elif sum_picked < desired_count:
        # shortfall
        shortfall = desired_count - sum_picked
        print(f"  [GITE] Undershoot by {shortfall} images; adding randomly.")
        # We can just randomly sample from the entire original_list again
        # and mark them as augmented or duplicates
        for _ in range(shortfall):
            src_path = random.choice(original_list)
            gite = get_gite_name(src_path)
            gite_picks[gite].append((src_path, do_augmentation))

    # 4) Combine final
    final_combined = []
    for g in gite_picks:
        final_combined.extend(gite_picks[g])
    return final_combined

# -------------------------------
# MAIN PROCESS
# -------------------------------
lst_LOOCV = [f for f in os.listdir(sup_dir) if os.path.isdir(os.path.join(sup_dir, f))]
for loo_folder in sorted(lst_LOOCV):
    print("="*70)
    print(f"Processing LOO folder: {loo_folder}")

    loo_dir = os.path.join(sup_dir, loo_folder)

    # 1) Gather images
    train_bg, test_bg = get_subset_images(loo_dir, "background")
    train_bat, test_bat = get_subset_images(loo_dir, "bats")
    
    total_train = len(train_bg) + len(train_bat)
    total_test = len(test_bg) + len(test_bat)
    if total_train > 0:
        print(f" - Train BG: {len(train_bg)} ({len(train_bg)/total_train:.2f}), "
              f"Train Bats: {len(train_bat)} ({len(train_bat)/total_train:.2f})")
    else:
        print(" - Train BG: 0, Train Bats: 0")
    if total_test > 0:
        print(f" - Test BG: {len(test_bg)} ({len(test_bg)/total_test:.2f}), "
              f"Test Bats: {len(test_bat)} ({len(test_bat)/total_test:.2f})")
    else:
        print(" - Test BG: 0, Test Bats: 0")
    
    
    # 1) Find out which train class is bigger
    bg_count  = len(train_bg)
    bat_count = len(train_bat)
    max_count = max(bg_count, bat_count)

    
    print("Balancing Train BG, per-gite ...")
    balanced_train_bg = balance_subset_per_gite(train_bg, max_count, do_augmentation=True)
    
    print("Balancing Train Bats, per-gite ...")
    balanced_train_bat = balance_subset_per_gite(train_bat, max_count, do_augmentation=True)

    print("Balancing Test BG (no augmentation, no per-gite needed) ...")
    balanced_test_bg   = balance_subset(test_bg, len(test_bg), do_augmentation=False)
    
    print("Balancing Test Bats (no augmentation, no per-gite needed) ...")
    balanced_test_bat  = balance_subset(test_bat, len(test_bat), do_augmentation=False)

    # 4) Create output dirs just once
    out_dir = os.path.join(balanced_root, f"balanced_{loo_folder}")
    for split in ("train", "test"):
        for cls in classes:
            os.makedirs(os.path.join(out_dir, split, cls), exist_ok=True)

    def process_and_save(img_info, out_path):
        # unpack defensively
        src_path = img_info[0]
        is_aug   = img_info[1] if len(img_info) > 1 else False

        try:
            img = Image.open(src_path)
        except Exception as e:
            print(f"Error opening {src_path}: {e}")
            return

        if is_aug:
            img = random_augmentation(img)
            name, ext = os.path.splitext(os.path.basename(src_path))
            out_name = f"{name}_aug_{random.randint(1000, 9999)}{ext}"
        else:
            out_name = os.path.basename(src_path)

        final_path = os.path.join(out_path, out_name)
        try:
            save_image(img, final_path)
        except Exception as e:
            print(f"Error saving {final_path}: {e}")

    # 6) Gather all jobs
    jobs = []
    with ThreadPoolExecutor(max_workers=os.cpu_count() or 4) as pool:
        for lst, split, cls in [
            (balanced_train_bg, "train", "background"),
            (balanced_train_bat, "train", "bats"),
            (balanced_test_bg,  "test",  "background"),
            (balanced_test_bat, "test",  "bats"),
        ]:
            dest = os.path.join(out_dir, split, cls)
            for img_info in lst:
                jobs.append(pool.submit(process_and_save, img_info, dest))

        # wait for all of them
        for fut in as_completed(jobs):
            # can check fut.exception() if you want to log failures
            pass

    print(f"Finished balancing '{loo_folder}'. Output at: {out_dir}\n")
