The script iterates through the folders to find videos for each bird:
* Using OpenCV for frame extraction at a specified interval
* Filter out redundant frames through Perceptual Hashing (pHash). I decided to start with pHash as it is faster and less computationally expensive and since I don't need clustering. If it fails I will consdier instead of other feature matching alternatives like ORB, SIFT, or SURF (within k-means algorithm) for more advanced comparisons. 
* Save the selected frames into bird-specific folders

It also generates a CSV recording:
* Original video name
* Timestamp of the extracted frame in the original video
* Filename of the frame

This is debatable, but as of now I have decided to filter similar frames (using pHASH) that have not been mask segmented. In that way, te similarity is checked considering how close/far birds are to the camera, or where in the frame they apepar, rather than focusing strictly on the posture, which I assume would be the case if I filtered for similar frames after the image has been cropped
In general, pHash works best when applied to full images with consistent backgrounds, as it captures overall image structure. Removing the background would probably lead to incorrectly marking unique frames as duplicates

## Frame extraction

In [None]:
import os
from pathlib import Path
import cv2
import csv
from tqdm import tqdm

input_dir = "/Volumes/Sarequi" # path to the hard drive containing all valid videos 
output_dir = "/Users/sarah/Bowerbird-ID/3_Frame_sampling/1_Raw_extracted_frames"
sampling_interval = 120  # Extract a frame every 120 frames
max_frames_per_bird = 3  # Max frames per bird folder
max_videos_per_bird = 3  # Process only 3 videos per bird folder
relevant_folders = ["2_hdd", "3_hdd"]

os.makedirs(output_dir, exist_ok=True)
csv_data = [] 

for folder_name in relevant_folders:
    folder_path = Path(input_dir) / folder_name
    for bird_folder in folder_path.iterdir():
        if bird_folder.is_dir():
            bird_name = bird_folder.name  # Extract Bird ID (e.g., "B02")
            bird_output_dir = os.path.join(output_dir, bird_name)
            os.makedirs(bird_output_dir, exist_ok=True)

            frame_paths = []
            video_count = 0

            for video_file in bird_folder.glob("*.MP4"):
                if video_count >= max_videos_per_bird:
                    break

                cap = cv2.VideoCapture(str(video_file))
                frame_count = 0
                saved_count = 0
                video_name = Path(video_file).stem

                while cap.isOpened():
                    ret, frame = cap.read()
                    if not ret or saved_count >= max_frames_per_bird:
                        break

                    if frame_count % sampling_interval == 0:
                        timestamp = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000
                        frame_name = f"{video_name}_frame{frame_count}.png"
                        frame_path = os.path.join(bird_output_dir, frame_name)

                        cv2.imwrite(frame_path, frame)
                        frame_paths.append(frame_path)
                        csv_data.append([bird_name, video_file.name, timestamp, frame_name])
                        saved_count += 1

                    frame_count += 1

                cap.release()
                video_count += 1

# Save metadata to CSV
metadata_csv = os.path.join(output_dir, "raw_frames_metadata.csv")
with open(metadata_csv, mode='w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["Bird ID", "Video Name", "Timestamp", "Frame Name"])  # Added "Bird ID"
    writer.writerows(csv_data)

print(f"Frame extraction complete. Metadata saved to {metadata_csv}")

FileNotFoundError: [Errno 2] No such file or directory: '/Volumes/Sarequi/2_hdd'

## YOLO filtering

In [None]:
from ultralytics import YOLO
import pandas as pd
import os
import shutil
from tqdm import tqdm

yolo_model = YOLO('yolo11m.pt') 

raw_metadata_csv = "/Users/sarah/Bowerbird-ID/3_Frame_sampling/1_Raw_extracted_frames/raw_frames_metadata.csv"
raw_frames_dir = "/Users/sarah/Bowerbird-ID/3_Frame_sampling/1_Raw_extracted_frames"  # Directory containing raw frames with subfolders
filtered_frames_dir = "/Users/sarah/Bowerbird-ID/3_Frame_sampling/2_YOLO_filtered_frames"  # Directory for valid frames
filtered_out_frames_dir = "/Users/sarah/Bowerbird-ID/3_Frame_sampling/Filtered_out_frames_no_bird"  # Directory for invalid frames

os.makedirs(filtered_frames_dir, exist_ok=True)
os.makedirs(filtered_out_frames_dir, exist_ok=True)

metadata = pd.read_csv(raw_metadata_csv)

filtered_metadata = []

print("Running YOLO on raw frames...")
for _, row in tqdm(metadata.iterrows(), total=len(metadata)):
    frame_name = row["Frame Name"]  
    bird_id = row["Bird ID"]  

    frame_path = os.path.join(raw_frames_dir, bird_id, frame_name)
    if not os.path.exists(frame_path):
        print(f"Warning: Frame not found at {frame_path}")
        continue

    # Run YOLO det
    results = yolo_model.predict(frame_path, conf=0.3, verbose=False)

    if results[0].boxes:  # If a bird is detected...
        # Create subdirectory structure in the filtered frames directory
        filtered_subfolder_dir = os.path.join(filtered_frames_dir, bird_id)
        os.makedirs(filtered_subfolder_dir, exist_ok=True)

        # Copy valid frame to the corresponding subdirectory
        valid_frame_path = os.path.join(filtered_subfolder_dir, frame_name)
        shutil.copy(frame_path, valid_frame_path)  # Copy valid frame

        # Add row to new metadata
        filtered_metadata.append(row)
    else:  # If no bird is detected...
        # Copy invalid frame directly to the "filtered out" directory (no bird ID subfolder)
        invalid_frame_path = os.path.join(filtered_out_frames_dir, frame_name)
        shutil.copy(frame_path, invalid_frame_path)  # Copy invalid frame directly

# Save updated metadata for valid frames
filtered_metadata_csv = os.path.join(filtered_frames_dir, "yolodet_frames_metadata.csv")
pd.DataFrame(filtered_metadata).to_csv(filtered_metadata_csv, index=False)

print(f"Done!")
print(f"Filtered frames (with birds) saved in: {filtered_frames_dir}")
print(f"Filtered-out frames (no birds) saved in: {filtered_out_frames_dir}")
print(f"Updated metadata for valid frames saved at: {filtered_metadata_csv}")

Running YOLO on raw frames...


  0%|          | 0/63 [00:00<?, ?it/s][W NNPACK.cpp:64] Could not initialize NNPACK! Reason: Unsupported hardware.
100%|██████████| 63/63 [01:10<00:00,  1.12s/it]

Done!
Filtered frames (with birds) saved in: /Users/sarah/Bowerbird-ID/3_Frame_sampling/YOLO_filtered_frames
Filtered-out frames (no birds) saved in: /Users/sarah/Bowerbird-ID/3_Frame_sampling/Filtered_out_frames_no_bird
Updated metadata for valid frames saved at: /Users/sarah/Bowerbird-ID/3_Frame_sampling/YOLO_filtered_frames/yolodet_frames_metadata.csv





## pHASH filtering

In [None]:
import pandas as pd
import shutil

filtered_metadata_csv = "/Users/sarah/Bowerbird-ID/3_Frame_sampling/2_YOLO_filtered_frames/yolodet_frames_metadata.csv"
filtered_frames_dir = "/Users/sarah/Bowerbird-ID/3_Frame_sampling/2_YOLO_filtered_frames"  # Directory for valid YOLO-filtered frames
phash_filtered_frames_dir = "/Users/sarah/Bowerbird-ID/3_Frame_sampling/3_pHASH_filtered_frames"  # Directory for pHASH-filtered frames
os.makedirs(phash_filtered_frames_dir, exist_ok=True)

# Perceptual Hashing Parameters
similarity_threshold = 5

# Load metadata
metadata = pd.read_csv(filtered_metadata_csv)

unique_metadata = []
unique_hashes = {}

total_frames = len(metadata)
deleted_frames = 0

print("Filtering similar frames across bird folders using perceptual hashing...")
for _, row in tqdm(metadata.iterrows(), total=total_frames):
    frame_name = row["Frame Name"]  
    bird_id = row["Bird ID"]  

    frame_path = os.path.join(filtered_frames_dir, bird_id, frame_name)
    if not os.path.exists(frame_path):
        print(f"Warning: Frame not found at {frame_path}")
        continue

    img = Image.open(frame_path)
    img_hash = imagehash.phash(img)

    if bird_id not in unique_hashes:
        unique_hashes[bird_id] = []

    # Check if the hash is unique for this bird
    if all(img_hash - h > similarity_threshold for h in unique_hashes[bird_id]):
        # Add to unique list
        unique_metadata.append(row)
        unique_hashes[bird_id].append(img_hash)

        # Create subdirectory structure for pHASH-filtered frames
        phash_subfolder_dir = os.path.join(phash_filtered_frames_dir, bird_id)
        os.makedirs(phash_subfolder_dir, exist_ok=True)

        # Copy unique frame to the corresponding subdirectory
        unique_frame_path = os.path.join(phash_subfolder_dir, frame_name)
        shutil.copy(frame_path, unique_frame_path)
    else:
        # Increment deleted frames count
        deleted_frames += 1

# Save updated metadata for pHASH-filtered frames
phash_metadata_csv = os.path.join(phash_filtered_frames_dir, "phash_frames_metadata.csv")
pd.DataFrame(unique_metadata).to_csv(phash_metadata_csv, index=False)

kept_frames = total_frames - deleted_frames
deleted_percentage = (deleted_frames / total_frames) * 100 if total_frames > 0 else 0

print("\nFrame Filtering Summary:")
print(f"Total frames processed: {total_frames}")
print(f"Frames kept: {kept_frames}")
print(f"Frames deleted: {deleted_frames}")
print(f"Percentage of frames deleted: {deleted_percentage:.2f}%")
print(f"Unique frames saved in: {phash_filtered_frames_dir}")
print(f"Updated metadata for unique frames saved at: {phash_metadata_csv}")

Filtering similar frames across bird folders using perceptual hashing...


100%|██████████| 41/41 [00:04<00:00,  9.57it/s]


Frame Filtering Summary:
Total frames processed: 41
Frames kept: 17
Frames deleted: 24
Percentage of frames deleted: 58.54%
Unique frames saved in: /Users/sarah/Bowerbird-ID/3_Frame_sampling/pHASH_filtered_frames
Updated metadata for unique frames saved at: /Users/sarah/Bowerbird-ID/3_Frame_sampling/pHASH_filtered_frames/phash_frames_metadata.csv



