This script processes images to detect birds, crop them out, and generate segmentation masks of the detected birds, all through YOLOv8

In [1]:
import os
import cv2
import numpy as np
from tqdm import tqdm
from detectron2.config import get_cfg
from detectron2.engine import DefaultPredictor
from detectron2 import model_zoo
import matplotlib.pyplot as plt
from ultralytics import YOLO

In [74]:
import os
from tqdm import tqdm
import cv2
import numpy as np
from ultralytics import YOLO
from scipy.ndimage import label

input_dir = r"D:\Bowerbird-ID\3_Frame_sampling\Fully_processed_frames"
yolo_version = 'x'
yolo_model = YOLO(f'yolo11{yolo_version}-seg.pt')

output_root = r"D:\Bowerbird-ID\4_Run_YOLOv11_det_seg\SegProcessedFramesX"
mask_dir = os.path.join(output_root, f"mask_{yolo_version}")
os.makedirs(mask_dir, exist_ok=True)

# Filtering parameters
MIN_BLOB_PIXELS = 5000
BOTTOM_FRACTION_ROW = 1 / 4  # Process only the bottom 1/4 for horizontal row filtering
BOTTOM_FRACTION_NARROW = 1 / 2  # Process the bottom 1/2 for narrow structure filtering
HORIZONTAL_THRESHOLD = 0.8  # Rows with >=80% black pixels will be removed
NARROW_SEGMENT_THRESHOLD = 100  # Maximum width of narrow structures to remove

print("Processing images by bird ID")
for bird_id in tqdm(os.listdir(input_dir)):
    bird_id_path = os.path.join(input_dir, bird_id)
    if not os.path.isdir(bird_id_path):
        continue

    bird_mask_dir = os.path.join(mask_dir, bird_id)
    os.makedirs(bird_mask_dir, exist_ok=True)

    for image_name in tqdm(os.listdir(bird_id_path), desc=f"Processing {bird_id}"):
        image_path = os.path.join(bird_id_path, image_name)
        if not image_name.lower().endswith('.png'):
            continue

        # YOLO prediction
        results = yolo_model.predict(image_path, conf=0.6, verbose=False)
        if len(results[0].boxes) == 0:  # Skip if no detections
            continue

        img = results[0].orig_img
        x1, y1, x2, y2 = map(int, results[0].boxes.xyxy[0])  # Get highest-confidence box
        cropped = img[y1:y2, x1:x2]

        # Mask prediction
        results = yolo_model.predict(cropped, conf=0.8, verbose=False)
        if results[0].masks is None:  # Skip if no masks
            continue

        mask = results[0].masks.data[0].cpu().numpy().astype(bool)

        # Ensure the mask dimensions match the cropped image dimensions
        if mask.shape[:2] != cropped.shape[:2]:
            mask = cv2.resize(mask.astype(np.uint8), (cropped.shape[1], cropped.shape[0]),
                              interpolation=cv2.INTER_NEAREST).astype(bool)

        # Horizontal row filtering for the bottom 1/4 of the image
        bottom_start_row = int(mask.shape[0] * (1 - BOTTOM_FRACTION_ROW))
        bottom_mask_row = mask[bottom_start_row:, :]  # Extract the bottom portion for row filtering

        for row_idx in range(bottom_mask_row.shape[0]):
            row = bottom_mask_row[row_idx, :]
            black_percentage = 1 - (np.sum(row) / row.shape[0])  # Percentage of black pixels

            if black_percentage >= HORIZONTAL_THRESHOLD:  # >= 80% black pixels
                bottom_mask_row[row_idx, :] = 0  # Black out the row

        # Replace the modified bottom mask back into the original mask
        mask[bottom_start_row:, :] = bottom_mask_row

        # Narrow structure filtering for the bottom 1/2 of the image
        bottom_start_narrow = int(mask.shape[0] * (1 - BOTTOM_FRACTION_NARROW))
        bottom_mask_narrow = mask[bottom_start_narrow:, :]  # Extract the bottom portion for narrow filtering

        for row_idx in range(bottom_mask_narrow.shape[0]):
            row = bottom_mask_narrow[row_idx, :]
            non_black_segments = np.split(np.where(row)[0], np.where(np.diff(np.where(row)[0]) > 1)[0] + 1)

            for segment in non_black_segments:
                if len(segment) <= NARROW_SEGMENT_THRESHOLD:  # Narrow segment
                    row[segment] = 0  # Turn the narrow segment black

            bottom_mask_narrow[row_idx, :] = row  # Update the row after removing narrow segments

        # Replace the modified narrow mask back into the original mask
        mask[bottom_start_narrow:, :] = bottom_mask_narrow

        # Remove small blobs
        labeled_mask, num_features = label(mask)
        filtered_mask = np.zeros_like(mask, dtype=bool)
        for region_label in range(1, num_features + 1):
            if np.sum(labeled_mask == region_label) >= MIN_BLOB_PIXELS:
                filtered_mask[labeled_mask == region_label] = True

        # Ensure filtered_mask dimensions match cropped dimensions
        if filtered_mask.shape[:2] != cropped.shape[:2]:
            filtered_mask = cv2.resize(filtered_mask.astype(np.uint8), (cropped.shape[1], cropped.shape[0]),
                                       interpolation=cv2.INTER_NEAREST).astype(bool)

        # Save masked image
        mask_rgb = np.zeros_like(cropped)
        mask_rgb[filtered_mask] = cropped[filtered_mask]
        mask_filename = os.path.join(bird_mask_dir, f"{os.path.splitext(image_name)[0]}_mask.png")
        cv2.imwrite(mask_filename, mask_rgb)

print("Complete")


Processing images by bird ID


Processing B02: 100%|██████████| 20/20 [00:51<00:00,  2.59s/it]
Processing B03: 100%|██████████| 20/20 [00:46<00:00,  2.35s/it]
Processing B04: 100%|██████████| 20/20 [00:46<00:00,  2.35s/it]
Processing B05: 100%|██████████| 20/20 [00:52<00:00,  2.65s/it]
Processing B07: 100%|██████████| 20/20 [00:52<00:00,  2.63s/it]
Processing B11: 100%|██████████| 20/20 [00:49<00:00,  2.46s/it]
Processing B18: 100%|██████████| 20/20 [00:49<00:00,  2.46s/it]
Processing B23: 100%|██████████| 20/20 [00:51<00:00,  2.57s/it]
Processing B26: 100%|██████████| 20/20 [00:51<00:00,  2.56s/it]
Processing B29: 100%|██████████| 20/20 [00:48<00:00,  2.41s/it]
Processing B30: 100%|██████████| 20/20 [00:50<00:00,  2.52s/it]
Processing B31: 100%|██████████| 20/20 [00:42<00:00,  2.14s/it]
Processing B47: 100%|██████████| 20/20 [00:57<00:00,  2.86s/it]
Processing B49:  45%|████▌     | 9/20 [00:32<00:39,  3.63s/it]
 76%|███████▋  | 13/17 [11:23<03:30, 52.55s/it]


KeyboardInterrupt: 

Difference is the following script takes into account the metadata file and creates a new one with the name of the masked frame

In [82]:
import os
import pandas as pd
from tqdm import tqdm
import cv2
import numpy as np
from ultralytics import YOLO
from scipy.ndimage import label

# Input and output directories
input_dir = r"D:\Bowerbird-ID\3_Frame_sampling\Fully_processed_frames"
input_metadata_csv = r"D:\Bowerbird-ID\3_Frame_sampling\Fully_processed_frames\processed_frames_metadata.csv"
output_dir = r"D:\Bowerbird-ID\4_Run_YOLOv11_det_seg\Output_masked_frames"
new_metadata_csv = os.path.join(output_dir, "masked_frames_metadata.csv")

yolo_version = 'x'
yolo_model = YOLO(f'yolo11{yolo_version}-seg.pt')

# Filtering parameters
MIN_BLOB_PIXELS = 5000
BOTTOM_FRACTION_ROW = 1 / 5  # Process only the bottom fifth of the image for horizontal row filtering
BOTTOM_FRACTION_NARROW = 1 / 3  # Process the bottom half of the image for narrow structure filtering
HORIZONTAL_THRESHOLD = 0.8  # Rows with >=80% black pixels will be removed
NARROW_SEGMENT_THRESHOLD = 100  # Maximum width of narrow structures to remove

# Ensure the new metadata CSV has proper headers
if not os.path.exists(new_metadata_csv):
    with open(new_metadata_csv, 'w') as f:
        f.write("Bird ID,Video Name,Frame Name,Timestamp (s),Masked Image\n")

metadata = pd.read_csv(input_metadata_csv)

print("Processing metadata entries")
for idx, row in tqdm(metadata.iterrows(), total=len(metadata), desc="Processing metadata"):
    # Convert row to a dictionary to avoid NumPy indexing issues
    row = row.to_dict()

    bird_id = row["Bird ID"]
    frame_name = row["Frame Name"]
    frame_path = os.path.join(input_dir, bird_id, frame_name)

    if not os.path.exists(frame_path):  # Skip if the frame file does not exist
        continue

    bird_mask_dir = os.path.join(output_dir, bird_id)
    os.makedirs(bird_mask_dir, exist_ok=True)

    # YOLO prediction
    results = yolo_model.predict(frame_path, conf=0.6, verbose=False)
    if len(results[0].boxes) == 0:  # Skip if no detections
        continue

    img = results[0].orig_img
    x1, y1, x2, y2 = map(int, results[0].boxes.xyxy[0])  # Get highest-confidence box
    cropped = img[y1:y2, x1:x2]

    # Mask prediction
    results = yolo_model.predict(cropped, conf=0.8, verbose=False)
    if results[0].masks is None:  # Skip if no masks
        continue

    mask = results[0].masks.data[0].cpu().numpy().astype(bool)

    # Ensure the mask dimensions match the cropped image dimensions
    if mask.shape[:2] != cropped.shape[:2]:
        mask = cv2.resize(mask.astype(np.uint8), (cropped.shape[1], cropped.shape[0]),
                          interpolation=cv2.INTER_NEAREST).astype(bool)

    # Horizontal row filtering for the bottom fifth of the image
    bottom_start_row = int(mask.shape[0] * (1 - BOTTOM_FRACTION_ROW))
    bottom_mask_row = mask[bottom_start_row:, :]  # Extract the bottom portion for row filtering

    for row_idx in range(bottom_mask_row.shape[0]):
        row_pixels = bottom_mask_row[row_idx, :]
        black_percentage = 1 - (np.sum(row_pixels) / row_pixels.shape[0])  # Percentage of black pixels

        if black_percentage >= HORIZONTAL_THRESHOLD:  # >= 80% black pixels
            bottom_mask_row[row_idx, :] = 0  # Black out the row

    # Replace the modified bottom mask back into the original mask
    mask[bottom_start_row:, :] = bottom_mask_row

    # Narrow structure filtering for the bottom half of the image
    bottom_start_narrow = int(mask.shape[0] * (1 - BOTTOM_FRACTION_NARROW))
    bottom_mask_narrow = mask[bottom_start_narrow:, :]  # Extract the bottom portion for narrow filtering

    for row_idx in range(bottom_mask_narrow.shape[0]):
        row_pixels = bottom_mask_narrow[row_idx, :]
        non_black_segments = np.split(np.where(row_pixels)[0], np.where(np.diff(np.where(row_pixels)[0]) > 1)[0] + 1)

        for segment in non_black_segments:
            if len(segment) <= NARROW_SEGMENT_THRESHOLD:  # Narrow segment
                row_pixels[segment] = 0  # Turn the narrow segment black

        bottom_mask_narrow[row_idx, :] = row_pixels  # Update the row after removing narrow segments

    # Replace the modified narrow mask back into the original mask
    mask[bottom_start_narrow:, :] = bottom_mask_narrow

    # Remove small blobs
    labeled_mask, num_features = label(mask)
    filtered_mask = np.zeros_like(mask, dtype=bool)
    for region_label in range(1, num_features + 1):
        if np.sum(labeled_mask == region_label) >= MIN_BLOB_PIXELS:
            filtered_mask[labeled_mask == region_label] = True

    # Skip this frame if no blob is larger than the minimum blob size
    if not np.any(filtered_mask):
        continue

    # Ensure filtered_mask dimensions match cropped dimensions
    if filtered_mask.shape[:2] != cropped.shape[:2]:
        filtered_mask = cv2.resize(filtered_mask.astype(np.uint8), (cropped.shape[1], cropped.shape[0]),
                                   interpolation=cv2.INTER_NEAREST).astype(bool)

    # Save masked image
    mask_rgb = np.zeros_like(cropped)
    mask_rgb[filtered_mask] = cropped[filtered_mask] 
    masked_frame_name = f"{os.path.splitext(frame_name)[0]}_mask.png"
    mask_path = os.path.join(bird_mask_dir, masked_frame_name)
    cv2.imwrite(mask_path, mask_rgb)

    # Append entry to new metadata
    with open(new_metadata_csv, 'a') as f:
        f.write(f"{bird_id},{row['Video Name']},{frame_name},{row['Timestamp (s)']},{masked_frame_name}\n")

print(f"Metadata saved at: {new_metadata_csv}")


Processing metadata entries


Processing metadata: 100%|██████████| 3200/3200 [3:37:26<00:00,  4.08s/it]  

Metadata saved at: D:\Bowerbird-ID\4_Run_YOLOv11_det_seg\Output_masked_frames\masked_frames_metadata.csv



