In [9]:
import os
from pathlib import Path
import cv2
from tqdm import tqdm
from ultralytics import YOLO
from PIL import Image
import imagehash


INPUT_DIR = "/gpfs/data/fs72607/juarezs98/extracted_frames/Extracted_test_videos"
OUTPUT_DIR = "/home/fs72607/juarezs98/Bowerbird-ID/7_Classify_bowerbird_ID/Extracted_frames"
YOLO_MODEL_PATH = "yolo11x-seg.pt"

# Parameters
SAMPLING_INTERVAL = 60
IOU_THRESHOLD = 0.5
SIMILARITY_THRESHOLD = 5
MAX_VIDEOS_PER_FOLDER = 5

# Load YOLO model
yolo_model = YOLO(YOLO_MODEL_PATH)

def calculate_iou(box1, box2):
    """Calculates Intersection over Union (IoU) for two bounding boxes"""
    x1, y1, x2, y2 = box1
    x1_, y1_, x2_, y2_ = box2  
    inter_x1, inter_y1 = max(x1, x1_), max(y1, y1_)
    inter_x2, inter_y2 = min(x2, x2_), min(y2, y2_)
    
    inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1)
    union_area = (x2 - x1) * (y2 - y1) + (x2_ - x1_) * (y2_ - y1_) - inter_area
    
    return inter_area / union_area if union_area else 0

def process_video(video_path, output_subdir):
    cap = cv2.VideoCapture(str(video_path))
    frame_count = 0
    unique_hashes = set()

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        if frame_count % SAMPLING_INTERVAL == 0:
            results = yolo_model.predict(frame, conf=0.3, verbose=False)
            detections = results[0].boxes

            if len(detections) > 1:
                cap.release()
                return  # Skip video if multiple birds are detected

            filtered_detections = []
            for i, box in enumerate(detections.xyxy):
                x1, y1, x2, y2 = map(int, box)
                score = detections.conf[i]

                if not any(calculate_iou((x1, y1, x2, y2), det[:4]) > IOU_THRESHOLD for det in filtered_detections):
                    filtered_detections.append((x1, y1, x2, y2, score))

            if filtered_detections:
                x1, y1, x2, y2, _ = max(filtered_detections, key=lambda d: d[-1])
                cropped = frame[y1:y2, x1:x2]
                pil_image = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
                frame_hash = imagehash.phash(pil_image)

                if all(abs(frame_hash - h) > SIMILARITY_THRESHOLD for h in unique_hashes):
                    frame_path = output_subdir / f"{video_path.stem}_frame{frame_count}.png"
                    if cv2.imwrite(str(frame_path), frame):
                        unique_hashes.add(frame_hash)

        frame_count += 1

    cap.release()

for subdir in Path(INPUT_DIR).iterdir():
    if subdir.is_dir():
        output_subdir = Path(OUTPUT_DIR) / subdir.name
        output_subdir.mkdir(parents=True, exist_ok=True)

        video_files = list(subdir.glob("*.MP4"))[:MAX_VIDEOS_PER_FOLDER]
        for video in tqdm(video_files, desc=f"Processing {subdir.name}"):
            process_video(video, output_subdir)

print("Done extracting frames")

Processing B26: 100%|██████████| 5/5 [00:29<00:00,  5.97s/it]
Processing B52: 100%|██████████| 5/5 [01:16<00:00, 15.25s/it]
Processing B02: 100%|██████████| 5/5 [01:17<00:00, 15.42s/it]
Processing B03: 100%|██████████| 5/5 [02:07<00:00, 25.47s/it]
Processing B04: 100%|██████████| 5/5 [01:40<00:00, 20.17s/it]
Processing B30: 100%|██████████| 5/5 [02:07<00:00, 25.50s/it]
Processing B05: 100%|██████████| 5/5 [00:42<00:00,  8.40s/it]
Processing B47: 100%|██████████| 5/5 [01:10<00:00, 14.06s/it]
Processing B31: 100%|██████████| 5/5 [01:02<00:00, 12.58s/it]
Processing B23: 100%|██████████| 5/5 [00:55<00:00, 11.10s/it]
Processing B11: 100%|██████████| 5/5 [00:35<00:00,  7.16s/it]
Processing B18: 100%|██████████| 5/5 [01:41<00:00, 20.34s/it]
Processing B50: 100%|██████████| 5/5 [01:57<00:00, 23.54s/it]
Processing B07: 100%|██████████| 5/5 [01:24<00:00, 16.83s/it]
Processing B49: 100%|██████████| 5/5 [01:11<00:00, 14.22s/it]
Processing B29: 100%|██████████| 5/5 [01:27<00:00, 17.53s/it]

Done extracting frames





In [10]:
import torch
from torchvision import models, transforms
from collections import Counter
import torch.nn.functional as F

OUTPUT_DIR = "/home/fs72607/juarezs98/Bowerbird-ID/7_Classify_bowerbird_ID/Extracted_frames"
MODEL_PATH = "/home/fs72607/juarezs98/Bowerbird-ID/6_Train_ResNet50/Full dataset (minus test videos)/best_model.pth"

CLASS_NAMES = {i: name for i, name in enumerate([
    'B02', 'B03', 'B04', 'B05', 'B07', 'B11', 'B18', 'B23',
    'B26', 'B29', 'B30', 'B31', 'B47', 'B49', 'B50', 'B52'
])}

# loads model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = models.resnet50(pretrained=False)
model.fc = torch.nn.Linear(model.fc.in_features, len(CLASS_NAMES))  # Ensure correct output size
model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
model.to(device).eval()

# data transformation
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# process each folder
for subdir in Path(OUTPUT_DIR).iterdir():
    if subdir.is_dir():
        predictions = []
        confidences = []

        for frame_path in tqdm(subdir.glob("*.png"), desc=f"Classifying {subdir.name}"):
            image = Image.open(frame_path).convert("RGB")
            input_tensor = transform(image).unsqueeze(0).to(device)

            with torch.no_grad():
                outputs = model(input_tensor)
                probabilities = F.softmax(outputs, dim=1)  # Convert logits to probabilities
                top_class = probabilities.argmax().item()
                top_confidence = probabilities[0, top_class].item()

                predictions.append(top_class)
                confidences.append(top_confidence)

        if predictions:
            # find the most common predicted class
            most_common_class, most_common_count = Counter(predictions).most_common(1)[0]

            # calculate the average confidence score for the most common class
            avg_confidence = sum(c for p, c in zip(predictions, confidences) if p == most_common_class) / most_common_count

            # get the top 3 predicted classes and their counts
            top_3 = Counter(predictions).most_common(3)
            top_3_str = ", ".join(f"{CLASS_NAMES[cls]} ({count} frames)" for cls, count in top_3)

            print(f"\n{subdir.name} - Most likely bird: {CLASS_NAMES[most_common_class]} "
                  f"({most_common_count} frames, Avg confidence: {avg_confidence:.2%})")
            print(f"    Top 3 predictions: {top_3_str}\n")


Classifying B26: 19it [00:02,  8.74it/s]



B26 - Most likely bird: B52 (19 frames, Avg confidence: 100.00%)
    Top 3 predictions: B52 (19 frames)



Classifying B52: 65it [00:07,  8.99it/s]



B52 - Most likely bird: B52 (54 frames, Avg confidence: 98.47%)
    Top 3 predictions: B52 (54 frames), B03 (6 frames), B04 (5 frames)



Classifying B02: 35it [00:03,  9.74it/s]



B02 - Most likely bird: B02 (20 frames, Avg confidence: 79.41%)
    Top 3 predictions: B02 (20 frames), B52 (11 frames), B29 (4 frames)



Classifying B03: 56it [00:06,  9.00it/s]



B03 - Most likely bird: B03 (43 frames, Avg confidence: 80.61%)
    Top 3 predictions: B03 (43 frames), B31 (8 frames), B04 (5 frames)



Classifying B04: 90it [00:09,  9.49it/s]



B04 - Most likely bird: B03 (66 frames, Avg confidence: 93.63%)
    Top 3 predictions: B03 (66 frames), B52 (21 frames), B31 (2 frames)



Classifying B30: 127it [00:13,  9.32it/s]



B30 - Most likely bird: B52 (123 frames, Avg confidence: 93.69%)
    Top 3 predictions: B52 (123 frames), B04 (4 frames)



Classifying B05: 28it [00:03,  9.12it/s]



B05 - Most likely bird: B52 (22 frames, Avg confidence: 82.12%)
    Top 3 predictions: B52 (22 frames), B04 (3 frames), B03 (2 frames)



Classifying B47: 42it [00:04,  9.33it/s]



B47 - Most likely bird: B52 (42 frames, Avg confidence: 99.20%)
    Top 3 predictions: B52 (42 frames)



Classifying B31: 14it [00:01, 10.14it/s]



B31 - Most likely bird: B52 (11 frames, Avg confidence: 88.42%)
    Top 3 predictions: B52 (11 frames), B31 (3 frames)



Classifying B23: 34it [00:03,  9.37it/s]



B23 - Most likely bird: B03 (25 frames, Avg confidence: 90.41%)
    Top 3 predictions: B03 (25 frames), B49 (7 frames), B52 (2 frames)



Classifying B11: 20it [00:02,  9.62it/s]



B11 - Most likely bird: B31 (15 frames, Avg confidence: 86.76%)
    Top 3 predictions: B31 (15 frames), B52 (5 frames)



Classifying B18: 67it [00:06,  9.71it/s]



B18 - Most likely bird: B52 (67 frames, Avg confidence: 94.32%)
    Top 3 predictions: B52 (67 frames)



Classifying B50: 97it [00:10,  9.35it/s]



B50 - Most likely bird: B52 (97 frames, Avg confidence: 99.98%)
    Top 3 predictions: B52 (97 frames)



Classifying B07: 65it [00:06,  9.54it/s]



B07 - Most likely bird: B52 (29 frames, Avg confidence: 99.88%)
    Top 3 predictions: B52 (29 frames), B07 (18 frames), B50 (9 frames)



Classifying B49: 58it [00:05,  9.69it/s]



B49 - Most likely bird: B03 (22 frames, Avg confidence: 72.78%)
    Top 3 predictions: B03 (22 frames), B52 (14 frames), B49 (10 frames)



Classifying B29: 26it [00:02,  9.58it/s]


B29 - Most likely bird: B52 (25 frames, Avg confidence: 99.59%)
    Top 3 predictions: B52 (25 frames), B03 (1 frames)




