In [1]:
import numpy as np
import cv2 as cv
import os
from tqdm import tqdm

In [2]:
# Initializing parameters
confThreshold = 0.7  # filters out detections with confidence <50%
maskThreshold = 0.4  # keeps pixels with probability >30% in the mask

The model was trained on the MS COCO dataset, which contains many object categories. Each detected object will be assigned a unique color for the mask. Thus, the following snippet...
* Loads the names of those object categories into a list
* Loads the colors and converts them to an array format that can be used to visually apply the masks
* Loads the pre-trained TensorFlow model (Mask-RCNN) into OpenCV's DNN module
* Sets the backend and target device

In [3]:
# loads class names from the specified file 
classesFile = "/Users/sarah/Mask_RCNN/mscoco_labels.names"
classes = None
with open(classesFile, 'rt') as f:
    classes = f.read().rstrip('\n').split('\n')

color = [255, 0, 0]  # Hardcoded the color as there will only be one class

textGraph = "/Users/sarah/Mask_RCNN/mask_rcnn_inception_v2_coco_2018_01_28.pbtxt" # defines the structure of the network
modelWeights = "/Users/sarah/Mask_RCNN/frozen_inference_graph.pb" # contains the pre-trained weights of the model

# OpenCV’s DNN module loads the pre-trained Mask R-CNN network using the TensorFlow format
net = cv.dnn.readNetFromTensorflow(modelWeights, textGraph)

net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV) # sets the backend as OpenCV
net.setPreferableTarget(cv.dnn.DNN_TARGET_CPU) # sets the target device to CPU for inference (can be changed to GPU) 

In [4]:
# draws the predicted bounding box, colorize and show the mask on the image
def drawBox(frame, classId, conf, left, top, right, bottom, classMask):
    cv.rectangle(frame, (left, top), (right, bottom), (255, 178, 50), 3)  # thickness 3, orange color (255, 178, 50)
    label = '%.2f' % conf  # prints a label with the confidence score

    labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1)  # calculates text label size
    top = max(top, labelSize[1])
    cv.rectangle(frame, (left, top - round(1.5*labelSize[1])), (left + round(1.5*labelSize[0]), top + baseLine), (255, 255, 255), cv.FILLED)  # draws a white rectangle as bg
    cv.putText(frame, label, (left, top), cv.FONT_HERSHEY_SIMPLEX, 0.75, (0,0,0), 1)  # prints text (confidence score) in black

    # resizes the mask, thresholds, colorize and apply it on the image
    classMask = cv.resize(classMask, (right - left + 1, bottom - top + 1))  # resizes the mask to fit the bbox
    mask = (classMask > maskThreshold)  # thresholds the mask so that only areas above the maskThreshold are kept
    roi = frame[top:bottom+1, left:right+1][mask]  # applies the mask to the region of interest (ROI) in the frame

    color = [255, 0, 0]  # Use a fixed color (blue)

    # colors the masked region with a blend of 30% of the mask's color and 70% of the original region
    frame[top:bottom+1, left:right+1][mask] = ([0.3*color[0], 0.3*color[1], 0.3*color[2]] + 0.7 * roi).astype(np.uint8)

    # draws the contours on the image
    mask = mask.astype(np.uint8)  # converts the mask to a binary format (0 or 1)
    contours, hierarchy = cv.findContours(mask, cv.RETR_TREE, cv.CHAIN_APPROX_SIMPLE)  # finds contours and returns 2 values (contours, hierarchy)
    cv.drawContours(frame[top:bottom+1, left:right+1], contours, -1, color, 3, cv.LINE_8, hierarchy, 100)  # draws contours using the fixed color

### Post-processing function
For each frame, extracts the bounding box and mask for each detected object. Output size of masks is NxCxHxW where:
* N - number of detected boxes
* C - number of classes (excluding background)
* HxW - segmentation mask size (e.g., 15x15 in this model)

In [5]:
def postprocess(boxes, masks):
    # numClasses = masks.shape[1] # can be accessed if needed
    numDetections = boxes.shape[2]
    
    # Get frame's height and width
    frameH = frame.shape[0]
    frameW = frame.shape[1]
    
    for i in range(numDetections):
        box = boxes[0, 0, i] # gets the bounding box information
        mask = masks[i] # gets the object's mask
        score = box[2] # confidence score of the detected object

        # only consider detections above the confidence threshold
        if score > confThreshold:
            classId = int(box[1]) # gets the class ID for the detected object
            
            # extracts bbox coordinates (scaled to the frame size)
            left = int(frameW * box[3])
            top = int(frameH * box[4])
            right = int(frameW * box[5])
            bottom = int(frameH * box[6])
            
            # ensures the coords are within the frame boundaries
            left = max(0, min(left, frameW - 1))
            top = max(0, min(top, frameH - 1))
            right = max(0, min(right, frameW - 1))
            bottom = max(0, min(bottom, frameH - 1))
            
            # extracts the mask for the object's detected class
            classMask = mask[classId]
            
            drawBox(frame, classId, score, left, top, right, bottom, classMask)

Input handling and output initialization

This script processes an image or video file, saving the output as a .png for images or an .mp4 for videos. For video inputs, the script also initializes a video writer

In [6]:
# input folder (containing the images) and output folder (for saving processed images)
inputFolder = "/Users/sarah/Bowerbird-ID/MegaDetector/output_cropped_frames"
outputFolder = "/Users/sarah/Bowerbird-ID/Mask_RCNN/predictions"

for imageFile in tqdm(os.listdir(inputFolder)):
    if not imageFile.lower().endswith('.png'):
        print(f"{imageFile} ain't an image")
        continue
    imagePath = os.path.join(inputFolder, imageFile)
    
    frame = cv.imread(imagePath)

    outputFile = os.path.join(outputFolder, os.path.basename(imageFile)[:-4] + 'pred.png')

    # creates a 4D blob from the image for Mask R-CNN
    blob = cv.dnn.blobFromImage(frame, swapRB=True, crop=False)
    net.setInput(blob)

    # runs the forward pass to get output from the output layers
    boxes, masks = net.forward(['detection_out_final', 'detection_masks'])

    # post-process the detections (custom function to handle boxes and masks)
    postprocess(boxes, masks)

    # prints efficiency info (inference time)
    t, _ = net.getPerfProfile()
    label = 'Mask-RCNN : Inference time: %.2f ms' % (t * 1000.0 / cv.getTickFrequency())
    cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0))

    cv.imwrite(outputFile, frame.astype(np.uint8)) # saves the output image 

cv.destroyAllWindows()

 27%|██▋       | 3/11 [00:04<00:11,  1.47s/it]

.DS_Store ain't an image


100%|██████████| 11/11 [00:14<00:00,  1.33s/it]
