In [1]:
# Import libraries
import numpy as np
import cv2
import random
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Streaming webcam videos
webcam_video_stream = cv2.VideoCapture('images/video_sample.mp4')

while True:
    ret,current_frame = webcam_video_stream.read()
    img_to_detect = current_frame
    # get height and width of image
    img_height = img_to_detect.shape[0]
    img_width = img_to_detect.shape[1]
    
    # convert to blob to pass into model
    # recommended scale factor is 0.007843 and width, height of blob is 300,300 and mean of 255 is 127.5
    img_blob = cv2.dnn.blobFromImage(img_to_detect,swapRB=True, crop=False)
    
    # Set of 90 class labels in alphabetical order (background + rest of 20 classes)
    class_labels = ["person","bicycle","car","motorbike","aeroplane","bus","train","truck","boat","traffic light","fire hydrant","street sign","stop sign","parking meter","bench","bird","cat","dog","horse","sheep","cow","elephant","bear","zebra","giraffe","hat","backpack","umbrella","shoe","eye glasses","handbag","tie","suitcase","frisbee","skis","snowboard","sports ball","kite","baseball bat","baseball glove","skateboard","surfboard","tennis racket","bottle","plate","wine glass","cup","fork","knife","spoon","bowl","banana","apple","sandwich","orange","broccoli","carrot","hot dog","pizza","donut","cake","chair","sofa","pottedplant","bed","mirror","diningtable","window","desk","toilet","door","tv","laptop","mouse","remote","keyboard","cell phone","microwave","oven","toaster","sink","refrigerator","blender","book","clock","vase","scissors","teddy bear","hair drier","toothbrush"]
    
    # Declare list of colors as an array
    # Green, Blue, Red, Cyan, Yellow, Purple
    # Split based on ',' and for every split, change type to init
    # convert that to numpy array to apply color mask to the image numpy array

    class_colors = ["0,255,0","0,0,255","255,0,0","255,255,0","0,255,255","255,0,255"]
    class_colors = [np.array(every_color.split(",")).astype("int") for every_color in class_colors]
    class_colors = np.array(class_colors)
    class_colors = np.tile(class_colors,(15,1))
    
    # Loading pre-trained model from prototext and Tensorflow files 
    maskrcnn = cv2.dnn.readNetFromTensorflow('datasets/maskrcnn_buffermodel.pb','datasets/maskrcnn_bufferconfig.txt')
    
    # Input preprocessed blob into model and pass through the model
    maskrcnn.setInput(img_blob)
    
    # obtain the detection predictions (both box and mask) by the model using forward() method
    (obj_detections_boxes, obj_detections_masks) = maskrcnn.forward(["detection_out_final","detection_masks"])
    
    
    # Loop over the detections
    no_of_detections = obj_detections_boxes.shape[2]

    for index in np.arange(0, no_of_detections):
        prediction_confidence = obj_detections_boxes[0,0,index,2]
        # take only predictions with confidence more than 50%
        if prediction_confidence > 0.5:
            # get the prediction label
            predicted_class_index = int(obj_detections_boxes[0,0,index,1])
            predicted_class_label = class_labels[predicted_class_index]
            # obtain the bounding box co-ordinates for the actual image from resized image size
            bounding_box = obj_detections_boxes[0,0,index,3:7] * np.array([img_width, img_height, img_width, img_height])
            (start_x_pt, start_y_pt, end_x_pt, end_y_pt) = bounding_box.astype("int")

            # obtain width and height of bounding box
            bounding_box_width = end_x_pt-start_x_pt
            bounding_box_height = end_y_pt-start_y_pt

            # obtain the bounding mask co-ordinates for current detection index
            object_mask = obj_detections_masks[index, predicted_class_index]
            # resize mask to bounding_box_width and bounding_box_height
            object_mask = cv2.resize(object_mask, (bounding_box_width,bounding_box_height))
            # minimum threshold value to convert float based mask array to binary
            # if true respective values will be true and vice versa
            object_mask = (object_mask>0.3)

            # slice the image array based on bounding box rectangle which is the roi
            object_region_of_interest = img_to_detect[start_y_pt:end_y_pt, start_x_pt:end_x_pt]
            # slice the roi array based on the bounding box
            object_region_of_interest = object_region_of_interest[object_mask]

            # get a random mask color from numpy array of colors
            #mask_color = random.choice(class_colors)
            mask_color = class_colors[predicted_class_index]

            # add a transparent color cover to the region of inertest
            roi_color_transparent_cover = ((0.3 * mask_color) + (0.5 * object_region_of_interest)).astype("uint8")
            # place the transparent color cover over the actual image
            img_to_detect[start_y_pt:end_y_pt, start_x_pt:end_x_pt][object_mask] = roi_color_transparent_cover

            # convert the color numpy array as a list and apply text and box
            mask_color = [int(c) for c in mask_color]

            # Print the prediction in console
            predicted_class_label = "{}: {:.2f}%".format(class_labels[predicted_class_index],prediction_confidence*100)
            print("predicted object {}: {}".format(index+1,predicted_class_label))

            # Draw rectangle and text in the image
            cv2.rectangle(img_to_detect, (start_x_pt,start_y_pt), (end_x_pt,end_y_pt), mask_color,2)
            cv2.putText(img_to_detect, predicted_class_label, (start_x_pt,start_y_pt-5), cv2.FONT_HERSHEY_COMPLEX, 0.5, mask_color,1)

    # Show the output image
    #plt.imshow(img_to_detect)

    cv2.imshow("Detection Output", img_to_detect)
    # terminate while loop if 'q' key is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# releasing the stream and the camera
webcam_video_stream.release()
# close all opencv windows
cv2.destroyAllWindows()
    

predicted object 1: car: 90.03%
predicted object 2: truck: 71.87%
predicted object 3: truck: 67.83%
predicted object 4: car: 65.36%
predicted object 5: person: 64.22%
predicted object 6: person: 60.21%
predicted object 7: truck: 57.48%
predicted object 8: person: 54.34%
predicted object 1: car: 89.77%
predicted object 2: person: 72.44%
predicted object 3: car: 70.63%
predicted object 4: person: 69.97%
predicted object 5: bus: 60.29%
predicted object 6: truck: 56.57%
predicted object 7: truck: 55.74%
predicted object 8: person: 53.53%
predicted object 1: car: 88.94%
predicted object 2: car: 80.35%
predicted object 3: person: 77.88%
predicted object 4: bus: 61.79%
predicted object 5: person: 61.60%
predicted object 6: car: 60.26%
predicted object 7: person: 56.30%
predicted object 1: car: 87.21%
predicted object 2: person: 79.73%
predicted object 3: truck: 76.70%
predicted object 4: car: 69.71%
predicted object 5: bus: 68.61%
predicted object 6: person: 59.77%
predicted object 7: person:

predicted object 1: car: 91.22%
predicted object 2: car: 87.17%
predicted object 3: person: 84.92%
predicted object 4: truck: 77.61%
predicted object 5: person: 71.51%
predicted object 6: person: 65.15%
predicted object 7: person: 56.04%
predicted object 8: person: 51.45%
predicted object 9: person: 50.37%
predicted object 1: car: 90.72%
predicted object 2: car: 88.47%
predicted object 3: person: 85.28%
predicted object 4: truck: 83.45%
predicted object 5: person: 72.33%
predicted object 6: person: 58.22%
predicted object 7: person: 51.81%
predicted object 8: person: 51.58%
predicted object 9: person: 50.82%
predicted object 10: truck: 50.57%
predicted object 1: car: 91.52%
predicted object 2: car: 88.79%
predicted object 3: truck: 80.66%
predicted object 4: person: 80.00%
predicted object 5: person: 66.15%
predicted object 6: truck: 64.07%
predicted object 7: person: 59.00%
predicted object 1: car: 92.52%
predicted object 2: car: 88.32%
predicted object 3: person: 81.74%
predicted obj