In [1]:
# Import libraries
import numpy as np
import cv2
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# Streaming webcam videos
webcam_video_stream = cv2.VideoCapture('images/video_sample.mp4')

while True:
    ret,current_frame = webcam_video_stream.read()
    img_to_detect = current_frame
    # get height and width of image
    img_height = img_to_detect.shape[0]
    img_width = img_to_detect.shape[1]
    # convert to blob to pass into model
    # recommended scale factor is 0.003922 = 1/255 and width, height of blob is 320,320
    # accepted sizes are 320*320 416*416, 609*609. More size means more accuracy but less spead
    img_blob = cv2.dnn.blobFromImage(img_to_detect,0.003922,(416,416),swapRB=True, crop=False)

    # Set of 80 class labels in alphabetical order (background + rest of 20 classes)
    class_labels = ["person","bicycle","car","motorcycle","airplane","bus","train","truck","boat","trafficlight","firehydrant",
                    "stopsign","parkingmeter","bench","bird","cat","dog","horse","sheep","cow","elephant","bear","zebra",
                    "giraffe","backpack","umbrella","handbag","tie","suitcase","frisbee","skis","snowboard","sportsball",
                    "kite","baseballbat","baseballglove","skateboard","surfboard","tennisracket","bottle","wineglass","cup",
                    "fork","knife","spoon","bowl","banana","apple","sandwich","orange","broccoli","carrot","hotdog","pizza",
                    "donut","cake","chair","sofa","pottedplant","bed","diningtable","toilet","tvmonitor","laptop","mouse",
                    "remote","keyboard","cellphone","microwave","oven","toaster","sink","refrigerator","book","clock","vase",
                    "scissors","teddybear","hairdrier","toothbrush"]
    
    
    #Declare List of colors as an array
    #Green, Blue, Red, cyan, yellow, purple
    #Split based on ',' and for every split, change type to int
    #convert that to a numpy array to apply color mask to the image numpy array
    class_colors = ["0,255,0","0,0,255","255,0,0","255,255,0","0,255,255"]
    class_colors = [np.array(every_color.split(",")).astype("int") for every_color in class_colors]
    class_colors = np.array(class_colors)
    class_colors = np.tile(class_colors,(16,1))

    # Loading pre-trained model from prototext and caffemodel files 
    yolo_model = cv2.dnn.readNetFromDarknet('datasets/yolov3.cfg','datasets/yolov3.weights')

    # Get all layers from yolo network
    # Loop and find the last layer (output layer) of the yolo network
    yolo_layers =yolo_model.getLayerNames()
    yolo_output_layer = [yolo_layers[yolo_layer[0]-1] for yolo_layer in yolo_model.getUnconnectedOutLayers()]
    # Input preprocessed blob into model and pass through the model
    yolo_model.setInput(img_blob)
    # obtain the detection predictions by the model using forward() method
    obj_detection_layers = yolo_model.forward(yolo_output_layer)

    # to fix multiple box overlapping issue using non-maximum supression (NMS)
    ############## NMS Change 1 ###############
    # initialization for non-max suppression (NMS)
    # declare list for [class id], [box center, width & height[], [confidences]
    class_ids_list = []
    boxes_list = []
    confidences_list = []
    ############## NMS Change 1 END ###########

    # loop over each of the layer outputs
    for object_detection_layer in obj_detection_layers:
        # loop over the detections
        for object_detection in object_detection_layer:

            # obj_detections[1 to 4] => will have the bounding box co-ordinates
            # obj_detections[5] => will have scores for all objects within bounding box
            all_scores = object_detection[5:]
            predicted_class_id = np.argmax(all_scores)
            prediction_confidence = all_scores[predicted_class_id]

            # take only predictions with confidence more than 50%
            if prediction_confidence > 0.5:
                # get the predicted label
                predicted_class_label = class_labels[predicted_class_id]
                # obtain the bounding box co-ordinates for the actual image from resized image size
                bounding_box = object_detection[0:4] * np.array([img_width, img_height, img_width, img_height])
                (box_center_x_pt, box_center_y_pt, box_width, box_height) = bounding_box.astype("int")
                start_x_pt = int(box_center_x_pt - (box_width/2))
                start_y_pt = int(box_center_y_pt - (box_height/2))

                ############## NMS Change 2 ###############
                #save class id, start x, y, width & height, confidences in a list for nms processing
                #make sure to pass confidence as float and width and height as integers
                class_ids_list.append(predicted_class_id)
                confidences_list.append(float(prediction_confidence))
                boxes_list.append([start_x_pt, start_y_pt, int(box_width), int(box_height)])
                ############## NMS Change 2 END ###########

    ############## NMS Change 3 ###############
    # Applying the NMS will return only the selected max value ids while suppressing the non maximum (weak) overlapping bounding boxes      
    # Non-Maxima Suppression confidence set as 0.5 & max_suppression threhold for NMS as 0.4 (adjust and try for better perfomance)
    max_value_ids = cv2.dnn.NMSBoxes(boxes_list, confidences_list, 0.5, 0.4)

    # loop through the final set of detections remaining after NMS and draw bounding box and write text
    for max_valueid in max_value_ids:
        max_class_id = max_valueid[0]
        box = boxes_list[max_class_id]
        start_x_pt = box[0]
        start_y_pt = box[1]
        box_width = box[2]
        box_height = box[3]

        #get the predicted class id and label
        predicted_class_id = class_ids_list[max_class_id]
        predicted_class_label = class_labels[predicted_class_id]
        prediction_confidence = confidences_list[max_class_id]
    ############## NMS Change 3 END ###########

        #obtain the bounding box end co-oridnates
        end_x_pt = start_x_pt + box_width
        end_y_pt = start_y_pt + box_height
        
        #get a random mask color from the numpy array of colors
        box_color = class_colors[predicted_class_id]
        
        #convert the color numpy array as a list and apply to text and box
        box_color = [int(c) for c in box_color]

        # Print the prediction in console
        predicted_class_label = "{}: {:.2f}%".format(class_labels[predicted_class_id],prediction_confidence*100)
        print("predicted object {}".format(predicted_class_label))

        # Draw rectangle and text in the image
        cv2.rectangle(img_to_detect, (start_x_pt,start_y_pt), (end_x_pt,end_y_pt), box_color,2)
        cv2.putText(img_to_detect, predicted_class_label, (start_x_pt,start_y_pt-5), cv2.FONT_HERSHEY_COMPLEX, 0.5, box_color,1)
    
    cv2.imshow("Detection Output", img_to_detect)

    # terminate while loop if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# releasing the stream and camera
webcam_video_stream.release()

# close all opencv windows
cv2.destroyAllWindows()


predicted object person: 98.51%
predicted object person: 98.11%
predicted object car: 97.10%
predicted object car: 96.56%
predicted object person: 96.14%
predicted object car: 93.36%
predicted object person: 93.17%
predicted object car: 91.64%
predicted object car: 85.13%
predicted object truck: 84.26%
predicted object person: 83.35%
predicted object trafficlight: 81.29%
predicted object truck: 79.48%
predicted object person: 74.93%
predicted object person: 73.36%
predicted object person: 70.86%
predicted object person: 69.52%
predicted object truck: 66.78%
predicted object car: 63.64%
predicted object car: 62.47%
predicted object car: 62.33%
predicted object car: 59.97%
predicted object person: 59.26%
predicted object car: 56.76%
predicted object person: 54.18%
predicted object car: 52.85%
predicted object trafficlight: 50.12%
predicted object person: 50.04%
predicted object person: 98.25%
predicted object person: 97.90%
predicted object car: 96.91%
predicted object person: 95.74%
pre

predicted object person: 98.67%
predicted object person: 98.49%
predicted object car: 97.38%
predicted object person: 97.11%
predicted object car: 97.10%
predicted object person: 94.71%
predicted object car: 94.01%
predicted object person: 92.08%
predicted object person: 91.03%
predicted object car: 89.62%
predicted object person: 88.99%
predicted object car: 88.46%
predicted object truck: 88.21%
predicted object person: 86.74%
predicted object person: 86.66%
predicted object person: 80.08%
predicted object person: 76.66%
predicted object person: 76.37%
predicted object truck: 76.14%
predicted object trafficlight: 73.31%
predicted object car: 70.32%
predicted object car: 67.22%
predicted object car: 65.99%
predicted object person: 55.00%
predicted object car: 54.59%
predicted object car: 54.19%
predicted object car: 98.18%
predicted object person: 97.96%
predicted object person: 97.78%
predicted object car: 96.97%
predicted object person: 96.92%
predicted object person: 94.82%
predicte

predicted object person: 99.16%
predicted object person: 98.77%
predicted object car: 96.94%
predicted object person: 95.80%
predicted object person: 95.44%
predicted object car: 92.33%
predicted object person: 90.84%
predicted object person: 89.26%
predicted object car: 87.59%
predicted object person: 85.38%
predicted object car: 84.99%
predicted object truck: 84.09%
predicted object person: 84.00%
predicted object person: 79.36%
predicted object car: 76.43%
predicted object car: 75.52%
predicted object person: 71.68%
predicted object car: 71.49%
predicted object car: 70.19%
predicted object person: 69.73%
predicted object trafficlight: 69.07%
predicted object person: 64.57%
predicted object car: 64.50%
predicted object car: 58.96%
predicted object car: 57.04%
predicted object car: 56.81%
predicted object person: 51.55%
predicted object person: 99.00%
predicted object person: 98.14%
predicted object car: 97.01%
predicted object person: 96.86%
predicted object person: 95.54%
predicted 

predicted object person: 98.50%
predicted object person: 98.35%
predicted object person: 96.12%
predicted object car: 95.98%
predicted object person: 95.02%
predicted object person: 93.66%
predicted object person: 90.54%
predicted object car: 90.29%
predicted object person: 88.91%
predicted object truck: 88.26%
predicted object person: 87.27%
predicted object person: 86.45%
predicted object person: 86.17%
predicted object car: 78.94%
predicted object car: 76.49%
predicted object car: 73.37%
predicted object car: 73.05%
predicted object car: 72.38%
predicted object trafficlight: 71.97%
predicted object person: 70.41%
predicted object car: 69.04%
predicted object car: 66.77%
predicted object person: 61.58%
predicted object person: 61.04%
predicted object car: 60.25%
predicted object person: 59.70%
predicted object trafficlight: 52.65%
predicted object person: 98.76%
predicted object person: 98.28%
predicted object person: 97.57%
predicted object car: 96.09%
predicted object person: 95.32