In [1]:
# Import libraries
import numpy as np
import cv2
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# Streaming webcam videos
webcam_video_stream = cv2.VideoCapture('images/video_sample.mp4')

while True:
    ret,current_frame = webcam_video_stream.read()
    img_to_detect = current_frame
    # get height and width of image
    img_height = img_to_detect.shape[0]
    img_width = img_to_detect.shape[1]
    # convert to blob to pass into model
    # recommended scale factor is 0.003922 = 1/255 and width, height of blob is 320,320
    # accepted sizes are 320*320 416*416, 609*609. More size means more accuracy but less spead
    img_blob = cv2.dnn.blobFromImage(img_to_detect,0.003922,(416,416),swapRB=True, crop=False)

    # Set of 80 class labels in alphabetical order (background + rest of 20 classes)
    class_labels = ["person","bicycle","car","motorcycle","airplane","bus","train","truck","boat","trafficlight","firehydrant",
                    "stopsign","parkingmeter","bench","bird","cat","dog","horse","sheep","cow","elephant","bear","zebra",
                    "giraffe","backpack","umbrella","handbag","tie","suitcase","frisbee","skis","snowboard","sportsball",
                    "kite","baseballbat","baseballglove","skateboard","surfboard","tennisracket","bottle","wineglass","cup",
                    "fork","knife","spoon","bowl","banana","apple","sandwich","orange","broccoli","carrot","hotdog","pizza",
                    "donut","cake","chair","sofa","pottedplant","bed","diningtable","toilet","tvmonitor","laptop","mouse",
                    "remote","keyboard","cellphone","microwave","oven","toaster","sink","refrigerator","book","clock","vase",
                    "scissors","teddybear","hairdrier","toothbrush"]
    
    
    #Declare List of colors as an array
    #Green, Blue, Red, cyan, yellow, purple
    #Split based on ',' and for every split, change type to int
    #convert that to a numpy array to apply color mask to the image numpy array
    class_colors = ["0,255,0","0,0,255","255,0,0","255,255,0","0,255,255"]
    class_colors = [np.array(every_color.split(",")).astype("int") for every_color in class_colors]
    class_colors = np.array(class_colors)
    class_colors = np.tile(class_colors,(16,1))

    # Loading pre-trained model from prototext and caffemodel files 
    yolo_model = cv2.dnn.readNetFromDarknet('datasets/yolov2-tiny.cfg','datasets/yolov2-tiny.weights')

    # Get all layers from yolo network
    # Loop and find the last layer (output layer) of the yolo network
    yolo_layers =yolo_model.getLayerNames()
    yolo_output_layer = [yolo_layers[yolo_layer[0]-1] for yolo_layer in yolo_model.getUnconnectedOutLayers()]
    # Input preprocessed blob into model and pass through the model
    yolo_model.setInput(img_blob)
    # obtain the detection predictions by the model using forward() method
    obj_detection_layers = yolo_model.forward(yolo_output_layer)

    # to fix multiple box overlapping issue using non-maximum supression (NMS)
    ############## NMS Change 1 ###############
    # initialization for non-max suppression (NMS)
    # declare list for [class id], [box center, width & height[], [confidences]
    class_ids_list = []
    boxes_list = []
    confidences_list = []
    ############## NMS Change 1 END ###########

    # loop over each of the layer outputs
    for object_detection_layer in obj_detection_layers:
        # loop over the detections
        for object_detection in object_detection_layer:

            # obj_detections[1 to 4] => will have the bounding box co-ordinates
            # obj_detections[5] => will have scores for all objects within bounding box
            all_scores = object_detection[5:]
            predicted_class_id = np.argmax(all_scores)
            prediction_confidence = all_scores[predicted_class_id]

            # take only predictions with confidence more than 50%
            if prediction_confidence > 0.2:
                # get the predicted label
                predicted_class_label = class_labels[predicted_class_id]
                # obtain the bounding box co-ordinates for the actual image from resized image size
                bounding_box = object_detection[0:4] * np.array([img_width, img_height, img_width, img_height])
                (box_center_x_pt, box_center_y_pt, box_width, box_height) = bounding_box.astype("int")
                start_x_pt = int(box_center_x_pt - (box_width/2))
                start_y_pt = int(box_center_y_pt - (box_height/2))

                ############## NMS Change 2 ###############
                #save class id, start x, y, width & height, confidences in a list for nms processing
                #make sure to pass confidence as float and width and height as integers
                class_ids_list.append(predicted_class_id)
                confidences_list.append(float(prediction_confidence))
                boxes_list.append([start_x_pt, start_y_pt, int(box_width), int(box_height)])
                ############## NMS Change 2 END ###########

    ############## NMS Change 3 ###############
    # Applying the NMS will return only the selected max value ids while suppressing the non maximum (weak) overlapping bounding boxes      
    # Non-Maxima Suppression confidence set as 0.5 & max_suppression threhold for NMS as 0.4 (adjust and try for better perfomance)
    max_value_ids = cv2.dnn.NMSBoxes(boxes_list, confidences_list, 0.5, 0.4)

    # loop through the final set of detections remaining after NMS and draw bounding box and write text
    for max_valueid in max_value_ids:
        max_class_id = max_valueid[0]
        box = boxes_list[max_class_id]
        start_x_pt = box[0]
        start_y_pt = box[1]
        box_width = box[2]
        box_height = box[3]

        #get the predicted class id and label
        predicted_class_id = class_ids_list[max_class_id]
        predicted_class_label = class_labels[predicted_class_id]
        prediction_confidence = confidences_list[max_class_id]
    ############## NMS Change 3 END ###########

        #obtain the bounding box end co-oridnates
        end_x_pt = start_x_pt + box_width
        end_y_pt = start_y_pt + box_height
        
        #get a random mask color from the numpy array of colors
        box_color = class_colors[predicted_class_id]
        
        #convert the color numpy array as a list and apply to text and box
        box_color = [int(c) for c in box_color]

        # Print the prediction in console
        predicted_class_label = "{}: {:.2f}%".format(class_labels[predicted_class_id],prediction_confidence*100)
        print("predicted object {}".format(predicted_class_label))

        # Draw rectangle and text in the image
        cv2.rectangle(img_to_detect, (start_x_pt,start_y_pt), (end_x_pt,end_y_pt), box_color,2)
        cv2.putText(img_to_detect, predicted_class_label, (start_x_pt,start_y_pt-5), cv2.FONT_HERSHEY_COMPLEX, 0.5, box_color,1)
    
    cv2.imshow("Detection Output", img_to_detect)

    # terminate while loop if 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# releasing the stream and camera
webcam_video_stream.release()

# close all opencv windows
cv2.destroyAllWindows()


predicted object person: 68.27%
predicted object car: 67.21%
predicted object person: 61.31%
predicted object person: 70.57%
predicted object car: 64.74%
predicted object person: 61.44%
predicted object car: 60.11%
predicted object person: 75.18%
predicted object car: 68.15%
predicted object person: 60.90%
predicted object person: 72.06%
predicted object car: 68.74%
predicted object person: 60.42%
predicted object person: 71.75%
predicted object car: 68.26%
predicted object person: 60.47%
predicted object person: 72.05%
predicted object car: 65.81%
predicted object person: 60.97%
predicted object person: 66.65%
predicted object car: 64.16%
predicted object car: 72.42%
predicted object person: 68.23%
predicted object car: 64.93%
predicted object car: 63.12%
predicted object person: 69.06%
predicted object car: 66.73%
predicted object car: 64.22%
predicted object car: 60.82%
predicted object person: 69.67%
predicted object car: 65.46%
predicted object person: 62.34%
predicted object pers

predicted object car: 71.10%
predicted object car: 61.54%
predicted object car: 61.02%
predicted object car: 72.87%
predicted object car: 63.82%
predicted object car: 62.60%
predicted object car: 60.43%
predicted object car: 73.50%
predicted object car: 64.35%
predicted object car: 64.19%
predicted object person: 62.95%
predicted object car: 73.06%
predicted object car: 65.08%
predicted object car: 63.78%
predicted object person: 62.52%
predicted object car: 60.33%
predicted object car: 73.86%
predicted object car: 65.56%
predicted object car: 61.60%
predicted object car: 72.85%
predicted object car: 65.05%
predicted object car: 61.03%
predicted object car: 72.98%
predicted object car: 65.40%
predicted object car: 61.05%
predicted object car: 72.86%
predicted object car: 63.40%
predicted object car: 75.87%
predicted object car: 64.05%
predicted object person: 62.76%
predicted object car: 76.35%
predicted object car: 65.34%
predicted object person: 61.32%
predicted object car: 76.63%
pr