# Week 10

This week and next week, we will be moving beyond image classification and focus on a new task: object detection. This requires us to localize and classify objects from multiple classes in an image.

# Lecture 18 - Classical Object Detection

Below we import some libraries and write some utility functions.

**Note**: Some code here is adapted from https://www.pyimagesearch.com/2020/06/22/turning-any-cnn-image-classifier-into-an-object-detector-with-keras-tensorflow-and-opencv/.

In [1]:
import imutils
import numpy as np

def sliding_window(image, stride, wh):
    # slide window vertically
    for y in range(0, image.shape[0] - wh[1], stride):
        
        # slide window horizontally
        for x in range(0, image.shape[1] - wh[0], stride):
            
            # yield the lower left corner of the window and the window
            yield (x, y, image[y:y + wh[1], x:x + wh[0]])
            
def image_pyramid(image, scale = 2, minSize = (224, 224)):
    # yield the original window
    yield image
    
    while True:
        # find the dimensions of the next image in the pydamid
        w = int(image.shape[1] / scale)
        
        # resize the image while maintaining aspect ratio
        image = imutils.resize(image, width = int(image.shape[1] / scale))
        
        # if the image is below the min size, stop
        if image.shape[0] < minSize[1] or image.shape[1] < minSize[0]:
            break
            
        # yield the next image in the pyramid
        yield image
        
#  Felzenszwalb et al.
def non_max_suppression_slow(boxes, overlapThresh):
    # if there are no boxes, return an empty list
    if len(boxes) == 0:
        return []
    
    # initialize the list of picked indexes
    pick = []
    
    # get coordinates of the bounding boxes
    x1 = boxes[:,0]
    y1 = boxes[:,1]
    x2 = boxes[:,2]
    y2 = boxes[:,3]
    
    # compute the area of the bounding boxes and sort by the bottom-right y-coordinate
    area = (x2 - x1 + 1) * (y2 - y1 + 1)
    idxs = np.argsort(y2)
    
    # loop over the indexes
    while len(idxs) > 0:
        # get last index in the indexes list
        last = len(idxs) - 1
        i = idxs[last]
        
        # add the index value to the list of picked indexes
        pick.append(i)
        
        # initialize the suppression list using the last index
        suppress = [last]
        
        # loop over all indexes in list
        for pos in range(last):
            # get the current index
            j = idxs[pos]
            
            # find min and max (x, y) coordinates for the bounding box
            xx1 = max(x1[i], x1[j])
            yy1 = max(y1[i], y1[j])
            xx2 = min(x2[i], x2[j])
            yy2 = min(y2[i], y2[j])
            
            # compute the width and height of the bounding box
            w = max(0, xx2 - xx1 + 1)
            h = max(0, yy2 - yy1 + 1)
            
            # compute the ratio of overlap between the computed bounding box and the bounding box in the area list
            overlap = float(w * h) / area[j]
            
            # if there is sufficient overlap, suppress the current bounding box
            if overlap > overlapThresh:
                suppress.append(pos)
        
        # delete all indexes in the suppression list
        idxs = np.delete(idxs, suppress)
        
    # return the bounding boxes that were picked
    return boxes[pick]

In [2]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet import preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications import imagenet_utils
from imutils.object_detection import non_max_suppression
import numpy as np
import time
import cv2

## Transfer Classifier



In [3]:
# initialize variables used for the object detection procedure
WIDTH = 800
P_SCALE = 1.5
WIN_STRIDE = 4
#ROI_SIZE = (550, 350)
ROI_SIZE = (250, 300)
INPUT_SIZE = (224, 224)
IMAGE = 'cat_dog.jpg'
VIZ = False
MINCONF = 0.8
OVERLAP_THRESHOLD = 0.5

In [4]:
# load ResNet pretrained on ImageNet
print("loading network...")
model = ResNet50(weights = 'imagenet', include_top=True)

# load the input image, resize to specified width, and find the new dimensions
original = cv2.imread(IMAGE)
original = imutils.resize(original, width = WIDTH)
H, W = original.shape[:2]

# initialize image pyramid
pyramid = image_pyramid(original, scale = P_SCALE, minSize = ROI_SIZE)

# initialize a list for ROIs from the pyramid and sliding windows
rois = []

# initialize a list for coordinates in the original image for the ROIs
locs = []

# start a timer
start = time.time()

# preprocess images from pyramid
for image in pyramid:
    print('new pyramid image')
    # find the scale between the original image and current layer of pyramid
    scale = W / float(image.shape[1])

    # loop over sliding window locations
    for x, y, roiOriginal in sliding_window(image, WIN_STRIDE, ROI_SIZE):
        # scale coordinates of ROI
        x = int(x * scale)
        y = int(y * scale)
        w = int(ROI_SIZE[0] * scale)
        h = int(ROI_SIZE[1] * scale)
        
        # preprocess ROI
        roi = cv2.resize(roiOriginal, INPUT_SIZE)
        roi = img_to_array(roi)
        roi = preprocess_input(roi)
        
        # update list of ROIs and coordinates
        rois.append(roi)
        locs.append((x, y, x + w, y + h))
        
        if VIZ:
            # clone image and draw a bounding box
            clone = original.copy()
            cv2.rectangle(clone, (x, y), (x + w, y + h), (0, 255, 0), 2)

            # show visualization of current ROI
            cv2.imshow('Visualization', clone)
            cv2.imshow('ROI', roiOriginal)
            cv2.waitKey(0)
            
end = time.time()

print('looping over pyramid/windows took', end - start, 's')

# convert ROIs to numpy array
rois = np.array(rois, dtype = 'float32')

# classify each proposal with ResNet
print('classifying ROIs...')
start = time.time()
preds = model.predict(rois)
end = time.time()

print('classifying ROIs took', end - start, 's')

predictions = imagenet_utils.decode_predictions(preds, top = 1)
labels = {}

# loop over the predictions
for (i, p) in enumerate(predictions):
    # get the prediction information for the current ROI
    imagenetID, label, prob = p[0]

    # filter out weak detections
    if prob >= MINCONF:
        # get bounding box associated with the prediction and
        # convert the coordinates
        box = locs[i]
        
        # get predictions for the label and add the bounding box and probability to the list
        L = labels.get(label, [])
        L.append((box, prob))
        labels[label] = L
        
# loop over the labels for each of detected objects in the image
for label in labels.keys():
    # clone the original image
    print('showing results for', label)
    clone = original.copy()
    
    # loop over all bounding boxes for the current label
    for box, prob in labels[label]:
        # draw the bounding box on the image
        startX, startY, endX, endY = box
        cv2.rectangle(clone, (startX, startY), (endX, endY), (0, 255, 0), 2)
    
    # show the results before NMS
    cv2.imshow('Before NMS', clone)
    clone = original.copy()
    
    # extract the bounding boxes and prediction probabilities
    boxes = np.array([p[0] for p in labels[label]])
    proba = np.array([p[1] for p in labels[label]])
    
    # apply NMS
    print('applying NMS...')
    boxes = non_max_suppression_slow(boxes, OVERLAP_THRESHOLD)
    
    # loop over all bounding boxes that were kept after applying NMS
    for (startX, startY, endX, endY) in boxes:
        
        # draw the bounding box and label on the image
        cv2.rectangle(clone, (startX, startY), (endX, endY), (0, 255, 0), 2)
        
        y = startY - 10 if startY - 10 > 10 else startY + 10
        
        cv2.putText(clone, label, (startX, y), cv2.FONT_HERSHEY_SIMPLEX, 0.45, (0, 255, 0), 2)
        
    # show the output after apply non-maxima suppression
    cv2.imshow('After NMS', clone)
    cv2.waitKey(0)

loading network...
new pyramid image
looping over pyramid/windows took 1.654421091079712 s
classifying ROIs...
classifying ROIs took 7.001559734344482 s
showing results for Brabancon_griffon
applying NMS...
showing results for Bouvier_des_Flandres
applying NMS...
showing results for Labrador_retriever
applying NMS...
showing results for Egyptian_cat
applying NMS...
