# Week 11

This week, we will be moving beyond image classification and focus on a new task: object detection. This requires us to localize and classify objects from multiple classes in an image.

Let's do a GPU check before training.

In [1]:
import tensorflow as tf
from tensorflow.python.client import device_lib

numGPUs = len(tf.config.experimental.list_physical_devices('GPU'))

print('Num GPUs Available: ', numGPUs)

if numGPUs > 0:
    print(tf.test.gpu_device_name())
    print(device_lib.list_local_devices()[1].physical_device_desc)
    print(device_lib.list_local_devices()[2].physical_device_desc)

Num GPUs Available:  0


# Lecture 18 - Classical Object Detection

Below we import some libraries and write some utility functions.

**Note**: Some code here is adapted from https://www.pyimagesearch.com/2020/06/22/turning-any-cnn-image-classifier-into-an-object-detector-with-keras-tensorflow-and-opencv/.

In [2]:
import imutils
import numpy as np

def sliding_window(image, stride, wh):
    # slide window vertically
    for y in range(0, image.shape[0] - wh[1], stride):
        
        # slide window horizontally
        for x in range(0, image.shape[1] - wh[0], stride):
            
            # yield the lower left corner of the window and the window
            yield (x, y, image[y:y + wh[1], x:x + wh[0]])
            
def image_pyramid(image, scale = 2, minSize = (224, 224)):
    # yield the original window
    yield image
    
    while True:
        # find the dimensions of the next image in the pydamid
        w = int(image.shape[1] / scale)
        
        # resize the image while maintaining aspect ratio
        image = imutils.resize(image, width = int(image.shape[1] / scale))
        
        # if the image is below the min size, stop
        if image.shape[0] < minSize[1] or image.shape[1] < minSize[0]:
            break
            
        # yield the next image in the pyramid
        yield image
        
#  Felzenszwalb et al.
def non_max_suppression_slow(boxes, overlapThresh):
    # if there are no boxes, return an empty list
    if len(boxes) == 0:
        return []
    
    # initialize the list of picked indexes
    pick = []
    
    # get coordinates of the bounding boxes
    x1 = boxes[:,0]
    y1 = boxes[:,1]
    x2 = boxes[:,2]
    y2 = boxes[:,3]
    
    # compute the area of the bounding boxes and sort by the bottom-right y-coordinate
    area = (x2 - x1 + 1) * (y2 - y1 + 1)
    idxs = np.argsort(y2)
    
    # loop over the indexes
    while len(idxs) > 0:
        # get last index in the indexes list
        last = len(idxs) - 1
        i = idxs[last]
        
        # add the index value to the list of picked indexes
        pick.append(i)
        
        # initialize the suppression list using the last index
        suppress = [last]
        
        # loop over all indexes in list
        for pos in range(last):
            # get the current index
            j = idxs[pos]
            
            # find min and max (x, y) coordinates for the bounding box
            xx1 = max(x1[i], x1[j])
            yy1 = max(y1[i], y1[j])
            xx2 = min(x2[i], x2[j])
            yy2 = min(y2[i], y2[j])
            
            # compute the width and height of the bounding box
            w = max(0, xx2 - xx1 + 1)
            h = max(0, yy2 - yy1 + 1)
            
            # compute the ratio of overlap between the computed bounding box and the bounding box in the area list
            overlap = float(w * h) / area[j]
            
            # if there is sufficient overlap, suppress the current bounding box
            if overlap > overlapThresh:
                suppress.append(pos)
        
        # delete all indexes in the suppression list
        idxs = np.delete(idxs, suppress)
        
    # return the bounding boxes that were picked
    return boxes[pick]

In [3]:
from tensorflow.keras.applications import ResNet101
from tensorflow.keras.applications.resnet import preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications import imagenet_utils
from imutils.object_detection import non_max_suppression
import cv2
import numpy as np
import pickle
import time

## Transfer Classifier



In [9]:
# initialize variables used for the object detection procedure
WIDTH = 800
P_SCALE = 1.25
WIN_STRIDE = 10
#ROI_SIZE = (550, 350)
ROI_SIZES = [(200, 100), (150, 100), (100, 100), (100, 150), (100, 200)]
INPUT_SIZE = (224, 224)
IMAGE = 'cat_dog.jpg'
VIZ = False
MINCONF = 0.8
OVERLAP_THRESHOLD = 0.6

In [10]:
# load ResNet pretrained on ImageNet
print('[INFO] Loading network...')

# if there are multiple GPUs, use a mirrored strategy to spread batches to all of the GPUs
if numGPUs > 1:
    strategy = tf.distribute.MirroredStrategy(cross_device_ops=tf.distribute.HierarchicalCopyAllReduce())

    with strategy.scope():
        model = ResNet101(weights = 'imagenet')

# if there are not multiple GPUs, run it normally
else:
    model = ResNet101(weights = 'imagenet')

# load the input image, resize to specified width, and find the new dimensions
original = cv2.imread(IMAGE)
original = imutils.resize(original, width = WIDTH)
H, W = original.shape[:2]

# initialize a list for ROIs from the pyramid and sliding windows
rois = []

# initialize a list for coordinates in the original image for the ROIs
locs = []

for ROI_SIZE in ROI_SIZES:
    print('[INFO] Searching ROI size', ROI_SIZE)
    new_rois = []
    new_locs = []

    # initialize image pyramid
    pyramid = image_pyramid(original, scale = P_SCALE, minSize = ROI_SIZE)

    # start a timer
    start = time.time()

    # preprocess images from pyramid
    for image in pyramid:
        print('[INFO] New pyramid image of shape', image.shape)
        # find the scale between the original image and current layer of pyramid
        scale = W / float(image.shape[1])

        # loop over sliding window locations
        for x, y, roiOriginal in sliding_window(image, WIN_STRIDE, ROI_SIZE):
            # scale coordinates of ROI
            x = int(x * scale)
            y = int(y * scale)
            w = int(ROI_SIZE[0] * scale)
            h = int(ROI_SIZE[1] * scale)

            # preprocess ROI
            roi = cv2.resize(roiOriginal, INPUT_SIZE)
            roi = img_to_array(roi)
            roi = preprocess_input(roi)

            # update list of ROIs and coordinates
            new_rois.append(roi)
            new_locs.append((x, y, x + w, y + h))

            if VIZ:
                # clone image and draw a bounding box
                clone = original.copy()
                cv2.rectangle(clone, (x, y), (x + w, y + h), (0, 255, 0), 2)

                # show visualization of current ROI
                cv2.imshow('Visualization', clone)
                cv2.imshow('ROI', roiOriginal)
                cv2.waitKey(0)

    end = time.time()

    print('[INFO] Looping over pyramid/windows took', end - start, 's')
    
    new_rois = np.array(new_rois, dtype = 'float32')
    
    rois.append(new_rois)
    locs.append(new_locs)

output = open('rois.pkl', 'wb')
pickle.dump([rois, locs], output)
output.close()
    
for roi, loc, ROI_SIZE in zip(rois, locs, ROI_SIZES):
    
    # clear GPU memory
    tf.keras.backend.clear_session()
    
    # classify each proposal
    print('[INFO] Classifying ROIs...')
    print('[INFO] Number of ROIs of aspect ratio', np.round(ROI_SIZE[0]/ROI_SIZE[1], 2), 'to 1:', roi.shape[0])
    start = time.time()
    preds = model.predict(roi, batch_size = 128, verbose = 1)
    end = time.time()
    print('[INFO] Classifying ROIs took', end - start, 's')

    predictions = imagenet_utils.decode_predictions(preds, top = 1)
    labels = {}

    # loop over the predictions
    for (i, p) in enumerate(predictions):
        # get the prediction information for the current ROI
        imagenetID, label, prob = p[0]

        # filter out weak detections
        if prob >= MINCONF:
            # get bounding box associated with the prediction and
            # convert the coordinates
            box = loc[i]

            # get predictions for the label and add the bounding box and probability to the list
            L = labels.get(label, [])
            L.append((box, prob))
            labels[label] = L

    # loop over the labels for each of detected objects in the image
    for label in labels.keys():
        # clone the original image
        #print('[INFO] Showing results for', label)
        clone = original.copy()

        # loop over all bounding boxes for the current label
        for box, prob in labels[label]:
            # draw the bounding box on the image
            startX, startY, endX, endY = box
            cv2.rectangle(clone, (startX, startY), (endX, endY), (0, 255, 0), 2)

        # show the results before NMS
        cv2.imshow('Before NMS', clone)
        clone = original.copy()

        # extract the bounding boxes and prediction probabilities
        boxes = np.array([p[0] for p in labels[label]])
        proba = np.array([p[1] for p in labels[label]])

        # apply NMS
        #print('[INFO] Applying NMS...')
        boxes = non_max_suppression_slow(boxes, OVERLAP_THRESHOLD)

        # loop over all bounding boxes that were kept after applying NMS
        for (startX, startY, endX, endY) in boxes:

            # draw the bounding box and label on the image
            cv2.rectangle(clone, (startX, startY), (endX, endY), (0, 255, 0), 2)

            y = startY - 10 if startY - 10 > 10 else startY + 10

            cv2.putText(clone, label, (startX, y), cv2.FONT_HERSHEY_SIMPLEX, 0.45, (0, 255, 0), 2)

        # show the output after apply non-maxima suppression
        cv2.imshow('After NMS', clone)
        cv2.waitKey(0)

# close windows
cv2.destroyAllWindows()

[INFO] Loading network...
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')
[INFO] Searching ROI size (200, 100)
[INFO] New pyramid image of shape (450, 800, 3)
[INFO] New pyramid image of shape (360, 640, 3)
[INFO] New pyramid image of shape (288, 512, 3)
[INFO] New pyramid image of shape (230, 409, 3)
[INFO] New pyramid image of shape (183, 327, 3)
[INFO] New pyramid image of shape (146, 261, 3)
[INFO] New pyramid image of shape (116, 208, 3)
[INFO] Looping over pyramid/windows took 1.311359167098999 s
[INFO] Searching ROI size (150, 100)
[INFO] New pyramid image of shape (450, 800, 3)
[INFO] New pyramid image of shape (360, 640, 3)
[INFO] New pyramid image of shape (288, 512, 3)
[INFO] New pyramid image of shape (230, 409, 3)
[INFO] New pyramid image of shape (183, 327, 3)
[INFO] New pyramid image of shape (146, 261, 3)
[INFO] New pyramid image of shape (116, 208, 3)
[INFO] Looping ove

# Selective Search and R-CNN

## Selective Search

Selective Search is a method for generating region proposals as an alternative to sliding windows and image pyramids. The latter is pretty brute-force, it is very slow, and it is very sensitive to the hyperparameters of the window size, pyramid scaling factor, etc. Beyond that, Selective Search tries to make this process more intelligent and more automatic.

In [4]:
# import libraries
import random
import time
import cv2

def selective_search(image_path, method = 'fast', viz = False):

    # import the image
    image = cv2.imread(image_path)

    # initialize OpenCV's selective search implementation and set the input image
    ss = cv2.ximgproc.segmentation.createSelectiveSearchSegmentation()
    ss.setBaseImage(image)

    # check to see if we are using the *fast* but *less accurate* version of selective search
    if method == 'fast':
        print('using *fast* selective search')
        ss.switchToSelectiveSearchFast()

    # otherwise we are using the *slower* but *more accurate* version
    else:
        print('using *quality* selective search')
        ss.switchToSelectiveSearchQuality()

    # run selective search on the input image
    start = time.time()
    rects = ss.process()
    end = time.time()

    # show how along selective search took to run along with the total
    # number of returned region proposals
    print('selective search took', end - start, 'seconds')
    print('selective search generated', len(rects), 'region proposals')

    if viz:
        # loop over the region proposals in chunks (so we can visualize them)
        for i in range(0, len(rects), 100):
            # clone the original image so we can draw on it
            output = image.copy()

            # loop over the current subset of region proposals
            for (x, y, w, h) in rects[i:100]:
                # draw the region proposal bounding box on the image
                color = [random.randint(0, 255) for j in range(0, 3)]
                cv2.rectangle(output, (x, y), (x + w, y + h), color, 2)

            # show the output image
            cv2.imshow("Output", output)
            key = cv2.waitKey(0) & 0xFF

            # if the `q` key was pressed, break from the loop
            if key == ord("q"):
                break

        cv2.destroyAllWindows()
        
    return rects

In [29]:
selective_search('cat_dog.jpg', 'quality', True)

using *quality* selective search
selective search took 34.400309562683105 seconds
selective search generated 16930 region proposals


array([[1253,  391,   28,  105],
       [ 247,  563,   14,   37],
       [ 450,  230,   56,  116],
       ...,
       [ 480,    0, 1020,  300],
       [ 605,    0,  895,  354],
       [  84,    0, 1416,  300]])

In [5]:
selective_search('cat_dog.jpg', 'fast', True)

AttributeError: module 'cv2' has no attribute 'ximgproc'

In [39]:
IMAGE = 'dog.jpg'
METHOD = 'fast'
MINCONF = 0.8

In [40]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.applications import imagenet_utils
from tensorflow.keras.preprocessing.image import img_to_array
from imutils.object_detection import non_max_suppression
import numpy as np

# load ResNet from disk with weights pre-trained on ImageNet
print('loading ResNet...')

# if there are multiple GPUs, use a mirrored strategy to spread batches to all of the GPUs
if numGPUs > 1:
    strategy = tf.distribute.MirroredStrategy(cross_device_ops=tf.distribute.HierarchicalCopyAllReduce())

    with strategy.scope():
        model = ResNet101(weights = 'imagenet')

# if there are not multiple GPUs, run it normally
else:
    model = ResNet101(weights = 'imagenet')

# load the input image from disk and get its dimensions
image = cv2.imread(IMAGE)
(H, W) = image.shape[:2]

# run selective search on the input image
rects = selective_search(IMAGE, METHOD)

# initialize the list of region proposals to classify and bounding boxes
rois = []
locs = []

# preprocess boxes from selective search
for (x, y, w, h) in rects:
    # filter out small rois
    if w / float(W) < 0.1 or h / float(H) < 0.1:
        continue
        
    # extract the region from the input image
    roi = image[y:y + h, x:x + w]
    
    # convert it from BGR to RGB channel ordering
    roi = cv2.cvtColor(roi, cv2.COLOR_BGR2RGB)
    
    # resize roi to 224x224 (input dimensions required by our pre-trained CNN)
    roi = cv2.resize(roi, (224, 224))
    
    # further preprocess by the ROI
    roi = img_to_array(roi)
    roi = preprocess_input(roi)
    
    # update proposals and bounding box lists
    rois.append(roi)
    locs.append((x, y, w, h))
    
# convert ROIs to numpy array
rois = np.array(rois)

# classify each proposal with ResNet
print('classifying ROIs...')
start = time.time()
preds = model.predict(rois)
end = time.time()

print('classifying ROIs took', end - start, 's')

predictions = imagenet_utils.decode_predictions(preds, top = 1)
labels = {}

# loop over the predictions
for (i, p) in enumerate(predictions):
    # get the prediction information for the current ROI
    imagenetID, label, prob = p[0]

    # filter out weak detections
    if prob >= MINCONF:
        # get bounding box associated with the prediction and
        # convert the coordinates
        x, y, w, h = locs[i]
        box = (x, y, x + w, y + h)
        
        # get predictions for the label and add the bounding box and probability to the list
        L = labels.get(label, [])
        L.append((box, prob))
        labels[label] = L
        
# loop over the labels for each of detected objects in the image
for label in labels.keys():
    # clone the original image
    print('showing results for', label)
    clone = image.copy()
    
    # loop over all bounding boxes for the current label
    for box, prob in labels[label]:
        # draw the bounding box on the image
        startX, startY, endX, endY = box
        cv2.rectangle(clone, (startX, startY), (endX, endY), (0, 255, 0), 2)
    
    # show the results before NMS
    cv2.imshow('Before NMS', clone)
    clone = image.copy()
    
    # extract the bounding boxes and prediction probabilities
    boxes = np.array([p[0] for p in labels[label]])
    proba = np.array([p[1] for p in labels[label]])
    
    # apply NMS
    print('applying NMS...')
    boxes = non_max_suppression(boxes, proba)
    
    # loop over all bounding boxes that were kept after applying NMS
    for startX, startY, endX, endY in boxes:
        
        # draw the bounding box and label on the image
        cv2.rectangle(clone, (startX, startY), (endX, endY), (0, 255, 0), 2)
        
        y = startY - 10 if startY - 10 > 10 else startY + 10
        
        cv2.putText(clone, label, (startX, y), cv2.FONT_HERSHEY_SIMPLEX, 0.45, (0, 255, 0), 2)
        
    # show the output after apply non-maxima suppression
    cv2.imshow('After NMS', clone)
    cv2.waitKey(0)
    
# close windows
cv2.destroyAllWindows()

loading ResNet...
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')
using *fast* selective search
selective search took 4.033909559249878 seconds
selective search generated 1321 region proposals
classifying ROIs...
classifying ROIs took 6.421988010406494 s
showing results for golden_retriever
applying NMS...
showing results for Dandie_Dinmont
applying NMS...
showing results for llama
applying NMS...
showing results for Brittany_spaniel
applying NMS...
