# Hands on Computer Vision Exercise Notebook

## General Setup

Make sure selected kernel for the notebook is "venv"

In [None]:
import cv2
import numpy as np
import math
import matplotlib.pyplot as plt

### Exercise 2: Computer Images and Processing

In [None]:
# Load and display two test images

# open two test images
aerial = cv2.imread('../data/rareplanes6.jpg')
dog_raw = cv2.imread('../data/dog.jpg')  # default: bgr for display
print(f"arr1 shape (H x W x Channels) or (Rows x Cols x Channels): {dog_raw.shape}")

In [None]:
print(dog_raw)

In [None]:
plt.imshow(aerial)
plt.show()

In [None]:
plt.imshow(dog_raw)
plt.show()

It looks a little...blue?

In [None]:
#cv.imread() defaults to bgr
# Use openCV to reverse channel order
plt.imshow(cv2.cvtColor(dog_raw, cv2.COLOR_BGR2RGB))
plt.show()

In [None]:
# Convert channels a different way
dog = dog_raw[..., ::-1]                  # reverses order of last dim of array bgr -> rgb
aerial = aerial[..., ::-1]
plt.imshow(dog)
plt.show()

Manipulating the image is achieved by manipulating the array!

In [None]:
# cropping the image is a simple as indexing the array
cropped = dog[:100,:100,:]
plt.imshow(cropped)
plt.show()

In [None]:
# Create your own crop
# full dims = 576 x 768
BOTTOM =
TOP = 
LEFT = 
RIGHT = 
cropped2 = dog[BOTTOM:TOP,:LEFT,:RIGHT]
plt.imshow(cropped2)
plt.show()

In [None]:
# edit color channels
# Channel order: RGB
no_green = dog.copy()
no_green[:,:,1] = 0

plt.imshow(no_green)
plt.show()

In [None]:
# dimming is a matter of division!
dimmed = (dog.copy() / 2).astype(int)

plt.imshow(dimmed)
plt.show()

In [None]:
# inspect the first 20 elements of the first row of each image's Green channel
print(dimmed[0,:20,1])
print(dog[0,:20,1])

In [None]:
# can you think of your own image transformation, and then implement it?
custom = dog.copy()

# suggestions
# set some subset of pixels to 0
# add/subtract some constant from all or some pixels
custom = # your transformation goes here

plt.imshow(custom)
plt.show()

In [None]:
# What if we multiply?
doubled = (dog.copy() * 2).astype(int)

plt.imshow(doubled)
plt.show()

In [None]:
# huh? Some are brighter but some are dimmer
# inspect the 200-215th elements of the first row of each image's Green channel
print(dog[0,:15,1])
print(doubled[0,:15,1])


In [None]:
# inspect the 200-215th elements of the first row of each image's Green channel
print(dog[0,200:215,1])
print(doubled[0,200:215,1])

In [None]:
original = np.array([188, 204, 240, 236, 224, 224, 218, 203, 186, 231])
doubled_for_real = original * 2
double_modulo = (original * 2) % 256
print(original)
print(doubled_for_real)
print(double_modulo)

RGB can't represent values over 255, so instead we're seeing (arr1 * 2) modulo 256   
How does the computer know to do this atuomatically??

In [None]:
print(dog.dtype)
print(original.dtype)

The answer is the data type! The datatype of `arr2` is 'uint8' which stands for "unsigned 8-bit integer". When we initially read in this image to the variable `arr1` with the code `arr1 = cv2.imread('../data/dog.jpg')`, it was automatically encoded as 'uint8'.  This data type represents integers with 8 binary digits (bits). It ranges from 00000000 to 11111111 (which in decimal is 255). In other words this data type is only expressive enough to represent intergers in the range [0,255]. Furthermore if an operation results in a value greater than 255, than uint8 will "wrap around" using modulo arithmetic as we observed.

Computer vision models require their inputs to be in very specific formats. Often it is neccesary to modify an image to meet these input specifications. For example, the model we'll be using today (tiny-yolov3) requires it's input image array values all to be scaled to the range [0,1] and have the dimensions (1x3x416x416)

https://github.com/onnx/models/tree/main/validated/vision/object_detection_segmentation/tiny-yolov3

In [None]:
# letterbox procedure
def letterbox(src, dest_shape):
    ## INPUTS ##
        #  '''resize image with unchanged aspect ratio using padding'''
        # src - an image array
        # dest_shape - tuple specifying desired letterboxed image dimensions
    ## Output ##
        # dest - letterboxed image array
    
    # get src dims
    src_width = src.shape[1]    # img.shape returns tuple (rows, cols, chan)
    src_height = src.shape[0]   # NOTE: rows => height; cols => width

    # cons dest array (filled with gray), get dest dims
    # NOTE: each 32-bit [B, G, R, A] pixel value is [128, 128, 128, 255]
    dest = np.full(dest_shape, np.uint8(128))
    if dest_shape[2] > 3:
        dest[:, :, 3] = np.uint8(255)
    dest_width = dest.shape[1]
    dest_height = dest.shape[0]

    # calculate width and height ratios
    width_ratio = dest_width / src_width        # NOTE: ratios are float values
    height_ratio = dest_height / src_height

    # init resized image width and height with max values (dest dims)
    rsz_width = dest_width
    rsz_height = dest_height

    # smallest scale factor will scale other dimension as well
    if width_ratio < height_ratio:
        rsz_height = int(src_height * width_ratio)  # NOTE: integer truncation
    else:
        rsz_width = int(src_width * height_ratio)

    # resize the image data using bi-linear interpolation
    rsz_dims = (rsz_width, rsz_height)
    rsz = cv2.resize(src, rsz_dims, 0, 0, cv2.INTER_LINEAR)

    # embed rsz into the center of dest
    dx = int((dest_width - rsz_width) / 2)          # NOTE: integer truncation
    dy = int((dest_height - rsz_height) / 2)
    dest[dy:dy+rsz_height, dx:dx+rsz_width, :] = rsz
    rsz_origin = (dx, dy)

    # letterboxing complete, return dest
    return (dest, rsz_origin, rsz_dims)

In [None]:
# letterbox the image to resize for NN input (size: (height, width, chan))
letterboxed_dog = letterbox(dog, (416, 416, 3))[0]
letterboxed_aerial = letterbox(aerial, (416,416,3))[0]
plt.imshow(letterboxed_dog)
plt.show()

In [None]:
dog

In [None]:
# pack_buffer procedure, ONNX model expects normalized float32 NCHW tensor
def pack_buffer(src):
    dest = np.array(src, dtype='float32')       # cons dest array via copy
    if dest.shape[2] > 3:
        dest = dest[:,:,:3]                     # if there is an alpha channel, remove it
    #dest = dest [..., ::-1]                     # reorder channels BGR -> RGB
    dest /= 255.0                               # normalize vals
    dest = np.transpose(dest, [2, 0, 1])        # make channel first dim
    dest = np.expand_dims(dest, 0)              # ins batch dim before chan dim
    return dest

In [None]:
dog.shape

In [None]:
buffered_dog = pack_buffer(dog)
buffered_dog.shape

In [None]:
# Note that the values are no longer integers
buffered_dog

In [None]:
buffered_dog.dtype

"Float" or floating-point numbers are used to represent real numbers (as opposed to integers--think: fractions and decimals). "32" refers to the number of bits allocated to store each floating-point number. 32 bits = 4 bytes.  
Representing floating point numbers with a fixed number of bits entails a tradeoff between the range of values you can represent and their precision.  
TODO: briefly talk about how floating point numbers are stored (sign, exponent, significand/mantissa)

### Exercise 3: PTNNS and Convolutional Neural Nets

#### Convolution

In [None]:
from PIL import Image
from scipy import signal

In [None]:
# Demonstration of manual convolution
print("\nManual Convolution Demonstration:")
sample_image = np.array([[50, 60, 70],
                         [80, 90, 100],
                         [110, 120, 130]])

sample_kernel = np.array([[0, 1, 0],
                          [1, -4, 1],
                          [0, 1, 0]])

print("Sample Image:")
print(sample_image)
print("\nSample Kernel:")
print(sample_kernel)

In [None]:
# Perform manual convolution for the center pixel
result = (60 * 1 + 80 * 1 + 100 * 1 + 90 * -4 + 120 * 1)
print(f"\nManual calculation for center pixel: {result}")

In [None]:
# Perform convolution using scipy
convolved = signal.convolve2d(sample_image, sample_kernel, mode='same', boundary='symm')
print("\nFull convolution result:")
print(convolved.astype(int))

#### Types of kernels using in image processing
1. Identity: doesn't change the image
2. Blur: averages nearby pixels
3. Sharpen: enhances edges by increasing contrast with neighboring pixels
4. Edge Detection: highlights edges in the image

In [None]:
def apply_kernel(image, kernel):
    # Apply the kernel to each color channel
    r = signal.convolve2d(image[:,:,0], kernel, mode='same', boundary='symm')
    g = signal.convolve2d(image[:,:,1], kernel, mode='same', boundary='symm')
    b = signal.convolve2d(image[:,:,2], kernel, mode='same', boundary='symm')
    
    # Stack the channels back together
    return np.stack([r, g, b], axis=2).astype(np.uint8)

In [None]:
def show_kernel_effect(image, kernel, kernel_name):
    # Apply the kernel
    convolved = apply_kernel(image, kernel)
    
    # Select a small region to display pixel values (e.g., 5x5)
    region_original = image[100:105, 100:105, 0]  # Red channel of original
    region_convolved = convolved[100:105, 100:105, 0]  # Red channel of convolved
    
    # Display images and pixel values
    fig, axs = plt.subplots(2, 2, figsize=(12, 10))
    fig.suptitle(f'Effect of {kernel_name} Kernel', fontsize=16)
    
    axs[0, 0].imshow(image)
    axs[0, 0].set_title('Original Image')
    axs[0, 0].axis('off')
    
    axs[0, 1].imshow(convolved)
    axs[0, 1].set_title('Convolved Image')
    axs[0, 1].axis('off')
    
    axs[1, 0].imshow(region_original, cmap='gray', vmin=0, vmax=255)
    axs[1, 0].set_title('Original Pixel Values (5x5 region)')
    for (j,i),label in np.ndenumerate(region_original):
        axs[1, 0].text(i,j,int(label),ha='center',va='center')
    
    axs[1, 1].imshow(region_convolved, cmap='gray', vmin=0, vmax=255)
    axs[1, 1].set_title('Convolved Pixel Values (5x5 region)')
    for (j,i),label in np.ndenumerate(region_convolved):
        axs[1, 1].text(i,j,int(label),ha='center',va='center')
    
    plt.tight_layout()
    plt.show()
    
    # Print the kernel
    print(f"\n{kernel_name} Kernel:")
    print(kernel)

In [None]:
# Load an image (replace with any image file)
#image_path = "sample_image.jpg"
#image = np.array(Image.open(image_path))
image = dog
# Define different convolution kernels
kernels = {
    'Identity': np.array([[0, 0, 0],
                          [0, 1, 0],
                          [0, 0, 0]]),
    
    'Average Blur': np.array([[1, 1, 1],
                      [1, 1, 1],
                      [1, 1, 1]]) / 9,
    
    'Gaussian Blur': np.array([[1, 4, 6, 4, 1],
                                     [4, 16, 24, 16, 4],
                                     [6, 24, 36, 24, 6],
                                     [4, 16, 24, 16, 4],
                                     [1, 4, 6, 4, 1]]) / 256,
    
    'Sharpen': np.array([[0, -1, 0],
                         [-1, 5, -1],
                         [0, -1, 0]]),
    
    'Edge Detection': np.array([[0, -1, 0],
                                [-1,  4, -1],
                                [0, -1, 0]])
}

In [None]:
# Apply each kernel and show detailed results
for name, kernel in kernels.items():
    show_kernel_effect(image, kernel, name)

The edge detection here is pretty bad! There is lots of noise in the photo, which makes detecting trued edges difficult. How might we resolve this?

In [None]:
show_kernel_effect(apply_kernel(dog, kernels['Gaussian Blur']), kernels['Edge Detection'], "Blur + Edge Detection")

In [None]:
# Print a kernel to understand its structure
print("Structure of the Sharpen kernel:")
print(kernels['Sharpen'])

In [None]:
# Create your own kernel

# Edit this kernel
custom_kernel = np.array([[0, 0, 0],
                          [0, 1, 0],
                          [0, 0, 0]])

kernels['Custom'] = custom_kernel

show_kernel_effect(dog, kernels['Custom'], "Custom Kernel")

#### Dimension change with Stride and Pooling Layers

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def apply_convolution(input_array, kernel, stride):
    input_height, input_width = input_array.shape
    kernel_size = kernel.shape[0]
    
    output_height = (input_height - kernel_size) // stride + 1
    output_width = (input_width - kernel_size) // stride + 1
    
    output = np.zeros((output_height, output_width))
    
    for i in range(0, input_height - kernel_size + 1, stride):
        for j in range(0, input_width - kernel_size + 1, stride):
            output[i//stride, j//stride] = np.sum(input_array[i:i+kernel_size, j:j+kernel_size] * kernel)
    
    return output

def apply_pooling(input_array, pool_size, stride, mode='max'):
    input_height, input_width = input_array.shape
    
    output_height = (input_height - pool_size) // stride + 1
    output_width = (input_width - pool_size) // stride + 1
    
    output = np.zeros((output_height, output_width))
    
    for i in range(0, input_height - pool_size + 1, stride):
        for j in range(0, input_width - pool_size + 1, stride):
            if mode == 'max':
                output[i//stride, j//stride] = np.max(input_array[i:i+pool_size, j:j+pool_size])
            elif mode == 'average':
                output[i//stride, j//stride] = np.mean(input_array[i:i+pool_size, j:j+pool_size])
    
    return output

In [None]:
# Create a sample input array
input_array = np.array([
    [1, 1, 1, 0, 0],
    [0, 1, 1, 1, 0],
    [0, 0, 1, 1, 1],
    [0, 0, 1, 1, 0],
    [0, 1, 1, 0, 0]
])

# Define a simple edge detection kernel
# kernel = np.array([[1, 1, 1],
#                       [1, 1, 1],
#                       [1, 1, 1]]) / 9

kernel = np.array([[0,0,0],
                   [0,1,0],
                   [0,0,0]])
# Demonstrate convolution with different strides
strides = [1, 2]
fig, axs = plt.subplots(1, len(strides) + 1, figsize=(15, 5))
fig.suptitle('Convolution with Different Strides', fontsize=16)

axs[0].imshow(input_array, cmap='gray')
axs[0].set_title('Input Array')
axs[0].axis('off')

for i, stride in enumerate(strides, 1):
    output = apply_convolution(input_array, kernel, stride)
    axs[i].imshow(output, cmap='gray')
    axs[i].set_title(f'Stride = {stride}\nOutput Shape: {output.shape}')
    axs[i].axis('off')

plt.tight_layout()
plt.show()

# Demonstrate pooling
pool_size = 2
strides = [1, 2]
pooling_modes = ['max', 'average']

fig, axs = plt.subplots(len(pooling_modes), len(strides) + 1, figsize=(15, 10))
fig.suptitle('Pooling with Different Strides and Modes', fontsize=16)

for i, mode in enumerate(pooling_modes):
    axs[i, 0].imshow(input_array, cmap='gray')
    axs[i, 0].set_title('Input Array')
    axs[i, 0].axis('off')
    
    for j, stride in enumerate(strides, 1):
        output = apply_pooling(input_array, pool_size, stride, mode)
        axs[i, j].imshow(output, cmap='gray')
        axs[i, j].set_title(f'{mode.capitalize()} Pooling\nStride = {stride}\nOutput Shape: {output.shape}')
        axs[i, j].axis('off')

plt.tight_layout()
plt.show()

In [None]:
# Print numerical values for better understanding
print("Input Array:")
print(input_array)

print("\nConvolution Output (Stride = 1):")
print(apply_convolution(input_array, kernel, 1))

print("\nConvolution Output (Stride = 2):")
print(apply_convolution(input_array, kernel, 2))

print("\nMax Pooling Output (2x2, Stride = 1):")
print(apply_pooling(input_array, 2, 1, 'max'))

print("\nMax Pooling Output (2x2, Stride = 2):")
print(apply_pooling(input_array, 2, 2, 'max'))

print("\nAverage Pooling Output (2x2, Stride = 2):")
print(apply_pooling(input_array, 2, 2, 'average'))

#### Pre-Trained Neural Networks (PTNNs)
    a.	PTNNs have architecture and trained weights.  
    b.	Getting trained Tiny YOLOv3 from ONNX model zoo  
    c.	Consider NETRON model viewer (https://github.com/lutzroeder/netron)  
    d.	onnx2torch module  
    e.	Loading ONNX model into pytorch  
    f.	Run on a test image look at output: bbox center and extent, objectness, classifications.  


In [None]:
import onnx
import onnxruntime as ort

In [None]:
# Read the model classes
def read_model_classes(pathname = '../model/coco.names'):
    file = open(pathname, 'r')
    classes = []
    while True:
        class_name = file.readline().strip()
        if not class_name:
            break
        classes.append(class_name)
    file.close()
    return classes

def run_inference(model, image_array):
    # cons input for ONNX model inference (packed images and their orig dims)
    img = pack_buffer(image_array)
    # dim4 = np.array([image_array.shape[1], image_array.shape[0]], dtype=np.float32).reshape(1, 2)

    # run ONNX model inference on input buffer to get results
    return model.run(None, {'input_1': img })#,'image_shape': dim4})

In [None]:
coco_names = read_model_classes()
for item in coco_names:
    print(item)

In [None]:
# cons ONNX Tiny YOLOv3 NN model
model   = ort.InferenceSession('../model/modified_yolov3-tiny.onnx')
# model = ort.InferenceSession('../model/yolov3-tiny.onnx')

In [None]:
dog_results1 = run_inference(model, letterboxed_dog)
aerial_results1 = run_inference(model, letterboxed_aerial)

In [None]:
# Let's take a look at our results!
dog_results1

In [None]:
print(len(dog_results1))     
print(dog_results1[0].shape)
print(dog_results1[1].shape)

In [None]:
dog_results1

### Exercise 5: Bounding Boxes for Object Detection

In [None]:
# draw_annos procedure (fixed ONNX anno scaling in unscale_annos proc)

# don't need to understand this code in detail but i think its important you see it
def draw_annos(src, annos, classes):
    dest = np.copy(src)
    green = (0, 255, 0)
    black = (0, 0, 0)
    face = cv2.FONT_HERSHEY_SIMPLEX
    scale = 0.5
    thickness = 1
    for anno in annos:
        pt1 = (anno[0][0], anno[0][1])
        pt2 = (anno[0][2], anno[0][3])
        text = f'{classes[anno[2]]}: {anno[1]:.2f}'
        (w, h), _ = cv2.getTextSize(text, face, scale, thickness)
        pt3 = (pt1[0], pt1[1] - h)
        pt4 = (pt1[0] + w, pt1[1])
        dest = cv2.rectangle(dest, pt1, pt2, green)
        dest = cv2.rectangle(dest, pt3, pt4, green, cv2.FILLED)
        dest = cv2.putText(dest, text, pt1, face, scale, black, thickness)
    return dest

# unscale_annos procedure (fixes ONNX anno scaling)
def unscale_annos(annos, dw, dh, w0, h0, w1, h1):
    res = []
    scale_w = float(w1) / float(w0)
    scale_h = float(h1) / float(h0)
    for anno in annos:
        pt1 = (int(anno[0][1]), int(anno[0][0]))   # ONNX bug! Points are
        pt2 = (int(anno[0][3]), int(anno[0][2]))   # transposed.
        pt3 = (pt1[0] - dw, pt1[1] - dh)
        pt4 = (pt2[0] - dw, pt2[1] - dh)
        pt5 = (int(float(pt3[0]) * scale_w), int(float(pt3[1]) * scale_h))
        pt6 = (int(float(pt4[0]) * scale_w), int(float(pt4[1]) * scale_h))
        arr1 = np.array([pt5[0], pt5[1], pt6[0], pt6[1]], dtype='int32')
        res.append((arr1, anno[1], anno[2]))
    return res

#+BEGIN_EXAMPLE

# sigmoid procedure
def sigmoid(x):
    return 1.0/(1.0 + math.exp(-x))

# (redefined) proc_results procedure
def proc_results(res, classes, pobj_thresh = 0.1, pcls_thresh = 0.5, orig_img_size = 416,
                 anchors = np.array([[[81,82], [135,169], [344,319]],
                                     [[10,14], [ 23, 27], [ 37, 58]]],
                                    dtype='int32')):
    dets = []
    # candidate detection layout:
    # [x, y, w, h, pobj, pcls_0, pcls_1, ..., pcls_i]
    # i: [0, num_classes)
    num_classes = len(classes)
    pcls_offset = 5                                     # offset of class probs
    num_params = pcls_offset + num_classes              # numParams per cand det
    num_yolo_blocks = anchors.shape[0]
    num_anchors = anchors.shape[1]
    assert len(res) == num_yolo_blocks
    for blk in range(num_yolo_blocks):                  # iter over yolo blocks
        height_blk = res[blk].shape[1]
        width_blk = res[blk].shape[2]
        stride_blk = orig_img_size / width_blk          # ASSUMES square image
        shape_blk = (height_blk, width_blk, num_anchors, num_params)
        dets_blk = np.reshape(res[blk], shape_blk)
        # each yolo block has an "image" where each "pixel" has a candidate
        # detection per anchor box
        for hi in range(height_blk):                    # iter over img rows
            for wi in range(width_blk):                 # iter over img cols
                for ai in range(num_anchors):           # iter over pxl anchors
                    det = dets_blk[hi][wi][ai]          # get detection
                    pobj = sigmoid(det[4])              # get objectness prob
                    if pobj > pobj_thresh:
                        x = stride_blk * (wi + sigmoid(det[0]))
                        y = stride_blk * (hi + sigmoid(det[1]))
                        w = math.exp(det[2]) * anchors[blk][ai][0]
                        h = math.exp(det[3]) * anchors[blk][ai][1]
                        for ci in range(num_classes):
                            pcls = sigmoid(det[pcls_offset + ci])
                            if pcls > pcls_thresh:
                                x1, y1 = x - (w / 2.0), y - (h / 2.0)
                                x2, y2 = x + (w / 2.0), y + (h / 2.0)
                                dets.append((pobj, pcls, ci, x1, y1, x2, y2))
    return dets

# overlap procedure, find bbox overlap length along a dim
def overlap(lo1, hi1, lo2, hi2):
    lo = max(lo1, lo2)
    hi = min(hi1, hi2)
    return hi - lo

# iou procedure (intersection-over-union); bbox: [xl, yl, xh, yh]
def iou(bbox1, bbox2):
    area1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])   # bbox1 area
    area2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])   # bbox2 area
    wo = overlap(bbox1[0], bbox1[2], bbox2[0], bbox2[2])    # overlap x dim
    ho = overlap(bbox1[1], bbox1[3], bbox2[1], bbox2[3])    # overlap y dim
    i_area = (wo * ho) if (wo > 0.0 and ho > 0.0) else 0.0  # intersection area
    u_area = area1 + area2 - i_area                         # union area
    return i_area / u_area

# basic_nms procedure (non-maximum supression); det: (pobj,pcls,ci,x1,y1,x2,y2)
def basic_nms(dets, iou_thresh = 0.5):
    filtered_dets = []
    dets.sort(reverse=True)                     # lexicographically sort dets
    while len(dets) > 0:                        # any remaining dets to check?
        c = dets[0]                             # get current det
        filtered_dets.append(c)                 # add to filtered_dets
        # predicate remove dets with same class index and high iou
        pred = lambda d : not (c[2] == d[2] and iou(c[3:], d[3:]) > iou_thresh)
        dets = [d for d in dets if pred(d)]     # make list of remaining dets
    return filtered_dets

# make_annos procedure
def make_annos(dets):
    annos = []
    for det in dets:
        box = [det[4], det[3], det[6], det[5]]  # NOTE: replicate ONNX bug
        score = det[0] * det[1]
        cls = det[2]
        annos.append((box, score, cls))
    return annos

In [None]:
(h1, w1, c1) = dog.shape
(letterboxed_dog, (dw, dh), (w0, h0)) = letterbox(dog, (416, 416, 3))
dog_dets1 = proc_results(dog_results1, coco_names,0.04, 0.04)
#filtered_dets = basic_nms(dog_dets1, 0.5)
dog_annos1 = make_annos(dog_dets1)

# # unscale annotations to draw in original image frame
dog_unscaled1 = unscale_annos(dog_annos1, dw, dh, w0, h0, w1, h1)

# # draw list of annotations on original image
dog_annotated1 = draw_annos(dog_raw, dog_unscaled1, coco_names)


In [None]:
plt.imshow(dog_annotated1[..., ::-1]  )
plt.show()

In [None]:
print(f"There are {len(dog_unscaled1)} detections!")

In [None]:
def inference_wrapper(img, labels, pobj_thresh=0, pcls_thresh=0,iou_thresh=0,
                      NMS=False):
    (h1, w1, c1) = img.shape
    (letterboxed_img, (dw, dh), (w0,h0)) = letterbox(img, (416, 416, 3))
    res1 = run_inference(model, letterboxed_img)
    dets1 = proc_results(res1, labels, pobj_thresh,pcls_thresh)
    if NMS:
        print("Applying NMS")
        filtered_dets1 = basic_nms(dets1, iou_thresh)
        annos1 = make_annos(filtered_dets1)
    else:
        annos1 = make_annos(dets1)

    unscaled_annos1 = unscale_annos(annos1, dw, dh, w0, h0, w1, h1)
    print(f"There are {len(unscaled_annos1)} detections!")
    annotated_img = draw_annos(img, unscaled_annos1, labels)

    return annotated_img

In [None]:
annotated_aerial = inference_wrapper(aerial, coco_names, .05, .05)
plt.imshow(annotated_aerial)
plt.show()

### Exercise 6: Post-processing YOLO Results with Non-Max Suppression (NMS)

In [None]:
# We could have done even worse
dets = proc_results(dog_results1, coco_names, pobj_thresh=0, pcls_thresh=0)
print(f"When we set the thresholds to 0, there are {len(dets)} detections!")

In [None]:
# Why?
# (13 * 13 cells) * (3 anchor boxes * 80 classes)
(13 * 13 * 3 * 80) + (26 * 26 * 3 * 80)

Can we do better on images by playing around with the various post-processing thresholds we control?

In [None]:
# let's set new thresholds and add NMS to try to reduce the number of detections for the dog

OBJ_THRESH = 0.05
CLASS_THRESH = 0.05
IOU_THRESH = 0.9

better_dog = inference_wrapper(dog, coco_names, pobj_thresh=OBJ_THRESH,
                               pcls_thresh=CLASS_THRESH, iou_thresh=IOU_THRESH, NMS=True)
plt.imshow(better_dog)
plt.show()

What about the aerial image?

In [None]:
# and the aerial image?

OBJ_THRESH = 0.01
CLASS_THRESH = 0.01
IOU_THRESH = 0.7

better_aerial = inference_wrapper(aerial, coco_names, pobj_thresh=OBJ_THRESH,
                               pcls_thresh=CLASS_THRESH, iou_thresh=IOU_THRESH, NMS=True)
plt.imshow(better_aerial)
plt.show()

Let's try doing detection on other images!

In [None]:
# load in a new image first
# new image
kite = arr1 = cv2.imread('../data/kite.jpg')
kite = kite[..., ::-1] # BGR --> RGB
plt.imshow(kite)
plt.show()

In [None]:
OBJ_THRESH = 0.01
CLASS_THRESH = 0.01
IOU_THRESH = 0.7

annotated_kite = inference_wrapper(kite, coco_names, pobj_thresh=OBJ_THRESH,
                               pcls_thresh=CLASS_THRESH, iou_thresh=IOU_THRESH, NMS=True)
plt.imshow(annotated_kite)
plt.show()

In [None]:
The `data` subdirectory contains the following images:  
StopSign_Blue.jpg       horses.jpg              rareplanes4.jpg
StopSign_Green.jpg      kite.jpg                rareplanes5.jpg
StopSign_Red.jpg        rareplanes1.jpg         rareplanes6.jpg
StopSign_Yellow.jpg     rareplanes10.jpg        rareplanes7.jpg
dog.jpg                 rareplanes2.jpg         rareplanes8.jpg
eagle.jpg               rareplanes3.jpg         rareplanes9.jpg

Try loading in and doing inference on these!