In [1]:
import yolov2
from draw_utils import DrawBoundingBoxes

import nnabla as nn
import nnabla.functions as F

import time
import numpy as np
from nnabla.utils.image_utils import imread, imresize, imsave

import clip

2021-09-14 08:49:11,205 [nnabla][INFO]: Initializing CPU extension...


In [2]:
width = 608
weights = 'yolov2.h5'
classes = 80
class_names = 'coco.names'
thresh = .22
nms = .45
nms_per_class = True
num_anchors = 5
anchors = [0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828]
anchors = np.array(anchors).reshape(-1, 2)

args_input = 'dog.jpg'


context = 'cudnn'
dev_id = '0'

In [3]:
text = 'a person looking down'


with nn.parameter_scope('clip'):
    clip.load('data/ViT-cpu.h5')

In [4]:
def _normalize(img, mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711), max_pixel_value=255.0):
    mean = np.array(mean, dtype=np.float32)
    mean = nn.Variable.from_numpy_array(mean)
    mean *= max_pixel_value

    std = np.array(std, dtype=np.float32)
    std = nn.Variable.from_numpy_array(std)
    std *= max_pixel_value

    denominator = F.r_div_scalar(std)

    mean = F.reshape(mean, (3, 1, 1))
    denominator = F.reshape(denominator, (3, 1, 1))
    
    img -= mean
    img *= denominator
    return img

def clip_logits(image, text):
    with nn.parameter_scope('clip'):
        with nn.auto_forward():
            h, w, c = image.shape

            # big box
            box = np.zeros((224, 224, 3), dtype=np.uint8)
            
            if h > w:
                res_w = 224 * w // h
                image = imresize(image, (res_w, 224), interpolate='bicubic')
                s = int(round(224 - res_w) / 2.)
                box[:, s:s+res_w, :] = image

            else:
                res_h = 224 * h // w
                image = imresize(image, (224, res_h), interpolate='bicubic')
                s = int(round(224 - res_h) / 2.)
                box[s:s+res_h, :, :] = image
            
            image = box
            # ceter crop
#             if h < w:
#                 res_w = 224 * w // h
#                 crop_left = int(round((res_w - 224) / 2.))
#                 image = imresize(image, (res_w, 224), interpolate='bicubic')
#                 image = image[:, crop_left:crop_left+224, :]

#             else:
#                 res_h = 224 * h // w
#                 crop_top = int(round((res_h - 224) / 2.))
#                 image = imresize(image, (224, res_h), interpolate='bicubic')
#                 image = image[crop_top:crop_top+224, :, :]

            
            image = image.transpose(2, 1, 0)
            
            
            image = nn.Variable.from_numpy_array(image)
            
    
            image = _normalize(image)

            text = clip.tokenize(text)

            img_logits, _ = clip.logits(image, text)
            similarity = img_logits / 100
    
    return similarity


In [5]:
names = np.genfromtxt(class_names, dtype=str, delimiter='?')
rng = np.random.RandomState(1223)
colors = rng.randint(0, 256, (classes, 3)).astype(np.uint8)
colors = [tuple(c.tolist()) for c in colors]


# Set context
from nnabla.ext_utils import get_extension_context
ctx = get_extension_context(
    context, device_id=dev_id, type_config='float')
nn.set_default_context(ctx)

_ = nn.load_parameters(weights)

2021-09-14 08:49:13,069 [nnabla][INFO]: Initializing CUDA extension...
2021-09-14 08:49:13,082 [nnabla][INFO]: Initializing cuDNN extension...


In [6]:
def draw_bounding_boxes(img, bboxes, im_w, im_h, names, colors, sub_w, sub_h, thresh, text):
    draw = DrawBoundingBoxes(img, colors)
    for bb in bboxes:
        if bb[4] <= 0:
            continue
        # x, y, w, h = bb[:4] * np.array([im_w, im_h, im_w, im_h])
        x, y, w, h = bb[:4]
        x = (x - (1 - sub_w) / 2.) / sub_w * im_w
        y = (y - (1 - sub_h) / 2.) / sub_h * im_h
        w = w * im_w / sub_w
        h = h * im_h / sub_h
        dw = w / 2.
        dh = h / 2.
        x0 = int(np.clip(x - dw, 0, im_w))
        y0 = int(np.clip(y - dh, 0, im_h))
        x1 = int(np.clip(x + dw, 0, im_w))
        y1 = int(np.clip(y + dh, 0, im_h))
        

        # prob check
        det_ind = np.where(bb[5:] > 0.1)[0]
        if len(det_ind) == 0:
            continue
        
#         print('PASS 1')
        
        cand = img[y0:y1, x0:x1, :]
        
        print(x0, x1, y0, y1)

        prob = clip_logits(cand, text)
        
        print(f'prob: {prob.d[0, 0]}')
        
#         if prob.d[0, 0] <= thresh:
#             continue
            
#         print('PASS 2')
            
        label = ''
        draw.draw((x0, y0, x1, y1), 0, label)
        
#         # prob check
#         det_ind = np.where(bb[5:] > thresh)[0]
#         if len(det_ind) == 0:
#             continue
#         prob = bb[5 + det_ind]
        # Object detection with deep learning and OpenCV
        # https://goo.gl/q4RdcZ
#         label = ', '.join("{}: {:.2f}%".format(
#             names[det_ind[j]], prob[j] * 100) for j in range(len(det_ind)))
#         print("[INFO] {}".format(label))
#         draw.draw((x0, y0, x1, y1), det_ind[0], label)
    return draw.get()

In [7]:
# Build a YOLO v2 network
feature_dict = {}
x = nn.Variable((1, 3, width, width))
y = yolov2.yolov2(x, num_anchors, classes,
                    test=True, feature_dict=feature_dict)
y = yolov2.yolov2_activate(y, num_anchors, anchors)
y = F.nms_detection2d(y, thresh, nms, nms_per_class)

In [8]:
# Read image
img_orig = imread(args_input, num_channels=3)
im_h, im_w, _ = img_orig.shape
# letterbox
w = width
h = width



if (w * 1.0 / im_w) < (h * 1. / im_h):
    new_w = w
    new_h = int((im_h * w) / im_w)
else:
    new_h = h
    new_w = int((im_w * h) / im_h)

patch = imresize(img_orig, (new_w, new_h)) / 255.
img = np.ones((h, w, 3), np.float32) * 0.5
# resize
x0 = int((w - new_w) / 2)
y0 = int((h - new_h) / 2)
img[y0:y0 + new_h, x0:x0 + new_w] = patch

# Execute YOLO v2
print("forward")
in_img = img.transpose(2, 0, 1).reshape(1, 3, width, width)
x.d = in_img
y.forward(clear_buffer=True)
print("done")

forward
done


In [9]:
bboxes = y.d[0]

In [10]:
bboxes.shape

(1805, 85)

In [11]:
img_draw = draw_bounding_boxes(
    img_orig, bboxes, im_w, im_h, names, colors, new_w * 1.0 / w, new_h * 1.0 / h, thresh, text)

132 320 232 520
prob: 0.21061177551746368
466 680 84 168
prob: 0.2581664025783539
94 589 122 449
prob: 0.21361331641674042


In [12]:
img_draw.shape

(576, 768, 3)

In [13]:
bboxes.shape

(1805, 85)

In [14]:
imsave('tada.jpg', img_draw)

In [15]:
img_orig.shape

(576, 768, 3)

In [16]:
imsave('fff.jpg', img_orig[122:449, 94:589, :])

In [17]:
bike = img_orig[122:449, 94:589, :]

In [18]:
h, w, c = bike.shape

In [19]:
box = np.zeros((224, 224, 3), dtype=np.uint8)

In [20]:
res_h = 224 * h // w

In [21]:
bike = imresize(bike, (224, res_h), interpolate='bicubic')

In [22]:
bike.shape

(147, 224, 3)

In [23]:
s = int(round(224 - res_h) / 2.)

In [24]:
box[s:s+res_h, :, :] = bike

In [25]:
imsave('fff.jpg', box)