In [None]:
# verify the library installation for Mega Detector
from utils.general import non_max_suppression, scale_coords, xyxy2xywh
from utils.augmentations import letterbox
import visualization.visualization_utils as viz_utils
import numpy as np
import ct_utils
import os
import torchvision.transforms as T
import torch
from PIL import Image
import cv2
from IPython.display import Image, display
import torch.backends.cudnn as cudnn

In [None]:
# verifying the availability of the GPUs
if torch.cuda.is_available():
    device = "cuda"
    print(f"PyTorch detected: {torch.cuda.get_device_name(0)} for processing.. ")
    print(f"Numbe of GPUs found: {torch.cuda.device_count()}")
    device = torch.device("cuda") # if there are multiple GPUs assign the Device Id
    cudnn.benchmark = True
else:
    device = "cpu"

In [None]:
torch.cuda.empty_cache() # clear the CUDA cache from the system

In [None]:
current_path = os.getcwd()
md_model_path = os.path.join(current_path, 'md_model', 'md_v5b.0.0.pt')
input_image_path = os.path.join(current_path, 'Cat_Selfie.JPG')
output_dir_path = os.path.join(current_path, 'Cat_Selfie_Crop')
display(Image(filename=input_image_path, width=500))

In [None]:
# Variables used by the megadetector model
IMAGE_SIZE = 1280  # image size used in training
STRIDE = 64
DETECTION_THRESHOLD = 0.2
# Label mapping for MegaDetector
DEFAULT_DETECTOR_LABEL_MAP = ['animal', 'person', 'vehicle']
# List variables to collect the output
final_output = {'input_file' : f'{os.path.basename(input_image_path)}'}
tot_detections = []
labels = []
bbox = []
scores = []
max_conf = 0.0

In [None]:
print(f'Loading a model onto the {device} for inference analysis.. ')
try:
    model_checkpoint = torch.load(md_model_path, device)
    model = model_checkpoint['model'].float().fuse().eval()
    print(f'[info]: Model is loaded to the {device}')
except OSError as oer:
    print(oer)

In [None]:
import torch.nn as nn

for m in model.modules():
    if isinstance(m, nn.Upsample):
        m.recompute_scale_factor = None

In [None]:
print(f'Pre-processing {os.path.basename(input_image_path)} file.. ')
# img_data = viz_utils.load_image(input_image_path) # Load the image file
img_data = cv2.imread(input_image_path) # read the image using cv2 library
cpy_img_data = img_data
img_array = np.asarray(img_data) # Convert the image data into a numpy array
img_resize = letterbox(img_array, new_shape=IMAGE_SIZE, stride=STRIDE, auto=True)[0] # Resize the image shape as per the megadetector model
img_transpose = img_resize.transpose((2, 0, 1))  # HWC to CHW; PIL Image is RGB already
img = np.ascontiguousarray(img_transpose)
img = torch.from_numpy(img)
img = img.to(device)
img = img.float()
img /= 255
if len(img.shape) == 3:  # always true for now, TODO add inference using larger batch size
    img = torch.unsqueeze(img, 0)

In [None]:
print('Mega detector look for objects in the image.. ')
with torch.no_grad():
    result = model(img)[0]

In [None]:
print('Apply non-maximum supression to the results of Mega Detector... ')
result_nms = non_max_suppression(prediction=result, conf_thres=DETECTION_THRESHOLD) # applying non-maximum supression logic to the results
normalization = torch.tensor(img_array.shape)[[1, 0, 1, 0]]  # normalization gain whwh

In [None]:
# extracting the feature from the object detected
for detection in result_nms:
    if len(detection):
        detection[:, :4] = scale_coords(img.shape[2:], detection[:, :4], img_array.shape).round()
        for *xyxy, conf, cls in reversed(detection):
            # normalized center-x, center-y, width and height
            xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / normalization).view(-1).tolist()
            api_box = ct_utils.convert_yolo_to_xywh(xywh)
            conf = ct_utils.truncate_float(conf.tolist(), precision=3)
            # MegaDetector output format's categories start at 1, but this model's start at 0
            cls = int(cls.tolist())
            labels.append(DEFAULT_DETECTOR_LABEL_MAP[cls])
            scores.append(conf)
            bbox.append(ct_utils.truncate_float_array(api_box, precision=4))
            tot_detections.append({ 'category': cls, 'conf': conf, 'bbox': ct_utils.truncate_float_array(api_box, precision=4)})
            max_conf = max(max_conf, conf)
final_output['max_detection_conf'] = max_conf
final_output['detections'] = tot_detections
print(final_output)

In [None]:
# crop the region of interest of an image using the bounding box predicted by the mega detector
# select the bounding box that has highest confidence score
for detection_values in final_output['detections']:
    if detection_values['conf'] == final_output['max_detection_conf']:
        bboxes = detection_values['bbox']

In [None]:
# the format of the bounding boxes are in x1,y1,w_box, h_box -> defined in visualization_utils.py
x1,y1,w_box,h_box = bboxes[0], bboxes[1], bboxes[2], bboxes[3]

# convert the bbox co-ordinates
ymin,xmin,ymax,xmax = y1, x1, y1 + h_box, x1 + w_box

#find the width and height of the image using cv2
im_height, im_width = img_data.shape[0], img_data.shape[1]

# adjust the bounding box co-ordinates with the image dimensions to crop the region of interest
(left, right, top, bottom) = (int(xmin * im_width), int(xmax * im_width), int(ymin * im_height), int(ymax * im_height))

# loading the bounding box xo-ordinates information for extracting RoI
roi_image = cpy_img_data[top:bottom, left:right]

# extracting input filename
head,tail = (os.path.basename(input_image_path)).split('.')

# building output file path to save an image
output_image_path = os.path.join(output_dir_path, f'{head}.JPG')

# saving the RoI part as a separate image
cv2.imwrite(output_image_path, roi_image)