In [1]:
import torch
from torch.backends import cudnn
import cv2
import numpy as np
from Object_Detection_XenoLidar.objectdetection.efficientnet.backbone import EfficientDetBackbone
from Object_Detection_XenoLidar.objectdetection.efficientnet.efficientdet.utils import BBoxTransform, ClipBoxes
from Object_Detection_XenoLidar.objectdetection.efficientnet.utils.utils import preprocess, invert_affine, postprocess, preprocess_video

class ObjectDetector:
    def __init__(self) -> None:
        # EfficientDet configuration

        self.compound_coef = 7
        self.force_input_size = None  # set None to use default size

        self.threshold = 0.3
        self.iou_threshold = 0.3

        # Gets the GPU if there is one, otherwise the cpu
        self.use_cuda = torch.cuda.is_available()
        self.use_float16 = False
        
        cudnn.fastest = True
        cudnn.benchmark = True

        # list of object categories that can be detected by the AI object detection
        self.obj_list = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
                    'fire hydrant', '', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
                    'cow', 'elephant', 'bear', 'zebra', 'giraffe', '', 'backpack', 'umbrella', '', '', 'handbag', 'tie',
                    'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
                    'skateboard', 'surfboard', 'tennis racket', 'bottle', '', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
                    'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut',
                    'cake', 'chair', 'couch', 'potted plant', 'bed', '', 'dining table', '', '', 'toilet', '', 'tv',
                    'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
                    'refrigerator', '', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
                    'toothbrush']

        # list of object categories allowed to be outputted
        self.out_class_id_dict = {
            'person'     : 1,
            'bicycle'    : 2,
            'motorcycle' : 3,
            'car'        : 4,
            'truck'      : 6,
            'bus'        : 7
        }
        
        # tf bilinear interpolation is different from any other's, just make do
        self.input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536]
        self.input_size = self.input_sizes[self.compound_coef] if self.force_input_size is None else self.force_input_size

        # load model
        self.model = EfficientDetBackbone(compound_coef=self.compound_coef, num_classes=len(self.obj_list))
        self.model.load_state_dict(torch.load(f'aiannotator/objectdetection/efficientnet/weights/efficientdet-d{self.compound_coef}.pth'))
        self.model.requires_grad_(False)
        self.model.eval()

        if self.use_cuda:
            self.model = self.model.cuda()
        if self.use_float16:
            self.model = self.model.half()

        # Box
        self.regressBoxes = BBoxTransform()
        self.clipBoxes    = ClipBoxes()

    def Detect(self, image):

        if image.shape[2] == 4:
            rgb_image = cv2.cvtColor(image, cv2.COLOR_BGRA2BGR)
        if image.shape[2] == 1:
            rgb_image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
        if image.shape[2] == 3:
            rgb_image = image
        
        ori_imgs, framed_imgs, framed_metas = preprocess_video(rgb_image, max_size=self.input_size)
        
        if self.use_cuda:
            x = torch.stack([torch.from_numpy(fi).cuda() for fi in framed_imgs], 0)
        else:
            x = torch.stack([torch.from_numpy(fi) for fi in framed_imgs], 0)

        x = x.to(torch.float32 if not self.use_float16 else torch.float16).permute(0, 3, 1, 2)

        # model predict
        with torch.no_grad():
            features, regression, classification, anchors = self.model(x)

            out = postprocess(x,
                            anchors, regression, classification,
                            self.regressBoxes, self.clipBoxes,
                            self.threshold, self.iou_threshold)

        # result
        out = invert_affine(framed_metas, out)

        # function for display
        def display(preds, imgs):
            for i in range(len(imgs)):
                if len(preds[i]['rois']) == 0:
                    return imgs[i]

                for j in range(len(preds[i]['rois'])):
                    (x1, y1, x2, y2) = preds[i]['rois'][j].astype(np.int)
                    cv2.rectangle(imgs[i], (x1, y1), (x2, y2), (255, 255, 0), 2)
                    obj = self.obj_list[preds[i]['class_ids'][j]]
                    score = float(preds[i]['scores'][j])

                    cv2.putText(imgs[i], '{}, {:.3f}'.format(obj, score),
                                (x1, y1 + 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
                                (255, 255, 0), 1)
                
                return imgs[i]

        def output_shapes(preds, imgs):
            """ label, points """
            shapes = []
            for i in range(len(imgs)):
                if len(preds[i]['rois']) == 0:
                    return shapes

                for j in range(len(preds[i]['rois'])):
                    (x_min, y_min, x_max, y_max) = preds[i]['rois'][j].astype(np.int)
                    points = [(x_min, y_min), (x_max, y_min), (x_max, y_max), (x_min, y_max)]
                    label = self.obj_list[preds[i]['class_ids'][j]]

                    if label in self.out_class_id_dict.keys():
                        shapes.append((label, points, None, None, True))
                
                return shapes

        # show frame by frame
        shapes = output_shapes(out, ori_imgs)

        return shapes

In [2]:
Annotator = ObjectDetector()

In [3]:
import glob
import XenoWareFormat as xw
import numpy as np
import json
import tqdm
def scale_image_base(image, ceil, floor):

    a = 255/(ceil-floor)
    b = floor*255/(floor-ceil)
    out = np.maximum(0,np.minimum(255,image*a+b)).astype(np.uint8)

    return out

In [5]:
def changeToJSON(txtfile, file_name):
    '''
    txt_file - file path to XL annotation file
    ptc_file - file path to the XPC point cloud file
    '''
    json_file = [{
        "image"   : None,
        "verified"   :False,
        "annotations" : []
       
              }]
    
    #file = open(txtfile, "r+")
    list_add = []
    json_file[0]["image"]= file_name
    #print(os.path.basename(txtfile))
    
    for idx, i in enumerate(txtfile):
        annotation = {"label"       : None,
                      "coordinates":{
                                      'x':None,
                                      'y':None,
                                      'width':None,
                                      'height':None
                      }}
        

        [x1,y1] = i[1][0]
        [x2,y2] = i[1][2]
        
        #class_id = int(i.split(",")[4].replace('\n',""))
        annotation ["label"] = i[0]
        annotation["coordinates"]['x'] = int((x1+x2)/2)
        annotation["coordinates"]['y'] = int((y1+y2)/2)
        annotation["coordinates"]['width'] = int(x2-x1)
        annotation["coordinates"]['height'] = int(y2-y1)
        
        json_file [0]["annotations"].append(annotation)
    return json_file

In [6]:

visual_image_dir = r'C:\Users\Adarsh Kuzhipathalil\Applications_team\Datasets\XW_Output_Arvoo\Parking Bodart\VisualImages'
visual_image_filenames = glob.glob(os.path.join(visual_image_dir,'xl_visual*.xim'))
visual_image_filenames.sort()
print('Found {} visual images'.format(len(visual_image_filenames)))
visual_image_filenames[0].split('/')[-1].replace('.vis','.txt')
out_dir = r'C:\Users\Adarsh Kuzhipathalil\Applications_team\Datasets\XW_Output_Arvoo\Parking Bodart\Annotations'

Found 329 visual images


In [8]:


for visual_image_filename in tqdm.tqdm(visual_image_filenames):

    [c1,_] = xw.XW_ReadFile(visual_image_filename) 
    visual_image = c1['data'] 

    frame = scale_image_base(visual_image,120,0)

    frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2BGR)
    shapes = Annotator.Detect(frame)
    json_annotations = changeToJSON(shapes,os.path.basename(visual_image_filename))
    out_file = os.path.join(out_dir, os.path.basename(visual_image_filename).replace('.xim','.json'))
    with open(out_file, 'w') as outfile:
        json.dump(json_annotations, outfile,ensure_ascii=False, indent=2)
        
    
    

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  (x_min, y_min, x_max, y_max) = preds[i]['rois'][j].astype(np.int)
100%|██████████| 329/329 [05:13<00:00,  1.05it/s]


In [45]:
json_annotations

[{'image': 'xl_visual00004292.xim',
  'verified': False,
  'annotations': [{'label': 'car',
    'coordinates': {'x': 1015.5, 'y': 69.5, 'width': 143, 'height': 45}}]}]

In [34]:
import os
os.path.basename(visual_image_filename)

'xl_visual00004292.xim'