In [None]:
%pip show onnx onnxruntime
%pip show torch

In [6]:
#All required functions and classes
#RUN ONCE

import torch, onnx
import onnxruntime as ort
import cv2
import numpy as np
import io, imageio
from PIL import Image
import time
import torchvision
import tensorflow.compat.v1 as tf
from PIL import Image, ImageDraw, ImageFont

tf.disable_v2_behavior()

def letterbox(im, new_shape=(416, 416), color=(114, 114, 114), auto=True, scaleup=True, stride=32):    
    '''Resize and pad image while meeting stride-multiple constraints.'''
    shape = im.shape[:2]  # current shape [height, width]
    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)
    elif isinstance(new_shape, list) and len(new_shape) == 1:
        new_shape = (new_shape[0], new_shape[0])

    # Scale ratio (new / old)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    if not scaleup:  # only scale down, do not scale up (for better val mAP)
        r = min(r, 1.0)

    # Compute padding
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding

    if auto:  # minimum rectangle
        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding

    dw /= 2  # divide padding into 2 sides
    dh /= 2

    if shape[::-1] != new_unpad:  # resize
        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border

    return im, r, (left, top)

def process_image(img_src, img_size, stride, half):
    '''Process image before image inference.'''
    image = letterbox(img_src, img_size, stride=stride)[0]
    # Convert
    image = image.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
    image = torch.from_numpy(np.ascontiguousarray(image))
    image = image.half() if half else image.float()  # uint8 to fp16/32
    image /= 255  # 0 - 255 to 0.0 - 1.0
    return image, img_src

# with open(test_path, 'rb') as f:
#     img_np = np.asarray(Image.open(io.BytesIO(f.read())))
def xywh2xyxy(x):
    '''Convert boxes with shape [n, 4] from [x, y, w, h] to [x1, y1, x2, y2] where x1y1 is top-left, x2y2=bottom-right.'''
    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
    return y

def non_max_suppression(prediction, conf_thres=0.25, iou_thres=0.45, agnostic=False, multi_label=False):
    """Runs Non-Maximum Suppression (NMS) on inference results.
    This code is borrowed from: https://github.com/ultralytics/yolov5/blob/47233e1698b89fc437a4fb9463c815e9171be955/utils/general.py#L775
    Args:
        prediction: (tensor), with shape [N, 5 + num_classes], N is the number of bboxes.
        conf_thres: (float) confidence threshold.
        iou_thres: (float) iou threshold.
        agnostic: (bool), when it is set to True, we do class-independent nms, otherwise, different class would do nms respectively.
        multi_label: (bool), when it is set to True, one box can have multi labels, otherwise, one box only huave one label.

    Returns:
         list of detections, echo item is one tensor with shape (num_boxes, 6), 6 is for [xyxy, conf, cls].
    """

    num_classes = prediction.shape[2] - 5  # number of classes
    pred_candidates = torch.logical_and(prediction[..., 4] > conf_thres, torch.max(prediction[..., 5:], axis=-1)[0] > conf_thres)  # candidates
    # Check the parameters.
    assert 0 <= conf_thres <= 1, f'conf_thresh must be in 0.0 to 1.0, however {conf_thres} is provided.'
    assert 0 <= iou_thres <= 1, f'iou_thres must be in 0.0 to 1.0, however {iou_thres} is provided.'

    # Function settings.
    max_det = 50 # maximum detection objects
    max_wh = 4096  # maximum box width and height
    max_nms = 30000  # maximum number of boxes put into torchvision.ops.nms()
    time_limit = 10.0  # quit the function when nms cost time exceed the limit time.
    multi_label &= num_classes > 1  # multiple labels per box

    tik = time.time()
    output = [torch.zeros((0, 6), device=prediction.device)] * prediction.shape[0]
    for img_idx, x in enumerate(prediction):  # image index, image inference
        x = x[pred_candidates[img_idx]]  # confidence

        # If no box remains, skip the next process.
        if not x.shape[0]:
            continue

        # confidence multiply the objectness
        x[:, 5:] *= x[:, 4:5]  # conf = obj_conf * cls_conf

        # (center x, center y, width, height) to (x1, y1, x2, y2)
        # print(x[:,:4])
        box = xywh2xyxy(x[:, :4])
        # print(box)
        # box = x[:,:4]

        # Detections matrix's shape is  (n,6), each row represents (xyxy, conf, cls)
        if multi_label:
            box_idx, class_idx = (x[:, 5:] > conf_thres).nonzero(as_tuple=False).T
            x = torch.cat((box[box_idx], x[box_idx, class_idx + 5, None], class_idx[:, None].float()), 1)
        else:  # Only keep the class with highest scores.
            conf, class_idx = x[:, 5:].max(1, keepdim=True)
            x = torch.cat((box, conf, class_idx.float()), 1)[conf.view(-1) > conf_thres]

        # Check shape
        num_box = x.shape[0]  # number of boxes
        if not num_box:  # no boxes kept.
            continue
        elif num_box > max_nms:  # excess max boxes' number.
            x = x[x[:, 4].argsort(descending=True)[:max_nms]]  # sort by confidence

        # Batched NMS
        class_offset = x[:, 5:6] * (0 if agnostic else max_wh)  # classes
        boxes, scores = x[:, :4] + class_offset, x[:, 4]  # boxes (offset by class), scores
        keep_box_idx = torchvision.ops.nms(boxes, scores, iou_thres)  # NMS
        if keep_box_idx.shape[0] > max_det:  # limit detections
            keep_box_idx = keep_box_idx[:max_det]

        output[img_idx] = x[keep_box_idx]
        if (time.time() - tik) > time_limit:
            print(f'WARNING: NMS cost time exceed the limited {time_limit}s.')
            break  # time limit exceeded

    return output

def button_candidates(boxes, scores, image):

    button_scores = []  # stores the score of each button (confidence)
    button_patches = []  # stores the cropped image that encloses the button
    button_positions = []  # stores the coordinates of the bounding box on buttons

    for box, score in zip(boxes, scores):
      x_min = int(box[0])
      y_min = int(box[1])
      x_max = int(box[2])
      y_max = int(box[3])

      button_patch = image[y_min: y_max, x_min: x_max]
      button_patch = cv2.resize(button_patch, (180, 180))

      button_scores.append(score)
      button_patches.append(button_patch)
      button_positions.append([x_min, y_min, x_max, y_max])
    return button_patches, button_positions, button_scores


charset = {'0': 0,  '1': 1,  '2': 2,  '3': 3,  '4': 4,  '5': 5,
           '6': 6,  '7': 7,  '8': 8,  '9': 9,  'A': 10, 'B': 11,
           'C': 12, 'D': 13, 'E': 14, 'F': 15, 'G': 16, 'H': 17,
           'I': 18, 'J': 19, 'K': 20, 'L': 21, 'M': 22, 'N': 23,
           'O': 24, 'P': 25, 'R': 26, 'S': 27, 'T': 28, 'U': 29,
           'V': 30, 'X': 31, 'Z': 32, '<': 33, '>': 34, '(': 35,
           ')': 36, '$': 37, '#': 38, '^': 39, 's': 40, '-': 41,
           '*': 42, '%': 43, '?': 44, '!': 45, '+': 46} # <nul> = +

class CharacterRecognizer:
  def __init__(self, graph_path=None, verbose=False):
    self.graph_path = graph_path #path to the model which is loaded as a graph
    self.session = None
    self.input = None
    self.output = []
    self.class_num = 1
    self.verbose = verbose

    self.idx_lbl = {} #this is the functionally inverse to charset
    for key in charset.keys():
      self.idx_lbl[charset[key]] = key
    self.init_recognizer()
    # print('character recognizer initialized!')

  def init_recognizer(self):

    # load graph and label map from default folder
    if self.graph_path is None:
      self.graph_path = './models/ocr_graph.pb'

    # check existence of the two files
    if not os.path.exists(self.graph_path):
      raise IOError('Invalid ocr_graph path! {}'.format(self.graph_path))

    # load frozen graph
    recognition_graph = tf.Graph()
    with recognition_graph.as_default():
      od_graph_def = tf.GraphDef()
      with tf.gfile.GFile(self.graph_path, 'rb') as fid:
        serialized_graph = fid.read()
        od_graph_def.ParseFromString(serialized_graph)
        tf.import_graph_def(od_graph_def, name='')
    self.session = tf.Session(graph=recognition_graph)

    # prepare input and output request
    self.input = recognition_graph.get_tensor_by_name('ocr_input:0')
    # self.output.append(recognition_graph.get_tensor_by_name('chars_logit:0'))
    # self.output.append(recognition_graph.get_tensor_by_name('chars_log_prob:0'))
    self.output.append(recognition_graph.get_tensor_by_name('predicted_chars:0'))
    self.output.append(recognition_graph.get_tensor_by_name('predicted_scores:0'))
    # self.output.append(recognition_graph.get_tensor_by_name('predicted_text:0'))


  def clear_session(self):
    if self.session is not None:
      self.session.close()

  def predict(self, image_np, draw=False):
    assert image_np.shape == (180, 180, 3)
    img_in = np.expand_dims(image_np, axis=0)
    codes, scores = self.session.run(self.output, feed_dict={self.input: img_in}) #returns codes and scores for each code (single letter)
    codes, scores = [np.squeeze(x) for x in [codes, scores]]
    # print(len(codes), codes)
    score_ave = 0
    text = ''
    for char, score in zip(codes, scores):
      if not self.idx_lbl[char] == '+':
        score_ave += score
        text += self.idx_lbl[char]
    score_ave /= len(text)

    if self.verbose:
      self.visualize_recognition_result(image_np, text, score_ave)


    img_show = self.draw_result(image_np, text, score_ave) if draw else image_np
    
    # print(f"text = {text}")

    return text, score_ave, np.array(img_show)

  @staticmethod
  def visualize_recognition_result(image_np, text, scores):
    img_pil = Image.fromarray(image_np)
    img_show = ImageDraw.Draw(img_pil)
    font = ImageFont.truetype('./Arial.ttf', 100)
    img_show.text((45, 60), text=text, font=font, fill=(255, 0, 255))
    img_pil.show()

  @staticmethod
  def draw_result(image_np, text, scores):
    img_pil = Image.fromarray(image_np)
    img_show = ImageDraw.Draw(img_pil)
    font = ImageFont.truetype('./Arial.ttf', 60)
    img_show.text((45, 60), text=text, font=font, fill=(255, 0, 255))
    return img_pil

In [None]:
import torch, onnx
import onnxruntime as ort
import cv2,os
import numpy as np
import io
from PIL import Image
import time
import torchvision
recognizer = CharacterRecognizer(verbose=False)
test_path = "test_imgs/1737_jpg.rf.873de08c4c0bb5c1b4faaa0572a48afa.jpg"



st = time.time()
for file_name in os.listdir("./test_imgs/"):
    test_path = "./test_imgs/" + file_name
    print(test_path)
    img_np = cv2.imread(test_path)
    img_n,_ = process_image(img_np,(416,416),2,False)
    # print(img_n)
    img_n = img_n.numpy()[np.newaxis]

    #load model
    onnx_model = onnx.load('models/yolov6/best_ckpt.onnx')
    onnx.checker.check_model(onnx_model)
    ort_sess = ort.InferenceSession('models/yolov6/best_ckpt.onnx')
    preds = ort_sess.run(None, {'images': img_n})
    preds = np.array(preds)
    preds = preds.reshape(1,3549,6)
    preds = torch.tensor(preds)
    dets = non_max_suppression(preds, conf_thres=0.25, iou_thres=0.45, agnostic=False)[0]

    dets = dets.tolist()
    boxes = [row[:4] for row in dets]
    scores = [row[4] for row in dets]

    button_patches, button_positions, _ = button_candidates(boxes, scores, img_np)

    for button_img in button_patches:
        # get button text and button_score for each of the images in button_patches
        button_text, button_score, _ = recognizer.predict(button_img)
        print(button_text)

print(time.time()-st)


In [11]:
import os
test_path = "test_imgs/15_jpg.rf.7e4ba4c0c0bdb3beea120118c56fd793.jpg"
img_np = cv2.imread(test_path)
img_n,_ = process_image(img_np,(416,416),2,False)
img_n = img_n.numpy()[np.newaxis]

#load models
onnx_model = onnx.load('models/yolov6/best_ckpt.onnx')
onnx.checker.check_model(onnx_model)
ort_sess = ort.InferenceSession('models/yolov6/best_ckpt.onnx')

#get preds
preds = ort_sess.run(None, {'images': img_n})
preds = np.array(preds)
preds = preds.reshape(1,3549,6)
preds = torch.tensor(preds)
dets = non_max_suppression(preds, conf_thres=0.25, iou_thres=0.45, agnostic=False)[0]
dets = dets.tolist()
boxes = [row[:4] for row in dets]
scores = [row[4] for row in dets]

recognizer = CharacterRecognizer(verbose=False)
button_patches, button_positions, _ = button_candidates(boxes, scores, img_np)

for button_img, button_pos in zip(button_patches, button_positions):
        button_text, button_score, button_draw =recognizer.predict(button_img, draw=True)
        x_min, y_min, x_max, y_max = button_pos
        print(x_min, y_min, x_max, y_max)
        button_rec = cv2.resize(button_draw, (x_max-x_min, y_max-y_min))
        img_np[y_min+6:y_max-6, x_min+6:x_max-6] = button_rec[6:-6, 6:-6]

cv2.imshow('IMage',img_np)
cv2.waitKey(0)


270 307 316 340
95 232 141 264
182 307 229 339
269 232 316 264
270 268 315 302
268 64 318 102
270 193 316 227
183 68 231 102
103 135 140 161
184 192 232 227
96 269 143 302
272 135 311 162
96 194 142 227
97 65 144 99
95 306 142 340


113

In [10]:
#Convert yolov8 to onnx
from ultralytics import YOLO

# Load a model
model = YOLO('models/yolov8/saved_models/yolov8nano.pt')  # load model

# Export the model
model.export(format='onnx',imgsz=416)

Ultralytics YOLOv8.0.126 🚀 Python-3.9.16 torch-2.0.1+cu117 CPU
Model summary (fused): 168 layers, 3005843 parameters, 0 gradients, 8.1 GFLOPs

[34m[1mPyTorch:[0m starting from models/yolov8/saved_models/yolov8nano.pt with input shape (1, 3, 416, 416) BCHW and output shape(s) (1, 5, 3549) (6.0 MB)

[34m[1mONNX:[0m starting export with onnx 1.14.0 opset 17...
[34m[1mONNX:[0m export success ✅ 0.7s, saved as models/yolov8/saved_models/yolov8nano.onnx (11.6 MB)

Export complete (1.0s)
Results saved to [1m/home/satarw/optimization/models/yolov8/saved_models[0m
Predict:         yolo predict task=detect model=models/yolov8/saved_models/yolov8nano.onnx imgsz=416 
Validate:        yolo val task=detect model=models/yolov8/saved_models/yolov8nano.onnx imgsz=416 data=/content/datasets/Elevator-buttons-2/data.yaml 
Visualize:       https://netron.app


verbose: False, log level: Level.ERROR



'models/yolov8/saved_models/yolov8nano.onnx'

In [None]:
#infer from yolov8onnx+ocr
import os
test_path = "test_imgs/15_jpg.rf.7e4ba4c0c0bdb3beea120118c56fd793.jpg"
img_np = cv2.imread(test_path)
img_n,_ = process_image(img_np,(416,416),2,False)
img_n = img_n.numpy()[np.newaxis]

#load models
onnx_model = onnx.load('models/yolov8/yolov8nano.onnx')
onnx.checker.check_model(onnx_model)
ort_sess = ort.InferenceSession('models/yolov8/yolov8nano.onnx')

#get preds
preds = ort_sess.run(None, {'images': img_n})
preds = np.array(preds)
preds = preds.reshape(1,5,3549)
npreds = np.zeros((1, 1, 6, 3549))
npreds[:, :, :5, :] = preds
preds = torch.tensor(npreds)
print(preds)
dets = non_max_suppression(preds, conf_thres=0.25, iou_thres=0.45, agnostic=False)[0]
dets = dets.tolist()
boxes = [row[:4] for row in dets]
print(boxes)
scores = [row[4] for row in dets]

recognizer = CharacterRecognizer(verbose=False)
button_patches, button_positions, _ = button_candidates(boxes, scores, img_np)

for button_img, button_pos in zip(button_patches, button_positions):
        button_text, button_score, button_draw =recognizer.predict(button_img, draw=True)
        x_min, y_min, x_max, y_max = button_pos
        print(x_min, y_min, x_max, y_max)
        button_rec = cv2.resize(button_draw, (x_max-x_min, y_max-y_min))
        img_np[y_min+6:y_max-6, x_min+6:x_max-6] = button_rec[6:-6, 6:-6]

cv2.imshow('IMage',img_np)
cv2.waitKey(0)