# PaddleOCR with OpenVINO

This demo shows how to run PaddleOCR (Lite) model on OpenVINO natively. Instead of exporting the PaddlePaddle model to ONNX and then create the Intermediate Representation (IR) format through OpenVINO optimizer, we can now read direct from the Paddle Model without any conversions.

Authors: 
Zhuo Wu, PhD (OpenVINO Edge AI Software Evangelist - Intel)

## Run Paddle Detection with OpenVINO

In [None]:
import os, os.path
import sys
import json
import urllib.request
import cv2
import numpy as np
import paddle
import math
import time
import collections

from openvino.inference_engine import IENetwork, IECore, ExecutableNetwork
from IPython import display
from PIL import Image, ImageDraw
import copy

import logging
import imghdr
from shapely.geometry import Polygon
import pyclipper

sys.path.append("/home/wu/openvino_notebooks/notebooks/utils")
import notebook_utils as utils
from pre_post_processing import *

### Load the Network for Paddle Detection

In [None]:
det_model_dir = "/home/wu/PaddleOCR/inference/ch_ppocr_mobile_v2.0_det_infer"
det_model_file_path = det_model_dir + "/inference.pdmodel"
det_params_file_path = det_model_dir + "/inference.pdiparams"

det_ie = IECore()
det_net = det_ie.read_network(det_model_file_path)

### Load the Network for Paddle Recognition

In [None]:
rec_model_dir = "/home/wu/PaddleOCR/inference/ch_ppocr_mobile_v2.0_rec_infer"
rec_model_file_path = rec_model_dir + "/inference.pdmodel"
rec_params_file_path = rec_model_dir + "/inference.pdiparams"

rec_ie = IECore()
rec_net = rec_ie.read_network(rec_model_file_path)

### Preprocessing and post processing image functions for text detection and recognition

In [None]:
def image_preprocess(input_image, size):
    img = cv2.resize(input_image, (size,size))
    img = np.transpose(img, [2,0,1]) / 255
    img = np.expand_dims(img, 0)
    ##NormalizeImage: {mean: [0.485, 0.456, 0.406], std: [0.229, 0.224, 0.225], is_scale: True}
    img_mean = np.array([0.485, 0.456,0.406]).reshape((3,1,1))
    img_std = np.array([0.229, 0.224, 0.225]).reshape((3,1,1))
    img -= img_mean
    img /= img_std
    return img.astype(np.float32)

In [None]:
def draw_text_det_res(dt_boxes, img_path):
    #src_im = cv2.imread(img_path)
    src_im = img_path
    for box in dt_boxes:
        box = np.array(box).astype(np.int32).reshape(-1, 2)
        cv2.polylines(src_im, [box], True, color=(255, 255, 0), thickness=2)
    return src_im

In [None]:
#Preprocess for Paddle Recognition
def resize_norm_img(img, max_wh_ratio):
        rec_image_shape = [3, 32, 320]
        imgC, imgH, imgW = rec_image_shape
        assert imgC == img.shape[2]
        character_type = "ch"
        if character_type == "ch":
            imgW = int((32 * max_wh_ratio))
        h, w = img.shape[:2]
        ratio = w / float(h)
        if math.ceil(imgH * ratio) > imgW:
            resized_w = imgW
        else:
            resized_w = int(math.ceil(imgH * ratio))
        resized_image = cv2.resize(img, (resized_w, imgH))
        resized_image = resized_image.astype('float32')
        resized_image = resized_image.transpose((2, 0, 1)) / 255
        resized_image -= 0.5
        resized_image /= 0.5
        padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
        padding_im[:, :, 0:resized_w] = resized_image
        return padding_im

### Main processing function for PaddleOCR

In [None]:
# Define main function for PaddleOCR
def run_paddle_ocr(source=0, flip=False, use_popup=False):
    # create video player to play with target fps
    player = utils.VideoPlayer(source=source, flip=flip, fps=30)
    
    #Start video capturing
    player.start()
    try:
        if use_popup:
            title = "Press ESC to Exit"
            cv2.namedWindow(winname=title, flags=cv2.WINDOW_GUI_NORMAL | cv2.WINDOW_AUTOSIZE)

        processing_times = collections.deque()
        while True:
            # grab the frame
            frame1 = player.next()
            if frame1 is None:
                print("Source ended")
                break
            else:    
                #Filp the image otherwise the recognition result is wrong
                frame = cv2.flip(frame1,1)
                image_file = frame
                test_image = image_preprocess(image_file,640)

                # pdmodel might be dynamic shape, this will reshape based on the input
                input_key = list(det_net.input_info.items())[0][0] # 'inputs'
                det_net.reshape({input_key: test_image.shape})
                det_exec_net = det_ie.load_network(det_net, 'CPU') 

                # measure processing time
                start_time = time.time()
                #perform the inference step
                output = det_exec_net.infer({input_key: test_image})
                stop_time = time.time()
                result_ie = list(output.values())

                # Postprocessing for Paddle Detection
                ori_im = image_file.copy()
                data = {'image': image_file}
                data_resize = DetResizeForTest(data)
                data_norm = NormalizeImage(data_resize)
                data_list = []
                keep_keys =  ['image', 'shape']
                for key in keep_keys:
                    data_list.append(data[key])
                img, shape_list = data_list

                shape_list = np.expand_dims(shape_list, axis=0)
                pred = result_ie[0]      
                if isinstance(pred, paddle.Tensor):
                    pred = pred.numpy()
                pred = pred[:, 0, :, :]
                segmentation = pred > 0.3

                boxes_batch = []
                for batch_index in range(pred.shape[0]):
                    src_h, src_w, ratio_h, ratio_w = shape_list[batch_index]
                    mask = segmentation[batch_index]
                    boxes, scores = boxes_from_bitmap(pred[batch_index], mask,src_w, src_h)
                    boxes_batch.append({'points': boxes})
                post_result = boxes_batch
                dt_boxes = post_result[0]['points']

                dt_boxes = filter_tag_det_res(dt_boxes, ori_im.shape)
                #Draw boxes on detected text
                src_im = draw_text_det_res(dt_boxes, image_file)

                processing_times.append(stop_time - start_time)
                # use processing times from last 200 frames
                if len(processing_times) > 200:
                    processing_times.popleft()

                #Visualize Paddle detecion results
                _, f_width = frame.shape[:2]
                # mean processing time [ms]
                processing_time = np.mean(processing_times) * 1000
                cv2.putText(img=src_im, text=f"Inference time: {processing_time:.1f}ms", org=(20, 40),
                            fontFace=cv2.FONT_HERSHEY_COMPLEX, fontScale=f_width / 1000,
                            color=(0, 0, 255), thickness=1, lineType=cv2.LINE_AA)

                # use this workaround if there is flickering
                if use_popup:
                    cv2.imshow(winname=title, mat=frame)
                    key = cv2.waitKey(1)
                    # escape = 27
                    if key == 27:
                        break
                else:
                    # encode numpy array to jpg
                    _, encoded_img = cv2.imencode(ext=".jpg", img=src_im,
                                                    params=[cv2.IMWRITE_JPEG_QUALITY, 100])
                    # create IPython image
                    i = display.Image(data=encoded_img)
                    # display the image in this notebook
                    display.clear_output(wait=True)
                    display.display(i)

                #Preprocess detection results for recognition
                dt_boxes = sorted_boxes(dt_boxes)
                img_crop_list = []   
                if dt_boxes != []:
                    for bno in range(len(dt_boxes)):
                        tmp_box = copy.deepcopy(dt_boxes[bno])
                        img_crop = get_rotate_crop_image(ori_im, tmp_box)
                        img_crop_list.append(img_crop)

                    #Recognition starts from here
                    img_num = len(img_crop_list)
                    # Calculate the aspect ratio of all text bars
                    width_list = []
                    for img in img_crop_list:
                        width_list.append(img.shape[1] / float(img.shape[0]))
                    # Sorting can speed up the recognition process
                    indices = np.argsort(np.array(width_list))
                    rec_res = [['', 0.0]] * img_num
                    rec_batch_num = 6
                    batch_num = rec_batch_num
                    rec_processing_times = 0

                    #For each detected text box, run inference for text recognition
                    for beg_img_no in range(0, img_num, batch_num):
                        end_img_no = min(img_num, beg_img_no + batch_num)

                        norm_img_batch = []
                        max_wh_ratio = 0
                        for ino in range(beg_img_no, end_img_no):
                            h, w = img_crop_list[indices[ino]].shape[0:2]
                            wh_ratio = w * 1.0 / h
                            max_wh_ratio = max(max_wh_ratio, wh_ratio)
                        for ino in range(beg_img_no, end_img_no):
                            norm_img = resize_norm_img(img_crop_list[indices[ino]],max_wh_ratio)
                            norm_img = norm_img[np.newaxis, :]
                            norm_img_batch.append(norm_img)

                        norm_img_batch = np.concatenate(norm_img_batch)
                        norm_img_batch = norm_img_batch.copy()

                        # pdmodel might be dynamic shape, this will reshape based on the input
                        input_key = list(rec_net.input_info.items())[0][0] # 'inputs'
                        rec_net.reshape({input_key: norm_img_batch.shape})
                        #Load the Paddle recognition network on CPU
                        rec_exec_net = rec_ie.load_network(rec_net, 'CPU') 

                        #Run inference for text recognition 
                        for index in range(len(norm_img_batch)):
                            output = rec_exec_net.infer({input_key: norm_img_batch})
                        result_ie = list(output.values())
                        preds = result_ie[0]
                        #Postprocessing recognition results
                        postprocess_op = build_post_process(postprocess_params)
                        rec_result = postprocess_op(preds)
                        for rno in range(len(rec_result)):
                            rec_res[indices[beg_img_no + rno]] = rec_result[rno]
                    print(rec_res)

    # ctrl-c
    except KeyboardInterrupt:
        print("Interrupted")
    # any different error
    except RuntimeError as e:
        print(e)
    finally:
        # stop capturing
        player.stop()
        if use_popup:
            cv2.destroyAllWindows()

## Run Live PaddleOCR with OpenVINO

In [None]:
run_paddle_ocr(source=0, flip=True, use_popup=True)

In [None]:
#Test OCR results on video file

#video_file = "test1.mp4"
#source = video_file
#player = utils.VideoPlayer(source=source, flip=False, fps=30)