<font size=5> Optical Character Recognition with help of R-tree </font>

Aim:

steps:
1. convert pdf to image, if multiple pdf pages then each page into the individual image file.
2. convert the colour image into a grayscale image
3. read/create target bounding boxes
4. with help of tesseract to recognize the character in the image
5. create an r-tree index for each bounding box of tesseract output data.
6. find the intersection of the target bounding box in an r-tree index.
7. get the required target index from the data frame, continue processing text if necessary.
8. repeat above step remaining pages.

# Import library

In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import rtree
import geopandas as gpd
from shapely.geometry import Polygon,LineString
import pytesseract
from pytesseract import Output
import cv2
from pdf2image import convert_from_path
from PIL import Image

import os
import re

pd.options.display.max_columns = 100

# Import file

In [2]:
file = "notebook/input/invoice3.pdf"
path = os.path.join(os.getcwd(),file)

target_path = os.path.join(os.getcwd(),'input')

In [3]:
os.getcwd()

'/home/sudhir/AI/practice/ocr/notebook'

# FileManager

1. PDF to Image convert
2. read image
3. show image

In [4]:
class FileManager:
    """
    File manager
    
    """
    
    def read_image(self, file, path):
        """
        Read image with opencv library
        """
        # file path + name
        filename = os.path.join(path,file)

        # img = Image.open(filename)
        img = cv2.imread(filename)
        
        # print image size
        print('Image size: ',img.shape)
        
        return img
    
    def show_image(self, img, small_size = (1000, 600)):
        # show image file with imshow method open cv 
        # cv2.namedWindow("img", cv2.WINDOW_NORMAL)
        # resize and show image
        if small_size:
            cv2.imshow('img',cv2.resize(img,small_size))
        else:
            cv2.imshow('img',img)
        cv2.waitKey(0)
        cv2.destroyAllWindows()
    
    def pdf_2_image(self,file, path):
        """
        PDF to image converter
        """
        # target_
        filename = os.path.join(path,file)

        # convert pdf to image
        pages = convert_from_path(filename,dpi=200)        

        # save each page with proper file name
        list_pagefilename = []
        for img_cnt,page in enumerate(pages):
            
            # new file name for image
            pagefilename = file+"_page_"+str(img_cnt+1)+".jpg"
            pagefilename = os.path.join(path,pagefilename)
            list_pagefilename.append(pagefilename)

            # Save the image of the page in system
            page.save(pagefilename, 'JPEG')

        return list_pagefilename

In [5]:
filename = "page_1.jpg"
img = FileManager().read_image(filename, target_path)

Image size:  (2339, 1654, 3)


In [6]:
FileManager().show_image(img)

# Tesseract

Among the data returned by pytesseract.image_to_data():
* left is the distance from the upper-left corner of the bounding box, to the left border of the image.
* top is the distance from the upper-left corner of the bounding box, to the top border of the image.
* width and height are the width and height of the bounding box.
* conf is the model's confidence for the prediction for the word within that bounding box. If conf is -1, that means that the corresponding bounding box contains a block of text, rather than just a single word.

In [7]:
class OCRwithRtree:
    """
    Optical Character Recognition with help of R-tree.
    
    """
    
    def grayscale(self,img):
        img2 = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        return img2
    
    def apply_ocr(self,img):
        """
        Apply OCR on image 
        """
        data = pytesseract.image_to_data(img, output_type=Output.DICT)
        
        return data
    
    def bbox_on_ocr(self,data,img):
        
        # len
        n_boxes = len(data['level'])
        for i in range(n_boxes):
            if data['conf'][i] != '-1':
                (x,y,w,h) = (data['left'][i],data['top'][i],data['width'][i],data['height'][i])
                cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 2)
        return img

In [8]:
data = OCRwithRtree().apply_ocr(img)

In [9]:
img = OCRwithRtree().bbox_on_ocr(data,img)
FileManager().show_image(img)

In [10]:
def convert_to_dataframe(data):
    # dict to dataframe
    df = pd.DataFrame(data)
    
    # remove lower confidence data
    df_st= df.query('conf != "-1"').reset_index(drop=True)
    
    print('Average performance of ocr:',round(df_st['conf'].mean(),3))
    
    # bounding box xmax,ymax calculation
    df_st['wloc'] = df_st['left'] + df_st['width'] 
    df_st['hloc'] = df_st['top'] + df_st['height']
    
    return df_st

In [11]:
df = convert_to_dataframe(data)

Average performance of ocr: 90.331


# Rtree index

In [12]:
def create_rtree_index(df):
    
    # intialize rtree 
    rindex = rtree.index.Index()
    
    # insert bbox to  rtree from data frame
    for idx in range(df.shape[0]):
        # (left, bottom, right, top)
        loc = tuple(df.loc[idx,['left','top','wloc','hloc']])
        rindex.insert(idx,loc)
    
    return rindex

In [13]:
def intersection_of_bbox(rindex,loc,df,normalize=None):
    inter_idx = [w for w in rindex.intersection(loc)]
    
    print('inersection index: ',inter_idx)
    df_tmp = df[df.index.isin(inter_idx)]
    
    # normalize the data
    text = ''
    if normalize:
        df_sort = df_tmp.sort_values(by=['top','left'])
        text = ' '.join(df_sort['text'].values)
        print('The text intersect with bbox is: ',text)
    return df_tmp,text

In [14]:
rindex = create_rtree_index(df)

In [15]:
# from
loc = (0,0, 10,10)
df_tmp,text = intersection_of_bbox(rindex,loc,df,normalize=True)

inersection index:  []
The text intersect with bbox is:  


## Query target bounding box
1. positive target bouding box
2. enclosed target bounding box

In [16]:
# from
loc = (120,346,272,435)
df_tmp,text = intersection_of_bbox(rindex,loc,df,normalize=True)

inersection index:  [2, 3, 5, 4, 34, 35]
The text intersect with bbox is:  From: - DEMO Sliced Suite 5A-1204


In [17]:
# from
loc = (118,651, 273, 809)
df_tmp,text = intersection_of_bbox(rindex,loc,df,normalize=True)

inersection index:  [58, 59, 60, 62, 61, 65, 64, 67]
The text intersect with bbox is:  To: Test Business Somewhere 123 Melbourne, VIC test@test.com


In [18]:
df_tmp

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,wloc,hloc
58,5,1,28,1,5,1,118,651,36,19,96,To:,154,670
59,5,1,28,1,6,1,118,686,47,19,96,Test,165,705
60,5,1,28,1,6,2,174,686,97,19,96,Business,271,705
61,5,1,28,1,7,1,120,722,38,18,96,123,158,740
62,5,1,28,1,7,2,167,721,128,19,96,Somewhere,295,740
64,5,1,28,1,8,1,120,755,118,21,96,"Melbourne,",238,776
65,5,1,28,1,8,2,248,755,39,19,96,VIC,287,774
67,5,1,28,1,9,1,118,790,155,19,91,test@test.com,273,809


## Problem 2

In [19]:
# filename = 'sales_invoice.pdf'
# target_path = os.path.join(os.getcwd(),'input')
# list_pagenames = FileManager().pdf_2_image(filename,target_path)

In [20]:
filename = "sales_invoice.pdf_page_1.jpg"
target_path = os.path.join(os.getcwd(),'input')
img = FileManager().read_image(filename, target_path)

Image size:  (2200, 1700, 3)


In [21]:
FileManager().show_image(img)

In [22]:
data = OCRwithRtree().apply_ocr(img)

In [23]:
df = convert_to_dataframe(data)

Average performance of ocr: 91.039


In [24]:
rindex = create_rtree_index(df)

In [25]:
# from
loc = (109,147, 380,344)
df_tmp,text = intersection_of_bbox(rindex,loc,df,normalize=True)

inersection index:  [15, 16, 14, 19, 18, 22, 26, 27, 32, 31, 33, 34]
The text intersect with bbox is:  Ltd ABC Pvt 100,10" main Bangalore Phone: 123456789 12345 Fax: Website: www.abc.com


In [26]:
df_tmp

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,wloc,hloc
14,5,1,15,1,1,1,109,147,74,27,96,ABC,183,174
15,5,1,15,1,1,2,198,147,51,26,94,Pvt,249,173
16,5,1,15,1,1,3,262,146,47,28,90,Ltd,309,174
18,5,1,15,2,1,1,111,187,92,23,72,"100,10""",203,210
19,5,1,15,2,1,2,214,189,55,18,96,main,269,207
22,5,1,15,3,1,1,111,218,115,32,96,Bangalore,226,250
26,5,1,15,4,1,1,110,257,76,19,95,Phone:,186,276
27,5,1,15,4,1,2,198,257,129,19,95,123456789,327,276
31,5,1,15,5,1,1,110,293,45,18,96,Fax:,155,311
32,5,1,15,5,1,2,167,292,70,19,96,12345,237,311


In [51]:
df.query('text == "SUBTOTAL" ')

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,wloc,hloc
86,5,1,15,13,2,1,1115,1355,122,32,96,SUBTOTAL,1237,1387


In [54]:
df.query(' 890< top > 780')['text'].values

array([' ', ' ', ' ', 'SUBTOTAL', '2,325.00', 'Other', 'Comments', 'or',
       'Special', 'Instructions', 'TAXABLE', '2,325.00', '1.', 'Total',
       'payment', 'due', 'in', '30', 'days', 'TAX', 'RATE', '6.875%',
       '2.', 'Please', 'include', 'the', 'invoice', 'number', 'on',
       'your', 'check', 'TAX', '159.84', 'S&H', '-', 'OTHER', '-',
       'TOTAL', '$', '2,484.84', ' ', 'Make', 'all', 'checks', 'payable',
       'to', 'ABC', 'Pvt', 'Ltd', 'If', 'you', 'have', 'any', 'questions',
       'about', 'this', 'invoice,', 'please', 'contact', '[Name,',
       'Phone', '#,', 'E-mail]', 'Thank', 'You', 'For', 'Your',
       'Business!'], dtype=object)

In [37]:
# from
loc = (567,790, 1237, 1387)
df_tmp,text = intersection_of_bbox(rindex,loc,df,normalize=True)

inersection index:  [124, 7, 66, 67, 68, 69, 75, 76, 83, 82, 11, 86]
The text intersect with bbox is:      DESCRIPTION QTY UNIT PRICE 15 150.00 75.00 1   SUBTOTAL


In [29]:
FileManager().show_image(img,small_size=None)

Refrence:

1. [bbox stackoverflow](https://stackoverflow.com/questions/20831612/getting-the-bounding-box-of-the-recognized-words-using-python-tesseract)