In [27]:
import cv2 
import pytesseract
import numpy as np

#### Bounding boxes around letter
E' possibile individuare il bounding box che circonda ogni lettera individuata da Tesseract durante il processo di OCR

In [39]:
img = cv2.imread('temp\\page_1.jpg')

height = img.shape[0]
width = img.shape[1]
custom_config = r'--oem 3 --psm 6'

d = pytesseract.image_to_boxes(img, output_type=pytesseract.Output.DICT, config=custom_config)
n_boxes = len(d['char'])
for i in range(n_boxes):
    (text,x1,y2,x2,y1) = (d['char'][i],d['left'][i],d['top'][i],d['right'][i],d['bottom'][i])
    cv2.rectangle(img, (x1,height-y1), (x2,height-y2) , (0,255,0), 2)
cv2.imshow('img',img)
cv2.waitKey(0)

-1

#### Bounding boxes around words

In alternativa è possibile individuare le singole parole che sono presenti nell'immagine. Utilizzando la funzione image_to_data è possibile ottenere il dictionary con presenti le parole individuate, i rispettivi bounding box , il testo e il confidence scores di ogni parola.<br>

Among the data returned by pytesseract.image_to_data():

- left is the distance from the upper-left corner of the bounding box, to the left border of the image.
- top is the distance from the upper-left corner of the bounding box, to the top border of the image.
- width and height are the width and height of the bounding box.
- conf is the model's confidence for the prediction for the word within that bounding box. If conf is -1, that means that the corresponding bounding box contains a block of text, rather than just a single word.

In [42]:
img = cv2.imread('invoice-sample.jpg')

d = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
print(d.keys())

n_boxes = len(d['text'])
for i in range(n_boxes):
    if int(d['conf'][i]) > 60:
        (text,x, y, w, h) = (d['text'][i], d['left'][i], d['top'][i], d['width'][i], d['height'][i])
        img = cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)

cv2.imshow('img', img)
cv2.waitKey(0)

dict_keys(['level', 'page_num', 'block_num', 'par_num', 'line_num', 'word_num', 'left', 'top', 'width', 'height', 'conf', 'text'])


-1

### Estrazione dati

Utilizzando la funzione sopra indicata (image_to_data) è possibile estrarre testo dal corpo dell'immagine.  Nell'esempio sotto riportato ad esempio si estrae la data presente nel documento sulla base di una regexp

In [35]:
import re
import cv2
import pytesseract
from pytesseract import Output

img = cv2.imread('invoice-sample.jpg')

d = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)

date_pattern = '^(0[1-9]|[12][0-9]|3[01])/(0[1-9]|1[012])/(19|20)\d\d$'

n_boxes = len(d['text'])
for i in range(n_boxes):
    if int(d['conf'][i]) > 60:
        if re.match(date_pattern, d['text'][i]):
            print(d['text'][i])
            (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
            img = cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
cv2.imshow('img', img)
cv2.waitKey(0)

12/12/2001


-1

In [19]:
# get grayscale image
def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# noise removal
def remove_noise(image):
    return cv2.medianBlur(image,5)
 
#thresholding
def thresholding(image):
    return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

#dilation
def dilate(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.dilate(image, kernel, iterations = 1)
    
#erosion
def erode(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.erode(image, kernel, iterations = 1)

#opening - erosion followed by dilation
def opening(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)

#canny edge detection
def canny(image):
    return cv2.Canny(image, 100, 200)

#skew correction
def deskew(image):
    coords = np.column_stack(np.where(image > 0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated

#template matching
def match_template(image, template):
    return cv2.matchTemplate(image, template, cv2.TM_CCOEFF_NORMED) 

In [20]:
img = cv2.imread('temp\\page_1.jpg')

gray = get_grayscale(img)
thresh = thresholding(gray)
opening = opening(gray)
canny = canny(gray)

cv2.imshow('gray', gray)
cv2.imshow('thresh', thresh)
cv2.imshow('opening', opening)
cv2.imshow('canny', canny)
cv2.waitKey(0)

-1