-
Notifications
You must be signed in to change notification settings - Fork 153
/
tesseract2.py
92 lines (80 loc) · 2.8 KB
/
tesseract2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""
Demo script using Mupdf OCR.
Extract text of a page and interpret unrecognized characters using Tesseract.
MuPDF codes unrecognizable characters as 0xFFFD = 65533.
Extraction option is "dict", which delivers contiguous text pieces within one
line, that have the same font properties (color, fontsize, etc.). Together with
the language parameter, this helps Tesseract finding the correct character.
The basic approach is to only invoke OCR, if the span text contains
chr(65533). Because Tesseract's response ignores leading spaces and appends
line break characters, some adjustments are made.
--------------
This demo will OCR only text, that is known to be text. This means, it
does not look at parts of a page containing images or text encoded as drawings.
--------------
Dependencies:
PyMuPDF v1.19.0
"""
import fitz
import time, os
print(fitz.__doc__)
tessdata = os.getenv("TESSDATA_PREFIX")
print("Found tessdata here:", tessdata)
print()
mat = fitz.Matrix(5, 5) # high resolution matrix
ocr_time = 0
pix_time = 0
INVALID_UNICODE = chr(0xFFFD) # the "Invalid Unicode" character
def get_tessocr(page, bbox):
"""Return OCR-ed span text using Tesseract.
Args:
page: fitz.Page
bbox: fitz.Rect or its tuple
Returns:
The OCR-ed text of the bbox.
"""
global ocr_time, pix_time, tess, mat
# Step 1: Make a high-resolution image of the bbox.
t0 = time.perf_counter()
pix = page.get_pixmap(
matrix=mat,
clip=bbox,
)
t1 = time.perf_counter()
ocrpdf = fitz.open("pdf", pix.pdfocr_tobytes())
ocrpage = ocrpdf[0]
text = ocrpage.get_text()
if text.endswith("\n"):
text = text[:-1]
t2 = time.perf_counter()
ocr_time += t2 - t1
pix_time += t1 - t0
return text
doc = fitz.open("v110-changes.pdf")
ocr_count = 0
for page in doc:
blocks = page.get_text("dict", flags=0)["blocks"]
for b in blocks:
for l in b["lines"]:
for s in l["spans"]:
text = s["text"]
if INVALID_UNICODE in text: # invalid characters encountered!
# invoke OCR
ocr_count += 1
print("before: '%s'" % text)
text1 = text.lstrip()
sb = " " * (len(text) - len(text1)) # leading spaces
text1 = text.rstrip()
sa = " " * (len(text) - len(text1)) # trailing spaces
new_text = sb + get_tessocr(page, s["bbox"]) + sa
print(" after: '%s'" % new_text)
print("-------------------------")
print("OCR invocations: %i." % ocr_count)
print(
"Pixmap time: %g (avg %g) seconds."
% (round(pix_time, 5), round(pix_time / ocr_count, 5))
)
print(
"OCR time: %g (avg %g) seconds."
% (round(ocr_time, 5), round(ocr_time / ocr_count, 5))
)