## This notebook illustrates various Pytesseract operations 

In [1]:
try:
    from PIL import Image
except ImportError:
    import Image
import pytesseract


### How to convert an Image to String 

In [2]:
print(pytesseract.image_to_string(Image.open('invoice.jpg'), lang='eng'))

ABC Ventures

BILL TO INVOICE # 101
XYZ InfoTech INVOICE DATE 28/06/2020
DLF Phase 3

Gurgaon

Invoice Total ¥ 52,000.00

DESCRIPTION AMOUNT
Laptop 40,000.00
Headphone 2,000.00
Bluetooth Speaker 8,000.00
Screen Guard 2,000.00

TERMS & CONDITIONS

Payment is due within 15 days



### How to Timeout/Terminate the tesseract job 

In [3]:
print(pytesseract.image_to_string('invoice.jpg', timeout=20)) # Timeout after 20 seconds

ABC Ventures

BILL TO INVOICE # 101
XYZ InfoTech INVOICE DATE 28/06/2020
DLF Phase 3

Gurgaon

Invoice Total ¥ 52,000.00

DESCRIPTION AMOUNT
Laptop 40,000.00
Headphone 2,000.00
Bluetooth Speaker 8,000.00
Screen Guard 2,000.00

TERMS & CONDITIONS

Payment is due within 15 days



### Get Bounding Box Estimates

In [4]:
print(pytesseract.image_to_boxes(Image.open('invoice.jpg')))

A 158 2050 170 2074 0
B 170 2050 182 2074 0
C 190 2049 220 2074 0
V 240 2050 247 2074 0
e 247 2050 262 2074 0
n 266 2049 279 2068 0
t 284 2050 299 2068 0
u 302 2049 311 2073 0
r 319 2049 330 2068 0
e 337 2049 356 2068 0
s 364 2049 381 2068 0
B 160 1826 172 1850 0
I 184 1826 190 1850 0
L 194 1826 209 1850 0
L 214 1826 231 1850 0
T 242 1826 261 1850 0
O 263 1825 287 1850 0
I 982 1826 988 1850 0
N 995 1826 1011 1850 0
V 1014 1826 1030 1850 0
O 1030 1826 1036 1850 0
I 1047 1825 1060 1850 0
C 1066 1825 1083 1850 0
E 1097 1826 1109 1850 0
# 1126 1825 1145 1850 0
1 1453 1828 1461 1848 0
0 1467 1827 1474 1848 0
1 1484 1828 1491 1848 0
X 158 1773 168 1793 0
Y 168 1773 176 1793 0
Z 185 1773 206 1793 0
I 222 1773 225 1793 0
n 229 1773 239 1788 0
f 243 1773 252 1793 0
o 252 1772 265 1788 0
T 266 1773 280 1793 0
e 284 1772 293 1788 0
c 299 1772 306 1788 0
h 314 1773 326 1793 0
I 982 1768 988 1792 0
N 994 1768 1011 1792 0
V 1014 1768 1033 1792 0
O 1037 1767 1049 1792 0
I 1049 1767 1060 1792 0
C 1065

### Get verbose data like boxes, confidences, line and page numbers

In [6]:
print(pytesseract.image_to_data(Image.open('invoice.jpg')))

level	page_num	block_num	par_num	line_num	word_num	left	top	width	height	conf	text
1	1	0	0	0	0	0	0	1653	2339	-1	
2	1	1	0	0	0	158	265	1337	387	-1	
3	1	1	1	0	0	158	265	223	25	-1	
4	1	1	1	1	0	158	265	223	25	-1	
5	1	1	1	1	1	158	265	71	25	95	ABC
5	1	1	1	1	2	240	265	141	25	96	Ventures
3	1	1	2	0	0	158	489	1337	117	-1	
4	1	1	2	1	0	160	489	1331	25	-1	
5	1	1	2	1	1	160	489	71	24	96	BILL
5	1	1	2	1	2	242	489	45	25	95	TO
5	1	1	2	1	3	982	489	134	25	93	INVOICE
5	1	1	2	1	4	1126	489	19	25	93	#
5	1	1	2	1	5	1453	491	38	21	96	101
4	1	1	2	2	0	158	546	1337	26	-1	
5	1	1	2	2	1	158	546	54	20	94	XYZ
5	1	1	2	2	2	222	546	104	21	94	InfoTech
5	1	1	2	2	3	982	547	134	25	95	INVOICE
5	1	1	2	2	4	1128	547	88	24	96	DATE
5	1	1	2	2	5	1358	549	137	21	96	28/06/2020
4	1	1	2	3	0	160	586	159	20	-1	
5	1	1	2	3	1	160	586	50	20	91	DLF
5	1	1	2	3	2	220	586	76	20	96	Phase
5	1	1	2	3	3	306	586	13	20	95	3
3	1	1	3	0	0	160	626	104	26	-1	
4	1	1	3	1	0	160	626	104	26	-1	
5	1	1	3	1	1	160	626	104	26	96	Gurgaon
2	1	2	0	0	0	163	774	1321	69	-1	
3	1	

### Get information about orientation and script detection

In [7]:
print(pytesseract.image_to_osd(Image.open('invoice.jpg')))

Page number: 0
Orientation in degrees: 0
Rotate: 0
Orientation confidence: 8.98
Script: Latin
Script confidence: 2.42



### Get a searchable PDF

In [8]:
pdf = pytesseract.image_to_pdf_or_hocr('invoice.jpg', extension='pdf')
with open('image.pdf', 'w+b') as f:
    f.write(pdf) # pdf type is bytes by default

### Get HOCR output

In [9]:
hocr = pytesseract.image_to_pdf_or_hocr('image.jpg', extension='hocr')
hocr

b'<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"\n    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">\n <head>\n  <title></title>\n<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\n  <meta name=\'ocr-system\' content=\'tesseract v4.0.0.20181030\' />\n  <meta name=\'ocr-capabilities\' content=\'ocr_page ocr_carea ocr_par ocr_line ocrx_word ocrp_wconf\'/>\n</head>\n<body>\n  <div class=\'ocr_page\' id=\'page_1\' title=\'image "C:\\Users\\amrul\\programming\\udemy_courses\\python_ocr\\pytesseract\\image.jpg"; bbox 0 0 620 1219; ppageno 0\'>\n   <div class=\'ocr_carea\' id=\'block_1_1\' title="bbox 41 90 591 220">\n    <p class=\'ocr_par\' id=\'par_1_1\' lang=\'eng\' title="bbox 41 90 591 220">\n     <span class=\'ocr_line\' id=\'line_1_1\' title="bbox 41 90 591 130; baseline 0.005 -6; x_size 20.888889; x_descenders 4.8888888; x_ascende

### How to add additional options

In [10]:
custom_oem_psm_config = r'--oem 3 --psm 11'
print(pytesseract.image_to_string('invoice.jpg', config=custom_oem_psm_config))

[

ABC Ventures

BILL TO

INVOICE #

101

XYZ InfoTech

INVOICE DATE

28/06/2020

DLF Phase 3

Gurgaon

Invoice Total

~ 52,000.00

DESCRIPTION

AMOUNT

Laptop

40,000.00

2,000.00

Headphone

Bluetooth Speaker

8,000.00

Screen Guard

2,000.00

TERMS & CONDITIONS

Payment is due within 15 days

[oT

