In [1]:
import sys

sys.path.append('..')

In [2]:
import csv
from io import StringIO
from pathlib import Path

import easyocr
import enchant
import pytesseract
from PIL import Image

from digi_leap.ocr import ocr_label
from digi_leap.ocr_score import score_easyocr

In [3]:
DATA_DIR = Path('..') / 'data'
LABELS_DIR = DATA_DIR / 'labels' / 'typewritten'
# LABELS_DIR = DATA_DIR / 'labels' / 'handwritten'

In [4]:
IMAGES = sorted(LABELS_DIR.glob('*.jpg'))

LABEL = IMAGES[1]

In [5]:
EASY_OCR = easyocr.Reader(['en'])

In [6]:
image = Image.open(LABEL)
# image

In [7]:
data1 = pytesseract.image_to_data(image)
print(data1)

level	page_num	block_num	par_num	line_num	word_num	left	top	width	height	conf	text
1	1	0	0	0	0	0	0	1335	786	-1	
2	1	1	0	0	0	132	127	988	90	-1	
3	1	1	1	0	0	132	127	988	90	-1	
4	1	1	1	1	0	132	127	988	40	-1	
5	1	1	1	1	1	132	135	297	32	96	APPALACHIAN
5	1	1	1	1	2	449	132	134	31	95	STATE
5	1	1	1	1	3	604	129	250	34	95	UNIVERSITY
5	1	1	1	1	4	873	127	247	33	96	HERBARIUM
4	1	1	1	2	0	283	183	536	34	-1	
5	1	1	1	2	1	283	179	163	45	96	PLANTS
5	1	1	1	2	2	464	185	55	31	89	OF
5	1	1	1	2	3	589	183	230	30	93	Tennessee
2	1	2	0	0	0	248	261	591	54	-1	
3	1	2	1	0	0	248	261	591	54	-1	
4	1	2	1	1	0	248	261	591	54	-1	
5	1	2	1	1	1	248	270	176	33	85	plenium
5	1	2	1	1	2	459	261	302	54	88	ruta-muraria
5	1	2	1	1	3	794	263	45	32	59	die
2	1	3	0	0	0	196	281	83	37	-1	
3	1	3	1	0	0	196	281	83	37	-1	
4	1	3	1	1	0	196	281	83	37	-1	
5	1	3	1	1	1	196	281	83	37	95	 
2	1	4	0	0	0	132	418	999	266	-1	
3	1	4	1	0	0	197	418	879	42	-1	
4	1	4	1	1	0	197	418	879	42	-1	
5	1	4	1	1	1	197	426	229	32	90	Limestone
5	1	4	1	1	2	460	421	148	33	96	clif

In [8]:
table1 = []
with StringIO(data1) as str_file:
    reader = csv.DictReader(str_file, delimiter='\t')
    for row in reader:
        conf = float(row['conf'])
        if conf < 0:
            continue
        left = int(row['left'])
        top = int(row['top'])
        table1.append({
            'text': row['text'],
            'conf': conf / 100.0,
            'left': left,
            'top': top,
            'right': left + int(row['width']) - 1,
            'bottom': top + int(row['height']) - 1,
        })
table1

[{'text': 'APPALACHIAN',
  'conf': 0.96,
  'left': 132,
  'top': 135,
  'right': 428,
  'bottom': 166},
 {'text': 'STATE',
  'conf': 0.95,
  'left': 449,
  'top': 132,
  'right': 582,
  'bottom': 162},
 {'text': 'UNIVERSITY',
  'conf': 0.95,
  'left': 604,
  'top': 129,
  'right': 853,
  'bottom': 162},
 {'text': 'HERBARIUM',
  'conf': 0.96,
  'left': 873,
  'top': 127,
  'right': 1119,
  'bottom': 159},
 {'text': 'PLANTS',
  'conf': 0.96,
  'left': 283,
  'top': 179,
  'right': 445,
  'bottom': 223},
 {'text': 'OF',
  'conf': 0.89,
  'left': 464,
  'top': 185,
  'right': 518,
  'bottom': 215},
 {'text': 'Tennessee',
  'conf': 0.93,
  'left': 589,
  'top': 183,
  'right': 818,
  'bottom': 212},
 {'text': 'plenium',
  'conf': 0.85,
  'left': 248,
  'top': 270,
  'right': 423,
  'bottom': 302},
 {'text': 'ruta-muraria',
  'conf': 0.88,
  'left': 459,
  'top': 261,
  'right': 760,
  'bottom': 314},
 {'text': 'die',
  'conf': 0.59,
  'left': 794,
  'top': 263,
  'right': 838,
  'bottom': 2

In [9]:
data2 = EASY_OCR.readtext(image)
data2

[([[120, 118], [1128, 118], [1128, 174], [120, 174]],
  'APPALACHIAV STATE UNIVERSITY HERBARIUM',
  0.8233845794609377),
 ([[275, 179], [521, 179], [521, 223], [275, 223]],
  'PLAVTS OF',
  0.6838808285523854),
 ([[584, 178], [827, 178], [827, 223], [584, 223]],
  'Tennessee',
  0.9627254164393629),
 ([[182, 262], [432, 262], [432, 318], [182, 318]],
  'Asplenium',
  0.5222644995959352),
 ([[450, 258], [770, 258], [770, 312], [450, 312]],
  'ruta_muraria',
  0.9767573634667366),
 ([[787, 259], [823, 259], [823, 299], [787, 299]], 'L', 0.41991813890533436),
 ([[187, 418], [435, 418], [435, 470], [187, 470]],
  'Limestone',
  0.9995719531047783),
 ([[450, 411], [746, 411], [746, 466], [450, 466]],
  'cliffs just',
  0.9949124980652062),
 ([[765, 413], [903, 413], [903, 455], [765, 455]],
  'above',
  0.999736048101681),
 ([[926, 422], [980, 422], [980, 454], [926, 454]], 'Wa', 0.8444103066260022),
 ([[973, 410], [1086, 410], [1086, 463], [973, 463]],
  'ter ,',
  0.820725203986585),
 ([[

In [10]:
table2 = []
for item in data2:
    pos = item[0]
    table2.append({
        'text': item[1],
        'conf': item[2],
        'left': pos[0][0],
        'top': pos[0][1],
        'right': pos[1][0],
        'bottom': pos[2][1],
    })
table2

[{'text': 'APPALACHIAV STATE UNIVERSITY HERBARIUM',
  'conf': 0.8233845794609377,
  'left': 120,
  'top': 118,
  'right': 1128,
  'bottom': 174},
 {'text': 'PLAVTS OF',
  'conf': 0.6838808285523854,
  'left': 275,
  'top': 179,
  'right': 521,
  'bottom': 223},
 {'text': 'Tennessee',
  'conf': 0.9627254164393629,
  'left': 584,
  'top': 178,
  'right': 827,
  'bottom': 223},
 {'text': 'Asplenium',
  'conf': 0.5222644995959352,
  'left': 182,
  'top': 262,
  'right': 432,
  'bottom': 318},
 {'text': 'ruta_muraria',
  'conf': 0.9767573634667366,
  'left': 450,
  'top': 258,
  'right': 770,
  'bottom': 312},
 {'text': 'L',
  'conf': 0.41991813890533436,
  'left': 787,
  'top': 259,
  'right': 823,
  'bottom': 299},
 {'text': 'Limestone',
  'conf': 0.9995719531047783,
  'left': 187,
  'top': 418,
  'right': 435,
  'bottom': 470},
 {'text': 'cliffs just',
  'conf': 0.9949124980652062,
  'left': 450,
  'top': 411,
  'right': 746,
  'bottom': 466},
 {'text': 'above',
  'conf': 0.9997360481016