In [1]:
import sys
sys.path.append('..')

In [2]:
import sqlite3
import warnings
from collections import defaultdict
from pathlib import Path

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from PIL import Image, ImageDraw

from digi_leap.pylib import db
from digi_leap.pylib import label_transforms as lt
from digi_leap.pylib import ocr_results
from digi_leap.pylib import vocab

In [3]:
len(vocab.WORDS)

1679018

In [21]:
DATA = Path('..') / 'data' / 'sernec'
# DATA = Path('..') / 'data' / 'label-babel-2'
SHEETS = DATA / 'sheets'
# DB = DATA / 'label-babel-2.sqlite'
DB = DATA / 'sernec.sqlite'

TEMP = Path('..') / 'data' / 'temp'

In [5]:
SHEETS = db.select_sheets(DB)

In [6]:
ocr = db.select_ocr(DB)
OCR = defaultdict(list)
for o in ocr:
    OCR[o['label_id']].append(o)
# OCR[7]

In [7]:
LABELS = [dict(lb) for lb in db.select_labels(DB) if lb['label_id'] in OCR]
LABELS = [lb for lb in LABELS if lb['class'] == 'Typewritten']
LABELS = sorted(LABELS, key=lambda lb: lb['label_id'])

len(LABELS)
LABELS[7]

{'label_id': 35,
 'offset': 2,
 'class': 'Typewritten',
 'sheet_id': 6,
 'label_run': '',
 'label_left': 2138,
 'label_top': 4771,
 'label_right': 3928,
 'label_bottom': 5881,
 'path': 'data/sernec/sheets/895d87af-3e67-4efd-a9d9-1d80d99af765.jpg',
 'width': 4000,
 'height': 6000}

In [8]:
def get_label(idx):
    with warnings.catch_warnings():  # Turn off EXIF warnings
        warnings.filterwarnings("ignore", category=UserWarning)
        label = LABELS[idx]
        path = Path('..') / label['path']
        sheet = Image.open(path)
        image = sheet.crop((label["label_left"], label["label_top"],
                            label["label_right"], label["label_bottom"]))
        return label, image

In [9]:
def transform_label(label, image):
    trans = lt.transform_label('deskew', image)
    trans = trans.convert('RGB')
    return trans

In [10]:
COLORS = {
    ('deskew', 'tesseract'): 'red',
    ('deskew', 'easy'): 'blue',
    ('binarize', 'tesseract'): 'green',
    ('binarize', 'easy'): 'orange',
}


def display_boxes(ocr_boxes, image):
    draw = ImageDraw.Draw(image)

    for o in ocr_boxes:
        box = [o['ocr_left'], o['ocr_top'], o['ocr_right'], o['ocr_bottom']]
        color = COLORS[(o['pipeline'], o['engine'])]
        draw.rectangle(box, outline=color, width=2)

In [11]:
def trim_boxes(ocr_boxes, label):
    ocr = OCR[ocr_boxes['label_id']]

    for o in ocr:
        box = [o['ocr_left'], o['ocr_top'], o['ocr_right'], o['ocr_bottom']]
        box = label.crop(box)
        if box.size[0] == 0 or box.size[1] == 0:
            continue
        proj = iu.profile_projection(box)
        above = np.where(proj > 0)
        if above and len(above[0]) > 0:
            o['bottom'] = o['top'] + above[0][-1]
            o['top'] += above[0][0]

In [24]:
def build_ocr(idx, save):
    label, image = get_label(idx)
    #     print(label['label_id'])

    image = transform_label(label, image)

    ocr_boxes = OCR[label['label_id']]
    # display_boxes(ocr_boxes, image)
    ocr_boxes = ocr_results.filter_boxes(ocr_boxes)
    # trim_boxes(ocr_boxes, label)
    # display_boxes(ocr_boxes, image)

    rows = ocr_results.get_lines(ocr_boxes)
    lines = []
    for row in rows:
        copies = ocr_results.get_copies(row)

        if len(copies) <= 1:
            continue

        copies = ocr_results.sort_copies(copies)

        aligned = ocr_results.align_copies(copies)
        # print('\n'.join(aligned))

        cons = ocr_results.consensus(aligned)

        ln = ocr_results.substitute(cons)
        # print(f'substitute={ln}')
        ln = ocr_results.spaces(ln)

        # print(f'spaces={ln}')

        ln = ocr_results.spell_correct(ln)
        # print(f'spell_correct={ln}')
        # print()
        lines.append(ln)

    text = '\n'.join(lines)
    print(text)
    display(image)

    if save:
        image.save(TEMP / f'{idx}.jpg')
        with open(TEMP / f'{idx}.txt', 'w') as txt_file:
            txt_file.write(text)


# interact(build_ocr, idx=(0, len(LABELS) - 1), save=False);
# build_ocr(3)
# build_ocr(7625)