In [1]:
import sys
sys.path.append('..')

In [2]:
import difflib as dl
import regex as re
import statistics as stat
import string
import subprocess
import tempfile
import unicodedata
import warnings
from collections import Counter, defaultdict, namedtuple
from dataclasses import dataclass, field
from functools import reduce
from itertools import groupby, combinations
from pathlib import Path
from pprint import pp
from textblob import Word
from typing import Union
from types import SimpleNamespace

import numpy as np
import pandas as pd
from spellchecker import SpellChecker
from IPython.display import display
from ipywidgets import interact
from PIL import Image, ImageDraw
from textblob import Word

from digi_leap.pylib import (
    db,
    image_util as iu,
    label_transforms as lt,
    line_align_subs as subs,
    line_align_py as la,
    ocr_results,
    vocab,
)

In [3]:
DATA = Path('..') / 'data' / 'sernec'
SHEETS = DATA / 'sheets'
DB = DATA / 'sernec.sqlite'

In [4]:
SHEETS = [dict(s) for s in db.select_sheets(DB)]

In [5]:
ocr = [dict(ocr) for ocr in db.select_ocr(DB)]
OCR = defaultdict(list)
for o in ocr:
    OCR[o['label_id']].append(o)

In [6]:
LABELS = [dict(lb) for lb in db.select_labels(DB) if lb['label_id'] in OCR]
LABELS = [lb for lb in LABELS if lb['class'] == 'Typewritten']

In [7]:
def get_label(idx):
    with warnings.catch_warnings():  # Turn off EXIF warnings
        warnings.filterwarnings("ignore", category=UserWarning)
        label = LABELS[idx]
        path = Path('..') / label['path']
        sheet = Image.open(path)
        image = sheet.crop((label["left"], label["top"],
                            label["right"], label["bottom"]))
        return label, image

In [8]:
def transform_label(label, image):
    trans = lt.transform_label('deskew', image)
    trans = trans.convert('RGB')
    return trans

In [9]:
COLORS = {
    ('deskew', 'tesseract'): 'red',
    ('deskew', 'easy'): 'blue',
    ('binarize', 'tesseract'): 'green',
    ('binarize', 'easy'): 'orange',
}


def display_boxes(ocr_boxes, image):
    draw = ImageDraw.Draw(image)

    for o in ocr_boxes:
        box = [o['left'], o['top'], o['right'], o['bottom']]
        color = COLORS[(o['pipeline'], o['engine'])]
        draw.rectangle(box, outline=color, width=2)

In [10]:
def trim_boxes(ocr_boxes, label):
    ocr = OCR[ocr_boxes['label_id']]

    for o in ocr:
        box = [o['left'], o['top'], o['right'], o['bottom']]
        box = label.crop(box)
        if box.size[0] == 0 or box.size[1] == 0:
            continue
        proj = iu.profile_projection(box)
        above = np.where(proj > 0)
        if above and len(above[0]) > 0:
            o['bottom'] = o['top'] + above[0][-1]
            o['top'] += above[0][0]

In [11]:
SPELL = SpellChecker(distance=1)
_WORDS = vocab.get_word_set(vocab.VOCAB_DIR / "plant_taxa.txt")
SPELL.word_frequency.load_words(list(_WORDS))


def misspellings(ln):
    """Word misspellings."""
    words = ln.split()

    new = []
    for word in words:
        if vocab.in_any_vocab(word):
            new.append(word)
        else:
            pre, suff = '', ''
            if match := re.match(r'^\W+', word):
                pre = match.group(0)
            if match := re.match(r'\W+$', word):
                suff = match.group(0)
            base = word.removeprefix(pre).removesuffix(suff)
            if not vocab.in_any_vocab(word) and len(base) > 2:
                cand = SPELL.candidates(base)
                print(f'{word=} {base=} {pre=} {suff=}')
                for c in cand:
                    print(c)
                print()
            new.append(word)

    return ' '.join(new)

In [13]:
threshold = 2**16


def build_ocr(idx):  # , threshold=2**16):
    label, image = get_label(idx)
    image = transform_label(label, image)
    ocr_boxes = OCR[label['label_id']]
    ocr_boxes = ocr_results.filter_boxes(ocr_boxes, image.size[1])
    # trim_boxes(ocr_boxes, label)
    # display_boxes(ocr_boxes, image)
    rows = ocr_results.get_lines(ocr_boxes)
    lines = []
    for row in rows:
        copies = ocr_results.get_copies(row)

        if len(copies) <= 1:
            continue

        copies = ocr_results.sort_copies(copies)

        aligned = ocr_results.align_copies(copies)
#         print('\n'.join(aligned))
#         print()

        cons = ocr_results.consensus(aligned)

        ln = ocr_results.substitute(cons)
        ln = ocr_results.spaces(ln)
#         ln = misspellings(ln)
        lines.append(ln)

    print('\n'.join(lines))
    display(image)


interact(build_ocr, idx=(0, len(LABELS) - 1));
# build_ocr(0)  # Dropped char in Det.
# build_ocr(2)  # The date is wrong
# build_ocr(11112)  # wes t of Charleston -- fixed
# build_ocr(10647)  # Changed character in Baccharis

interactive(children=(IntSlider(value=5561, description='idx', max=11122), Output()), _dom_classes=('widget-in…