# OCR Labels

This is just a rough draft of how the chain of events may go for the OCR process. Most of these steps can be replaced with more advanced methods later.

## Setup

### Imports

In [1]:
import sys

sys.path.append('..')

In [2]:
import re
import string
import textwrap
from pathlib import Path
from dataclasses import dataclass, field

import enchant
import numpy as np
import pytesseract
from ipywidgets import interact
from PIL import Image, ImageDraw, ImageEnhance, ImageOps
from scipy import ndimage
from skimage import morphology as morph
from skimage import measure, exposure, util, io, color, filters
from skimage import transform as xform

from digi_leap import label_image as li

### File locations

In [3]:
DATA_DIR = Path('..') / 'data'
LABELS_DIR = DATA_DIR / 'labels' / 'typewritten'
# LABELS_DIR = DATA_DIR / 'labels' / 'handwritten'

### Constants for image processing

In [4]:
# How many words should be in a label?
MIN_WORDS = 20

# For threshold_sauvola
WINDOW_SIZE = 11
K = 0.032

# For remove_small_objects
MIN_SIZE = 64

# For ImageOps.scale
FACTOR = 2.0

### Setup for the spell checker

This is being used as a proxy for scoring the quality of the OCR output. It will get either replaced or augmented later.

In [5]:
PUNCT = re.escape(string.punctuation)
SPLIT = re.compile(rf'([\s{PUNCT}]+)')

ALLOW = {'.)', '.]'}

LANG = 'en_US'
EXTRA_VOCAB = DATA_DIR / 'custom_vocab.txt'


MIN_DIM = 512

OK = 90.0
BAD = 20.0

In [6]:
def spell_checker(lang, extra_vocab=''):
    """Setup the spell checker."""
    if extra_vocab:
        vocab = enchant.DictWithPWL(lang, str(extra_vocab))
    else:
        vocab = enchant.Dict(lang)
    return vocab


VOCAB = spell_checker(LANG, EXTRA_VOCAB)

### Target images

In [7]:
IMAGES = sorted(LABELS_DIR.glob('*.jpg'))

IDX = 3240   # A target image with plenty of underlines
# IDX = 46    # Lots of odd colors
# IDX = 2880  # Underlines
IDX = 2939  # Rotated
# IDX = 28970  # Border

## A structure that holds an OCR score

In [8]:
@dataclass(order=True)
class OCRScore:
    found: int = 0
    total: int = 0
    file: str = ''
    stem: str = ''
    method: list[str] = field(default_factory=list)
    text: str = ''

    @property
    def score(self):
        return self.found, self.percent

    @property
    def is_ok(self):
        return self.percent >= OK and self.found >= MIN_WORDS

    @property
    def percent(self):
        per = self.found / self.total if self.total != 0 else 0.0
        return round(per * 100.0, 2)

    def __str__(self):
        return textwrap.dedent(f"""
        {self.score=}
        {self.found=}
        {self.percent=}
        {self.total=}
        {self.stem=}
        {self.method=}
        """)

    def then_i(self, action):
        if not self.method or action != self.method[-1]:
            self.method.append(action)

### Score OCR

In [9]:
def score_ocr(image):
    text = pytesseract.image_to_string(image, config=li.TESS_CONFIG)
    text = re.sub(r'(\n\s*){3,}', '\n\n', text)

    words = [x for w in SPLIT.split(text) if (x := w.strip())]

    found = sum(1 for w in words if VOCAB.check(w)
                or len(w) == 1 or w in ALLOW)

    return OCRScore(
        total=len(words),
        found=found,
        text=text,
    )

### Save a score record

In [10]:
def save_score(score, path, method):
    score.file = str(path)
    score.stem = path.stem
    score.then_i(method)
    print()
    print(f'{score.score=}')
    print(f'{score.method=}')
    print('-' * 80)
    print(score.text)
    print('-' * 80)
    return score

## Find label index

In [11]:
def get_label_index(key):
    paths = [(i, x) for i, x in enumerate(IMAGES) if str(x).find(key) > -1]
    if not key or not paths:
        return
    idx, path = paths[0]
    label = Image.open(path).convert('L')
    return idx

## Process a label

In [12]:
def ocr_label(key):
    idx = get_label_index(key)
    if not idx:
        return
    path = IMAGES[idx]
    image = Image.open(path)

    display(image)

    # ################################################################
    # Try the unmodified label

    action = 'did nothing'

    best = score_ocr(image)
    print(best.score, action)

    if best.is_ok:
        return save_score(best, path, action)

    # ################################################################
    # Try a bigger label

    action = f'scaled by: {FACTOR}'

    bigger = ImageOps.scale(image, FACTOR)
    score = score_ocr(bigger)
    print(score.score, action)

    if image.width < MIN_DIM or image.height < MIN_DIM:
        image = bigger
        best.then_i(action)

    if score > best:
        image = bigger
        best = score
        best.then_i(action)
        if best.is_ok:
            return save_score(best, path, action)

    # ################################################################
    # Try to orient the label

    osd = pytesseract.image_to_osd(image)
    angle = int(re.search(r'degrees:\s*(\d+)', osd).group(1))

    action = f'rotated by: {angle}'

    data = np.asarray(image).copy()

    if angle != 0:
        rotated = ndimage.rotate(data, int(angle), mode='nearest')
        rotated = li.to_pil(rotated)
        score = score_ocr(rotated)
        print(score.score, action)

        if score > best:
            image = rotated
            best = score
            best.then_i(action)
            if best.is_ok:
                return save_score(best, path, action)

    # ################################################################
    # Try converting the image to binary

    action = f'sauvola threshold window size = {WINDOW_SIZE} K = {K}'

    binary = np.asarray(image).copy()
    threshold = filters.threshold_sauvola(binary, window_size=WINDOW_SIZE, k=K)
    binary = binary < threshold

    temp = li.to_pil(binary)
    score = score_ocr(temp)
    print(score.score, action)

    if score > best:
        image = temp
        best = score
        score.then_i(action)
        if score.is_ok:
            return save_score(score, path, action)

    # ################################################################
    # Try to remove small objects

    action = f'removed small objects min_size = {MIN_SIZE}'

    cleaned = morph.remove_small_objects(binary, min_size=MIN_SIZE)
    temp = li.to_pil(cleaned)
    score = score_ocr(temp)
    print(score.score, action)

    if score > best:
        image = temp
        binary = cleaned
        best = score
        score.then_i(action)
        if score.is_ok:
            return save_score(score, path, action)

 
    # ################################################################
    # Try opening holes

    action = f'binary opening'

    cleaned = morph.binary_opening(binary)
    temp = li.to_pil(binary)
    score = score_ocr(temp)
    print(score.score, action)

    if score > best:
        image = temp
        binary = cleaned
        best = score
        score.then_i(action)
        display(li.to_pil(image))
        if score.is_ok:
            return save_score(score, path, actions)

    # ################################################################
    # Try to correct skew

    # ################################################################
    # Try to remove horizontal lines

    # ################################################################
    # Nothing worked

    print(best.score, f'fail')

    return save_score(best, path, 'fail')


interact(ocr_label, key='')

interactive(children=(Text(value='', description='key'), Output()), _dom_classes=('widget-interact',))

<function __main__.ocr_label(key)>