In [1]:
import sys

sys.path.append('..')

In [15]:
from pathlib import Path

import numpy as np
import pytesseract
from ipywidgets import interact
from PIL import Image, ImageDraw
from scipy import ndimage
from skimage import morphology as morph
from skimage import measure, exposure, util, io, color
from skimage import transform as xform
from scipy.ndimage import interpolation as inter

from digi_leap import label_image as li
from digi_leap.label_image import NEAR_HORIZ, NEAR_VERT

In [3]:
DATA_DIR = Path('..') / 'data'
LABELS_DIR = DATA_DIR / 'labels' / 'typewritten'
# LABELS_DIR = DATA_DIR / 'labels' / 'handwritten'

In [4]:
LABELS = sorted(LABELS_DIR.glob('*.jpg'))

In [5]:
def get_label(key):
    paths = [i for i in LABELS if str(i).find(key) > -1]
    if not key or not paths:
        return
    label = io.imread(paths[0])
    label = color.rgb2gray(label).copy()
    return label

In [6]:
def test1(key):
    label = get_label(key)
    label = xform.rescale(label, 2.0, anti_aliasing=True)

    label = li.binarize(label, window_size=9, k=0.02)
    label = util.invert(label)

    label = label.astype(np.bool8)
    label = util.invert(label)
    h_lines = li.find_lines(label, NEAR_HORIZ, line_length=100, line_gap=5)
    li.remove_horiz_lines(label, h_lines, line_width=6,
                          window=8, threshold=1.0)

    morph.remove_small_holes(label, area_threshold=16)

    image = li.to_pil(label)
    image = image.convert('RGB')
    draw = ImageDraw.Draw(image)

    for line in h_lines:
        draw.line(line, width=6, fill=(255, 0, 0))

    display(image)


interact(test1, key='4128356')

interactive(children=(Text(value='4128356', description='key'), Output()), _dom_classes=('widget-interact',))

<function __main__.test1(key)>

In [40]:
def test2(key):
    label1 = get_label(key)
    label1_text = li.ocr_text(label1)
    print(label1_text)

    # label1 = exposure.adjust_gamma(label1, gamma=1.5)
    label1 = xform.rescale(label1, 2.0, anti_aliasing=True)
    # label1 = exposure.equalize_adapthist(label1)

    label2 = label1.copy()
    label2 = li.binarize(label2)

    h_lines = li.find_lines(label2, NEAR_HORIZ, line_length=100, line_gap=5)
    li.remove_horiz_lines(label2, h_lines, line_width=6, window=8, threshold=2.0)

    label3 = label2.copy()
    label3 = morph.binary_closing(label3, morph.disk(10))
    label3 = morph.remove_small_objects(label3, min_size=256)
    label3 = morph.binary_dilation(label3, morph.disk(8))
    label3 = morph.remove_small_objects(label3, min_size=16)
    # display(li.to_pil(label3))

    label4 = label3.copy()
    label4, count = measure.label(label4, return_num=True)
    print(count)
    props = measure.regionprops(label4)

    for prop in props:
        r1, c1, r2, c2 = prop.bbox
        part = label1[r1:r2, c1:c2]
        binary = li.binarize(part)
        part = exposure.adjust_gamma(part, gamma=2.0)
        # part = ndimage.gaussian_filter(part, sigma=1)
        angle = li.find_skew(binary)
        part = inter.rotate(part, angle, mode='nearest')
        # part = xform.rotate(part, 1.0, resize=True, mode='minimum')
        # part = li.binarize(part)
        # part = util.invert(part)
        #         h_lines = part.find_horizontal_lines(line_length=100, line_gap=5)
        #         part.remove_horiz_lines(h_lines, line_width=6, window=8, threshold=2.0)
        #         part.data = morph.binary_closing(part.data, morph.disk(10))
        #         part.data = morph.remove_small_objects(part.data, min_size=8)
        #         part.data = morph.binary_dilation(part.data, morph.disk(8))
        display(li.to_pil(part))
        print
        part_text = li.ocr_text(part)
        print(part_text)


interact(test2, key='4128356')

interactive(children=(Text(value='4128356', description='key'), Output()), _dom_classes=('widget-interact',))

<function __main__.test2(key)>