In [1]:
import sys
sys.path.append('..')

In [2]:
import io
import re
from functools import partial
from pathlib import Path
from tempfile import NamedTemporaryFile

import cv2 as cv
import pytesseract
from ipywidgets import interact
from IPython.display import HTML
import numpy as np
from skimage import filters, morphology, restoration, util
import matplotlib.pyplot as plt
from PIL import Image as PImage, ImageDraw, ImageOps
from wand.image import Image
from wand.drawing import Drawing

In [3]:
DATA_DIR = Path('..') / 'data'
LABELS_DIR = DATA_DIR / 'labels' / 'typewritten'
# LABELS_DIR = DATA_DIR / 'labels' / 'handwritten'

In [4]:
CONFIG = ' '.join([
    '-l eng',
    "-c tessedit_char_blacklist='€«¢»£®'",
])

In [5]:
IMAGES = sorted(LABELS_DIR.glob('*.jpg'))

In [6]:
def raw_image(idx):
    image = PImage.open(IMAGES[idx])
    print(IMAGES[idx])
    print(idx)
    display(image.getexif())
    display(image)


# raw_image(46)
# raw_image(46, 0)  # Lots of odd colors
# raw_image(288, 0)  # Underlines
# raw_image(293, 9)  # Rotated
# raw_image(2897, 0)  # Border
# interact(raw_image, idx=(0, len(IMAGES) - 1))

In [7]:
BLOCK_SIZE = 11
OFFSET = 11


def scikit_image(idx, angle):
    path = IMAGES[idx]

    with PImage.open(path) as orig:
        image = orig.convert('L')
        image = image.rotate(angle, fillcolor='white', expand=True)
        data = np.array(image)

    # threshold = filters.threshold_local(data, block_size=BLOCK_SIZE, offset=OFFSET)
    threshold = filters.threshold_sauvola(data, window_size=OFFSET, k=0.032)
    binary = data > threshold

    text = pytesseract.image_to_string(binary, config=CONFIG)
    text = re.sub(r'\n\s*\n(\s*\n)+', '\n\n', text)

    print(text)
    display(PImage.fromarray(binary))


# scikit_image(46, 0)  # Lots of odd colors
# scikit_image(288, 0)  # Underlines
# scikit_image(293, 9)  # Rotated
# scikit_image(2897, 0)  # Border
# interact(scikit_image, idx=(0, len(IMAGES) - 1), angle=(-180, 180))

In [8]:
radius = 11


def wand_image(idx, angle):
    path = IMAGES[idx]
    with Image(filename=path) as image:
        image.transform_colorspace('gray')
        image.rotate(angle, background='white')

        image.deskew(0.4)
        image.adaptive_threshold(
            width=radius, height=radius, offset=-0.04 * image.quantum_range)
        image.despeckle()
        image.sharpen()

        # image.evaluate(operator='thresholdblack', value=1, channel='gray')

        blob = image.make_blob('jpeg')
        blob = PImage.open(io.BytesIO(blob))

        text = pytesseract.image_to_string(blob)
        text = re.sub(r'\n\s*\n(\s*\n)+', '\n\n', text)

        print(text)
        display(image)


# wand_image(46, 0)  # Lots of odd colors
# wand_image(288, 0)  # Underlines
# wand_image(293, 90)  # Rotated
# wand_image(2897, 0)  # Border
# interact(wand_image, idx=(0, len(IMAGES) - 1), angle=(-180, 180))

### Adjust output font

In [9]:
%%javascript
$('.output_label').css('font-size', '16pt')

<IPython.core.display.Javascript object>