# Convert book pages to text

In [1]:
from itertools import cycle
from pathlib import Path

import matplotlib.patches as patches
import matplotlib.pyplot as plt
import pytesseract
import regex as re
import skimage
from ipywidgets import interact
from PIL import Image
from tqdm import tqdm
from skimage import (
    color,
    filters,
    io,
    measure,
    util,
)

In [2]:
DATA_DIR = Path('..') / 'data'
IMAGE_DIR = DATA_DIR / 'images'
TEXT_DIR = DATA_DIR / 'text'

This constant is used for determining which column a fragment of text belongs to. Columns in real documents are not always aligned around the actual center so we adjust it to handle those cases. The algorithm is:
- find the center of the page
- subtract the constant
- anything to the left of this line is in column 1 and everything else is in column 1

In [3]:
CENTER_LINE_PAD = 50

## Get the images directories

In [4]:
DOCS = sorted(IMAGE_DIR.glob('*'))
DOCS

[PosixPath('../data/images/Barneby_1991_Sensitivae_Censitae'),
 PosixPath('../data/images/Barneby_1998_Silk_Tree_Guanacaste_Monkey_s_Earring_Part_III'),
 PosixPath('../data/images/Barneby_and_Grimes_1996_Silk_Tree_Guanacaste_Monkey_s_Earring_Part_I'),
 PosixPath('../data/images/Barneby_and_Grimes_1997_Silk_Tree_Guanacaste_Monkey_s_Earring_Part_II'),
 PosixPath('../data/images/Ebinger_Seigler_Clarke_2000_Taxonomic_Revision_of_South_American_species_of_the_genus_Acacia_subgenus_Acacia_Fabaceae_Mimosoideae'),
 PosixPath('../data/images/flora_australia_11a_mimosaceae_acacia_1_2'),
 PosixPath('../data/images/flora_australia_11b_mimosaceae_acacia_2'),
 PosixPath('../data/images/flora_australia_12_mimosaceae_exacacia_caesalpiniaceae')]

We also need to determine which books use a one column layout vs a two column layout.

In [5]:
ONE_COLUMN = [d for d in DOCS if str(d).find('flora_australia') > -1]
ONE_COLUMN

[PosixPath('../data/images/flora_australia_11a_mimosaceae_acacia_1_2'),
 PosixPath('../data/images/flora_australia_11b_mimosaceae_acacia_2'),
 PosixPath('../data/images/flora_australia_12_mimosaceae_exacacia_caesalpiniaceae')]

## Segment pages into columns and other parts

Many pages have multiple columns which will confuse the OCR engines.

Having a clear border around the text is important for the OCR engines to work.

In [6]:
BORDER = 6


def get_bbox(region):
    top, left, bottom, right = region.bbox
    top -= BORDER
    left -= BORDER
    bottom += BORDER
    right += BORDER
    return top, left, bottom, right

We don't want smaller bounding boxes that overlap with bigger bounding boxes.

In [7]:
def inside(smaller_region, bigger_region):
    s_top, s_left, s_bottom, s_right = smaller_region.bbox
    b_top, b_left, b_bottom, b_right = bigger_region.bbox

    top_left = (
        s_top > b_top and s_left > b_left and s_top < b_bottom and s_left < b_right)

    bottom_right = (
        s_bottom > b_top and s_right > b_left and s_bottom < b_bottom
        and s_right < b_right)

    top_right = (
        s_top > b_top and s_right > b_left and s_top < b_bottom and s_right < b_right)

    bottom_left = (
        s_bottom > b_top and s_left > b_left and s_bottom < b_bottom
        and s_left < b_right)

    return top_left or bottom_right or top_right or bottom_left

Get rid of headers and footers and other small stray marks.

In [8]:
def too_small(region, min_height=50, min_width=100):
    top, left, bottom, right = get_bbox(region)
    width = right - left
    height = bottom - top
    return width < min_width or height < min_height

Another method to get rid of headers, footers, and figure captions is to remove text that spans both of the columns, provided that the document is laid out in with two columns of text per page.

In [9]:
def too_wide(region, image_width, pad=CENTER_LINE_PAD):
    top, left, bottom, right = get_bbox(region)
    width = right - left
    return width > (image_width // 2) + pad

Chop the processed page into regions of text.

In [10]:
def get_regions(labeled, is_two_columns):
    regions = sorted(measure.regionprops(labeled), key=lambda r: r.area, reverse=True)
    regions = [r for r in regions if not too_small(r)]

    new_regions = []
    for region in regions:
        for bigger in new_regions:
            if inside(region, bigger):
                break
        else:
            new_regions.append(region)
    new_regions = regions

    regions = new_regions

    if is_two_columns:
        regions = [r for r in regions if not too_wide(r, labeled.shape[1])]

    return regions

Sometimes you want to see how the processing pipeline is dealing with the images.

In [11]:
def show_image(image):
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 10), sharex=True, sharey=True)
    ax.imshow(image, cmap='gray')
    ax.axis('off')

    fig.tight_layout()

    plt.show()

It is useful to see the regions for debugging.

In [12]:
COLORS = """ red blue green cyan magenta yellow brown purple """.split()


def show_regions(image, regions):
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(24, 20), sharex=True, sharey=True)
    ax.imshow(image)
    ax.axis('off')

    image_height, image_width = image.shape[0], image.shape[1]
    rect = patches.Rectangle(
        (image_width // 2, 0),
        1,
        image_height,
        fill=False,
        edgecolor="gray",
        linewidth=2,
    )
    ax.add_patch(rect)

    color_ = cycle(COLORS)

    for region in regions:
        top, left, bottom, right = get_bbox(region)
        width = right - left
        height = bottom - top
        rect = patches.Rectangle(
            (left, top),
            width,
            height,
            fill=False,
            edgecolor=next(color_),
            linewidth=2,
        )
        ax.add_patch(rect)

    fig.tight_layout()

    plt.show()

    return regions

Sort regions of the page so that the text on the page will flow.

In [13]:
def region_key(region, threshold):
    top, left, bottom, right = get_bbox(region)
    return left // threshold, top, left


def sort_regions(image, regions, pad=CENTER_LINE_PAD):
    threshold = (image.shape[1] // 2) - pad
    return sorted(regions, key=lambda r: region_key(r, threshold))

OCR each region of text.

In [14]:
def regions_to_text(regions, gray):
    texts = []

    for region in regions:
        top, left, bottom, right = get_bbox(region)
        width = right - left
        height = bottom - top
        cropped = gray[top:bottom, left:right] * 255.0
        cropped = Image.fromarray(cropped).convert('RGB')
        text = pytesseract.image_to_string(cropped)
        if len(text.split()) > MIN_WORDS:
            # show_image(cropped)
            texts.append(text)

    return texts

This is the primary processing pipeline for converting an image of a page into text.

In [15]:
WINDOW_SIZE = 11
MIN_WORDS = 8
SIGMA = 11
K = 0.032


def pipeline(image, is_two_columns):
    image = skimage.img_as_float(image)
    gray = color.rgb2gray(image)

    blurred = filters.gaussian(gray, sigma=SIGMA)

    threshold = filters.threshold_sauvola(blurred, window_size=WINDOW_SIZE, k=K)
    threshold = filters.threshold_otsu(blurred)
    binary = blurred > threshold

    inverted = util.invert(binary)
    labeled = measure.label(inverted)

    regions = get_regions(labeled, is_two_columns)
    regions = sort_regions(image, regions)

    texts = regions_to_text(regions, gray)

    # show_regions(image, regions)

    return texts

In [16]:
def parse_doc(doc):
    print(doc.stem)
    dir_ = IMAGE_DIR / doc.stem

    is_two_columns = not doc in ONE_COLUMN

    pages = sorted(dir_.glob('*.jpg'))

    with open(TEXT_DIR / f'{doc.stem}.txt', 'w') as text_file:
        for page in tqdm(range(len(pages))):
            image = io.imread(pages[page])

            texts = pipeline(image, is_two_columns)
            text_file.writelines(texts)

In [17]:
for doc in DOCS:
    parse_doc(doc)

Barneby_1991_Sensitivae_Censitae


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 775/775 [50:02<00:00,  3.87s/it]


Barneby_1998_Silk_Tree_Guanacaste_Monkey_s_Earring_Part_III


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 202/202 [11:35<00:00,  3.44s/it]


Barneby_and_Grimes_1996_Silk_Tree_Guanacaste_Monkey_s_Earring_Part_I


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 252/252 [14:56<00:00,  3.56s/it]


Barneby_and_Grimes_1997_Silk_Tree_Guanacaste_Monkey_s_Earring_Part_II


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 135/135 [08:19<00:00,  3.70s/it]


Ebinger_Seigler_Clarke_2000_Taxonomic_Revision_of_South_American_species_of_the_genus_Acacia_subgenus_Acacia_Fabaceae_Mimosoideae


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 22/22 [01:12<00:00,  3.30s/it]


flora_australia_11a_mimosaceae_acacia_1_2


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 502/502 [23:29<00:00,  2.81s/it]


flora_australia_11b_mimosaceae_acacia_2


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 351/351 [18:47<00:00,  3.21s/it]


flora_australia_12_mimosaceae_exacacia_caesalpiniaceae


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 143/143 [07:37<00:00,  3.20s/it]
