# Convert PDFs to images

Most OCR engines work on images and not PDFs.

In [1]:
import shutil
from pathlib import Path

import regex as re

In [2]:
DATA_DIR = Path('..') / 'data'
IMAGE_DIR = DATA_DIR / 'images'
PDF_DIR = DATA_DIR / 'pdf'

## Rename PDFs to something sane

### Get old PDF names

In [3]:
old_names = !ls $PDF_DIR
old_names

['Barneby_1991_Sensitivae_Censitae.pdf',
 'Barneby_1998_Silk_Tree_Guanacaste_Monkey_s_Earring_Part_III.pdf',
 'Barneby_and_Grimes_1996_Silk_Tree_Guanacaste_Monkey_s_Earring_Part_I.pdf',
 'Barneby_and_Grimes_1997_Silk_Tree_Guanacaste_Monkey_s_Earring_Part_II.pdf',
 'Ebinger_Seigler_Clarke_2000_Taxonomic_Revision_of_South_American_species_of_the_genus_Acacia_subgenus_Acacia_Fabaceae_Mimosoideae.pdf',
 'flora_australia_11a_mimosaceae_acacia_1_2.pdf',
 'flora_australia_11b_mimosaceae_acacia_2.pdf',
 'flora_australia_12_mimosaceae_exacacia_caesalpiniaceae.pdf']

### Rename ugly file names

It's easier, not required but much easier, to work with sane file names. Remove characters that cause command line utilities problems. Remove spaces, commas, parentheses, etc. and replace them with an underscores `_`.

In [4]:
DOCS = []

for old_name in old_names:
    path = Path(old_name)
    stem = path.stem
    stem = re.sub(r'[^\w.]', '_', stem)
    stem = re.sub(r'__+', '_', stem)
    stem = re.sub(r'_+$', '', stem)

    new_name = path.with_stem(stem)
    DOCS.append(new_name)

    if new_name != old_name:
        old = PDF_DIR / old_name
        new = PDF_DIR / new_name
        shutil.move(old, new)

DOCS

[PosixPath('Barneby_1991_Sensitivae_Censitae.pdf'),
 PosixPath('Barneby_1998_Silk_Tree_Guanacaste_Monkey_s_Earring_Part_III.pdf'),
 PosixPath('Barneby_and_Grimes_1996_Silk_Tree_Guanacaste_Monkey_s_Earring_Part_I.pdf'),
 PosixPath('Barneby_and_Grimes_1997_Silk_Tree_Guanacaste_Monkey_s_Earring_Part_II.pdf'),
 PosixPath('Ebinger_Seigler_Clarke_2000_Taxonomic_Revision_of_South_American_species_of_the_genus_Acacia_subgenus_Acacia_Fabaceae_Mimosoideae.pdf'),
 PosixPath('flora_australia_11a_mimosaceae_acacia_1_2.pdf'),
 PosixPath('flora_australia_11b_mimosaceae_acacia_2.pdf'),
 PosixPath('flora_australia_12_mimosaceae_exacacia_caesalpiniaceae.pdf')]

### Convert all PDF pages to an image

Poppler utils has a nice utility for converting PDF pages to images (`pdftocairo`), one image per page.

`sudo apt install poppler-utils`

In [5]:
def pdf2images(name):
    print(name)

    src = PDF_DIR / name

    stem = Path(name).stem
    dir_ = IMAGE_DIR / stem
    dst = dir_ / stem

    !mkdir -p $dir_
    !pdftocairo -jpeg $src $dst

In [6]:
# for name in DOCS:
#     pdf2images(name)

In [7]:
pdf2images(DOCS[1])

Barneby_1998_Silk_Tree_Guanacaste_Monkey_s_Earring_Part_III.pdf


## Examine conversion results

Look at the results to see how it worked and what the format of the pages are.
- Did the conversion work well?
- Are we dealing with pages with one or two columns, or a mix?
- Are there pages that can be removed? I have moved them to a `backup` folder.