# Clean text to improve parser results

In [1]:
import sys

sys.path.append('..')

In [2]:
from pathlib import Path

import regex as re
import traiter.util
from tqdm import tqdm

from mimosa.pylib.pipelines import sentence_pipeline as sp

In [3]:
PDF_DIR = Path('..') / 'data' / 'pdf'
RAW_DIR = Path('..') / 'data' / 'text'
TEXT_DIR = Path('..') / 'mimosa' / 'text'

Get all PDF files

In [4]:
FLAGS = re.IGNORECASE | re.VERBOSE

In [5]:
PDFS = PDF_DIR.glob('*.pdf')

Do not reprocess PDFs

In [6]:
already_text = {t.stem for t in RAW_DIR.glob('*.txt')}

PDFS = {p for p in PDFS if p.stem not in already_text}
PDFS

{PosixPath('../data/pdf/Barneby_1991_Sensitivae_Censitae.pdf'),
 PosixPath('../data/pdf/Barneby_1998_Silk_Tree_Guanacaste_Monkey_s_Earring_Part_III.pdf'),
 PosixPath('../data/pdf/Barneby_and_Grimes_1996_Silk_Tree_Guanacaste_Monkey_s_Earring_Part_I.pdf'),
 PosixPath('../data/pdf/Barneby_and_Grimes_1997_Silk_Tree_Guanacaste_Monkey_s_Earring_Part_II.pdf'),
 PosixPath('../data/pdf/Ebinger_Seigler_Clarke_2000_Taxonomic_Revision_of_South_American_species_of_the_genus_Acacia_subgenus_Acacia_Fabaceae_Mimosoideae.pdf'),
 PosixPath('../data/pdf/flora_australia_11a_mimosaceae_acacia_1_2.pdf'),
 PosixPath('../data/pdf/flora_australia_11b_mimosaceae_acacia_2.pdf'),
 PosixPath('../data/pdf/flora_australia_12_mimosaceae_exacacia_caesalpiniaceae.pdf')}

In [7]:
for pdf in PDFS:
    raw = RAW_DIR / pdf.name
    raw = raw.with_suffix('.txt')

    !pdftotext $pdf $raw



In [8]:
# Fix PDF specific mojibake
MOJIBAKE = {
    '{': '(',
    '}': ')',
}
TRANS = str.maketrans(MOJIBAKE)

In [10]:
already_cleaned = {t.stem for t in TEXT_DIR.glob('*.txt')}

RAW_TEXT = {r for r in RAW_DIR.glob('*.txt') if r.stem not in already_cleaned}
RAW_TEXT

{PosixPath('../data/text/Barneby_1998_Silk_Tree_Guanacaste_Monkey_s_Earring_Part_III.txt'),
 PosixPath('../data/text/Barneby_and_Grimes_1996_Silk_Tree_Guanacaste_Monkey_s_Earring_Part_I.txt'),
 PosixPath('../data/text/Barneby_and_Grimes_1997_Silk_Tree_Guanacaste_Monkey_s_Earring_Part_II.txt'),
 PosixPath('../data/text/Ebinger_Seigler_Clarke_2000_Taxonomic_Revision_of_South_American_species_of_the_genus_Acacia_subgenus_Acacia_Fabaceae_Mimosoideae.txt'),
 PosixPath('../data/text/flora_australia_11a_mimosaceae_acacia_1_2.txt'),
 PosixPath('../data/text/flora_australia_11b_mimosaceae_acacia_2.txt'),
 PosixPath('../data/text/flora_australia_12_mimosaceae_exacacia_caesalpiniaceae.txt')}

Most of the heavy lifting for cleaning the text is done by the traiter utility function.

In [11]:
def clean_text(raw_path):
    with open(raw_path) as raw_file:
        text = raw_file.read()

    text = traiter.util.clean_text(text, trans=TRANS)

    # Break into sentences
    nlp = sp.pipeline()
    nlp.max_length = 4_000_000

    doc = nlp(text)

    lines = [s.text + '\n' for s in doc.sents]

    # Write output
    clean_path = TEXT_DIR / raw_path.name
    with open(clean_path, 'w') as clean_file:
        clean_file.writelines(lines)

In [12]:
for raw_path in sorted(RAW_TEXT):
    print(raw_path.stem)
#     clean_text(raw_path)

Barneby_1998_Silk_Tree_Guanacaste_Monkey_s_Earring_Part_III
Barneby_and_Grimes_1996_Silk_Tree_Guanacaste_Monkey_s_Earring_Part_I
Barneby_and_Grimes_1997_Silk_Tree_Guanacaste_Monkey_s_Earring_Part_II
Ebinger_Seigler_Clarke_2000_Taxonomic_Revision_of_South_American_species_of_the_genus_Acacia_subgenus_Acacia_Fabaceae_Mimosoideae
flora_australia_11a_mimosaceae_acacia_1_2
flora_australia_11b_mimosaceae_acacia_2
flora_australia_12_mimosaceae_exacacia_caesalpiniaceae
