# Clean text to improve parser results

In [1]:
import sys

sys.path.append('..')

In [2]:
from pathlib import Path

import regex as re
from traiter import util

In [3]:
DATA_DIR = Path('..') / 'data'
TEXT_DIR = DATA_DIR / 'text'

RAW_DIR = TEXT_DIR / 'raw'
CLEANED_DIR = TEXT_DIR / 'cleaned'

In [4]:
FLAGS = re.IGNORECASE | re.VERBOSE

In [5]:
RAW_FILES = sorted(RAW_DIR.glob('*.txt'))
RAW_FILES

[PosixPath('../data/text/raw/Barneby_1991_Sensitivae_Censitae.txt'),
 PosixPath('../data/text/raw/Barneby_1998_Silk_Tree_Guanacaste_Monkey_s_Earring_Part_III.txt'),
 PosixPath('../data/text/raw/Barneby_and_Grimes_1996_Silk_Tree_Guanacaste_Monkey_s_Earring_Part_I.txt'),
 PosixPath('../data/text/raw/Barneby_and_Grimes_1997_Silk_Tree_Guanacaste_Monkey_s_Earring_Part_II.txt'),
 PosixPath('../data/text/raw/Ebinger_Seigler_Clarke_2000_Taxonomic_Revision_of_South_American_species_of_the_genus_Acacia_subgenus_Acacia_Fabaceae_Mimosoideae.txt'),
 PosixPath('../data/text/raw/flora_australia_11a_mimosaceae_acacia_1_2.txt'),
 PosixPath('../data/text/raw/flora_australia_11b_mimosaceae_acacia_2.txt'),
 PosixPath('../data/text/raw/flora_australia_12_mimosaceae_exacacia_caesalpiniaceae.txt')]

Most of the heavy lifting for cleaning the text is done by the traiter utility function.

In [14]:
# Fix application specific mojibake
MOJIBAKE = {
    '{': '(',
    '}': ')',
}
TRANS = str.maketrans(MOJIBAKE)


for raw_path in RAW_FILES:
    print(raw_path.stem)

    with open(raw_path) as raw_file:
        text = raw_file.read()

    text = util.clean_text(text)

    text = re.sub(r'\n(\S)', r' \1', text, flags=FLAGS)
    text = re.sub(r'^\s+|\s+$', '', text, flags=re.MULTILINE)
    text = re.sub(r'([^.?"])\n+([^.?"])', r'\1 \2', text, flags=FLAGS)
    text = text.translate(TRANS)

    clean_path = CLEANED_DIR / raw_path.name

    with open(clean_path, 'w') as clean_file:
        clean_file.write(text)

Barneby_1991_Sensitivae_Censitae
Barneby_1998_Silk_Tree_Guanacaste_Monkey_s_Earring_Part_III
Barneby_and_Grimes_1996_Silk_Tree_Guanacaste_Monkey_s_Earring_Part_I
Barneby_and_Grimes_1997_Silk_Tree_Guanacaste_Monkey_s_Earring_Part_II
Ebinger_Seigler_Clarke_2000_Taxonomic_Revision_of_South_American_species_of_the_genus_Acacia_subgenus_Acacia_Fabaceae_Mimosoideae
flora_australia_11a_mimosaceae_acacia_1_2
flora_australia_11b_mimosaceae_acacia_2
flora_australia_12_mimosaceae_exacacia_caesalpiniaceae
