# Clean text to improve parser results

In [1]:
import sys

sys.path.append('..')

In [2]:
from pathlib import Path

import regex as re
from tqdm import tqdm
from traiter import util

from mimosa.pylib.pipelines import sentence_pipeline as sp

In [3]:
DATA_DIR = Path('..') / 'data'
TEXT_DIR = DATA_DIR / 'text'

RAW_DIR = TEXT_DIR / 'manual'
CLEANED_DIR = TEXT_DIR / 'cleaned'

In [4]:
FLAGS = re.IGNORECASE | re.VERBOSE

In [5]:
RAW_FILES = sorted(RAW_DIR.glob('*.txt'))
RAW_FILES

[PosixPath('../data/text/manual/Barneby_1991_Sensitivae_Censitae.txt')]

Most of the heavy lifting for cleaning the text is done by the traiter utility function.

In [6]:
# Fix PDF specific mojibake
MOJIBAKE = {
    '{': '(',
    '}': ')',
}
TRANS = str.maketrans(MOJIBAKE)


for raw_path in RAW_FILES:
    print(raw_path.stem)

    with open(raw_path) as raw_file:
        text = raw_file.read()

    text = util.clean_text(text)

    text = re.sub(r'\n(\S)', r' \1', text, flags=FLAGS)
    text = re.sub(r'^\s+|\s+$', '', text, flags=re.MULTILINE)
    text = re.sub(r'([^.?"])\n+([^.?"])', r'\1 \2', text, flags=FLAGS)
    text = text.translate(TRANS)

    nlp = sp.pipeline()
    nlp.max_length = 4_000_000

    print("parsting started")
    doc = nlp(text)
    print("parsting finished\n")

    lines = [s.text + '\n' for s in doc.sents]

    clean_path = CLEANED_DIR / raw_path.name
    with open(clean_path, 'w') as clean_file:
        clean_file.writelines(lines)

Barneby_1991_Sensitivae_Censitae
parsting started
parsting finished

