# Check printed text

Find the expected printed text in a certificate

In [None]:
import os
import pandas as pd
import re
import regex
import sys
from spacy import displacy
sys.path.append(os.getcwd() + '/..')
from scripts import read_transkribus_files, printed_text, utils

In [None]:
def render_text(text, entities):
    displacy.render({ "text": re.sub("\\n", " ", text), 
                      "ents": entities }, 
                      options = { "colors": { "fuzzy_match": "yellow"} }, style = "ent", manual = True)

## 1. Find missed printed text

In [None]:
data_dir = "../../data/Overlijden/x-samples/three-columns-100/page"

texts, metadata, textregions = read_transkribus_files.read_files(data_dir)

In [None]:
def get_printed_text_year(text_id):
    text_year = int(text_id[:4])
    printed_text_year = list(printed_text.PRINTED_TEXT.keys())[0]
    for year in sorted(printed_text.PRINTED_TEXT.keys()):
        if year > printed_text_year and text_year >= year:
            printed_text_year = year
    return printed_text_year

In [None]:
def find_phrases_in_text(text, phrases):
    entities = []
    for phrase in phrases:
        positions = utils.find_text_patterns(phrase, text)
        if len(positions) == 1:
            positions[0]["label"] = phrase
            entities.append(positions[0])
        elif len(positions) == 0:
            character_errors = 0
            while len(positions) == 0 and character_errors < 3:
                character_errors += 1
                query = f"({phrase.lower()})"+"{"+f"e<={character_errors}"+"}"
                positions = [ match for match in regex.finditer(query, text.lower()) ]
            if len(positions) == 1:
                entities.append({"start": positions[0].start(),
                                 "end": positions[0].end(),
                                 "label": "fuzzy_match",
                                 "correct_phrase": phrase })
    return entities

In [None]:
def correct_text(text_in, entities):
    text_out = text_in
    for entity in entities:
        if entity["label"] == "fuzzy_match":
            text_out = text_out[:entity["start"]] + entity["correct_phrase"] + text_out[entity["end"]:]
    return text_out

In [None]:
corrected_text = {}
for text_id in sorted(texts.keys()):
    printed_text_year = get_printed_text_year(text_id)
    entities = find_phrases_in_text(texts[text_id], printed_text.PRINTED_TEXT[printed_text_year])
    corrected_text[text_id] = correct_text(texts[text_id], entities)

In [None]:
pd.DataFrame.from_dict(corrected_text, orient="index").to_csv("three_columns_100_corrected.csv")