# Check printed text

Find the expected printed text in a certificate

In [None]:
import copy
import os
import pandas as pd
import re
import regex
import sys
from spacy import displacy
sys.path.append(os.getcwd() + '/..')
from scripts import read_transkribus_files, printed_text, utils

In [None]:
def render_text(text, entities):
    displacy.render({ "text": re.sub("\\n", " ", text), 
                      "ents": entities }, 
                      options = { "colors": { "fuzzy_match": "yellow"} }, style = "ent", manual = True)

## 1. Find missed printed text

In [None]:
data_dir = "../../data/Overlijden/x-samples/three-columns-100/page"

texts, metadata, textregions = read_transkribus_files.read_files(data_dir)

In [None]:
def get_printed_text_year(text_id):
    """ finds appropriate index of text format printed_text.PRINTED_TEXT for a certificate """
    text_year = int(text_id[:4])
    printed_text_year = list(printed_text.PRINTED_TEXT.keys())[0]
    for year in sorted(printed_text.PRINTED_TEXT.keys()):
        if year > printed_text_year and text_year >= year:
            printed_text_year = year
    return printed_text_year

In [None]:
def same_number_of_words(phrase, search_text, positions):
    guessed_phrase = search_text[positions[0].start(): positions[0].end()]
    return len(guessed_phrase.split()) == len(phrase.split())

In [None]:
def find_match(text, phrase, start=0, end=None, level=0, max_diff=3):
    match = {}
    search_text = text[start:end]
    if end == None:
        search_text = text[start: end]
    if len(phrase) > 2 - level:
        positions = utils.find_text_patterns(phrase.lower(), search_text.lower())
        if len(positions) == 1:
            positions[0]["label"] = phrase
            match = { "start": positions[0]["start"] + start,
                      "end": positions[0]["end"] + start,
                      "label": positions[0]["label"] }
        elif len(positions) == 0:
            character_errors = 0
            while len(positions) == 0 and character_errors <= max_diff:
                query = f"({phrase.lower()})"+"{"+f"e<={character_errors}"+"}"
                positions = [ match for match in regex.finditer(query, search_text.lower()) ]
                character_errors += 1
            if len(positions) == 1 and same_number_of_words(phrase, search_text, positions):
                match = { "start": positions[0].start() + start,
                          "end": positions[0].end() + start,
                          "label": "fuzzy_match",
                          "correct_phrase": phrase }
    return match

In [None]:
def find_phrases_in_text(text, phrases):
    """ find phrases in text, only return unique matches """
    entities = []
    for phrase in phrases:
        entities.append(find_match(text, phrase))
    return entities

In [None]:
def get_min_char_pos(entities, index):
    for counter in range(index-1, 0, -1):
        if "end" in entities[counter]:
            return entities[counter]["end"] + 1
    return 0

In [None]:
def get_max_char_pos(entities, index):
    for counter in range(index+1, len(entities)):
        if "start" in entities[counter]:
            return entities[counter]["start"]
    return None

In [None]:
def find_phrases_in_text_with_entities(text, phrases, entities, max_diff=3):
    """ find phrases in text, only return unique matches """
    for i in range(0, len(phrases)):
        if len(entities[i]) == 0:
            start = get_min_char_pos(entities, i)
            end = get_max_char_pos(entities, i)
            if end == None:
                entities[i] = find_match(text, phrases[i], start=start, level=1, max_diff=max_diff)
            else:
                entities[i] = find_match(text, phrases[i], start=start, end=end, level=1, max_diff=max_diff)                
    return entities

In [None]:
def update_entities(entities, entity_replaced):
    """ adjust start and end point of entities after replacing a text """
    delta = len(entity_replaced["correct_phrase"]) - (entity_replaced["end"] - entity_replaced["start"])
    for entity in entities:
        if "start" in entity and entity["start"] > entity_replaced["start"]:
            entity["start"] += delta
        if "end" in entity and entity["end"] >= entity_replaced["end"]:
            entity["end"] += delta
    return entities

In [None]:
def correct_text(text_in, entities):
    """ replace fuzzy matches in text by correct phrases """
    text_out = text_in
    for entity in reversed(entities):
        if "label" in entity and entity["label"] == "fuzzy_match":
            text_out = text_out[:entity["start"]] + entity["correct_phrase"] + text_out[entity["end"]:]
            if len(entity["correct_phrase"]) != entity["end"] - entity["start"]:
                entities = update_entities(entities, entity)
    return text_out, entities

In [None]:
corrected_text = {}
text_entities = {}
for text_id in sorted(texts.keys()):
    printed_text_year = get_printed_text_year(text_id)
    entities = find_phrases_in_text(texts[text_id], printed_text.PRINTED_TEXT[printed_text_year])
    entities = find_phrases_in_text_with_entities(texts[text_id], printed_text.PRINTED_TEXT[printed_text_year], entities)
    entities = find_phrases_in_text_with_entities(texts[text_id], printed_text.PRINTED_TEXT[printed_text_year], entities, max_diff=5)
    text_entities[text_id] = entities
    corrected_text[text_id], entities = correct_text(texts[text_id], copy.deepcopy(entities))
#    render_text(corrected_text[text_id], [ entity for entity in entities if "label" in entity ])

In [None]:
pd.DataFrame.from_dict(corrected_text, orient="index").to_csv("three_columns_100_corrected.csv")

## 2. Correct XML text

In [None]:
for text_id in text_entities:
    for entity in sorted([x for x in text_entities[text_id] if len(x) > 0 ], key=lambda x: x["start"]):
        if "correct_phrase" in entity:
            guessed_phrase = texts[text_id][entity["start"]:entity["end"]]
            correct_phrase = entity["correct_phrase"]
            if not (len(guessed_phrase.split()) == 
                    len(correct_phrase.split())):
                print(text_id, guessed_phrase, "#", correct_phrase)

In [None]:
help(sorted)

In [None]:
import xml.etree.ElementTree as ET

In [None]:
def get_text_from_xml(root):
    text = ""
    metadata = {}
    for textline in root.findall(".//{*}TextLine"):
        expand_metadata(metadata, process_textline_attrib(textline.attrib), len(text))
        custom_dict = make_custom_dict(textline.attrib)
        for unicode in textline.findall("./{*}TextEquiv/{*}Unicode"):
            if unicode.text != None:
                text += remove_strikethroughs(unicode.text, custom_dict) + "\n"
    return text, metadata

In [None]:
def get_text_from_file(file_name):
    tree = ET.parse(file_name)
    root = tree.getroot()
    text, metadata = get_text_from_xml(root)
    textregions = get_textregions_from_xml(root)
    return text, metadata, textregions

In [None]:
file_name = "../../data/Overlijden/x-samples/three-columns-100/page/O.R. 1831 Stad 027.xml"

In [None]:
tree = ET.parse(file_name)
root = tree.getroot()
counter = 0
for textline in root.findall(".//{*}TextLine"):
    for unicode in textline.findall("./{*}Word/{*}TextEquiv/{*}Unicode"):
        print(counter, unicode.text)
        counter += 1 + len(unicode.text)