# Check printed text

Find the expected printed text in a certificate, check for errror and make suggestions for changes

In [None]:
import copy
import os
import pandas as pd
import re
import regex
import sys
from spacy import displacy
sys.path.append(os.getcwd() + '/..')
from scripts import read_transkribus_files, printed_text, utils

In [None]:
def render_text(text, entities):
    displacy.render({ "text": re.sub("\\n", " ", text), 
                      "ents": entities }, 
                      options = { "colors": { "fuzzy_match": "yellow"} }, style = "ent", manual = True)

## 1. Find missed printed text

In [None]:
data_dir = "../../data/Overlijden/x-samples/three-columns-100/page"

texts, metadata, textregions = read_transkribus_files.read_files(data_dir)

In [None]:
def get_printed_text_year(text_id):
    """ finds appropriate index of text format printed_text.PRINTED_TEXT for a certificate """
    text_year = int(text_id[:4])
    printed_text_year = list(printed_text.PRINTED_TEXT.keys())[0]
    for year in sorted(printed_text.PRINTED_TEXT.keys()):
        if year > printed_text_year and text_year >= year:
            printed_text_year = year
    return printed_text_year

In [None]:
def same_number_of_words(phrase, search_text, positions):
    """ check if the proposed replacement phrase has the same number of words as the original """
    guessed_phrase = search_text[positions[0].start(): positions[0].end()]
    return len(guessed_phrase.split()) == len(phrase.split())

In [None]:
SKIP_PHRASES = [ "des jaars een duizend acht honderd", "op dit eiland", "op den", ]

def find_match(text, phrase, start=0, end=None, level=0, max_diff=3):
    """ find a phrase in the text allowing for some (max_diff) non-matching characters """
    match = {}
    search_text = text[start: end]
    if phrase.lower() in SKIP_PHRASES and start == 0 and end == None:
        return match
    if len(phrase) > 2 - level:
        positions = utils.find_text_patterns(phrase.lower(), search_text.lower())
        if (len(positions) == 1 and 
            (positions[0]["start"] == 0 or 
             not regex.search("[a-z]", search_text[positions[0]["start"]-1].lower())) and 
            (positions[0]["end"] == len(search_text) or 
             not regex.search("[a-z]", search_text[positions[0]["end"]].lower()))):
            positions[0]["label"] = phrase
            match = { "start": positions[0]["start"] + start,
                      "end": positions[0]["end"] + start,
                      "label": "match" } # positions[0]["label"] }
        elif len(positions) == 0:
            character_errors = 0
            while len(positions) == 0 and character_errors <= max_diff:
                query = f"({phrase.lower()})"+"{"+f"e<={character_errors}"+"}"
                positions = [ match for match in regex.finditer(query, search_text.lower()) ]
                character_errors += 1
            if len(positions) == 1 and same_number_of_words(phrase, search_text, positions):
                match = { "start": positions[0].start() + start,
                          "end": positions[0].end() + start,
                          "label": "fuzzy_match",
                          "correct_phrase": phrase }
                if positions[0].group()[0] == " ":
                    match["start"] += 1
                if positions[0].group()[-1] == " ":
                    match["end"] -= 1
    return match

In [None]:
def find_phrases_in_text(text, phrases):
    """ find phrases in text, only return unique matches """
    entities = []
    for phrase in phrases:
        entities.append(find_match(text, phrase))
    return entities

In [None]:
def get_min_char_pos(entities, index):
    """ get the final position of the last preceding phrase with a match """
    for counter in range(index-1, 0, -1):
        if "end" in entities[counter]:
            return entities[counter]["end"] + 1
    return 0

In [None]:
def get_max_char_pos(entities, index):
    """ get the first position of the first next phrase with a match """
    for counter in range(index+1, len(entities)):
        if "start" in entities[counter]:
            return entities[counter]["start"]
    return None

In [None]:
def find_phrases_in_text_with_entities(text, phrases, entities, max_diff=3):
    """ find phrases in text, only return unique matches """
    for i in range(0, len(phrases)):
        if len(entities[i]) == 0:
            start = get_min_char_pos(entities, i)
            end = get_max_char_pos(entities, i)
            if end == None:
                entities[i] = find_match(text, phrases[i], start=start, level=1, max_diff=max_diff)
            else:
                entities[i] = find_match(text, phrases[i], start=start, end=end, level=1, max_diff=max_diff)                
    return entities

In [None]:
def update_entities(entities, entity_replaced):
    """ adjust start and end point of entities after replacing a text """
    delta = len(entity_replaced["correct_phrase"]) - (entity_replaced["end"] - entity_replaced["start"])
    for entity in entities:
        if "start" in entity and entity["start"] > entity_replaced["start"]:
            entity["start"] += delta
        if "end" in entity and entity["end"] >= entity_replaced["end"]:
            entity["end"] += delta
    return entities

In [None]:
def correct_text(text_in, entities):
    """ replace fuzzy matches in text by correct phrases """
    text_out = text_in
    for entity in reversed(entities):
        if "label" in entity and entity["label"] == "fuzzy_match":
            text_out = text_out[:entity["start"]] + entity["correct_phrase"] + text_out[entity["end"]:]
            if len(entity["correct_phrase"]) != entity["end"] - entity["start"]:
                entities = update_entities(entities, entity)
    return text_out, entities

In [None]:
NO_SPLIT_PHRASES = [ "des jaars een duizend acht honderd", 
                     "laatstelijk gewoond", 
                     "niet te kunnen schrijven", 
                     "op dit eiland" ]

def printed_text_split_in_words(printed_text_in, entities_in):
    """ split expected phrase in word before searching word-by-word """
    printed_text_out = []
    entities_out = []
    for i in range(0, len(printed_text_in)):
        if len(entities[i]) > 0 or len(printed_text_in[i].split()) == 1 or printed_text_in[i].lower() in NO_SPLIT_PHRASES:
            printed_text_out.append(printed_text_in[i])
            entities_out.append(entities_in[i])
        else:
            for word in printed_text_in[i].split():
                printed_text_out.append(word)
                entities_out.append({})
    return printed_text_out, entities_out

In [None]:
def sanity_check_entities(entities, text_id):
    starts_seen = {}
    ends_seen = {}
    for entity in entities:
        if "start" in entity:
            if entity["start"] in starts_seen:
                utils.print_with_color(f"duplicate start: {entity['start']} for text_id {text_id}!!\n")
            if entity["end"] in ends_seen:
                utils.print_with_color(f"duplicate end: {entity['end']} for text_id {text_id}!\n")
                sys.exit()
            starts_seen[entity["start"]] = True
            ends_seen[entity["end"]] = True

In [None]:
corrected_text = {}
text_entities = {}
for text_id in sorted(texts.keys()):
    if text_id > "0":
        printed_text_year = get_printed_text_year(text_id)
        printed_text_text = printed_text.PRINTED_TEXT[printed_text_year]
        entities = find_phrases_in_text(texts[text_id], printed_text_text)
        entities = find_phrases_in_text_with_entities(texts[text_id], printed_text_text, entities)
        entities = find_phrases_in_text_with_entities(texts[text_id], printed_text_text, entities)
        printed_text_text, entities = printed_text_split_in_words(printed_text_text, entities)
        entities = find_phrases_in_text_with_entities(texts[text_id], printed_text_text, entities)
        entities = find_phrases_in_text_with_entities(texts[text_id], printed_text_text, entities)
        sanity_check_entities(entities, text_id)
        text_entities[text_id] = entities
        corrected_text[text_id], entities = correct_text(texts[text_id], copy.deepcopy(entities))
        print(text_id)
        render_text(corrected_text[text_id], [ entity for entity in entities if "label" in entity ])
        break

In [None]:
pd.DataFrame.from_dict(corrected_text, orient="index").to_csv("three_columns_100_corrected.csv")

## 2. Correct XML text

In [None]:
import xml.etree.ElementTree as ET

In [None]:
def check_text_entities(text_entities):
    """ verify that there are not text correction with a different number of words than the original """
    for text_id in text_entities:
        for entity in sorted([x for x in text_entities[text_id] if len(x) > 0 ], key=lambda x: x["start"]):
            if "correct_phrase" in entity:
                guessed_phrase = texts[text_id][entity["start"]:entity["end"]]
                correct_phrase = entity["correct_phrase"]
                if not (len(guessed_phrase.split()) == 
                        len(correct_phrase.split())):
                    print(text_id, guessed_phrase, "#", correct_phrase)

In [None]:
def convert_text_entities(entities):
    corrected_tokens = {}
    for entity in entities:
        if "label" in entity and entity["label"] == "fuzzy_match":
            offset = entity["start"]
            for token in entity["correct_phrase"].split():
                corrected_tokens[offset] = token
                offset += len(token) + 1
    return corrected_tokens

In [None]:
file_name = "../../data/Overlijden/x-samples/three-columns-100/page/O.R. 1831 Stad 027.xml"

In [None]:
def write_file(file_name, tree):
    with open(file_name, "wb") as f:
        tree.write(f, encoding="utf8")
    f.close()

In [None]:
def cleanup_xml_file(file_name):
    file_handle_in = open(file_name, "r")
    buffer = ""
    for line in file_handle_in:
        line = regex.sub("encoding=.utf8.", 'encoding="UTF-8" standalone="yes"', line)
        line = regex.sub("ns0:", "", line)
        buffer += regex.sub(":ns0", "", line)
    file_handle_in.close()
    file_handle_out = open(file_name, "w")
    print(buffer, file=file_handle_out)
    file_handle_out.close()

In [None]:
def correct_file(file_name, text_entities):
    text_id = read_transkribus_files.make_file_id(file_name)
    print(text_id)
    corrected_tokens = convert_text_entities(text_entities[text_id])
    tree = ET.parse(file_name)
    root = tree.getroot()
    counter = 0
    for textline in root.findall(".//{*}TextLine"):
        changed = False
        changed_line = ""
        for unicode in textline.findall("./{*}Word/{*}TextEquiv/{*}Unicode"):
            next_counter = counter + 1 + len(unicode.text)
            corrected_token = ""
            if counter in corrected_tokens and corrected_tokens[counter].lower() != unicode.text.lower():
                corrected_token = corrected_tokens[counter]
                utils.print_with_color(f"{counter} {unicode.text} {corrected_token}\n")
                unicode.text = corrected_token
                changed = True
            if len(changed_line) == 0:
                changed_line = unicode.text
            else:
                changed_line += " " + unicode.text
            counter = next_counter
        for unicode in textline.findall("./{*}TextEquiv/{*}Unicode"):
            if changed:
                unicode.text = changed_line
    file_name_corrected = regex.sub("/page/", "/corrected/", file_name)
    write_file(file_name_corrected, tree)
    cleanup_xml_file(file_name_corrected)

In [None]:
base_dir = "../../data/Overlijden/x-samples/three-columns-100/page/"

for text_id in text_entities:
    file_name = base_dir + read_transkribus_files.make_file_name(text_id, base_dir)
    correct_file(file_name, text_entities)