# Evaluate Hand-written Text Recognition

In [None]:
from Levenshtein import distance
import itertools
import os
import matplotlib.pyplot as plt
import re
import sys
sys.path.append(os.getcwd() + '/..')
from scripts import read_transkribus_files, printed_text

In [None]:
def print_with_color(string, color_code=1):
    print(f"\x1b[3{color_code}m{string}\x1b[m", end="")

## 1. Estimate HTR accuracy by examining printed text

In [None]:
data_dir = "../../data/Overlijden/x-samples/three-columns-100/page"
#data_dir = "../../data/Overlijden/x-samples/Training_set_V2/page"

texts, metadata, textregions = read_transkribus_files.read_files(data_dir)

In [None]:
def cleanup_text(text):
    return " ".join(text.strip().split())

In [None]:
BATCHSIZE = 1

PROMPT_DECEASED_NAME = """Here is a Dutch death certificate. 
Can you extract the name of the deceased person from this document? 
Only give the deceased name as result. 
Other information from the document is not necessary.
""" 

PROMPT_DEATH_DATE = """Here is a Dutch death certificate. 
Can you extract the death date from this document? 
Please present the result in numeric form, with a zero prepending day and month numbers smaller than 10, so for example like: 01-01-1900.
Only give the death date as result. 
Other information from the document is not necessary.
""" 

PROMPT_MOTHER_NAME = """Here is a Dutch death certificate. 
Can you extract the name of the mother of the deceased person from this document? 
Only give the mother's name as result. 
Other information from the document is not necessary.
""" 

def make_prompt_texts(texts):
    counter = 0
    for text_id in sorted(texts.keys()):
        year, district, folio_nbr = key.split("-")
        print(text_id)
        print(PROMPT_MOTHER_NAME + re.sub("\n", " ", texts[text_id]))
        counter += 1
        if counter >= BATCHSIZE:
            counter = 0
            print("\n")
            
#make_prompt_texts(texts)

In [None]:
def compare_strings(string1, text):
    clean_string = re.sub("[^a-z0-9]", "", re.sub("ç", "c", string.lower()))
    clean_text = re.sub("[^a-z0-9]", "", re.sub("ç", "c", text.lower()))
    return clean_string in clean_text

In [None]:
def get_template_id(PRINTED_TEXT, year):
    template_id = 0
    for key in PRINTED_TEXT.keys():
        if key > template_id and key <= int(year):
            template_id = key
    return template_id

In [None]:
scores = {}
for key in sorted(texts.keys()):
    year, district, folio_nbr = key.split("-")
    template_id = get_template_id(printed_text.PRINTED_TEXT, year)
    correct_text = ""
    missed_text = ""
    for string in printed_text.PRINTED_TEXT[template_id]:
        if compare_strings(string, texts[key]):
            correct_text += string
        else:
            missed_text += string
    scores[key] = len(correct_text)/len(correct_text + missed_text)
[(key[1], key[0]) for key in sorted(scores.items() ,key=lambda item: item[1])][-5:]

## 2. Read gold names

In [None]:
import pandas as pd

In [None]:
def fix_year(text_id, date_in):
    print(text_id, date_in)
    target_year = int(text_id[:4])
    guessed_year = int(date_in[-4:])
    if guessed_year != target_year and guessed_year != target_year-1:
        print_with_color(f"changing year {guessed_year} to {target_year}!\n")
        date_out = date_in[0:-4] + str(target_year)
    else:
        date_out = date_in
    return date_out

Source of this gold data: table used by Lisa

In [None]:
GOLD_DATA = "../../data/Overlijden/x-samples/three-columns-100.csv"

In [None]:
def extract_gold_names_from_gold_data(gold_data):
    gold_names = {}
    for key in gold_data.index:
        gold_names_key = read_transkribus_files.make_file_id(gold_data["scans"][key])
        gold_name = ""
        if isinstance(gold_data["first_names"][key], str):
            if isinstance(gold_data["last_name"][key], str):
                gold_name = str(gold_data["first_names"][key]) + " " + str(gold_data["last_name"][key])
            else:
                gold_name = str(gold_data["first_names"][key])
        elif isinstance(gold_data["last_name"][key], str):
            gold_name = str(gold_data["last_name"][key])
        if gold_names_key in gold_names and gold_name != "":
            gold_names[gold_names_key].append(gold_name)
        elif gold_name != "":
            gold_names[gold_names_key] = [gold_name]
        else:
            gold_names[gold_names_key] = []
    return gold_names

In [None]:
gold_data = pd.read_csv(GOLD_DATA)

In [None]:
gold_names = extract_gold_names_from_gold_data(gold_data)

## 3. Check and correct gold names

Some names in the gold data are different from the ones on the certificates. For example, certficate 1867-2e-035 mentions Klarisse Roduma as deceased person but in the gold data the associated deceased name is Klarisse Roduma Isabella. The surname of the mother was added.

In [None]:
corrections = { "1833-1e-046": "Maria Dolor",
                "1848-1e-049": "Cicilia Magdalena",
                "1850-3e-010": "Ivaela Bernardina",
                "1851-1e-048": "Roselfin Longina",
                "1851-1e-096": "Maria Gerarda",
                "1862-1e-014": "Josef Theodorus",
                "1862-1e-029": "Abrammenina Gysbertha Veeris",
                "1863-5e-002": "Nicolaas Johannes",
                "1863-5e-024": "Ermesilia Eleonora",
                "1863-6e-028": "Anthonia Pieternella",
                "1863-7e-002": "George Alvarez Correa",
                "1864-1e-272": "Cornelia Nersilia",
                "1864-2e-030": "Maria Gracia",
                "1864-2e-062": "Bartolomeuw Apostel",
                "1865-2e-021": "Lucas Martis",
                "1865-4e-006": "Elwin Martes Jansen",
                "1865-5e-019": "Johannes Balentinus",
                "1866-1e-051": "Juana Francisca",
                "1866-1e-155": "Stephanus Mercelino",
                "1866-2e-004": "Domingo Lingker",
                "1867-1e-057": "Hoze Leon",
                "1867-1e-194": "Anna Wilhelmina",
                "1867-2e-035": "Klarisse Roduma",
                "1867-5e-017": ["Juan Rimon", "Juantje Aland" ]
              }

print(f"There are {len(corrections)} corrections")

In [None]:
def add_correction_to_gold_names(gold_names, corrections):
    for key in corrections:
        if not isinstance(corrections[key], list):
            if corrections[key] not in gold_names[key]:
                gold_names[key].append(corrections[key])
        else:
            for name in corrections[key]:
                if name not in gold_names[key]:
                    gold_names[key].append(name)
    return gold_names

In [None]:
def print_names_for_checking(gold_names):
    for key in sorted(gold_names.keys()):
        print(key, gold_names[key])

In [None]:
gold_names = add_correction_to_gold_names(gold_names, corrections)

## 4. Compare names with gold data

In [None]:
def cleanup_name(name):
    name = re.sub("[.,]", "", name)
    name = re.sub("  *", " ", name).strip()
    return name

In [None]:
def add_text_ids(names_in):
    counter = 0
    names_out = {}
    for text_id in sorted(texts.keys()):
        names_out[text_id] = names_in[counter]
        counter += 1
    return names_out

In [None]:
chatgpt_deceased_names = add_text_ids(list(pd.read_csv("deceased_names_gpt35.csv", header=None)[0]))

In [None]:
names_correct = { text_id: False for text_id in texts }
for text_id in sorted(texts.keys()):
    guessed_name = cleanup_name(chatgpt_deceased_names[text_id]).lower()
    for gold_name in gold_names[text_id]:
        gold_name = cleanup_name(gold_name).lower()
        if not names_correct[text_id]:
            names_correct[text_id] = (guessed_name == gold_name)

In [None]:
decade_scores = {}
for text_id in sorted(texts.keys()):               
    decade = text_id[:3] + "0"
    if decade in decade_scores:
        if names_correct[text_id] in decade_scores[decade]:
            decade_scores[decade][names_correct[text_id]] += 1
        else:
            decade_scores[decade][names_correct[text_id]] = 1
    else:
        decade_scores[decade] = { names_correct[text_id]: 1, not names_correct[text_id]: 0 }

In [None]:
names_almost_correct = {}
name_distance_counts = {}
name_distances = { text_id: 999999 for text_id in texts }
for text_id in sorted(texts.keys()):               
    for gold_name in gold_names[text_id]:
        levenshtein_distance = distance(cleanup_name(chatgpt_deceased_names[text_id]).lower(), cleanup_name(gold_name).lower())
        if levenshtein_distance < name_distances[text_id]:
            name_distances[text_id]  = levenshtein_distance
    if name_distances[text_id] not in name_distance_counts:
        name_distance_counts[name_distances[text_id]] = 1
    else:
        name_distance_counts[name_distances[text_id]] += 1
    names_almost_correct[text_id] = name_distances[text_id] <= 3

In [None]:
decade_almost_scores = {}
for text_id in sorted(names_almost_correct.keys()):               
    decade = text_id[:3] + "0"
    if decade in decade_almost_scores:
        if names_almost_correct[text_id] in decade_almost_scores[decade]:
            decade_almost_scores[decade][names_almost_correct[text_id]] += 1
        else:
            decade_almost_scores[decade][names_almost_correct[text_id]] = 1
    else:
        decade_almost_scores[decade] = { names_almost_correct[text_id]: 1, not names_almost_correct[text_id]: 0 }

In [None]:
for text_id in sorted(texts.keys()):           
    if not names_correct[text_id]:
        try:
            print(f"{name_distances[text_id]} {chatgpt_deceased_names[text_id]} {gold_names[text_id]} {text_id}")
        except:
            pass
            
print(f"correct: {len([ True for text_id in names_correct if names_correct[text_id] ])};", 
      f"partially: {name_distance_counts[0]+name_distance_counts[1]+name_distance_counts[2]+name_distance_counts[3]};", 
      f"counts per distance: {name_distance_counts}")

In [None]:
plt.plot(decade_almost_scores.keys(), 
         [ 100*decade_almost_scores[decade][True]/(decade_almost_scores[decade][True] + decade_almost_scores[decade][False]) 
           for decade in decade_almost_scores ],
         label="partial (Levenshtein<=3)")
plt.plot(decade_scores.keys(), 
         [ 100*decade_scores[decade][True]/(decade_scores[decade][True] + decade_scores[decade][False]) 
           for decade in decade_scores ],
         label="perfect (Levenshtein=0)"
        )
plt.title("percentage correct names per decade")
plt.xlabel("decades")
plt.ylabel("percentages")
plt.legend()
plt.show()

## 5. Compare guessed names to known names

In [None]:
known_certificates = pd.read_csv("../../data/Overlijden/x-misc/Overlijden 1831-1950 JESSYv2-1831-1929.csv", low_memory=False)

In [None]:
name_types = {}
name_counts = {}
MINIMUM_YEAR = 1869

for index, row in known_certificates.iterrows():
    surname = row["Achternaam"]
    firstnames = row["Voornamen"]
    try:
        year = int(row["Jaar"])
    except:
        continue
    if isinstance(surname, str) and year >= MINIMUM_YEAR:
        for name_part in surname.strip().split():
            clean_name = cleanup_name(name_part).lower()
            if clean_name not in name_types:
                name_types[clean_name] = "last"
            if clean_name in name_counts:
                name_counts[clean_name] += 1
            else:
                name_counts[clean_name] = 1
    if isinstance(firstnames, str) and year >= MINIMUM_YEAR:
        for name_part in firstnames.strip().split():
            clean_name = cleanup_name(name_part).lower()
            if clean_name not in name_types:
                name_types[clean_name] = "first"
            elif name_types[clean_name] == "last":
                name_types[clean_name] = "both"
            if clean_name in name_counts:
                name_counts[clean_name] += 1
            else:
                name_counts[clean_name] = 1

In [None]:
MINIMUM_FREQUENCY = 10

def get_closest_name(name):
    if name in name_types.keys():
        return [name], 0
    min_distance = 999999
    best_names = []
    for known_name in name_types:
        if name_counts[known_name] >= MINIMUM_FREQUENCY:
            this_distance = distance(cleanup_name(known_name), name)
            if this_distance < min_distance:
                min_distance = this_distance
                best_names = [known_name]
            elif this_distance == min_distance:
                best_names.append(known_name)
    return best_names, min_distance

In [None]:
has_unknown_name = { text_id: False for text_id in texts }
for text_id in chatgpt_deceased_names:
    name_format = []
    for name_part in chatgpt_deceased_names[text_id].strip().split():
        clean_name = cleanup_name(name_part).lower()
        if clean_name in name_types:
            name_format.append(name_types[clean_name])
        else:
            print(f"unknown name: {clean_name}, {get_closest_name(clean_name)}")
            name_format.append("UNKNOWN")
            has_unknown_name[text_id] = True
    print(name_format)

In the ChatGPT3.5 output there are 64 names with unknown parts (not found in the 1869+ data). Of these names 61 (95%) are wrong while 3 (5%) are correct. This analysis is different for the partially correct names: 34 of the 64 names have a Levenshtein distance of 3 or smaller to the correct name while for the other 30 the distance is larger.

In [None]:
correct_count = 0
wrong_count = 0
almost_correct_count = 0
not_almost_correct_count = 0
for text_id in has_unknown_name:
    if has_unknown_name[text_id]:
        if names_correct[text_id]:
            correct_count += 1
        else:
            wrong_count += 1
        if names_almost_correct[text_id]:
            almost_correct_count += 1
        else:
            not_almost_correct_count += 1
print(f"correct: {correct_count}; wrong: {wrong_count}; almost correct: {almost_correct_count}; not almost correct: {not_almost_correct_count}")

Replacing unknown names with known names does not have a big effect. Only a few names can be corrected because most names have many alternatives. This makes choosing the correct alternative difficult.

In [None]:
def find_most_frequent(name_types, name_alternatives):
    best_name = name_alternatives[0]
    highest_frequency = name_types[best_name]
    for name_alternative in name_alternatives:
        if (name_types[name_alternative] > highest_frequency or 
            (name_types[name_alternative] == highest_frequency and len(name_alternative) > len(best_name))):
            best_name = name_alternative
            highest_frequency = name_types[best_name]
    return best_name

In [None]:
for text_id in sorted(texts.keys()):
    name_correct = False
    guessed_name = cleanup_name(chatgpt_deceased_names[text_id]).lower()
    for gold_name in gold_names[text_id]:
        gold_name = cleanup_name(gold_name).lower()
        if not name_correct:
            name_correct = (guessed_name == gold_name)
    if not name_correct:
        name_parts_corrected = []
        corrected = True
        for name_part in guessed_name.split():
            if name_part in name_types:
                name_parts_corrected.append(name_part)
            else:
                name_alternatives, name_distance = get_closest_name(name_part)
                if name_distance > 3:
                    name_parts_corrected.append(name_part)
                    corrected = False
                elif len(name_alternatives) == 1:
                    name_parts_corrected.append(name_alternatives[0])
                else:
                    if len(name_alternatives) == 0:
                        print_with_color("cannot happen")
                    name_parts_corrected.append(find_most_frequent(name_types, name_alternatives))
        name_corrected = " ".join(name_parts_corrected)
        if name_corrected != guessed_name and corrected:
            corrected_distance = distance(name_corrected, cleanup_name(gold_names[text_id][0]).lower())
            if corrected_distance > 0:
                print(f"{corrected_distance} {distance(guessed_name, cleanup_name(gold_names[text_id][0]).lower())} {name_corrected} # {guessed_name} {gold_names[text_id]}")
            else:
                print_with_color(f"{corrected_distance} {distance(guessed_name, cleanup_name(gold_names[text_id][0]).lower())} {name_corrected} # {guessed_name} {gold_names[text_id]}\n")

## 6. Free word order evaluation

In [None]:
names_correct_unordered = { text_id: False for text_id in texts }
for text_id in sorted(texts.keys()):
    guessed_name = cleanup_name(chatgpt_deceased_names[text_id]).lower()
    for gold_name in gold_names[text_id]:
        gold_name = cleanup_name(gold_name).lower()
        for permutation in itertools.permutations(guessed_name.split()):
            if not names_correct_unordered[text_id]:
                names_correct_unordered[text_id] = (" ".join(permutation) == gold_name)
                if " ".join(permutation) != guessed_name and " ".join(permutation) == gold_name:
                    print(permutation, guessed_name)

In [None]:
len([ True for text_id in names_correct if names_correct_unordered[text_id] ])

## 6. Evaluate with name of mother

In [None]:
mother_names = pd.read_csv("mothers.csv", header=None).to_dict(orient="records")
mother_names = { item[0]: item[1] for item in mother_names }

In [None]:
names_correct_with_mother = { text_id: False for text_id in texts }
for text_id in sorted(texts.keys()):
    guessed_name = cleanup_name(chatgpt_deceased_names[text_id]).lower()
    for gold_name in gold_names[text_id]:
        gold_name = cleanup_name(gold_name).lower()
        if not names_correct_with_mother[text_id]:
            names_correct_with_mother[text_id] = (guessed_name == gold_name)
    if isinstance(mother_names[text_id], str) and not names_correct_with_mother[text_id]:
        mother_name = cleanup_name(mother_names[text_id]).lower().split()[-1]
        for gold_name in gold_names[text_id]:
            gold_name = cleanup_name(gold_name).lower()
            if not names_correct_with_mother[text_id]:
                names_correct_with_mother[text_id] = (guessed_name + " " + mother_name == gold_name)
                if guessed_name + " " + mother_name == gold_name:
                    print(guessed_name, mother_name, gold_name, sep="#")

In [None]:
len([ True for text_id in names_correct if names_correct_with_mother[text_id] ])