# Evaluate Hand-written Text Recognition

In [None]:
from Levenshtein import distance
import os
import matplotlib.pyplot as plt
import re
import sys
sys.path.append(os.getcwd() + '/..')
from scripts import read_transkribus_files

In [None]:
def print_with_color(string, color_code=1):
    print(f"\x1b[3{color_code}m{string}\x1b[m", end="")

## 1. Estimate HTR accuracy by exmanining printed text

In [None]:
data_dir = "../../data/Overlijden/x-samples/three-columns-100/page"
#data_dir = "../../data/Overlijden/x-samples/Training_set_V2/page"

texts, metadata, textregions = read_transkribus_files.read_files(data_dir)

In [None]:
def cleanup_text(text):
    return " ".join(text.strip().split())

In [None]:
BATCHSIZE = 1

PROMPT_DECEASED_NAME = """Here is a Dutch death certificate. 
Can you extract the name of the deceased person from this document? 
Only give the deceased name as result. 
Other information from the document is not necessary.
""" 

PROMPT_DEATH_DATE = """Here is a Dutch death certificate. 
Can you extract the death date from this document? 
Please present the result in numeric form, with a zero prepending day and month numbers smaller than 10, so for example like: 01-01-1900.
Only give the death date as result. 
Other information from the document is not necessary.
""" 

counter = 0
for key in sorted(texts.keys()):
    year, district, folio_nbr = key.split("-")
    print(PROMPT_DECEASED_NAME + re.sub("\n", " ", texts[key]))
    counter += 1
    if counter >= BATCHSIZE:
        counter = 0
        print("\n")

In [None]:
def compare_strings(string1, text):
    clean_string = re.sub("[^a-z0-9]", "", re.sub("ç", "c", string.lower()))
    clean_text = re.sub("[^a-z0-9]", "", re.sub("ç", "c", text.lower()))
    return clean_string in clean_text

In [None]:
def get_template_id(PRINTED_TEXT, year):
    template_id = 0
    for key in PRINTED_TEXT.keys():
        if key > template_id and key <= int(year):
            template_id = key
    return template_id

In [None]:
scores = {}
for key in sorted(texts.keys()):
    year, district, folio_nbr = key.split("-")
    template_id = get_template_id(PRINTED_TEXT, year)
    correct_text = ""
    missed_text = ""
    for string in PRINTED_TEXT[template_id]:
        if compare_strings(string, texts[key]):
            correct_text += string
        else:
            missed_text += string
    scores[key] = len(correct_text)/len(correct_text + missed_text)
[(key[1], key[0]) for key in sorted(scores.items() ,key=lambda item: item[1])][-5:]

## 2. Compare names with gold data

In [None]:
import pandas as pd

In [None]:
def fix_year(text_id, date_in):
    print(text_id, date_in)
    target_year = int(text_id[:4])
    guessed_year = int(date_in[-4:])
    if guessed_year != target_year and guessed_year != target_year-1:
        print_with_color(f"changing year {guessed_year} to {target_year}!\n")
        date_out = date_in[0:-4] + str(target_year)
    else:
        date_out = date_in
    return date_out

In [None]:
GOLD_DATA = "../../data/Overlijden/x-samples/three-columns-100.csv"

In [None]:
gold_data = pd.read_csv(GOLD_DATA)

In [None]:
GOLD_DATA = "../../data/Overlijden/x-samples/three-columns-100.csv"
gold_data = pd.read_csv(GOLD_DATA)
names = {}
for key in gold_data.index:
    names_key = read_transkribus_files.make_file_id(gold_data["scans"][key])
    if isinstance(gold_data["first_names"][key], str):
        if isinstance(gold_data["last_name"][key], str):
            name = str(gold_data["first_names"][key]) + " " + str(gold_data["last_name"][key])
        else:
            name = str(gold_data["first_names"][key])
    elif isinstance(gold_data["last_name"][key], str):
        name = str(gold_data["last_name"][key])
    if names_key in names:
        names[names_key].append(name)
    else:
        names[names_key] = [name]

In [None]:
chatgpt_deceased_names = list(pd.read_csv("deceased_names_gpt35.csv", header=None)[0])

In [None]:
counter = 0
names_correct = 0
name_distances = {}
year_scores = {}
for key in sorted(texts.keys()):
    name_correct = False
    levenshtein_minimum = 999999
    year = key[:4]
    for name in names[key]:
        name = re.sub("[.,]", "", name)
        name = re.sub("  *", " ", name).strip()
        try:
            chatgpt_deceased_names[counter] = re.sub("[.,]", "", chatgpt_deceased_names[counter])
            chatgpt_deceased_names[counter] = re.sub("  *", " ", chatgpt_deceased_names[counter])
            if not name_correct:
                name_correct = (chatgpt_deceased_names[counter].lower() == name.lower())
            levenshtein_distance = distance(chatgpt_deceased_names[counter].lower(), name.lower())
            if levenshtein_distance < levenshtein_minimum:
                levenshtein_minimum  = levenshtein_distance
        except:
            print_with_color(f"missing data for key: {key}\n")
    try:
        if not name_correct:
            print(f"{chatgpt_deceased_names[counter]} {names[key]} {key}")
    except:
        pass
    if year in year_scores:
        if name_correct in year_scores[year]:
            year_scores[year][name_correct] += 1
        else:
            year_scores[year][name_correct] = 1
    else:
        year_scores[year] = { name_correct: 1, not name_correct: 0 }
    counter += 1
    if name_correct:
        names_correct += 1
    if levenshtein_minimum not in name_distances:
        name_distances[levenshtein_minimum] = 1
    else:
        name_distances[levenshtein_minimum] += 1
names_correct, name_distances

In [None]:
plt.plot(year_scores.keys(), [ 100*year_scores[year][True]/(year_scores[year][True] + year_scores[year][False]) for year in year_scores ])
plt.xticks(range(0, 30, 5))
plt.title("precentage correct names per year")
plt.show()

In [None]:
date_of_death_gold = {}
for key in gold_data.index:
    date_of_death_gold_key = read_transkribus_files.make_file_id(gold_data["scans"][key])
    if isinstance(gold_data["date_of_death"][key], str):
        date_of_death_gold_value = gold_data["date_of_death"][key]
        if date_of_death_gold_key in date_of_death_gold:
            date_of_death_gold[date_of_death_gold_key].append(date_of_death_gold_value)
        else:
            date_of_death_gold[date_of_death_gold_key] = [date_of_death_gold_value]

In [None]:
counter = 0
correct_death_date_found_count = 0
for text_id in sorted(texts.keys()):
    try:
        chatgpt_death_dates[counter] = fix_year(text_id, chatgpt_death_dates[counter])
        date_correct = chatgpt_death_dates[counter] == date_of_death_gold[text_id][0]
        print(counter, chatgpt_death_dates[counter], date_of_death_gold[text_id][0], date_correct)
        if date_correct:
            correct_death_date_found_count += 1 
    except:
        print_with_color(f"error for text id {text_id}!\n")
    counter += 1
    if counter >= len(chatgpt_death_dates):
        break
correct_death_date_found_count

to do: check for missing gold data