# Explore manually annotated Curaçao files

## 1. Read files

In [None]:
import ast
import json
from Levenshtein import distance
import math
import matplotlib.pyplot as plt
import os
import pandas as pd
import re
import sys
import xml.etree.ElementTree as ET
sys.path.append(os.getcwd() + '/..')
from scripts import read_transkribus_files
from scripts import get_death_date
from scripts import get_deceased_name
from scripts import utils

In [None]:
#data_dir = "../../data/Training_set_V2/"
data_dir = "../../data/Sample_regex/Sample_regex/page/"
#data_dir = "../../data/Overlijden/x-samples/first-38/page"
#data_dir = "../../data/Overlijden/x-samples/three-columns-100/page"

texts, metadata, textregions = read_transkribus_files.read_files(data_dir)

In [None]:
GOLD_DATA = "../../data/Overlijden/x-samples/three-columns-100.csv"
gold_data = pd.read_csv(GOLD_DATA)
gold_names = {}
for key in gold_data.index:
    gold_names_key = read_transkribus_files.make_file_id(gold_data["scans"][key])
    if isinstance(gold_data["first_names"][key], str):
        if isinstance(gold_data["last_name"][key], str):
            name = str(gold_data["first_names"][key]) + " " + str(gold_data["last_name"][key])
        else:
            name = str(gold_data["first_names"][key])
    elif isinstance(gold_data["last_name"][key], str):
        name = str(gold_data["last_name"][key])
    if gold_names_key in gold_names:
        gold_names[gold_names_key].append(name)
    else:
        gold_names[gold_names_key] = [name]

## 2. Visualize entities

For list of entity tags of model `wietsedv/bert-base-dutch-cased-finetuned-udlassy-ner`, see [OntoNotes](https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf), page 21

In [None]:
from spacy import displacy

In [None]:
def render_text(text, entities):
    displacy.render({ "text": re.sub("\\n", " ", text), 
                      "ents": entities }, 
                      options = { "colors": { "PERSON": "orange", 
                                              "first_names": "orange", 
                                              "last_name": "orange" } }, style = "ent", manual = True)

In [None]:
def convert_guessed_entities(entities_in):
    entities_out = []
    for entity in entities_in:
        start_tag = entity["entity"][0]
        label = entity["entity"][2:]
        if start_tag == "B" or not entities_out:
            entities_out.append({"start": entity["start"], "end": entity["end"], "label": label})
        else:
            entities_out[-1]["end"] = entity["end"]
    return entities_out

In [None]:
def continued_entity(entity, last_entity):
    return("continued" in entity.keys() and 
           "continued" in last_entity.keys() and 
           int(entity["offset"]) == int(last_entity["offset"]) + int(last_entity["length"]) + 1)

In [None]:
def convert_gold_entities(entities_in):
    entities_out = []
    for key in entities_in:
        if "offset" in entities_in[key][0]:
            last_entity = {}
            for entity in entities_in[key]:
                if continued_entity(entity, last_entity):
                    entities_out[-1]["end"] = int(entity["offset"]) + int(entity["length"])
                else:
                    entities_out.append({"start": int(entity["offset"]), 
                                         "end": int(entity["offset"]) + int(entity["length"]),
                                         "label": key})
                last_entity = entity.copy()
    return entities_out

In [None]:
def ignore_entities(entities_in, labels_to_omit):
    entities_out = []
    for entity in entities_in:
        if entity["label"] not in labels_to_omit:
            entities_out.append(entity)
    return entities_out

In [None]:
test_key = sorted(texts.keys())[0]
render_text(texts[test_key], ignore_entities(convert_gold_entities(metadata[test_key]), ["textStyle", "unclear"]))

## 3. Get name of deceased

In [None]:
nbr_of_names_found = 0
nbr_of_stillborns_found = 0
name_correct_count = 0
name_distances = {}

deceased_names = {}
for text_id in sorted(texts.keys()):
    text = texts[text_id]
    deceased_names[text_id] = get_deceased_name.get_name_of_deceased_from_text(text)
    nbr_of_names_found, nbr_of_stillborns_found, name_is_correct = get_deceased_name.evaluate_deceased_names((deceased_names[text_id], 
                                                                                                              get_deceased_name.stillborn_count(text)),
                                                                          nbr_of_names_found, 
                                                                          nbr_of_stillborns_found,
                        get_deceased_name.get_metadata(metadata[text_id], texts[text_id], ["first_names", "last_name"]))
print(f"Records: {len(texts)}; Names found: {nbr_of_names_found}; Correct names: {name_correct_count};", end=" ")
print(f"Stillborns: {nbr_of_stillborns_found}; Missing: {len(texts)-nbr_of_names_found-nbr_of_stillborns_found}")

In [None]:
deceased_names

In [None]:
name_distances_cumulative = {}
last_value = 0
for key in sorted(name_distances.keys()):
    name_distances_cumulative[key] = name_distances[key] + last_value
    last_value = name_distances_cumulative[key]

In [None]:
plt.bar(sorted(name_distances.keys()), [name_distances[key] for key in sorted(name_distances.keys())])
plt.plot(name_distances_cumulative.keys(), name_distances_cumulative.values())
plt.title("Levenshtein distances")
plt.show()

In [None]:
name_distances_cumulative

## 4. Get decease date

In [None]:
missing_data = { "1848-1e-049": ["19-03-1848"],
                 "1850-3e-010": ["04-11-1850"],
                 "1851-1e-048": ["22-03-1851"],
                 "1851-1e-096": ["19-06-1851"],
                 "1867-1e-057": ["18-03-1867"]
               }
date_of_death_gold  = missing_data
for key in gold_data.index:
    date_of_death_gold_key = read_transkribus_files.make_file_id(gold_data["scans"][key])
    if isinstance(gold_data["date_of_death"][key], str):
        date_of_death_gold_value = gold_data["date_of_death"][key]
        if date_of_death_gold_key in date_of_death_gold:
            date_of_death_gold[date_of_death_gold_key].append(date_of_death_gold_value)
        else:
            date_of_death_gold[date_of_death_gold_key] = [date_of_death_gold_value]
    else:
        utils.print_with_color(f"cannot store data from document {date_of_death_gold_key}: {gold_data['date_of_death'][key]}\n") 

In [None]:
nbr_of_incomplete_dates = 0
correct_death_date_found_count = 0
for text_id in sorted(texts.keys()):
    dates = get_death_date.get_death_date(texts[text_id])
    dates = get_death_date.fix_years(text_id, dates)
    print(text_id, dates)
    if len(dates) > 0 and (dates[0][0] == 0 or dates[0][1] == "" or dates[0][2] == 0):
        nbr_of_incomplete_dates += 1
    #correct_death_date_found = get_death_date.print_dates(text_id, dates, date_of_death_gold)
    #if correct_death_date_found:
    #    correct_death_date_found_count += 1
    #for date in dates:
    #    if date[0] == 0 or date[1] == "" or date[2] == 0:
    #        nbr_of_incomplete_dates += 1
print(f"number of incomplete dates: {nbr_of_incomplete_dates}; number of correct death dates found: {correct_death_date_found_count}")

**Error Analysis Training set V2:**

- 3: misspelled day
- 8: misspelled year

**Error Analysis Sample regex:**

- 21: missing month
- 25: misspelled day
- 26: misspelled month
- 28: missing day
- 32: misspelled day
- 37: misspelled month
- 38: misspelled month
- 42: misspelled month
- 44: misspelled month and year
- 45: misspelled month
- 46: misspelled month
- ...

## 5. Evaluate external data

In [None]:
data = pd.read_csv("death_dates_gpt4.csv", header=None)

In [None]:
def fix_year_in_date(date, text_id):
    day_date, month_date, year_date = date.split("-")
    year_text_id = text_id.split("-")[0]
    if year_date == year_text_id or int(year_date) + 1 == int(year_text_id):
        return date
    else:
        return "-".join([day_date, month_date, year_text_id])

In [None]:
UNKNOWN_DATE = "00-00-0000"

data_counter = 0
nbr_of_correct_dates = 0
nbr_of_correct_fixed_dates = 0
nbr_of_unknown_dates = 0
for text_id in sorted(texts.keys()):
    try:
        guessed_date = data[0][data_counter]
        gold_date = date_of_death_gold[text_id][0]
        if guessed_date == gold_date:
            nbr_of_correct_dates += 1
        elif fix_year_in_date(guessed_date, text_id) == gold_date:
            nbr_of_correct_fixed_dates += 1
        elif guessed_date == UNKNOWN_DATE:
            nbr_of_unknown_dates += 1
    except:
        if text_id not in date_of_death_gold:
            utils.print_with_color(f"missing gold data for document {text_id}!\n")
        elif data_counter >= len(data):
            utils.print_with_color(f"missing guessed data for document {text_id}!\n")
    data_counter += 1
print(f"number of correct dates: {nbr_of_correct_dates}; number of correct fixed dates: {nbr_of_correct_fixed_dates}; number of unknown dates: {nbr_of_unknown_dates};")

## 99. Tests

In [None]:
import unittest

In [None]:
class TestNotebook(unittest.TestCase):    
    def test_process_custom_attrib(self):
        self.assertEqual(read_transkribus_files.process_custom_attrib("readingOrder {index:1;} certificate_date {offset:10; length:25; continued:true;}"),
                         { 'readingOrder': { 'index': '1' },
                           'certificate_date': { 'offset': '10', 'length': '25', 'continued': 'true' } } )

In [None]:
unittest.main(argv=[''], verbosity=2, exit=False)