# Explore manually annotated Curaçao files

## 1. Read files

In [None]:
import ast
import json
from Levenshtein import distance
import math
import matplotlib.pyplot as plt
import os
import pandas as pd
import re
import sys
import xml.etree.ElementTree as ET
sys.path.append(os.getcwd() + '/..')
from scripts import read_transkribus_files
from scripts import get_death_date
from scripts import utils

In [None]:
#data_dir = "../../data/Training_set_V2/"
#data_dir = "../../data/Sample_regex/Sample_regex/page/"
#data_dir = "../../data/Overlijden/x-samples/first-38/page"
data_dir = "../../data/Overlijden/x-samples/three-columns-100/page"

texts, metadata, textregions = read_transkribus_files.read_files(data_dir)

In [None]:
GOLD_DATA = "../../data/Overlijden/x-samples/three-columns-100.csv"
gold_data = pd.read_csv(GOLD_DATA)
names = {}
for key in gold_data.index:
    names_key = read_transkribus_files.make_file_id(gold_data["scans"][key])
    if isinstance(gold_data["first_names"][key], str):
        if isinstance(gold_data["last_name"][key], str):
            name = str(gold_data["first_names"][key]) + " " + str(gold_data["last_name"][key])
        else:
            name = str(gold_data["first_names"][key])
    elif isinstance(gold_data["last_name"][key], str):
        name = str(gold_data["last_name"][key])
    if names_key in names:
        names[names_key].append(name)
    else:
        names[names_key] = [name]

## 2. Visualize entities

For list of entity tags of model `wietsedv/bert-base-dutch-cased-finetuned-udlassy-ner`, see [OntoNotes](https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf), page 21

In [None]:
from spacy import displacy

In [None]:
def render_text(text, entities):
    displacy.render({ "text": re.sub("\\n", " ", text), 
                      "ents": entities }, 
                      options = { "colors": { "PERSON": "orange", 
                                              "first_names": "orange", 
                                              "last_name": "orange" } }, style = "ent", manual = True)

In [None]:
def convert_guessed_entities(entities_in):
    entities_out = []
    for entity in entities_in:
        start_tag = entity["entity"][0]
        label = entity["entity"][2:]
        if start_tag == "B" or not entities_out:
            entities_out.append({"start": entity["start"], "end": entity["end"], "label": label})
        else:
            entities_out[-1]["end"] = entity["end"]
    return entities_out

In [None]:
def continued_entity(entity, last_entity):
    return("continued" in entity.keys() and 
           "continued" in last_entity.keys() and 
           int(entity["offset"]) == int(last_entity["offset"]) + int(last_entity["length"]) + 1)

In [None]:
def convert_gold_entities(entities_in):
    entities_out = []
    for key in entities_in:
        if "offset" in entities_in[key][0]:
            last_entity = {}
            for entity in entities_in[key]:
                if continued_entity(entity, last_entity):
                    entities_out[-1]["end"] = int(entity["offset"]) + int(entity["length"])
                else:
                    entities_out.append({"start": int(entity["offset"]), 
                                         "end": int(entity["offset"]) + int(entity["length"]),
                                         "label": key})
                last_entity = entity.copy()
    return entities_out

In [None]:
def ignore_entities(entities_in, labels_to_omit):
    entities_out = []
    for entity in entities_in:
        if entity["label"] not in labels_to_omit:
            entities_out.append(entity)
    return entities_out

In [None]:
test_key = sorted(texts.keys())[0]
render_text(texts[test_key], ignore_entities(convert_gold_entities(metadata[test_key]), ["textStyle", "unclear"]))

## 3. Find entities in texts

In [None]:
import transformers

transformers.utils.logging.set_verbosity_error()

In [None]:
def show_names(entities):
    name = ""
    for part in entities:
        if re.search("^B", part["entity"]) and name != "":
            print(name)
            name = ""
        if re.search("(GPE|PERSON)$", part["entity"]):
            if name != "":
                name += " "
            name += part["word"]
    if name != "":
        print(name)  

Tested models (initial number indicates monthly downloads):
* (345) wietsedv/bert-base-dutch-cased-finetuned-conll2002-ner (several false positives)
* (74) Matthijsvanhof/bert-base-dutch-cased-finetuned-NER (not useful, tags everything)
* (16) wietsedv/bert-base-dutch-cased-finetuned-sonar-ner (some false positives)
* (13) proycon/bert-ner-cased-conll2002-nld (did not find any entities)
* (10) proycon/bert-ner-cased-sonar1-nld (found only one entity)
* (10) Matthijsvanhof/bert-base-dutch-cased-finetuned-NER8 (not useful, tags everything)
* (4) [wietsedv/bert-base-dutch-cased-finetuned-udlassy-ner](https://huggingface.co/wietsedv/bert-base-dutch-cased-finetuned-udlassy-ner) (few false positives) **SELECTED**

In [None]:
run_bert_pipeline = transformers.pipeline(task='ner', model='wietsedv/bert-base-dutch-cased-finetuned-udlassy-ner')

In [None]:
test_key = sorted(texts.keys())[0]
entities = run_bert_pipeline(texts[test_key])

In [None]:
show_names(entities)

## 4. Post-process entities

Expand entities which end in the middle of a word

In [None]:
def expand_entities(entities_in, text):
    entities_out = []
    for entity_in in entities_in:
        entity_out = entity_in.copy()
        while (entity_out["end"] < len(text) and 
               (re.search("\w", text[entity_out["end"]]) or re.search("[.,-]", text[entity_out["end"]]))):
            entity_out["word"] += text[entity_out['end']]
            entity_out["end"] += 1
        entities_out.append(entity_out)
    return entities_out

Combine successive entities where the second one has a label starting with I or the same label as the previous entity

In [None]:
def expand_last_entity(entities, entity):
    entities[-1]["word"] += " " + entity["word"]
    entities[-1]["end"] = entity["end"]

In [None]:
def combine_entities(entities_in):
    entities_out = []
    for entity_in in entities_in:
        entity_out = entity_in.copy()
        if len(entities_out) == 0:
            entities_out.append(entity_out)
        elif re.search("^I-", entity_out["entity"]):
            expand_last_entity(entities_out, entity_out)
        else:
            entity_out["entity"] = re.sub("^[BIE]-", "B-", entity_out["entity"])
            if entity_out["start"] < entities_out[-1]["start"]:
                print("error: entities are not sorted by position!")
            elif entity_out["start"] <= entities_out[-1]["end"] + 1 and entity_out["entity"] == entities_out[-1]["entity"]:
                expand_last_entity(entities_out, entity_out)
            else:
                entities_out.append(entity_out)
    return entities_out

In [None]:
def process_and_render_texts(texts):
    for text_id in texts:
        text = texts[text_id]
        entities = run_bert_pipeline(text)
        entities = combine_entities(expand_entities(entities, text))
        print(f"Text {text_id}")
        render_text(text, convert_guessed_entities(entities))

In [None]:
process_and_render_texts({ test_key:texts[test_key] })

## 5. Get name of deceased

In [None]:
def cleanup(text_in):
    text_out = re.sub("\s+", " ", text_in)
    text_out = re.sub("- ", "", text_out)
    return re.sub("[,.]", "", text_out.lower())

In [None]:
def find_text_patterns(query, text):
    positions = []
    pattern = re.compile(query)
    for m in pattern.finditer(text.lower()):
        positions.append({"start": m.start(), "end": m.end()})
    return positions

In [None]:
def get_name_of_deceased(text, entities):
    deceased = []
    positions = find_text_patterns("overleden is:?,?", text) + find_text_patterns("is overleden:?,?", text)
    for position in positions:
        name_deceased = ""
        for entity in entities:
            if entity["start"] == position["end"] + 1:
                name_deceased = entity["word"]
        deceased.append(name_deceased)
    positions = find_text_patterns("levens?loos", text)
    return deceased, len(positions)

In [None]:
def compare_names(results, metadata):
    if len(results[0]) == 0 or results[0][0] == "":
        return True
    if "first_names" not in metadata or "last_name" not in metadata:
        return(False)
    guessed_name = results[0][0]
    if re.search(".,.", guessed_name):
        guessed_name = re.sub("^[^,]+, *(\S.*)$", "\\1", results[0][0]) + " " + re.sub("^([^,]+),.*$", "\\1", results[0][0])
    annotated_name = " ".join([ metadata["first_names"], metadata["last_name"]])
    return cleanup(guessed_name) == cleanup(annotated_name)

In [None]:
def evaluate_deceased_names(results, nbr_of_names_found, nbr_of_stillborns_found, metadata):
    if len(results[0]) != 0 and re.search("\w", results[0][0]):
        nbr_of_names_found += 1
    if results[1] > 0:
        nbr_of_stillborns_found += 1
    return nbr_of_names_found, nbr_of_stillborns_found, compare_names(results, metadata)

In [None]:
def get_metadata(metadata, text, keys):
    data = {}
    for key in keys:
        if key in metadata:
            for metadata_item in metadata[key]:
                name = text[int(metadata_item["offset"]): 
                            int(metadata_item["offset"]) + int(metadata_item["length"])]
                if key not in data:
                    data[key] = name
                else:
                    data[key] += " " + name
    return data               

In [None]:
def print_name_correct(name_is_correct):
    if not name_is_correct:
        utils.print_with_color("wrong name")

In [None]:
nbr_of_names_found = 0
nbr_of_stillborns_found = 0
name_correct_count = 0
name_distances = {}

for text_id in sorted(texts.keys()):
    text = texts[text_id]
    entities = run_bert_pipeline(text)
    entities = combine_entities(expand_entities(entities, text))
    #print(f"Text {text_id}:", end=" ")
    results = get_name_of_deceased(text, entities)
    nbr_of_names_found, nbr_of_stillborns_found, name_is_correct = evaluate_deceased_names(results, 
                                                                          nbr_of_names_found, 
                                                                          nbr_of_stillborns_found,
                        get_metadata(metadata[text_id], texts[text_id], ["first_names", "last_name"]))
    #print(results, end=" ")
    #print_name_correct(name_is_correct)
    #print()
    name_distance = distance(results[0][0].lower(), names[text_id][0].lower())
    name_correct = ( name_distance == 0 )
    if name_distance in name_distances:
        name_distances[name_distance] += 1
    else:
        name_distances[name_distance] = 1
    print(name_correct, name_distance, text_id, results[0][0], names[text_id][0])
    if name_correct:
        name_correct_count += 1
print(f"Records: {len(texts)}; Names found: {nbr_of_names_found}; Correct names: {name_correct_count};", end=" ")
print(f"Stillborns: {nbr_of_stillborns_found}; Missing: {len(texts)-nbr_of_names_found-nbr_of_stillborns_found}")

In [None]:
name_distances_cumulative = {}
last_value = 0
for key in sorted(name_distances.keys()):
    name_distances_cumulative[key] = name_distances[key] + last_value
    last_value = name_distances_cumulative[key]

In [None]:
plt.bar(sorted(name_distances.keys()), [name_distances[key] for key in sorted(name_distances.keys())])
plt.plot(name_distances_cumulative.keys(), name_distances_cumulative.values())
plt.title("Levenshtein distances")
plt.show()

In [None]:
name_distances_cumulative

## 6. Get decease date

In [None]:
process_and_render_texts({ test_key:texts[test_key] })

In [None]:
date_of_death_gold = {}
for key in gold_data.index:
    date_of_death_gold_key = read_transkribus_files.make_file_id(gold_data["scans"][key])
    if isinstance(gold_data["date_of_death"][key], str):
        date_of_death_gold_value = gold_data["date_of_death"][key]
        if date_of_death_gold_key in date_of_death_gold:
            date_of_death_gold[date_of_death_gold_key].append(date_of_death_gold_value)
        else:
            date_of_death_gold[date_of_death_gold_key] = [date_of_death_gold_value]

In [None]:
nbr_of_incomplete_dates = 0
correct_death_date_found_count = 0
for text_id in sorted(texts.keys()):
    dates = get_death_date.get_death_date(texts[text_id])
    #dates = get_death_date.fix_years(text_id, dates)
    #print(text_id, dates)
    #if len(dates) > 0 and (dates[0][0] == 0 or dates[0][1] == "" or dates[0][2] == 0):
    #    nbr_of_incomplete_dates += 1
    correct_death_date_found = get_death_date.print_dates(text_id, dates, date_of_death_gold)
    if correct_death_date_found:
        correct_death_date_found_count += 1
    for date in dates:
        if date[0] == 0 or date[1] == "" or date[2] == 0:
            nbr_of_incomplete_dates += 1
print(f"number of incomplete dates: {nbr_of_incomplete_dates}; number of correct death dates found: {correct_death_date_found_count}")

**Notes Training set V2:**

- 3: misspelled day
- 8: misspelled year

**Notes Sample regex:**

- 21: missing month
- 25: misspelled day
- 26: misspelled month
- 28: missing day
- 32: misspelled day
- 37: misspelled month
- 38: misspelled month
- 42: misspelled month
- 44: misspelled month and year
- 45: misspelled month
- 46: misspelled month
- ...

## 7. Evaluate external data

In [None]:
data = pd.read_csv("death_dates_gpt4.csv", header=None)

In [None]:
counter = 0
nbr_of_correct_dates = 0
for text_id in sorted(texts.keys()):
    try:
        if data[0][counter] == date_of_death_gold[text_id][0]:
            nbr_of_correct_dates += 1
    except:
        utils.print_with_color(f"missing gold data for document {text_id}!\n")
    counter += 1
print(f"number of correct dates: {nbr_of_correct_dates}")

## 99. Tests

In [None]:
import unittest

In [None]:
class TestNotebook(unittest.TestCase):    
    def test_process_custom_attrib(self):
        self.assertEqual(read_transkribus_files.process_custom_attrib("readingOrder {index:1;} certificate_date {offset:10; length:25; continued:true;}"),
                         { 'readingOrder': { 'index': '1' },
                           'certificate_date': { 'offset': '10', 'length': '25', 'continued': 'true' } } )

In [None]:
unittest.main(argv=[''], verbosity=2, exit=False)