# Explore manually annotated Curaçao files

## 1. Read files

In [None]:
import ast
import json
import math
import os
import pandas as pd
import re
import sys
import xml.etree.ElementTree as ET
sys.path.append(os.getcwd() + '/..')
from scripts import read_transkribus_files

In [None]:
def print_with_color(string, color_code=1):
    print(f"\x1b[3{color_code}m{string}\x1b[m", end="")

In [None]:
#data_dir = "../../data/Training_set_V2/"
#data_dir = "../../data/Sample_regex/Sample_regex/page/"
#data_dir = "../../data/Overlijden/x-samples/first-38/page"
data_dir = "../../data/Overlijden/x-samples/three-columns-100/page"

texts, metadata, textregions = read_transkribus_files.read_files(data_dir)

## 2. Visualize entities

For list of entity tags of model `wietsedv/bert-base-dutch-cased-finetuned-udlassy-ner`, see [OntoNotes](https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf), page 21

In [None]:
from spacy import displacy

In [None]:
def render_text(text, entities):
    displacy.render({ "text": re.sub("\\n", " ", text), 
                      "ents": entities }, 
                      options = { "colors": { "PERSON": "orange", 
                                              "first_names": "orange", 
                                              "last_name": "orange" } }, style = "ent", manual = True)

In [None]:
def convert_guessed_entities(entities_in):
    entities_out = []
    for entity in entities_in:
        start_tag = entity["entity"][0]
        label = entity["entity"][2:]
        if start_tag == "B" or not entities_out:
            entities_out.append({"start": entity["start"], "end": entity["end"], "label": label})
        else:
            entities_out[-1]["end"] = entity["end"]
    return entities_out

In [None]:
def continued_entity(entity, last_entity):
    return("continued" in entity.keys() and 
           "continued" in last_entity.keys() and 
           int(entity["offset"]) == int(last_entity["offset"]) + int(last_entity["length"]) + 1)

In [None]:
def convert_gold_entities(entities_in):
    entities_out = []
    for key in entities_in:
        if "offset" in entities_in[key][0]:
            last_entity = {}
            for entity in entities_in[key]:
                if continued_entity(entity, last_entity):
                    entities_out[-1]["end"] = int(entity["offset"]) + int(entity["length"])
                else:
                    entities_out.append({"start": int(entity["offset"]), 
                                         "end": int(entity["offset"]) + int(entity["length"]),
                                         "label": key})
                last_entity = entity.copy()
    return entities_out

In [None]:
def ignore_entities(entities_in, labels_to_omit):
    entities_out = []
    for entity in entities_in:
        if entity["label"] not in labels_to_omit:
            entities_out.append(entity)
    return entities_out

In [None]:
test_key = 1831001
render_text(texts[test_key], ignore_entities(convert_gold_entities(metadata[test_key]), ["textStyle", "unclear"]))

## 3. Find entities in texts

In [None]:
import transformers

transformers.utils.logging.set_verbosity_error()

In [None]:
def show_names(entities):
    name = ""
    for part in entities:
        if re.search("^B", part["entity"]) and name != "":
            print(name)
            name = ""
        if re.search("(GPE|PERSON)$", part["entity"]):
            if name != "":
                name += " "
            name += part["word"]
    if name != "":
        print(name)  

Tested models (initial number indicates monthly downloads):
* (345) wietsedv/bert-base-dutch-cased-finetuned-conll2002-ner (several false positives)
* (74) Matthijsvanhof/bert-base-dutch-cased-finetuned-NER (not useful, tags everything)
* (16) wietsedv/bert-base-dutch-cased-finetuned-sonar-ner (some false positives)
* (13) proycon/bert-ner-cased-conll2002-nld (did not find any entities)
* (10) proycon/bert-ner-cased-sonar1-nld (found only one entity)
* (10) Matthijsvanhof/bert-base-dutch-cased-finetuned-NER8 (not useful, tags everything)
* (4) [wietsedv/bert-base-dutch-cased-finetuned-udlassy-ner](https://huggingface.co/wietsedv/bert-base-dutch-cased-finetuned-udlassy-ner) (few false positives) **SELECTED**

In [None]:
run_bert_pipeline = transformers.pipeline(task='ner', model='wietsedv/bert-base-dutch-cased-finetuned-udlassy-ner')

In [None]:
entities = run_bert_pipeline(texts[2])

In [None]:
show_names(entities)

## 4. Post-process entities

Expand entities which end in the middle of a word

In [None]:
def expand_entities(entities_in, text):
    entities_out = []
    for entity_in in entities_in:
        entity_out = entity_in.copy()
        while (entity_out["end"] < len(text) and 
               (re.search("\w", text[entity_out["end"]]) or re.search("[.,-]", text[entity_out["end"]]))):
            entity_out["word"] += text[entity_out['end']]
            entity_out["end"] += 1
        entities_out.append(entity_out)
    return entities_out

Combine successive entities where the second one has a label starting with I or the same label as the previous entity

In [None]:
def expand_last_entity(entities, entity):
    entities[-1]["word"] += " " + entity["word"]
    entities[-1]["end"] = entity["end"]

In [None]:
def combine_entities(entities_in):
    entities_out = []
    for entity_in in entities_in:
        entity_out = entity_in.copy()
        if len(entities_out) == 0:
            entities_out.append(entity_out)
        elif re.search("^I-", entity_out["entity"]):
            expand_last_entity(entities_out, entity_out)
        else:
            entity_out["entity"] = re.sub("^[BIE]-", "B-", entity_out["entity"])
            if entity_out["start"] < entities_out[-1]["start"]:
                print("error: entities are not sorted by position!")
            elif entity_out["start"] <= entities_out[-1]["end"] + 1 and entity_out["entity"] == entities_out[-1]["entity"]:
                expand_last_entity(entities_out, entity_out)
            else:
                entities_out.append(entity_out)
    return entities_out

In [None]:
def process_and_render_texts(texts):
    for text_id in texts:
        text = texts[text_id]
        entities = run_bert_pipeline(text)
        entities = combine_entities(expand_entities(entities, text))
        print(f"Text {text_id}")
        render_text(text, convert_guessed_entities(entities))

In [None]:
process_and_render_texts({ text_id:texts[text_id] for text_id in texts if text_id < 3})

## 5. Get name of deceased

In [None]:
def cleanup(text_in):
    text_out = re.sub("\s+", " ", text_in)
    text_out = re.sub("- ", "", text_out)
    return re.sub("[,.]", "", text_out.lower())

In [None]:
def find_text_patterns(query, text):
    positions = []
    pattern = re.compile(query)
    for m in pattern.finditer(text.lower()):
        positions.append({"start": m.start(), "end": m.end()})
    return positions

In [None]:
def get_name_of_deceased(text, entities):
    deceased = []
    positions = find_text_patterns("overleden is:?,?", text) + find_text_patterns("is overleden:?,?", text)
    for position in positions:
        name_deceased = ""
        for entity in entities:
            if entity["start"] == position["end"] + 1:
                name_deceased = entity["word"]
        deceased.append(name_deceased)
    positions = find_text_patterns("levens?loos", text)
    return deceased, len(positions)

In [None]:
def compare_names(results, metadata):
    if len(results[0]) == 0 or results[0][0] == "":
        return True
    if "first_names" not in metadata or "last_name" not in metadata:
        return(False)
    guessed_name = results[0][0]
    if re.search(".,.", guessed_name):
        guessed_name = re.sub("^[^,]+, *(\S.*)$", "\\1", results[0][0]) + " " + re.sub("^([^,]+),.*$", "\\1", results[0][0])
    annotated_name = " ".join([ metadata["first_names"], metadata["last_name"]])
    return cleanup(guessed_name) == cleanup(annotated_name)

In [None]:
def evaluate_deceased_names(results, nbr_of_names_found, nbr_of_stillborns_found, metadata):
    if len(results[0]) != 0 and re.search("\w", results[0][0]):
        nbr_of_names_found += 1
    if results[1] > 0:
        nbr_of_stillborns_found += 1
    return nbr_of_names_found, nbr_of_stillborns_found, compare_names(results, metadata)

In [None]:
def get_metadata(metadata, text, keys):
    data = {}
    for key in keys:
        if key in metadata:
            for metadata_item in metadata[key]:
                name = text[int(metadata_item["offset"]): 
                            int(metadata_item["offset"]) + int(metadata_item["length"])]
                if key not in data:
                    data[key] = name
                else:
                    data[key] += " " + name
    return data               

In [None]:
def print_name_correct(name_is_correct):
    if not name_is_correct:
        print_with_color("wrong name")

In [None]:
GOLD_DATA = "../../data/Overlijden/x-samples/three-columns-100.csv"
gold_data = pd.read_csv(GOLD_DATA)
names = {}
for key in gold_data.index:
    names_key = read_transkribus_files.make_file_id(gold_data["scans"][key])
    if isinstance(gold_data["first_names"][key], str):
        if isinstance(gold_data["last_name"][key], str):
            name = str(gold_data["first_names"][key]) + " " + str(gold_data["last_name"][key])
        else:
            name = str(gold_data["first_names"][key])
    elif isinstance(gold_data["last_name"][key], str):
        name = str(gold_data["last_name"][key])
    if names_key in names:
        names[names_key].append(name)
    else:
        names[names_key] = [name]

In [None]:
nbr_of_names_found = 0
nbr_of_stillborns_found = 0

for text_id in sorted(texts.keys()):
    text = texts[text_id]
    entities = run_bert_pipeline(text)
    entities = combine_entities(expand_entities(entities, text))
    #print(f"Text {text_id}:", end=" ")
    results = get_name_of_deceased(text, entities)
    nbr_of_names_found, nbr_of_stillborns_found, name_is_correct = evaluate_deceased_names(results, 
                                                                          nbr_of_names_found, 
                                                                          nbr_of_stillborns_found,
                        get_metadata(metadata[text_id], texts[text_id], ["first_names", "last_name"]))
    #print(results, end=" ")
    #print_name_correct(name_is_correct)
    #print()
    print(results[0][0].lower() == names[text_id][0].lower(), text_id, results[0][0], names[text_id][0])
print(f"Records: {len(texts)}; Names found: {nbr_of_names_found};", end=" ")
print(f"Stillborns: {nbr_of_stillborns_found}; Missing: {len(texts)-nbr_of_names_found-nbr_of_stillborns_found}")

## 6. Get decease date

In [None]:
import sys

In [None]:
ordinals = { "eersten": 1, "tweeden": 2, "derden": 3, "vierden": 4, "vijfden": 5,
             "zesden": 6, "zevenden": 7, "achtsten": 8, "negenden": 9, "tienden": 10,
             "elfden": 11, "twaalfden": 12, "dertienden": 13, "veertienden": 14, "vijftienden": 15,
             "zestienden": 16, "zeventienden": 17, "achttienden": 18, "negentienden": 19, "twintigsten": 20,
             "eenentwintigsten": 21, "tweeentwintigsten": 22, "drieentwintigsten": 23, "vierentwintigsten": 24, "vijfentwintigsten": 25,
             "zesentwintigsten": 26, "zevenentwintigsten": 27, "achtentwintigsten": 28, "negenentwintigsten": 29, "dertigsten": 30,
             "eenendertigsten": 31,
             "een en twintigsten": 21, "twee en twintigsten": 22, "drie en twintigsten": 23, "vier en twintigsten": 24, "vijf en twintigsten": 25,
             "zes en twintigsten": 26, "zeven en twintigsten": 27, "acht en twintigsten": 28, "negen en twintigsten": 29, 
             "een en dertigsten": 31,
            }

In [None]:
cardinals = {             "een": 1,  "twee": 2,     "drie": 3,     "vier": 4,      "vijf": 5,      "zes": 6,      "zeven": 7,      "acht": 8,     "negen": 9,
              "tien": 10, "elf": 11, "twaalf": 12,  "dertien": 13, "veertien": 14, "vijftien": 15, "zestien": 16, "zeventien": 17, "achttien":18, "negentien": 19,
                                     "twintig": 20, "dertig": 30,  "veertig": 40,  "vijftig": 50,  "zestig": 60,  "zeventig": 70,  "tachtig": 80, "negentig": 90, } 

In [None]:
others = { "en": 0, "honderd": 100, "duizend": 1000, }

In [None]:
date_months = { '': 0, "januari": 1, "februari": 2, "maart": 3, "april": 4, "mei": 5, "juni": 6,
                "juli": 7, "augustus": 8, "september": 9, "oktober": 10, "november": 11, "december": 12,
                "july": 7, "october": 10, }

In [None]:
def get_next_token(position, text):
    while position < len(text) - 1 and re.search("\s", text[position]):
        position += 1
    token = ""
    while position < len(text) - 1 and not re.search("\s", text[position]):
        token += text[position]
        position += 1
    return token, position

In [None]:
def get_date_day(position, text):
    return number_parser(text[position:])
    day = ""
    next_token, end_position = get_next_token(position, text)
    next_next_token, dummy = get_next_token(position + len(next_token) + 1, text)
    if cleanup(next_token) in dict(ordinals, **cardinals).keys() and not re.search("^en$", next_next_token, re.IGNORECASE):
        day = next_token
    elif cleanup(next_token) + "n" in dict(ordinals, **cardinals).keys() and not re.search("^en$", next_next_token, re.IGNORECASE):
        day = next_token
    else:
        next_next_token, end_position = get_next_token(position + len(next_token) + 1, text)
        next_token += " " + next_next_token
        next_next_token, end_position = get_next_token(position + len(next_token) + 1, text)
        next_token += " " + next_next_token
        if cleanup(next_token) in dict(ordinals, **cardinals).keys():
            day = next_token
        elif cleanup(next_token) + "n" in dict(ordinals, **cardinals).keys():
            day = next_token
    if day:
        return day, end_position - position
    else:
        return day, 0

In [None]:
def get_date_month(position, text):
    month = ""
    next_token, end_position = get_next_token(position, text)
    if cleanup(next_token) in date_months.keys():
        month = next_token
    elif re.search("-$", next_token):
        next_next_token, next_end_position = get_next_token(end_position, text)
        next_token = re.sub("-$", "", next_token)
        next_token += next_next_token
        if cleanup(next_token) in date_months.keys():
            month = next_token
            end_position = next_end_position
    if month:
        return month, end_position - position
    else:
        return month, 0

In [None]:
def get_date_year(position, text):
    year = ""
    next_token, next_position = get_next_token(position, text)
    if next_token.lower() != "des":
        return number_parser(text[position:])
    next_token, next_position = get_next_token(next_position, text)
    return number_parser(text[next_position:])

In [None]:
def longest_number_match(text):
    longest_match = ""
    longest_match_length = 0
    text_index = 0
    while text_index < len(text) and re.search("\s", text[text_index]):
        text_index += 1
    for i in range(text_index, text_index + 25):
        phrase = cleanup(text[text_index: i])
        if phrase in cardinals.keys() and phrase != longest_match:
            longest_match = phrase
            longest_match_length = int(i)
        elif phrase in ordinals.keys() and phrase != longest_match:
            longest_match = phrase
            longest_match_length = int(i)
        elif phrase in others.keys() and phrase != longest_match:
            longest_match = phrase
            longest_match_length = int(i)
    return longest_match, longest_match_length

In [None]:
def split_off_hundreds_thousands(tokens):
    if re.search(".(honderd|duizend)", tokens[0].lower()):
        tokens.insert(0, re.sub("(honderd|duizend).*", "", tokens[0].lower()))
        tokens[1] = re.sub(".*(honderd|duizend)", "\\1", tokens[1].lower())


def number_parser(text):
    if not text:
        return 0, 0
    first_number, first_offset = longest_number_match(text)
    second_number, second_offset = longest_number_match(text[first_offset:])
    if cleanup(first_number) == "en":
        number, offset = number_parser(text[first_offset:])
        return number, offset + first_offset
    if cleanup(first_number) in cardinals:
        if cleanup(second_number) == "honderd":
            number, offset = number_parser(text[first_offset + second_offset:])
            return 100 * cardinals[cleanup(first_number)] + number, first_offset + second_offset + offset
        if cleanup(second_number) == "duizend":
            number, offset = number_parser(text[first_offset + second_offset:])
            return 1000 * cardinals[cleanup(first_number)] + number, first_offset + second_offset + offset
        number, offset = number_parser(text[first_offset:])
        return cardinals[cleanup(first_number)] + number, first_offset + offset
    if cleanup(first_number) in ordinals:
        number, offset = number_parser(text[first_offset:])
        return ordinals[cleanup(first_number)] + number, first_offset + offset
    return 0, 0

In [None]:
def get_dates(texts, text_id, pattern):
    dates = []
    text = texts[text_id]
    entities = run_bert_pipeline(text)
    entities = combine_entities(expand_entities(entities, text))
    day = ""
    positions = find_text_patterns(pattern, text)
    for position in positions:
        day, token_length_day = get_date_day(position["end"], text)
        month, token_length_month = get_date_month(position["end"] + token_length_day, text)
        year, token_length_year = get_date_year(position["end"] + token_length_day + token_length_month, text)
        dates.append((day,month,year))
    return summarize_dates(dates)


def complete_date(date):
    return date[0] != 0 and date[1] != "" and date[2] != 0


def contains_complete_date(dates):
    if not dates:
        return False
    elif complete_date(dates[0]):
        return True
    else:
        return contains_complete_date(dates[1:])


def summarize_dates(dates_in):
    keep_only_complete_dates = contains_complete_date(dates_in)
    dates_out = []
    for date in dates_in:
        (day, month, year) = date
        if complete_date(date):
            dates_out.append(date)
        elif not keep_only_complete_dates and (day != 0 or month != "" or year != 0):
            dates_out.append(date)
    return dates_out    

def print_dates(texts, text_id, dates, note=""):
    summarized_dates = summarize_dates(dates)
    if not summarized_dates:
        print_with_color(f"Text {text_id}: (no dates found)\n")
    for date in summarize_dates(dates):
        (day, month, year) = date
        try:
            print(f"Text {text_id}: {day} {month} {year} ({ordinals[cleanup(day)]}-{date_months[cleanup(month)]}-{year}) {note}")
        except:
            try:
                print(f"Text {text_id}: {day} {month} {year} ({cardinals[cleanup(day)]}-{date_months[cleanup(month)]}-{year}) {note}")
            except:
                if day != 0 and month != "" and year != 0:
                    print(f"Text {text_id}: {day} {month} {year} {note}")
                else:
                    print_with_color(f"Text {text_id}: {day} {month} {year} {note}\n")

def get_death_date(texts, text_id):
    dates = get_dates(texts, text_id, "op den")
    if not dates:
        death_dates = get_dates(texts, text_id, "op")
        document_dates = get_document_date(texts, text_id)
        for date in death_dates:
            if date[2] != 0:
                dates.append(date)
            elif document_dates and document_dates[0][2] != 0 and (date[0] != 0 or date[1] != ""):
                if date_months[cleanup(document_dates[0][1])] < date_months[cleanup(date[1])]:
                    dates.append((date[0], date[1], document_dates[0][2] - 1))
                else:
                    dates.append((date[0], date[1], document_dates[0][2]))
            elif not document_dates or document_dates[0][2] == 0:
                if date[0] != 0 or date[1] != "" or date[2] != 0:
                    dates.append(date)
    print_dates(texts, text_id, dates)
    return dates
    
    
def get_document_date(texts, text_id):
    dates = get_dates(texts, text_id, "heden")
    if not dates:
        dates = get_dates(texts, text_id, "heden den")
    return dates

In [None]:
process_and_render_texts({ text_id:texts[text_id] for text_id in texts if text_id == 21})

In [None]:
nbr_of_incomplete_dates = 0
for text_id in sorted(texts.keys()):
    dates = get_death_date(texts, text_id)
    for date in dates:
        if date[0] == 0 or date[1] == "" or date[2] == 0:
            nbr_of_incomplete_dates += 1
print(f"number of incomplete dates: {nbr_of_incomplete_dates}")

**Notes Training set V2:**

- 3: spelling error: twinttigsten
- 8: spelling error: teen duizend
- 18: spelling error: twintigste
- 27 spelling error: decemder
- 61: spelling error: achtiende

**Notes Sample regex:**

- 1: extra space
- 2: extra space
- 5: extra space
- 10: spelling error
- 12: extra space
- 15: spelling error
- 19: month as number
- 21 no month
- ...

## 99. Tests

In [None]:
import unittest

In [None]:
class TestNotebook(unittest.TestCase):    
    def test_process_custom_attrib(self):
        self.assertEqual(process_custom_attrib("readingOrder {index:1;} certificate_date {offset:10; length:25; continued:true;}"),
                         { 'readingOrder': { 'index': '1' },
                           'certificate_date': { 'offset': '10', 'length': '25', 'continued': 'true' } } )

In [None]:
unittest.main(argv=[''], verbosity=2, exit=False)