# Explore manually annotated Curaçao files

## 1. Read files

In [None]:
import ast
import json
import re
import xml.etree.ElementTree as ET

In [None]:
data_dir = "../../data/Training_set_V2"

In [None]:
def get_text_from_file(file_name):
    tree = ET.parse(file_name)
    root = tree.getroot()
    return get_text_from_xml(root)

In [None]:
def get_text_from_xml(root):
    text = ""
    for textline in root.findall(".//{*}TextLine"):
        custom_dict = make_custom_dict(textline.attrib)
        for unicode in textline.findall("./{*}TextEquiv/{*}Unicode"):
            text += remove_strikethroughs(unicode.text, custom_dict) + "\n"
    return text

In [None]:
def make_custom_dict(text_line_attributes):
    if "custom" not in text_line_attributes:
        return {}
    custom_tokens = text_line_attributes["custom"].split()
    custom_dict = {}
    while custom_tokens:
        custom_key = custom_tokens.pop(0)
        custom_value = custom_tokens.pop(0)
        while custom_tokens and not re.search("}$", custom_value):
            custom_value += " " + custom_tokens.pop(0)
        if custom_key in custom_dict:
            custom_dict[custom_key].append(ast.literal_eval(json_string_add_quotes(custom_value)))
        else:
            custom_dict[custom_key] = [ast.literal_eval(json_string_add_quotes(custom_value))]
    return custom_dict

In [None]:
def remove_strikethroughs(text_line, custom_dict):
    if "textStyle" not in custom_dict:
        return text_line
    chars = list(text_line)
    for strikethrough in custom_dict["textStyle"]:
        if "strikethrough" in strikethrough:
            start = int(strikethrough["offset"])
            for i in range(start, start + int(strikethrough["length"])):
                chars[i] = " "
    return "".join(chars)

In [None]:
def json_string_add_quotes(string):
    return re.sub("{ *", "{ '", 
               re.sub(": *", "': '", 
                   re.sub("; *", "', '",
                       re.sub("} *'", "} ",
                           re.sub("; *}", "' }", string)))))

In [None]:
def make_file_name(file_id):
    return "p" + str(file_id).zfill(3) + ".xml"

In [None]:
def read_files():
    texts = {}
    for file_id in range(1, 100):
        try:
            texts[file_id] = get_text_from_file(data_dir + "/" + make_file_name(file_id))
        except:
            pass
    return texts

In [None]:
texts = read_files()

## 2. Find entities in texts

In [None]:
import transformers

transformers.utils.logging.set_verbosity_error()

In [None]:
def show_names(entities):
    name = ""
    for part in entities:
        if re.search("^B", part["entity"]) and name != "":
            print(name)
            name = ""
        if re.search("(GPE|PERSON)$", part["entity"]):
            if name != "":
                name += " "
            name += part["word"]
    if name != "":
        print(name)  

Tested models (initial number indicates monthly downloads):
* (345) wietsedv/bert-base-dutch-cased-finetuned-conll2002-ner (several false positives)
* (74) Matthijsvanhof/bert-base-dutch-cased-finetuned-NER (not useful, tags everything)
* (16) wietsedv/bert-base-dutch-cased-finetuned-sonar-ner (some false positives)
* (13) proycon/bert-ner-cased-conll2002-nld (did not find any entities)
* (10) proycon/bert-ner-cased-sonar1-nld (found only one entity)
* (10) Matthijsvanhof/bert-base-dutch-cased-finetuned-NER8 (not useful, tags everything)
* (4) [wietsedv/bert-base-dutch-cased-finetuned-udlassy-ner](https://huggingface.co/wietsedv/bert-base-dutch-cased-finetuned-udlassy-ner) (few false positives) **SELECTED**

In [None]:
run_bert_pipeline = transformers.pipeline(task='ner', model='wietsedv/bert-base-dutch-cased-finetuned-udlassy-ner')

In [None]:
entities = run_bert_pipeline(texts[2])

In [None]:
show_names(entities)

## 3. Visualize entities

For list of entity tags of model `wietsedv/bert-base-dutch-cased-finetuned-udlassy-ner`, see [OntoNotes](https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf), page 21

In [None]:
from spacy import displacy

In [None]:
def convert_entities(entities_in):
    entities_out = []
    for entity in entities_in:
        start_tag = entity["entity"][0]
        label = entity["entity"][2:]
        if start_tag == "B" or not entities_out:
            entities_out.append({"start": entity["start"], "end": entity["end"], "label": label})
        else:
            entities_out[-1]["end"] = entity["end"]
    return entities_out

In [None]:
def render_text(text, entities):
    displacy.render({ "text": re.sub("\\n", " ", text), 
                      "ents": convert_entities(entities) }, 
                      options = { "colors": { "PERSON": "orange" } }, style = "ent", manual = True)

In [None]:
render_text(texts[2], entities)

## 4. Post-process entities

Expand entities which end in the middle of a word

In [None]:
def expand_entities(entities_in, text):
    entities_out = []
    for entity_in in entities_in:
        entity_out = entity_in.copy()
        while (entity_out["end"] < len(text) and 
               (re.search("\w", text[entity_out["end"]]) or re.search("[.,-]", text[entity_out["end"]]))):
            entity_out["word"] += text[entity_out['end']]
            entity_out["end"] += 1
        entities_out.append(entity_out)
    return entities_out

Combine successive entities where the second one has a label starting with I or the same label as the previous entity

In [None]:
def combine_entities(entities_in):
    entities_out = []
    for entity_in in entities_in:
        entity_out = entity_in.copy()
        if len(entities_out) == 0:
            entities_out.append(entity_out)
        elif re.search("^I-", entity_out["entity"]):
            expand_last_entity(entities_out, entity_out)
        else:
            entity_out["entity"] = re.sub("^[BIE]-", "B-", entity_out["entity"])
            if entity_out["start"] < entities_out[-1]["start"]:
                print("error: entities are not sorted by position!")
            elif entity_out["start"] <= entities_out[-1]["end"] + 1 and entity_out["entity"] == entities_out[-1]["entity"]:
                expand_last_entity(entities_out, entity_out)
            else:
                entities_out.append(entity_out)
    return entities_out

In [None]:
def expand_last_entity(entities, entity):
    entities[-1]["word"] += " " + entity["word"]
    entities[-1]["end"] = entity["end"]

In [None]:
entities = run_bert_pipeline(texts[2])

In [None]:
entities = combine_entities(expand_entities(entities, texts[2]))

In [None]:
render_text(texts[2], entities)

In [None]:
def process_and_render_texts(texts):
    for text_id in texts:
        text = texts[text_id]
        entities = run_bert_pipeline(text)
        entities = combine_entities(expand_entities(entities, text))
        print(f"Text {text_id}")
        render_text(text, entities)

In [None]:
process_and_render_texts({ text_id:texts[text_id] for text_id in texts if text_id < 3})

## 5. Get name of deceased

In [None]:
def find_text_patterns(query, text):
    positions = []
    pattern = re.compile(query)
    for m in pattern.finditer(text):
        positions.append({"start": m.start(), "end": m.end()})
    return positions

In [None]:
def get_name_of_deceased(text, entities):
    deceased = []
    positions = find_text_patterns("overleden is:?,?", text) 
    for position in positions:
        name_deceased = ""
        for entity in entities:
            if entity["start"] == position["end"] + 1:
                name_deceased = entity["word"]
        deceased.append(name_deceased)
    positions = find_text_patterns("levens?loos", text)
    return deceased, len(positions)

In [None]:
for text_id in texts:
    text = texts[text_id]
    entities = run_bert_pipeline(text)
    entities = combine_entities(expand_entities(entities, text))
    print(f"Text {text_id}:", end=" ")
    print(get_name_of_deceased(text, entities))

## 6. Get decease date

In [None]:
date_days = { '': 0, "eersten": 1, "tweeden": 2, "derden": 3, "vierden": 4, "vijfden": 5,
              "zesden": 6, "zevenden": 7, "achtsten": 8, "negenden": 9, "tienden": 10,
              "elfden": 11, "twaalfden": 12, "dertienden": 13, "veertienden": 14, "vijftienden": 15,
              "zestienden": 16, "zeventienden": 17, "achttienden": 18, "negentienden": 19, "twintigsten": 20,
              "eenentwintigsten": 21, "tweeentwintigsten": 22, "drieentwintigsten": 23, "vierentwintigsten": 24, "vijfentwintigsten": 25,
              "zesentwintigsten": 26, "zevenentwintigsten": 27, "achtentwintigsten": 28, "negenentwintigsten": 29, "dertigsten": 30,
              "eenendertigsten": 31,
              "eerste": 1, "tweede": 2, "derde": 3, "vierde": 4, "vijfde": 5,
              "zesde": 6, "zevende": 7, "achtste": 8, "negende": 9, "tiende": 10,
              "elfde": 11, "twaalfde": 12, "dertiende": 13, "veertiende": 14, "vijftiende": 15,
              "zestiende": 16, "zeventiende": 17, "achttiende": 18, "negentiende": 19, "twintigste": 20,
              "eenentwintigste": 21, "tweeentwintigste": 22, "drieentwintigste": 23, "vierentwintigste": 24, "vijfentwintigste": 25,
              "zesentwintigste": 26, "zevenentwintigste": 27, "achtentwintigste": 28, "negenentwintigste": 29, "dertigste": 30,
              "eenendertigste": 31,
              "een en twintigsten": 21, "twee en twintigsten": 22, "drie en twintigsten": 23, "vier en twintigsten": 24, "vijf en twintigsten": 25,
              "zes en twintigsten": 26, "zeven en twintigsten": 27, "acht en twintigsten": 28, "negen en twintigsten": 29, 
              "een en dertigsten": 31,
              "een en twintigste": 21, "twee en twintigste": 22, "drie en twintigste": 23, "vier en twintigste": 24, "vijf en twintigste": 25,
              "zes en twintigste": 26, "zeven en twintigste": 27, "acht en twintigste": 28, "negen en twintigste": 29, 
              "een en dertigste": 31,
            }

In [None]:
date_months = { '': 0, "januari": 1, "februari": 2, "maart": 3, "april": 4, "mei": 5, "juni": 6,
                "juli": 7, "augustus": 8, "september": 9, "oktober": 10, "november": 11, "december": 12,
                "july": 7, "october": 10, }

In [None]:
def get_next_token(position, text):
    while position < len(text) - 1 and re.search("\s", text[position]):
        position += 1
    token = ""
    while position < len(text) - 1 and not re.search("\s", text[position]):
        token += text[position]
        position += 1
    return token, position

In [None]:
def cleanup(token):
    return re.sub("\W?$", "", token.lower())

In [None]:
def get_date_day(position, text):
    day = ""
    next_token, end_position = get_next_token(position, text)
    if cleanup(next_token) in date_days.keys():
        day = next_token
    else:
        next_next_token, end_position = get_next_token(position + len(next_token) + 1, text)
        next_token += " " + next_next_token
        if cleanup(next_token) in date_days.keys():
            day = next_token
        else:
            next_next_token, end_position = get_next_token(position + len(next_token) + 1, text)
            next_token += " " + next_next_token
            if cleanup(next_token) in date_days.keys():
                day = next_token
    if day:
        return day, end_position - position
    else:
        return day, 0

In [None]:
def get_date_month(position, text):
    month = ""
    next_token, end_position = get_next_token(position, text)
    if cleanup(next_token) in date_months.keys():
        month = next_token
    elif re.search("-$", next_token):
        next_next_token, next_end_position = get_next_token(end_position, text)
        next_token = re.sub("-$", "", next_token)
        next_token += next_next_token
        if cleanup(next_token) in date_months.keys():
            month = next_token
            end_position = next_end_position
    if month:
        return month, end_position - position
    else:
        return month, 0

In [None]:
def get_date_year(position, text):
    year = ""
    next_token, next_position = get_next_token(position, text)
    next_token, next_position = get_next_token(next_position, text)
    next_token, next_position = get_next_token(next_position, text)
    if cleanup(next_token) == "een":
        year = next_token
        next_token, next_position = get_next_token(next_position, text)
        if cleanup(next_token) == "duizend":
            year += " " + next_token
            finished = False
            while not finished:
                next_token, next_position = get_next_token(next_position, text)
                if next_token != "te":
                    year += " " + next_token
                finished = next_token == "te" or re.search(",$", next_token)
            next_token, next_position = get_next_token(next_position, text)
    return year, next_position

In [None]:
digits = { "een": 1, "twee": 2, "drie": 3, "vier": 4, "vijf": 5, "zes": 6, "zeven": 7, "acht": 8, "negen": 9 }
decades = { "tien": 10, "twintig": 20, "dertig": 30, "veertig": 40, "vijftig": 50, "zestig": 60, "zeventig": 70, "tachtig": 80, "negentig": 90 } 

In [None]:
def number_parser(text, number):
    if not text:
        return 0
    tokens = text.split()
    if len(tokens) > 1 and cleanup(tokens[0]) == "een" and cleanup(tokens[1]) == "duizend":
        return 1000 + number_parser(" ".join(tokens[2:]), number)
    if cleanup(tokens[0]) == "en":
        return number_parser(" ".join(tokens[1:]), number)
    if cleanup(tokens[0]) in digits:
        if len(tokens) > 1 and cleanup(tokens[1]) == "honderd":
            return 100 * digits[cleanup(tokens[0])] + number_parser(" ".join(tokens[2:]), number)
        return digits[cleanup(tokens[0])] + number_parser(" ".join(tokens[1:]), number)
    if cleanup(tokens[0]) in decades:
        return decades[cleanup(tokens[0])] + number_parser(" ".join(tokens[1:]), number)
    return 0

In [None]:
for text_id in texts:
    text = texts[text_id]
    entities = run_bert_pipeline(text)
    entities = combine_entities(expand_entities(entities, text))
    day = ""
    for position in find_text_patterns("op den", text):
        day, token_length_day = get_date_day(position["end"], text)
        month, token_length_month = get_date_month(position["end"] + token_length_day, text)
        year, token_length_year = get_date_year(position["end"] + token_length_day + token_length_month, text)
        year = re.sub(" *honderd", " honderd", year)
        try:
            print(f"Text {text_id}: {day} {month} {year} ({date_days[cleanup(day)]}-{date_months[cleanup(month)]}-{number_parser(year, 0)})")
        except:
            print(f"Text {text_id}: {day} {month} {year}")