# Explore manually annotated Curaçao files

## 1. Read files

In [None]:
import xml.etree.ElementTree as ET

In [None]:
data_dir = "../data/Training_set_V2"

In [None]:
def get_text_from_file(file_name):
    tree = ET.parse(file_name)
    root = tree.getroot()
    return get_text_from_xml(root)

In [None]:
def get_text_from_xml(root):
    text = ""
    for tag in root.findall(".//{*}TextLine/{*}TextEquiv/{*}Unicode"):
        text += tag.text + "\n"
    return(text)

In [None]:
def make_file_name(file_id):
    return "p" + str(file_id).zfill(3) + ".xml"

In [None]:
def read_files():
    texts = []
    for file_id in range(1,12):
        if file_id != 10:
            texts.append(get_text_from_file(data_dir + "/" + make_file_name(file_id)))
    return texts

In [None]:
texts = read_files()

## 2. Find names in texts

In [None]:
import re
import transformers

transformers.utils.logging.set_verbosity_error()

In [None]:
def show_names(results):
    name = ""
    for part in results:
        if re.search("^B", part["entity"]) and name != "":
            print(name)
            name = ""
        if not re.search("(CARDINAL|DATE|ORDINAL|WORK_OF_ART)$", part["entity"]):
            if name != "":
                name += " "
            name += part["word"]
    if name != "":
        print(name)  

Tested models (initial number indicates monthly downloads):
* (345) wietsedv/bert-base-dutch-cased-finetuned-conll2002-ner (several false positives)
* (74) Matthijsvanhof/bert-base-dutch-cased-finetuned-NER (not useful, tags everything)
* (16) wietsedv/bert-base-dutch-cased-finetuned-sonar-ner (some false positives)
* (13) proycon/bert-ner-cased-conll2002-nld (did not find any entities)
* (10) proycon/bert-ner-cased-sonar1-nld (found only one entity)
* (10) Matthijsvanhof/bert-base-dutch-cased-finetuned-NER8 (not useful, tags everything)
* (4) wietsedv/bert-base-dutch-cased-finetuned-udlassy-ner (few false positives)

In [None]:
run_bert_pipeline = transformers.pipeline(task='ner', model='wietsedv/bert-base-dutch-cased-finetuned-udlassy-ner')

In [None]:
results = run_bert_pipeline(texts[0])

In [None]:
show_names(results)

## 3. Visualize named entities

In [None]:
from spacy import displacy

In [None]:
def convert_entities(results):
    entities = []
    for result in results:
        start_tag = result["entity"][0]
        label = result["entity"][2:]
        if start_tag == "B":
            entities.append({"start": result["start"], "end": result["end"], "label": label})
        else:
            entities[-1]["end"] = result["end"]
    return entities

In [None]:
displacy.render({ "text": re.sub("\\n", " ", texts[0]), 
                  "ents": convert_entities(results) }, 
                options = { "colors": { "PERSON": "orange" } }, style = "ent", manual = True)