# Info fields via regular expressions

Extract persons from the info fields StartEntryInfo and EndEntryInfo of the [slave registers of Suriname](https://datasets.iisg.amsterdam/dataset.xhtml?persistentId=hdl:10622/CSPBHO) via regular expressions

In [None]:
import os
import pandas as pd
import regex
import sys
from IPython.display import clear_output
sys.path.append(os.getcwd() + '/..')
from scripts import get_deceased_name, utils

In [None]:
def squeal(text=None):
    clear_output(wait=True)
    if not text is None:
        print(text)

## 1. Read data

In [None]:
DATA_FILE = "../../data/suriname/Dataset Suriname Slave and Emancipation Registers Version 1.1.csv"

data = pd.read_csv(DATA_FILE, low_memory=False)

## 2. Extract entities from info field

In [None]:
def select_entities_by_type(entities):
    """ select entities of a type ("entity") that may contain a person name """
    return [ entity for entity in entities 
                    if regex.search("(PERSON|ORG|GPE|FAC|NORP|WORK_OF_ART|EVENT|LOC)",
                                    entity["entity"]) ] 

In [None]:
def get_entities_from_data(data):
    """ apply machine learning model to extract entities from string; only keep persons """
    entities = {}
    nbr_of_entities = 0
    for index, row in data.iterrows():
        text = row[DATA_FIELD]
        if isinstance(text, str):
            entities[index] = select_entities_by_type(get_deceased_name.get_entities_from_text(text))
            nbr_of_entities += len(entities[index])
        if index % 100 == 0:
            squeal(f"total lines: {len(data)}; processed lines: {index}; found entities: {nbr_of_entities}")
        if CUT_OFF > 0 and nbr_of_entities >= CUT_OFF:
            break
    squeal(f"total lines: {len(data)}; processed lines: {index}; found entities: {nbr_of_entities}")
    return entities, index

In [None]:
DATA_FIELD = "EndEntryInfo"
INDEX_FIELD = "Id_source"
CUT_OFF = 2000

# entities, last_index = get_entities_from_data(data)

## 3. Inspect entities

In [None]:
def add_entity_labels(entities):
    """ add label field to entity, required for rendering """
    for entity in entities:
        entity["label"] = entity["entity"]
    return entities

In [None]:
def inspect_entities(data, entities, first_index=0, last_index=-1):
    """ show info texts with identified entities """
    for index, row in data.iterrows():
        if first_index < 0 or index > first_index:
            text = row[DATA_FIELD]
            if index in entities:
                print(index)
                utils.render_text(text, entities[index])
            elif isinstance(text, str):
                utils.render_text(text,[])
            if last_index >= 0 and index >= last_index:
                break
    return

In [None]:
inspect_entities(data, { index: add_entity_labels(entities[index]) for index in entities }, last_index=last_index)

## 4. Access context of entities

In [None]:
def is_white_space(character):
    """ check if single character contains white space """
    return regex.search(r"^\s$", character)

In [None]:
def get_token_before_entity(text, entity):
    """ get token before entity in text, with start position """
    previous_token = ""
    end = entity["start"]
    while end > 0 and is_white_space(text[end-1]):
        end -= 1
    start = end - 1
    while start > 0 and not is_white_space(text[start-1]):
        start -= 1
    if start >= 0:
        previous_token = text[start: end]
    return previous_token, start

In [None]:
def get_token_after_entity(text, entity):
    """ get token after entity in text, with end position """
    next_token = ""
    start = entity["end"]
    while start < len(text) and is_white_space(text[start]):
        start += 1
    end = start + 1
    while end < len(text) and not is_white_space(text[end]):
        end += 1
    if start < len(text):
        next_token = text[start: end]
    return next_token, end

In [None]:
def get_first_token_of_entity(text, entity):
    """ get first token of entity, with end position """
    first_token = ""
    start = entity["start"]
    while start < entity["end"] and is_white_space(text[start]):
        start += 1
    end = start + 1
    while end < entity["end"] and not is_white_space(text[end]):
        end += 1
    if start < entity["end"]:
        first_token = text[start: end]
    return first_token, end

In [None]:
def get_last_token_of_entity(text, entity):
    """ get last token of entity, with start position """
    last_token = ""
    end = entity["end"]
    while end > entity["start"] and is_white_space(text[end - 1]):
        end -= 1
    start = end - 1
    while start > entity["start"] and not is_white_space(text[start - 1]):
        start -= 1
    if start >= entity["start"]:
        last_token = text[start: end]
    return last_token, start

In [None]:
def remove_first_token_of_entity(text, entity):
    """ remove first token of entity """
    first_token, end = get_first_token_of_entity(text , entity)
    while end < entity["end"] and is_white_space(text[end]):
        end += 1
    entity["start"] = end
    entity["word"] = text[entity["start"]: entity["end"]]
    return entity

In [None]:
def remove_last_token_of_entity(text, entity):
    """ remove last token of entity """
    last_token, start = get_last_token_of_entity(text , entity)
    while start > entity["start"] and is_white_space(text[start - 1]):
        start -= 1
    entity["end"] = start
    entity["word"] = text[entity["start"]: entity["end"]]
    return entity

## 5. Expand and shrink entities

In [None]:
non_name_words = [    "aan", "aandeel", "aangifte", "aankomende", "afgeschreven", "akte", "als",
                      "amsterdam", "augs", "augustus", "beheerders", "besmet", "bevolking", "bij",
                      "blijkens", "broeder", "cod", "college", "collegie", "comm", "commissariaat",
                      "commissaris", "commissarissen", "conditie", "curator", "curators", "custodi",
                      "dd", "ddo", "de", "dec", "decbr", "decemb", "december", "decemr", "decr",
                      "den", "deszelfs", "dezer", "dispositie","dood", "door", "erf", "erfenis",
                      "erfgenaam", "erfgenamen", "erven", "etablisement", "evangelische", "exc",
                      "executie", "executie", "expl", "exploicteur", "exploieteur", "extract", "fo",
                      "fort", "gekocht", "gemanumitteerd", "gemanumitteerde", "gemeente", "genomen",
                      "geref", "geregd", "geregistreerd", "geregtshof", "geresolutie",
                      "genl.staten", "gergd", "gr", "gouv", "gouverment", "gouverments", "gouvern",
                      "gouvernts", "gouvernement", "gouvr", "hervormde", "het", "hijpotheek", "hoc",
                      "hoge", "hoofdgelden", "ik", "in", "ingevolge", "inlandsche", "innocente",
                      "janij", "januarij", "julij", "journaal", "kolonie", "kolonien", "kommandant",
                      "kurators", "lande", "landen", "landsbelastingen", "maart", "meerderjarigen",
                      "mei", "minderj", "minderjarige", "minderjarigen", "minderje", "minderjn",
                      "nieuw", "no", "notarieele", "novemb", "novr", "onder", "opgever",
                      "overleden", "paramaribo", "per", "pub", "publieke", "raad", "resol",
                      "resolutie", "resolutie9", "respect", "septbr", "septemr", "slaaf", "slaven",
                      "slavin", "testament", "testamentaire", "uit", "van", "vendu", "vonnis",
                      "vendumeester", "veiling", "verbonden","verklaard", "verkocht", "verpand",
                      "vmr", "volg", "voor", "voorden", "vrij", "vrijdom", "vrijgeworden",
                      "weduwe", "weesmeesteren", "weesmeesters", "zie", "zijn", "zijne" ]
prefix_name_words = [ "boedel", "erven", "gebrs", "weduwe", ]
suffix_name_words = [ "'anavia", "(…)ing", "bol", "de", "den", "en", "gaander", "geb", "geboren", 
                      "gebn", "green", "heilbron", "helb", "lande", "laurence", "meijers", "mers",
                      "n", "nepveu", "nom", "osse", "petram", "pret", "qq", "salomons", "sanches",
                      "u", "ux", "van", "vlier", "wolff", ]

In [None]:
def format_list_of_words(text):
    line = ""
    for token in text.split():
        if len(token) + len(line) > 99:
            print(line)
            line = ""
        if line == "":
            line = 21 * " "
        line += " " + token
    if len(line) > 0:
        print(line)

In [None]:
def remove_initial_non_entity_words(text, entity):
    first_token, end = get_first_token_of_entity(text, entity)
    if (not first_token.lower() in non_name_words and 
        not regex.sub(r"[.,]$", "", first_token).lower() in non_name_words and 
        not regex.search(r"^[^a-zA-Z]+$", first_token)):
        return entity
    else:
        return remove_initial_non_entity_words(text, remove_first_token_of_entity(text, entity))

In [None]:
def remove_final_non_entity_words(text, entity):
    last_token, start = get_last_token_of_entity(text, entity)
    if (not last_token.lower() in non_name_words and 
        not regex.sub(r"[.,]$", "", last_token).lower() in non_name_words and 
        not regex.search(r"^[^a-zA-Z]+$", last_token)):
        return entity
    else:
        return remove_final_non_entity_words(text, remove_last_token_of_entity(text, entity))

In [None]:
def add_initial_entity_words(text, entity, last_entity_end):
    previous_token, start = get_token_before_entity(text, entity)
    if (not previous_token.lower() in prefix_name_words and 
        not regex.sub(r"[.,]+$", "", previous_token).lower() in prefix_name_words and
        not regex.search(r"^[A-Z]\.?$", previous_token) and
        not regex.search(r"^[A-Z]\.[A-Z]\.$", previous_token)):
        return entity
    elif start <= last_entity_end:
        return entity
    else:
        entity["start"] = start
        return add_initial_entity_words(text, entity, last_entity_end)

In [None]:
def add_final_entity_words(text, entity, next_entity_start):
    next_token, end = get_token_after_entity(text, entity)
    if (not next_token.lower() in suffix_name_words and 
        not regex.sub(r"[.,]$", "", next_token).lower() in suffix_name_words and
        not regex.search(r"^[A-Z]\.?$", next_token)):
        return entity
    elif end > next_entity_start:
        return entity
    else:
        entity["end"] = end
        return add_final_entity_words(text, entity, next_entity_start)

In [None]:
def get_next_entity_start(entities, text, i):
    if i + 1 >= len(entities):
        return len(text)
    else:
        return entities[i + 1]["start"]

In [None]:
def shrink_entities(entities):
    for index in entities:
        text = data[DATA_FIELD][index]
        for entity in entities[index]:
            entity = remove_initial_non_entity_words(data[DATA_FIELD][index], entity)
            entity = remove_final_non_entity_words(data[DATA_FIELD][index], entity)
    return entities

In [None]:
def expand_entities(entities):
    for index in entities:
        last_entity_end = 0
        text = data[DATA_FIELD][index]
        next_entity_start = get_next_entity_start(entities[index], text, 0)
        for i in range(0, len(entities[index])):
            entity = entities[index][i]
            if entity["start"] < entity["end"]:
                entity = add_initial_entity_words(data[DATA_FIELD][index], entity, last_entity_end)
                entity = add_final_entity_words(data[DATA_FIELD][index], entity, next_entity_start)
                next_entity_start = get_next_entity_start(entities[index], text, i + 1)
                last_entity_end = entity["end"]
                entity["word"] = data[DATA_FIELD][index][entity["start"]: entity["end"]]
    return entities

In [None]:
def find_overlapping_entities(entities):
    for i in range(0, len(entities)):
        for j in range(0, len(entities)):
            if i != j and (entities[i]["start"] <= entities[j]["start"] and entities[i]["end"] >= entities[j]["end"]):
                print(f"deleting entity {j}\n")
                entities.pop(j)
                return entities
            elif i != j and (entities[i]["start"] >= entities[j]["start"] and entities[i]["end"] <= entities[j]["end"]):
                print(f"deleting entity {i}\n")
                entities.pop(i)
                return entities
            elif i < j and entities[i]["start"] < entities[j]["end"] and entities[i]["end"] > entities[j]["start"]:
                print("A", entities[i], "\nB", entities[j], "\n")
                return entities

In [None]:
def check_for_overlapping_entities(entities):
    for index in entities:
        entity_characters = [ False ] * len(data[DATA_FIELD][index])
        for entity in entities[index]:
            for i in range(entity["start"], entity["end"]):
                if entity_characters[i]:
                    print(f"overlapping entities for index {index}! {entities[index]}")
                    entities[index] = find_overlapping_entities(entities[index])
                    break
                entity_characters[i] = True

In [None]:
check_for_overlapping_entities(entities)

## 6. Combine entities

In [None]:
def add_birth_names(entities):
    for index in entities:
        birth_words = [ "geboren", "geb", "gebn", "en", ]
        entities_to_combine = []
        for i in range(1, len(entities[index])):
            last_token, start = get_last_token_of_entity(data[DATA_FIELD][index], entities[index][i-1])
            if ((entities[index][i-1]["end"] == entities[index][i]["start"] or 
                 entities[index][i-1]["end"] + 1 == entities[index][i]["start"]) and 
                (last_token.lower() in birth_words or 
                 regex.sub(r"[.,]$", "", last_token).lower() in birth_words)):
                entities_to_combine.append(i)
        for i in range(len(entities_to_combine) - 1, -1, -1):
            entities[index][entities_to_combine[i]-1]["end"] = entities[index][entities_to_combine[i]]["end"]
            entities[index].pop(entities_to_combine[i])
    return entities

In [None]:
entities, last_index = get_entities_from_data(data)

In [None]:
entities = shrink_entities(entities)
entities = expand_entities(entities)
entities = add_birth_names(entities)

In [None]:
def show_entities(entities):
    for index in entities:
        for entity in entities[index]:
            text = data[DATA_FIELD][index][entity["start"]: entity["end"]]
            if text != "":
                print (index, text)

In [None]:
show_entities(entities)

## 7. Combine entities (old code)

Combination words:
* en
* geboren, geb, gebn

Unknown abbreviation: vmr?

In [None]:
def patch_born_as(text, entities):
    for i in range(0, len(entities)-1):
        if regex.search(r"^\s*(geb|gebn|geboren).*\s*$",
                        text[entities[i]["end"]: entities[i+1]["start"]]):
            entities[i]["end"] = entities[i+1]["end"]
            entities[i+1]["start"] =  entities[i+1]["end"]
    return [entity for entity in entities if entity["start"] != entity["end"]]

In [None]:
PREFIX_WORDS = [ "boedel", "erven", "geb", "gebn", "geboren", "weduwe", ]

In [None]:
def add_labels(entities):
    for i in range(0, len(entities)):
        if regex.search(r"(PERSON|GPE|FAC)", entities[i]["entity"]):
             entities[i]["label"] = "entity"
        else:
             entities[i]["label"] = "other"        
    return entities

In [None]:
def process_info_text(text, index=-1):
    entities_out = []
    last_end = -99
    mother_seen = False
    entities_in = get_entities_from_text(text)
    for entity_in in entities_in:
        if regex.search("(PERSON|GPE|FAC)", entity_in["entity"]):
            previous_token, start = get_previous_token(text, entity_in)
            if previous_token.lower() in PREFIX_WORDS:
                entity_in["start"] = start
            entity_in_string = text[entity_in["start"]: entity_in["end"]]
            if regex.search("^geb", entity_in_string) and (entity_in["start"] == last_end + 1 or
                                                      entity_in["start"] == last_end + 2):
                entities_in[-1] = (entities_in[-1][0], entities_in[-1][1] + " " + entity_in_string, entities[-1][2])
                continue
            role = "eigenaar"
            previous_token, start = get_previous_token(text, entity_in)
            if regex.search("geboren (uit|van)", text, regex.IGNORECASE) and not mother_seen:
                role = "moeder"
                mother_seen = True
            elif regex.search("(gemanumitteerd|vrij *geworden)", text, regex.IGNORECASE):
                role = "vrijgemaakte"
            elif regex.search("genaamd", previous_token, regex.IGNORECASE):
                role = "vrijgemaakte"
            elif regex.search("(slaaf|slavin|slaven)", previous_token, regex.IGNORECASE):
                role = "slaafgemaakte"
            elif regex.search(r"(plant\b|plantage|plantaadje|ple\b|houtvelling|divisie|district)",
                              previous_token, regex.IGNORECASE):
                role = "location"
            elif regex.search("(curator|deurwaarder|klerk|landschrijver|vendumeester)",
                              previous_token, regex.IGNORECASE):
                role = "ambtenaar"
            entities_out.append({"index": index, 
                                 "start": entity_in["start"], 
                                 "end": entity_in["end"], 
                                 "role": role, 
                                 "previous_token": previous_token,
                                 "label": "name",
                                 "word": text[entity_in["start"]: entity_in["end"]]})
            last_end = entity_in["end"]
    return entities_out

In [None]:
def patch_add_initials(text, entities):
    for entity in entities:
        previous_token, start = get_previous_token(text, entity)
        while regex.search(r"^[A-Z]\.*$", previous_token) or regex.search(r"^([A-Z]\.)+$", previous_token):
            entity["start"] = start
            previous_token, start = get_previous_token(text, entity)
    return entities

In [None]:
    #entities = patch_born_as(text, entities)
    #entities = patch_add_initials(text, entities)
    #entities = add_labels(entities)

In [None]:
entities_per_index = {}
for entity in entities:
    if entity["index"] in entities_per_index:
        entities_per_index[entity["index"]].append(entity)
    else:
        entities_per_index[entity["index"]] = [entity]

for index, row in data.iterrows():
    if isinstance(row[DATA_FIELD], str) and index <= CUT_OFF:
        print(index, end=" ")
        if index in entities_per_index:
            utils.render_text(row[DATA_FIELD], entities_per_index[index])
        else:
            utils.render_text(row[DATA_FIELD], [])

## 8. Cleanup names (old code)

In [None]:
def cleanup_name(name_string):
    name_string = regex.sub(r"[.,]\s*", " ", name_string)
    return name_string.strip()

In [None]:
prefixes_to_delete = [ "en",  "kurators", "notarieele", "slavin", ]
suffixes_to_delete = [ "/", "bij executie", "dd", "per executie", "febrij", "kolonien",
                       "landsbelastingen", "resolutie", "voor den vrijdom", "vrijdom", "den",
                       "voor", ]
non_names = [ "", ",", "akte", "aug", "augs", "august", "augusts", "augustus", "boedel",
              "boven", "custodi", "dec", "decbr", "decemb", "december", "decemr","den lande",
              "dezer", "erfenis", "erfgenaam", "erfgename", "erfgenamen", "executie", "febr",
              "febrij", "fo", "folio", "gemanumitteerd", "Gemmanumitteerd", "genomen",
              "geregistreerd", "geregd", "geresolutie", "gergd", "gouv", "gouv resol", "gouvern",
              "gouvernement", "gouvernts", "hoofdgelden", "janij", "januarij", "julij", "junij",
              "kolonien", "kurators", "lande", "lot no", "maart", "no", "notarieele", "novemb",
              "october", "overleden", "overschrijving", "plant", "resolutie", "ruiling", "slaaf",
              "suriname", "vendu", "vendumeester", "vrijdom", "zn", "zijne", ]
locations = [ "batavia", "nickerie", "vreeland", "spieringshoek", "kroonenburg", "caledonia",
              "molhoop", "libanon", "saltzhalen", "waterloo", "fairfield", "amsterdam",
              "paramaribo", "paradize", "felix", "dordrecht", "tourtonne", "lochaber",
              "leliendaal", "bremen", "lugtenburg", "saramacca", "zeezigt", "munnikkendam",
              "zwarigheid", "katwijk", "hooijland", "poelwijk", "alkmaar", "waijamoe",
              "petersburg", "johannesburg", "toledo", "ornamibo", "sardam", "coronie", "saksen",
              "thorarica", "curaçao", "cottica", "andresa", "curacao", "suriname", ]


def get_names_from_string(name_string):
    names = []
    for non_name in non_names + locations:
        match = regex.search(f"^{non_name}[^a-zA-Z]*$", name_string, regex.IGNORECASE)
        if match:
            return []
    match = regex.search(f"^(.*)\\s*[0-9]+\\s*(.*)$", name_string, regex.IGNORECASE)
    if match:
        names.extend(get_names_from_string(match.group(1)))
        names.extend(get_names_from_string(match.group(2)))
        return names
    for prefix in prefixes_to_delete:
        match = regex.search(f"^(.*)\\s+{prefix}\\s+(.*)$", name_string, regex.IGNORECASE)
        if match:
            names.extend(get_names_from_string(match.group(1)))
            names.extend(get_names_from_string(match.group(2)))
            return names
        match = regex.search(f"^{prefix}\\s+(.*)$", name_string, regex.IGNORECASE)
        if match:
            return get_names_from_string(match.group(1))
    for suffix in suffixes_to_delete:
        match = regex.search(f"^(.*)\\s+{suffix}\\s+(.*)$", name_string, regex.IGNORECASE)
        if match:
            names.extend(get_names_from_string(match.group(1)))
            names.extend(get_names_from_string(match.group(2)))
            return names
        match = regex.search(f"^(.*)\\s+{suffix}$", name_string, regex.IGNORECASE)
        if match:
            return get_names_from_string(match.group(1))
    names.append(name_string)
    return names

In [None]:
def get_names_from_string_with_punctuation(entity, text):
    entity_parts = []
    word_parts = entity["word"].split()
    for word_part in word_parts:
        if len(entity_parts) == 0:
            entity_parts.append({"start": entity["start"], "end": entity["start"] + len(word_part), "role": entity["role"]})
        elif not regex.search("^(qq|en)$", word_part) and len(entity_parts) > 0:
            entity_parts[-1]["end"] += len(word_part) + 1
        if regex.search("^(qq|en)$", word_part) or (len(entity_parts) > 0 and
                                                    text[entity_parts[-1]["end"] - 1] == ","):
            entity_parts.append({"start": entity_parts[-1]["end"] + 1, "end": entity_parts[-1]["end"] + 1, "role": entity["role"]})
    return entity_parts

In [None]:
def split_name_default(entity, text):
    if entity["start"] >= entity["end"]:
        first_name_tokens = []
        last_name_tokens = []
    else:
        name_tokens = data[DATA_FIELD][entity["index"]].split()
        first_name_tokens = name_tokens[:-1]
        last_name_tokens = name_tokens[-1:]
        if len(first_name_tokens) == 1 and first_name_tokens[0].lower() in non_names:
            first_name_tokens, last_name_tokens = split_name_default(" ".join(last_name_tokens))
        elif len(last_name_tokens) == 1 and last_name_tokens[0].lower() in non_names:
            first_name_tokens, last_name_tokens = split_name_default(" ".join(first_name_tokens))
    return first_name_tokens, last_name_tokens

In [None]:
last_name_words = [ "bo", "d", "da", "de", "den", "der", "du", "geb", "gebn", "geboren", "nom",
                    "prive", "privé", "v", "van", "ux", ]

def expand_multi_token_last_name(first_name_tokens, last_name_tokens):
    for i in range(0, len(first_name_tokens)):
        if (first_name_tokens[i].lower() in last_name_words and
            (not len(first_name_tokens[i]) == 1 or
             first_name_tokens[i].lower() == first_name_tokens[i])):
            while len(first_name_tokens) > i:
                last_name_tokens = [ first_name_tokens.pop(-1)] + last_name_tokens
            break
    return first_name_tokens, last_name_tokens

In [None]:
def include_last_name_before_born_as(first_name_tokens, last_name_tokens):
    if (len(last_name_tokens) > 0 and len(first_name_tokens) > 0 and
        regex.search(r"^(geb|gebn|geboren|beh|jr|sr)\b", last_name_tokens[0], regex.IGNORECASE)):
        last_name_tokens = [first_name_tokens.pop(-1)] + last_name_tokens
    return first_name_tokens, last_name_tokens

In [None]:
def include_last_name_before_nom_ux(first_name_tokens, last_name_tokens):
    if (len(last_name_tokens) > 0 and len(first_name_tokens) > 1 and
        regex.search("^ux\\b", last_name_tokens[0], regex.IGNORECASE) and
        regex.search("^(n|nom)\\b", first_name_tokens[-1], regex.IGNORECASE)):
        last_name_tokens = [first_name_tokens.pop(-1)] + last_name_tokens
        last_name_tokens = [first_name_tokens.pop(-1)] + last_name_tokens
    return first_name_tokens, last_name_tokens

In [None]:
def make_entity(string, label):
    return { "word": string, "start": 0, "end": len(string), "label": label }

In [None]:
def split_name(entity, text):
    first_name_tokens, last_name_tokens = split_name_default(entity, text)
    #first_name_tokens, last_name_tokens = expand_multi_token_last_name(first_name_tokens,
    #                                                                   last_name_tokens)
    #first_name_tokens, last_name_tokens = include_last_name_before_born_as(first_name_tokens,
    #                                                                       last_name_tokens)
    #first_name_tokens, last_name_tokens = include_last_name_before_nom_ux(first_name_tokens,
    #                                                                      last_name_tokens)
    if role == "eigenaar" or len(first_name_tokens) != 0:
        return make_entity(" ".join(first_name_tokens), "voornaam"), make_entity(" ".join(last_name_tokens), "achternaam")
    else:
        return make_entity(" ".join(last_name_tokens), "voornaam"), make_entity(" ".join(first_name_tokens), "achternaam")

In [None]:
PREFIX_TOKENS = [ "bl", "boedel", "erven", "mr", "we", "weduwe", ]

def get_prefix_tokens(first_name_tokens):
    prefix_tokens = []
    while len(first_name_tokens) > 0 and first_name_tokens[0].lower() in PREFIX_TOKENS:
        prefix_tokens.append(first_name_tokens.pop(0))
    return prefix_tokens, first_name_tokens

In [None]:
INFIX_TOKENS = [ "d'", "da", "de", "del", "den", "der", "des", "du", "d'", "het", "la", "du",
                 "l'", "la", "le", "'t", "ter", "v", "van", "von" ]

def get_infix_tokens(last_name_tokens):
    infix_tokens = []
    while len(last_name_tokens) > 0 and last_name_tokens[0].lower() in INFIX_TOKENS:
        infix_tokens.append(last_name_tokens.pop(0))
    return infix_tokens, last_name_tokens

In [None]:
SUFFIX_TOKENS = [ "beh", "cs", "jr", "nom", "n", "prive", "qq", "sr", "ux" ]

def get_suffix_tokens(last_name_tokens):
    suffix_tokens = []
    while len(last_name_tokens) > 0 and last_name_tokens[-1].lower() in SUFFIX_TOKENS:
        suffix_tokens.insert(0, last_name_tokens.pop(-1))
    return last_name_tokens, suffix_tokens

In [None]:
def split_name_in_five(first_name, last_name):
    prefix_tokens, first_name_tokens = get_prefix_tokens(first_name.split())
    infix_tokens, last_name_tokens = get_infix_tokens(last_name.split())
    last_name_tokens, suffix_tokens = get_suffix_tokens(last_name_tokens)
    return(make_entity(" ".join(prefix_tokens), "prefix"), 
           make_entity(" ".join(first_name_tokens), "voornaam"),
           make_entity(" ".join(infix_tokens), "infix"),
           make_entity(" ".join(last_name_tokens), "achternaam"),
           make_entity(" ".join(suffix_tokens), "suffix"))

In [None]:
def save_names(results):
    results.to_csv(DATA_FIELD + ".csv", index=False, columns=results.columns)

In [None]:
def parse_names(entities):
    table_out = []
    for entity in [entity for entity in entities if entity["role"] != "location"]:
        text = data[DATA_FIELD][entity["index"]]
        for entity_part in get_names_from_string_with_punctuation(entity, text):
            name_string = text[entity_part["start"]: entity_part["end"]]
            first_name, last_name = split_name(entity_part, text)
            prefix, first_name, infix, last_name, suffix = split_name_in_five(first_name["word"],
                                                                              last_name["word"])
            if first_name["word"] != "" or last_name["word"] != "":
                table_out.append([entity["index"], prefix["word"], first_name["word"], infix["word"],
                                                   last_name["word"], suffix["word"], entity["role"],
                                                   text])
    return pd.DataFrame(table_out, columns=["id", "prefix", "voornaam", "infix",
                                            "achternaam", "suffix", "rol",
                                            "tekstbron" ])

In [None]:
results = parse_names(names)

In [None]:
save_names(results)

## 9. Check names (old code)

In [None]:
def count_preceding_tokens(names):
    return pd.DataFrame(names)[3].value_counts()

In [None]:
def count_names_without_first_name(results):
    empty_first_name = []
    for index, row in results.iterrows():
        if row["first_name"] == "":
            empty_first_name.append(row["last_name"])
    return pd.DataFrame(empty_first_name).value_counts()

In [None]:
def count_names_without_last_name(results):
    empty_last_name = []
    for index, row in results.iterrows():
        if row["last_name"] == "":
            empty_last_name.append(row["first_name"])
    return pd.DataFrame(empty_last_name).value_counts()

In [None]:
def count_first_names(results):
    return results["first_name"].value_counts()

In [None]:
def count_last_names(results):
    return results["last_name"].value_counts()

In [None]:
def find_string_in_results(results, string, field="source"):
    selected = []
    for index,row in results.iterrows():
        if regex.search(string, row[field], regex.IGNORECASE):
            selected.append(row)
    return pd.DataFrame(selected)

In [None]:
count_preceding_tokens(names).to_dict()

In [None]:
find_string_in_results(results, "zn", "last_name")

In [None]:
data[DATA_FIELD][189873]

In [None]:
count_names_without_first_name(results).to_dict()

In [None]:
process_info_text("Vrij geworden en thans genaamd François Jacobus Hendrik Roosdijk.")

In [None]:
get_deceased_name.get_entities_from_text(data[DATA_FIELD][3310])

In [None]:
for text in data[DATA_FIELD]:
    if isinstance(text, str) and regex.search("district", text):
        print(text)

## 99. Tests

In [None]:
import unittest

In [None]:
class TestNotebook(unittest.TestCase):    
    def test_split_name(self):
        self.assertEqual(split_name("boedel weduwe Adriana Augusta van Dam qq", "eigenaar"),
                         ('boedel weduwe Adriana Augusta', 'van Dam qq') )

        
    def test_split_name_in_five(self):
        self.assertEqual(split_name_in_five("boedel weduwe Adriana Augusta", "van Dam qq"),
                         ('boedel weduwe', 'Adriana Augusta', 'van',  'Dam',  'qq') )

        
    def test_patch_born_as(self):
        self.assertEqual(patch_born_as("Jansen geboren De Vries", [{ "start": 0, "end": 6 }, { "start": 15, "end": 23 }]),
                         [{"start": 0, "end": 23}] )
        
        
    def test_get_token_before_entity(self):
        self.assertEqual(get_token_before_entity("one two three", { "start": 4, "end": 7 }),
                         ( "one", 0 ))
        self.assertEqual(get_token_before_entity("one two three", { "start": 0, "end": 3 }),
                         ( "", -1 ))
        self.assertEqual(get_token_before_entity(" one two three", { "start": 1, "end": 4 }),
                         ( "", -1 ))
        self.assertEqual(get_token_before_entity("one two three", { "start": 1, "end": 3 }),
                         ( "o", 0 ))
        
        
    def test_get_token_after_entity(self):
        self.assertEqual(get_token_after_entity("one two three", { "start": 4, "end": 7 }),
                         ( "three", 13 ))
        self.assertEqual(get_token_after_entity("one two three", { "start": 8, "end": 13 }),
                         ( "", 14 ))
        self.assertEqual(get_token_after_entity("one two three ", { "start": 8, "end": 13 }),
                         ( "", 15 ))
        self.assertEqual(get_token_after_entity("one two three", { "start": 8, "end": 12 }),
                         ( "e", 13 ))
        
        
    def test_get_first_token_of_entity(self):
        self.assertEqual(get_first_token_of_entity("one two three", { "start": 4, "end": 13 }),
                         ( "two", 7 ))
        self.assertEqual(get_first_token_of_entity("one two three", { "start": 5, "end": 13 }),
                         ( "wo", 7 ))
        self.assertEqual(get_first_token_of_entity("one two three ", { "start": 3, "end": 13 }),
                         ( "two", 7 ))
        self.assertEqual(get_first_token_of_entity("one two three ", { "start": 3, "end": 7 }),
                         ( "two", 7 ))
        
        
    def test_get_last_token_of_entity(self):
        self.assertEqual(get_last_token_of_entity("one two three", { "start": 0, "end": 7 }),
                         ( "two", 4 ))
        self.assertEqual(get_last_token_of_entity("one two three", { "start": 0, "end": 6 }),
                         ( "tw", 4 ))
        self.assertEqual(get_last_token_of_entity("one two three ", { "start": 0, "end": 8 }),
                         ( "two", 4 ))
        self.assertEqual(get_last_token_of_entity("one two three ", { "start": 4, "end": 7 }),
                         ( "two", 4 ))

        
    def test_remove_first_token_of_entity(self):
        self.assertEqual(remove_first_token_of_entity("one two three", { "start": 4, "end": 13 }),
                         { "start": 8, "end": 13, "word": "three" })
        self.assertEqual(remove_first_token_of_entity("one two three", { "start": 5, "end": 13 }),
                         { "start": 8, "end": 13, "word": "three" })
        self.assertEqual(remove_first_token_of_entity("one two three ", { "start": 3, "end": 13 }),
                         { "start": 8, "end": 13, "word": "three" })
        self.assertEqual(remove_first_token_of_entity("one two three ", { "start": 4, "end": 7 }),
                         { "start": 7, "end": 7, "word": "" })
        
        
    def test_remove_last_token_of_entity(self):
        self.assertEqual(remove_last_token_of_entity("one two three", { "start": 0, "end": 7 }),
                         { "start": 0, "end": 3, "word": "one" })
        self.assertEqual(remove_last_token_of_entity("one two three", { "start": 0, "end": 6 }),
                         { "start": 0, "end": 3, "word": "one" })
        self.assertEqual(remove_last_token_of_entity("one two three ", { "start": 0, "end": 8 }),
                         { "start": 0, "end": 3, "word": "one" })
        self.assertEqual(remove_last_token_of_entity("one two three ", { "start": 4, "end": 7 }),
                         { "start": 4, "end": 4, "word": "" })

In [None]:
unittest.main(argv=[''], verbosity=2, exit=False)