# Info fields via regular expressions

Extract persons from the info fields StartEntryInfo and EndEntryInfo of the [slave registers of Suriname](https://datasets.iisg.amsterdam/dataset.xhtml?persistentId=hdl:10622/CSPBHO) via regular expressions

In [None]:
import os
import pandas as pd
import regex
import sys
from IPython.display import clear_output
sys.path.append(os.getcwd() + '/..')
from scripts import get_deceased_name

In [None]:
def squeal(text=None):
    clear_output(wait=True)
    if not text is None: 
        print(text)

## 1. Read data

In [None]:
DATA_FILE = "../../data/suriname/Dataset Suriname Slave and Emancipation Registers Version 1.1.csv"

data = pd.read_csv(DATA_FILE, low_memory=False)

## 2. Extract names from info field

In [None]:
def get_previous_token(text, entity):
    previous_token = ""
    end = entity["start"] - 1
    while end > 0 and regex.search("\s", text[end-1]):
        end -= 1
    start = end - 1
    while start > 0 and not regex.search("\s", text[start-1]):
        start -= 1
    if end >= 0:
        previous_token = text[start: end]
    return previous_token, start

In [None]:
def process_info_text(text, index=-1):
    names = []
    last_end = -99
    mother_seen = False
    entities = get_deceased_name.get_entities_from_text(text)
    for entity in entities:
        if regex.search("(PERSON|GPE)", entity["entity"]):
            previous_token, start = get_previous_token(text, entity)
            if previous_token.lower() in PREFIX_WORDS:
                entity["start"] = start
            name_string = text[entity["start"]: entity["end"]]
            if regex.search("^geb", name_string) and (entity["start"] == last_end + 1 or entity["start"] == last_end + 2): 
                names[-1] = (names[-1][0], names[-1][1] + " " + name_string, names[-1][2])
                continue
            role = "eigenaar"
            previous_token, start = get_previous_token(text, entity)
            if regex.search("geboren (uit|van)", text, regex.IGNORECASE) and not mother_seen:
                role = "moeder"
                mother_seen = True
            elif regex.search("(gemanumitteerd|vrij *geworden)", text, regex.IGNORECASE):
                role = "vrijgemaakte"
            elif regex.search("genaamd", previous_token, regex.IGNORECASE):
                role = "vrijgemaakte"
            elif regex.search("(slaaf|slavin|slaven)", previous_token, regex.IGNORECASE):
                role = "slaafgemaakte"
            elif regex.search("(plant\\b|plantage|plantaadje|houtvelling|divisie|district)", previous_token, regex.IGNORECASE):
                role = "location"
            elif regex.search("(curator|deurwaarder|klerk|landschrijver|vendumeester)", previous_token, regex.IGNORECASE):
                role = "ambtenaar"
            if role != "location":
                names.append((index, text[entity["start"]: entity["end"]], role, previous_token))
                last_end = entity["end"]
    return names

In [None]:
PREFIX_WORDS = [ "boedel", "erven", "geb", "gebn", "geboren", "weduwe", ]

def get_names_from_data(data):
    names = []
    for index, row in data.iterrows():
        text = row[DATA_FIELD]
        if isinstance(text, str):
            names.extend(process_info_text(text, index))
        if index % 100 == 0:
            squeal(f"total lines: {len(data)}; processes lines: {index}; found names: {len(names)}")
        if CUT_OFF > 0 and len(names) >= CUT_OFF:
            break
    squeal(f"total lines: {len(data)}; processes lines: {index}; found names: {len(names)}")
    return names

In [None]:
DATA_FIELD = "EndEntryInfo"
INDEX_FIELD = "Id_source"
CUT_OFF = 1000

names = get_names_from_data(data)

## 3. Cleanup names

In [None]:
def cleanup_name(name_string):
    name_string = regex.sub("[.,]\s*", " ", name_string)
    return name_string.strip()

In [None]:
prefixes_to_delete = [ "en",  "kurators", "notarieele", "slavin", ]
suffixes_to_delete = [ "/", "bij executie", "dd", "per executie", "febrij", "kolonien", "landsbelastingen", "resolutie",
                       "voor den vrijdom", "vrijdom", "den", "voor", ]
non_names = [ "", ",", "akte", "aug", "augs", "august", "augusts", "augustus", "boedel", "custodi", "de hoofdgelden", "dec", 
              "decbr", "decemb", "december", "decemr", "den lande", "dezer", "erfenis", "erfgenaam", "erfgename", "erfgenamen", "executie", 
              "executie genomen", "febr", "febrij", "fo", "folio", "gemanumitteerd", "genomen", "geregistreerd", 
              "geregd", "gergd", "gouv", "gouv resol", "gouvern", "gouvernement", "janij", "januarij", "julij", 
              "junij", "kolonien", "kurators", "lande", "lot no", "maart", "no", "notarieele", "novemb", "october", "overleden", 
              "overschrijving", "plant", "qq", "resolutie", "ruiling", "slaaf", "t", "vendu", "vendumeester", "vrijdom", "zn", "zijne", ]
locations = [ "batavia", "nickerie", "vreeland", "spieringshoek", "kroonenburg", "caledonia", "molhoop", "libanon",
              "saltzhalen", "waterloo", "fairfield", "amsterdam", "paramaribo", "paradize", "felix", "dordrecht", 
              "tourtonne", "lochaber", "leliendaal", "bremen", "lugtenburg", "saramacca", "zeezigt", "munnikkendam", 
              "zwarigheid", "katwijk", "hooijland", "poelwijk", "alkmaar", "waijamoe", "petersburg", "johannesburg", 
              "toledo", "ornamibo", "sardam", "coronie", "saksen", "thorarica", "curaçao", "cottica", "andresa", 
              "curacao", ]


def get_names_from_string(name_string):
    names = []
    for non_name in non_names + locations:
        match = regex.search(f"^{non_name}[^a-zA-Z]*$", name_string, regex.IGNORECASE)
        if match:
            return []
    match = regex.search(f"^(.*)\s*[0-9]+\s*(.*)$", name_string, regex.IGNORECASE)
    if match:
        names.extend(get_names_from_string(match.group(1)))
        names.extend(get_names_from_string(match.group(2)))
        return names
    for prefix in prefixes_to_delete:
        match = regex.search(f"^(.*)\s+{prefix}\s+(.*)$", name_string, regex.IGNORECASE)
        if match:
            names.extend(get_names_from_string(match.group(1)))
            names.extend(get_names_from_string(match.group(2)))
            return names
        match = regex.search(f"^{prefix}\s+(.*)$", name_string, regex.IGNORECASE)
        if match:
            return get_names_from_string(match.group(1))
    for suffix in suffixes_to_delete:
        match = regex.search(f"^(.*)\s+{suffix}\s+(.*)$", name_string, regex.IGNORECASE)
        if match:
            names.extend(get_names_from_string(match.group(1)))
            names.extend(get_names_from_string(match.group(2)))
            return names
        match = regex.search(f"^(.*)\s+{suffix}$", name_string, regex.IGNORECASE)
        if match:
            return get_names_from_string(match.group(1))
    names.append(name_string)
    return names

In [None]:
def get_names_from_string_with_punctuation(name_string):
    names = []
    for name in regex.split("(,|qq)", name_string):
        names_found = get_names_from_string(cleanup_name(name))
        names.extend(names_found)
    names_checked = []
    for name in names:
        if len(names_checked) > 0 and regex.search("^(geb\\b|gebn\\b|geboren)", name, regex.IGNORECASE):
            names_checked[-1] += " " + name
        else:
            names_checked.append(name)
    return names_checked

In [None]:
def split_name_default(name_string):
    if len(name_string) == 0:
        first_name_tokens = []
        last_name_tokens = []
    else:
        name_tokens = name_string.split()
        first_name_tokens = name_tokens[:-1]
        last_name_tokens = name_tokens[-1:]
    return first_name_tokens, last_name_tokens

In [None]:
last_name_words = [ "bo", "d", "da", "de", "den", "der", "du", "geb", "gebn", "geboren", "nom", "prive", "privé", "v", "van", "ux", ]

def expand_multi_token_last_name(first_name_tokens, last_name_tokens):
    for i in range(0, len(first_name_tokens)):
        if first_name_tokens[i].lower() in last_name_words and (not len(first_name_tokens[i]) == 1 or first_name_tokens[i].lower() == first_name_tokens[i]):
            while len(first_name_tokens) > i:
                last_name_tokens = [ first_name_tokens.pop(-1)] + last_name_tokens
            break
    return first_name_tokens, last_name_tokens

In [None]:
def include_last_name_before_born_as(first_name_tokens, last_name_tokens):
    if len(last_name_tokens) > 0 and len(first_name_tokens) > 0 and regex.search("^(geb|gebn|geboren|beh|jr|sr)\\b", last_name_tokens[0], regex.IGNORECASE):
        last_name_tokens = [first_name_tokens.pop(-1)] + last_name_tokens
    return first_name_tokens, last_name_tokens

In [None]:
def include_last_name_before_nom_ux(first_name_tokens, last_name_tokens):
    if (len(last_name_tokens) > 0 and len(first_name_tokens) > 1 and 
        regex.search("^ux\\b", last_name_tokens[0], regex.IGNORECASE) and 
        regex.search("^(n|nom)\\b", first_name_tokens[-1], regex.IGNORECASE)):
        last_name_tokens = [first_name_tokens.pop(-1)] + last_name_tokens
        last_name_tokens = [first_name_tokens.pop(-1)] + last_name_tokens
    return first_name_tokens, last_name_tokens

In [None]:
def split_name(name_string, role):
    first_name_tokens, last_name_tokens = split_name_default(name_string)
    first_name_tokens, last_name_tokens = expand_multi_token_last_name(first_name_tokens, last_name_tokens)
    first_name_tokens, last_name_tokens = include_last_name_before_born_as(first_name_tokens, last_name_tokens)
    first_name_tokens, last_name_tokens = include_last_name_before_nom_ux(first_name_tokens, last_name_tokens)
    if role == "eigenaar" or len(first_name_tokens) != 0:
        return " ".join(first_name_tokens), " ".join(last_name_tokens)
    else:
        return " ".join(last_name_tokens), " ".join(first_name_tokens)

In [None]:
PREFIX_TOKENS = [ "bl", "boedel", "erven", "mr", "we", "weduwe", ]

def get_prefix_tokens(first_name_tokens):
    prefix_tokens = []
    while len(first_name_tokens) > 0 and first_name_tokens[0].lower() in PREFIX_TOKENS:
        prefix_tokens.append(first_name_tokens.pop(0))
    return prefix_tokens, first_name_tokens

In [None]:
INFIX_TOKENS = [ "d'", "da", "de", "del", "den", "der", "des", "du", "d'", "het", "la", "du", "l'", "la", "le", "'t", "ter", "v", "van", "von" ]

def get_infix_tokens(last_name_tokens):
    infix_tokens = []
    while len(last_name_tokens) > 0 and last_name_tokens[0].lower() in INFIX_TOKENS:
        infix_tokens.append(last_name_tokens.pop(0))
    return infix_tokens, last_name_tokens

In [None]:
SUFFIX_TOKENS = [ "beh", "cs", "jr", "nom", "n", "prive", "qq", "sr", "ux" ]

def get_suffix_tokens(last_name_tokens):
    suffix_tokens = []
    while len(last_name_tokens) > 0 and last_name_tokens[-1].lower() in SUFFIX_TOKENS:
        suffix_tokens.insert(0, last_name_tokens.pop(-1))
    return last_name_tokens, suffix_tokens

In [None]:
def split_name_in_five(first_name, last_name):
    prefix_tokens, first_name_tokens = get_prefix_tokens(first_name.split())
    infix_tokens, last_name_tokens = get_infix_tokens(last_name.split())
    last_name_tokens, suffix_tokens = get_suffix_tokens(last_name_tokens)
    return " ".join(prefix_tokens), " ".join(first_name_tokens), " ".join(infix_tokens), " ".join(last_name_tokens), " ".join(suffix_tokens),

In [None]:
def save_names(results):
    results.to_csv(DATA_FIELD + ".csv", index=False, columns=results.columns)

In [None]:
def fix_uit(name_tuple, first_name, last_name):
    if first_name == "" and regex.search(f"^uit\s+{last_name}\S*$", data[DATA_FIELD][name_tuple[0]], regex.IGNORECASE):
        first_name = last_name
        last_name = ""
        name_tuple = (name_tuple[0], name_tuple[1], "moeder", name_tuple[3])
    return name_tuple, first_name, last_name

In [None]:
def parse_names(names):
    table_out = []
    for name_tuple in names:
        for name_string in get_names_from_string_with_punctuation(name_tuple[1]):
            first_name, last_name = split_name(name_string, name_tuple[2])
            name_tuple, first_name, last_name = fix_uit(name_tuple, first_name, last_name)
            prefix, first_name, infix, last_name, suffix = split_name_in_five(first_name, last_name)
            table_out.append([data[INDEX_FIELD][name_tuple[0]], prefix, first_name, infix, last_name, suffix, name_tuple[2], data[DATA_FIELD][name_tuple[0]]])
    return pd.DataFrame(table_out, columns=["id", "prefix", "voornaam", "infix", "achternaam", "suffix", "rol", "tekstbron" ])

In [None]:
results = parse_names([x for x in names])

In [None]:
results["tekstbron"][98]

In [None]:
results[240:300]

In [None]:
save_names(results)

## 4. Check names

In [None]:
def count_preceding_tokens(names):
    return pd.DataFrame(names)[3].value_counts()

In [None]:
def count_names_without_first_name(results):
    empty_first_name = []
    for index, row in results.iterrows():
        if row["first_name"] == "":
            empty_first_name.append(row["last_name"])
    return pd.DataFrame(empty_first_name).value_counts()

In [None]:
def count_names_without_last_name(results):
    empty_last_name = []
    for index, row in results.iterrows():
        if row["last_name"] == "":
            empty_last_name.append(row["first_name"])
    return pd.DataFrame(empty_last_name).value_counts()

In [None]:
def count_first_names(results):
    return results["first_name"].value_counts()

In [None]:
def count_last_names(results):
    return results["last_name"].value_counts()

In [None]:
def find_string_in_results(results, string, field="source"):
    selected = []
    for index,row in results.iterrows():
        if regex.search(string, row[field], regex.IGNORECASE):
            selected.append(row)
    return pd.DataFrame(selected)

In [None]:
count_preceding_tokens(names).to_dict()

In [None]:
find_string_in_results(results, "zn", "last_name")

In [None]:
data[DATA_FIELD][189873]

In [None]:
count_names_without_first_name(results).to_dict()

In [None]:
process_info_text("Vrij geworden en thans genaamd François Jacobus Hendrik Roosdijk.")

In [None]:
get_deceased_name.get_entities_from_text(data[DATA_FIELD][3310])

In [None]:
for text in data[DATA_FIELD]:
    if isinstance(text, str) and regex.search("district", text):
        print(text)