# Info fields

Extract persons from the info fields StartEntryInfo and EndEntryInfo of the [slave registers of Suriname](https://datasets.iisg.amsterdam/dataset.xhtml?persistentId=hdl:10622/CSPBHO)

In [None]:
import os
import pandas as pd
import regex
import sys
from IPython.display import clear_output
sys.path.append(os.getcwd() + '/..')
from scripts import get_deceased_name

In [None]:
def squeal(text=None):
    clear_output(wait=True)
    if not text is None: 
        print(text)

In [None]:
DATA_FILE = "../../data/suriname/Dataset Suriname Slave and Emancipation Registers Version 1.1.csv"

data = pd.read_csv(DATA_FILE, low_memory=False)

In [None]:
data["StartEntryEventDetailed"].value_counts()

In [None]:
def get_previous_token(text, entity):
    previous_token = ""
    end = entity["start"] - 1
    while end > 0 and regex.search("\s", text[end-1]):
        end -= 1
    start = end - 1
    while start > 0 and not regex.search("\s", text[start-1]):
        start -= 1
    if end >= 0:
        previous_token = text[start: end]
    return previous_token, start

In [None]:
text = "Van H M Beekman voor 1/4. Van M M Beekman voor 1/4. Van den boedel I E van Wijck geboren Beekman voor 1/4. En van den boedel G F C Beekman voor het overige 1/4."
entities = get_deceased_name.run_bert_pipeline(text)
get_deceased_name.combine_entities(get_deceased_name.expand_entities(entities, text))

In [None]:
MISSING_WORDS = [ "erven", "geb", "geboren", "weduwe", ]

def get_names_from_data(data):
    names = []
    last_index = -1
    last_end = -1
    for index, row in data.iterrows():
        text = row["StartEntryInfo"]
        try:
            entities = get_deceased_name.run_bert_pipeline(text)
            entities = get_deceased_name.combine_entities(get_deceased_name.expand_entities(entities, text))
            if len(entities) > 0:
                for entity in entities:
                    if regex.search("PERSON", entity["entity"]):
                        previous_token, start = get_previous_token(text, entity)
                        if previous_token.lower() in MISSING_WORDS:
                            entity["start"] = start
                        name_string = text[entity["start"]: entity["end"]]
                        if regex.search("^geb", name_string) and index == last_index and entity["start"] == last_end + 1:
                            names[-1] = (names[-1][0], names[-1][1] + " " + name_string)
                        names.append((index, text[entity["start"]: entity["end"]]))
                        last_end = entity["end"]
                        last_index = index
        except:
            pass
        if index % 100 == 0:
            squeal(index)
        if len(names) >= 1000:
            break
    return names

In [None]:
names = get_names_from_data(data)

In [None]:
print(names)

In [None]:
prefixes_to_delete = [ "boedel", "en", ]
suffixes_to_delete = [ "beh.", "bij executie", "dd", "per executie", "qq", "voor den vrijdom", ]
non_names = [ "den lande", "janij", "t", ]

def get_names_from_string(name_string):
    names = []
    for non_name in non_names:
        match = regex.search(f"^{non_name}$", name_string, regex.IGNORECASE)
        if match:
            return []
    for prefix in prefixes_to_delete:
        match = regex.search(f"^(.*)\s+{prefix}\s+(.*)$", name_string, regex.IGNORECASE)
        if match:
            names.extend(get_names_from_string(match.group(1)))
            names.extend(get_names_from_string(match.group(2)))
            return names
        match = regex.search(f"^{prefix}\s+(.*)$", name_string, regex.IGNORECASE)
        if match:
            return get_names_from_string(match.group(1))
    for suffix in suffixes_to_delete:
        match = regex.search(f"^(.*)\s+{suffix}\s+(.*)$", name_string, regex.IGNORECASE)
        if match:
            names.extend(get_names_from_string(match.group(1)))
            names.extend(get_names_from_string(match.group(2)))
            return names
        match = regex.search(f"^(.*)\s+{suffix}$", name_string, regex.IGNORECASE)
        if match:
            return get_names_from_string(match.group(1))
    names.append(name_string)
    return names

In [None]:
def cleanup_name(name_string):
    name_string = regex.sub("[.,]\s*", " ", name_string)
    return name_string

In [None]:
data["StartEntryInfo"][557]

In [None]:
last_name_words = [ "d", "da", "de", "den", "der", "geb", "geboren", "v", "van" ]

def split_name(name_string):
    name_tokens = name_string.split()
    first_name_tokens = name_tokens[:-1]
    last_name_tokens = name_tokens[-1:]
    for i in range(0, len(first_name_tokens)):
        if first_name_tokens[i].lower() in last_name_words and (not len(first_name_tokens[i]) or first_name_tokens[i].lower() == first_name_tokens[i]):
            while len(first_name_tokens) > i:
                last_name_tokens = [ first_name_tokens.pop(-1)] + last_name_tokens
            break
    return " ".join(first_name_tokens), " ".join(last_name_tokens)

In [None]:
for i in range(0, 100):
    if isinstance(data["StartEntryInfo"][i], str):
        print(data["StartEntryInfo"][i])
        for j in range(0, 100):
            if names[j][0] == i:
                for name_string in get_names_from_string(cleanup_name(names[j][1])):
                    print("   ", i, split_name(name_string))

In [None]:
for name_tuple in names:
    for name_string in get_names_from_string(cleanup_name(name_tuple[1])):
        print(name_tuple[0], split_name(name_string))