# Entity linking

Code for entity linking experiments based on the file Overlijdensmerged.csv

In [None]:
from ast import literal_eval
from math import nan
import os
import pandas as pd
import regex
import sys

## 1. Link entities based on name and death year

In [None]:
BIRTH_DATE = "Geb.datum"
DECEASED = "Overledene"
FATHER = "Vader"
FIRST_NAMES = "Voornamen"
INFORMANT = "Aangever"
MOTHER = "Moeder"
SPOUSE = "Echtgeno(o)t(e)"
SURNAME = "Achternaam"
WITNESS = "Getuige"
WITNESS9 = "Getuige9"
YEAR = "Jaar"

In [None]:
age_keys = { DECEASED: "Leeftijd",
             FATHER: "Leeftijd/ovl",
             MOTHER: "Leeftijd/ovl4",
             INFORMANT: "Leeftijd6",
             WITNESS: "Leeftijd8",
             WITNESS9: "Leeftijd11",
             SPOUSE: "Beroep12" }

profession_keys = { DECEASED: "Beroep",
                    FATHER: "Beroep2",
                    MOTHER: "Beroep3",
                    INFORMANT: "Beroep5",
                    WITNESS: "Beroep7",
                    WITNESS9: "Beroep10",
                    SPOUSE: "Beroep12" }

In [None]:
def get_person_data_from_certificates(known_certificates, person_data={}):
    """ extract person data from available certificates """
    for index, row in known_certificates.iterrows():
        deceased_name = ""
        if isinstance(row[SURNAME], str) and isinstance(row[FIRST_NAMES], str):
            deceased_name = " ".join([row[FIRST_NAMES], row[SURNAME]]).lower()
            if regex.search("[0-9]", deceased_name):
                deceased_name = ""
            else:
                if deceased_name not in person_data:
                    person_data[deceased_name] = []
                person_data[deceased_name].append((DECEASED, row[YEAR], row[age_keys[DECEASED]], row[profession_keys[DECEASED]], row[BIRTH_DATE], ""))  
        for key in [FATHER, MOTHER, INFORMANT, WITNESS, WITNESS9, SPOUSE]:
            if isinstance(row[key], str) and not regex.search("[0-9]", row[key]) and not row[key] == "":
                other_name = row[key].lower()
                if other_name not in person_data:
                    person_data[other_name] = []
                person_data[other_name].append((key, row[YEAR], row[age_keys[key]], row[profession_keys[key]], "", deceased_name))
    return person_data

In [None]:
def other_role(role_list):
    """ test if a person data list contains a role unequal to deceased/overledene """
    return len([ True for role in role_list if role[0] != DECEASED ]) > 0

In [None]:
def compute_birth_years(role_list):
    birth_years = []
    for role in role_list:
        if isinstance(role[4], str) and regex.search("[0-9][0-9][0-9][0-9]$", role[4]):
            birth_years.append([int(role[4][-4:])])
        elif isinstance(role[1], str) and regex.search("^[0-9][0-9][0-9][0-9]$", role[1]):
            if isinstance(role[2], str) and regex.search("^[0-9][0-9]$", role[2]):
                birth_year = int(role[1]) - int(role[2])
                birth_years.append([ birth_year, birth_year -1 ])
            elif isinstance(role[2], str) and regex.search("^[0-9][0-9] j", role[2]):
                birth_year = int(role[1]) - int(role[2][:2])
                birth_years.append([ birth_year, birth_year - 1])
            elif isinstance(role[2], str) and regex.search("^[0-9][0-9] [md]", role[2]):
                birth_year = int(role[1])
                birth_years.append([ birth_year, birth_year - 1])
            else:
                birth_years.append(float("nan"))
        else:
            birth_years.append(float("nan"))
    return birth_years

In [None]:
def ages_known(role_list):
    """ test if a person data list contains at least two data items with a known age; at least one cannot be of the deceased person """ 
    return len([ True for year in compute_birth_years(role_list) if isinstance(year, list) ] ) > 1

In [None]:
def get_linkable_persons(person_data):
    """ get linkable person data lists based on the functions other_role/1 and ages_known/1 """
    return [ (name, person_data[name]) for name in sorted(person_data.keys(), 
                                                          key=lambda name: len(person_data[name]), 
                                                          reverse=True) if not regex.search("levenloos", name) and 
                                                                           other_role(person_data[name]) and
                                                                           ages_known(person_data[name])]

In [None]:
def get_age_from_string(string, reported_errors={}):
    """ extract age from string which could include extra data """
    if regex.search("^[0-9]+$", string):
        return string
    elif regex.search("^([~±<> ;`'\"]*|ruim )[0-9]+ *(jaar|jaren|½| en | of | a ).*$", string, regex.IGNORECASE):
        return regex.sub("^([~±<> ;`'\"]*|ruim )([0-9]+)( *)(jaar|jaren|½| en | of | a ).*$", "\\2", string, regex.IGNORECASE)
    elif regex.search("^[~±<> ;`'\"]*[0-9]+ *(maanden|mnd|maand|weken|week|dagen|dgn|uren|uur).*$", string, regex.IGNORECASE):
        return "0"
    elif regex.search("^[~±<> ;`'\"][0-9]+$", string, regex.IGNORECASE):
        return regex.sub("^[~±<> ;`'\"]", "", string, regex.IGNORECASE)
    elif regex.search("[0-9]+ *j$", string, regex.IGNORECASE):
        return regex.sub(" *j$", "", string, regex.IGNORECASE)
    else:
        if regex.search("[0-9]", string, regex.IGNORECASE) and string not in reported_errors:
            print(f"unknown age string format in {string}\n")
            reported_errors[string] = True
        return None

In [None]:
def make_person_links(linkable_persons):
    linked_person_data = {}
    for name in linkable_persons:
        birth_years = compute_birth_years(linkable_persons[name])
        role_list = [ linkable_persons[name][index] for index in range(0, len(birth_years)) if isinstance(birth_years[index], list) ]
        birth_years =  [ birth_years[index] for index in range(0, len(birth_years)) if isinstance(birth_years[index], list) ]
        linked_person_data[name] = [ [ role ] for role in role_list ]
        for role_index_1 in range(0, len(role_list)):
            for role_index_2 in range(role_index_1 + 1, len(role_list)):
                for year in birth_years[role_index_1]:
                    if year in birth_years[role_index_2]:
                        linked_person_data[name][role_index_1].append(role_list[role_index_2])
                        linked_person_data[name][role_index_2].append(role_list[role_index_1])
                        break
    return linked_person_data

In [None]:
def remove_duplicates(linkable_persons):
    return [ (data_in[0], list(set(data_in[1]))) for data_in in linkable_persons ]

In [None]:
def summarize(name, data, missing_deaths=0, debug=False):
    summary = {}
    for key in data:
        if data[key] not in summary:
            summary[data[key]] = [key]
        else:
            summary[data[key]].append(key)
    if len(summary) > 0:
        if debug:
            print(f"\nNAME: {name}")
        for key in summary:
            if debug:
                print(key)
            missing_death = not regex.search("overledene", key)
            for value in summary[key]:
                if value != key:
                    if debug:
                        print(f"   {value}")
                if missing_death:
                    missing_death = not regex.search("overledene", value)
        if missing_death:
            missing_deaths += 1
    return summary, missing_deaths

In [None]:
def remove_duplicate_sets(linked_person_data):
    for name in linked_person_data:
        delete_list = []
        for index_1 in range(0, len(linked_person_data[name])):
            if index_1 not in delete_list:
                for index_2 in range(index_1 + 1, len(linked_person_data[name])):
                    if index_2 not in delete_list:
                        if set(linked_person_data[name][index_1]) == set(linked_person_data[name][index_2]):
                            delete_list.append(index_2)
        for data_index in sorted(set(delete_list), reverse=True):
            del linked_person_data[name][data_index]

In [None]:
def get_column_names(df):
    return [ column_name for column_name in df if not regex.search("^Unnamed:", column_name) ]  

In [None]:
# initial tests were performed with data file Overlijden 1831-1950 JESSYv2-1831-1929.csv

known_certificates = pd.read_csv("../../data/Overlijden/x-misc/Overlijdensmerged.csv", low_memory=False)
print(f"there are {len(known_certificates)} certificates")

In [None]:
person_data = get_person_data_from_certificates(known_certificates)
print(f"data for {len(person_data)} persons were found in the certificates")

In [None]:
person_data['esther curiel']

In [None]:
linkable_persons = remove_duplicates(get_linkable_persons(person_data))
print(f"there are {len(linkable_persons)} linkable persons")

In [None]:
linked_person_data = make_person_links({ data[0]: data[1] for data in linkable_persons })
len([data[0] for data in linked_person_data])

In [None]:
remove_duplicate_sets(linked_person_data)

In [None]:
len([data[0] for data in linked_person_data])

In [None]:
summary = []
for name in linked_person_data:
    if len(linked_person_data[name]) > 0:
        for role_list in linked_person_data[name]:
            for role in role_list:
                if role[0] in [ INFORMANT, WITNESS, WITNESS9 ]:
                    summary.append((name, role_list))
                    break

In [None]:
len(set([data[0] for data in summary]))

In [None]:
deceased_role_found_names = []
for data in summary:
    deceased_role_counter = 0
    for role in data[1]:
        if role[0] == FATHER:
            deceased_role_counter += 1
    if deceased_role_counter == 1:
        deceased_role_found_names.append(data[0])
print(len(set(deceased_role_found_names)))

In [None]:
summary[-2:-1]

In [None]:
[ (data[0], data[1]) for data in summary if len(data[1]) == 3 and data[1][0][0] == DECEASED and data[1][1][0] == DECEASED and data[1][2][0] != DECEASED ]

## 2. Look for related entities including father-child relation

In [None]:
for name in linked_person_data:
    for key in linked_person_data[name]:
        try:
            data = literal_eval(key)
            if data[0] == "vader" and data[4] in linked_person_data:
                summary, missing_deaths = summarize(name, linked_person_data[name], debug=True)
                summary, missing_deaths = summarize(data[4], linked_person_data[data[4]], debug=True)
        except:
            pass

## 3. Look for links including mother-child relation

For linking by mother information we need the birth or death year of the mother but these are not in the data? Years 1900-1905 seem to have many column errors, please check

In [None]:
def loop_test():
    for name in person_data:
        for data in person_data[name]:
            if data[0] == "moeder" and isinstance(data[2], str) and data[2].lower() not in "ovl. ovl".split():
                print(name, data)
loop_test()

## 99. Unexpected link

In [None]:
for person in linkable_persons:
    if person[0] == "gerardus martinus eustatia" or person[0] == "gerardus martinus eustacia":
        print(person)