# Entity linking

In [None]:
from ast import literal_eval
from math import nan
import os
import pandas as pd
import regex
import sys
sys.path.append(os.getcwd() + '/..')
from scripts import utils

## 1. Link entities based on name and death year

In [None]:
age_keys = { "overledene": "Leeftijd",
             "Vader": "Leeftijd/ovl",
             "Moeder": "Leeftijd/ovl4",
             "Aangever": "Leeftijd6",
             "Getuige": "Leeftijd8",
             "Getuige9": "Leeftijd11",
             "Echtgeno(o)t(e)": "Beroep12" }
profession_keys = { "overledene": "Beroep",
                    "Vader": "Beroep2",
                    "Moeder": "Beroep3",
                    "Aangever": "Beroep5",
                    "Getuige": "Beroep7",
                    "Getuige9": "Beroep10",
                    "Echtgeno(o)t(e)": "Beroep12" }

In [None]:
def other_role(role_list):
    """ test if a person data list contains a role unequal to deceased/overledene """
    nbr_of_other_roles = 0
    for role in role_list:
        if role[0] != "overledene":
            nbr_of_other_roles += 1
    return nbr_of_other_roles > 0

In [None]:
def ages_known(role_list):
    """ test if a person data list contains at least two data items with a known age; at least one cannot be of the deceased person """
    nbr_of_known_ages = 0
    deceased_seen = False
    for role in role_list:
        if (isinstance(role[2], str) and regex.search("[0-9]", role[2])) or (len(role) > 4 and isinstance(role[4], str) and regex.search("[0-9]", role[4])):
            if not role[0] == "overledene":
                nbr_of_known_ages += 1
            elif not deceased_seen:
                nbr_of_known_ages += 1
                deceased_seen = True
    return nbr_of_known_ages > 1

In [None]:
def get_linkable_persons(person_data):
    """ get linkable person data lists based on the functions other_role/1 and ages_known/1 """
    return [(name, person_data[name]) for name in sorted(person_data.keys(), 
                                                         key=lambda name: len(person_data[name]), 
                                                         reverse=True) if not regex.search("levenloos", name) and 
                                                                      other_role(person_data[name]) and
                                                                      ages_known(person_data[name])]

In [None]:
def get_person_data_from_certificates(known_certificates):
    """ extract person data from available certificates """
    person_data = {}
    for index, row in known_certificates.iterrows():
        name = ""
        if isinstance(row["Achternaam"], str) and isinstance(row["Voornamen"], str):
            name = " ".join([row["Voornamen"], row["Achternaam"]]).lower()
            if not regex.search("[0-9]",name):
                if name in person_data:
                    person_data[name].append(("overledene", row["Jaar"], row[age_keys["overledene"]], row[profession_keys["overledene"]], row["Geb.datum"]))
                else:
                    person_data[name] = [("overledene", row["Jaar"], row[age_keys["overledene"]], row[profession_keys["overledene"]], row["Geb.datum"])]
            
        for key in ["Vader", "Moeder", "Aangever", "Getuige", "Getuige9", "Echtgeno(o)t(e)"]:
            if isinstance(row[key], str) and not regex.search("[0-9]", row[key]):
                if key == "Vader" and name != "":
                    data = (key.lower(), row["Jaar"], row[age_keys[key]], row[profession_keys[key]], name)
                else:
                     data = (key.lower(), row["Jaar"], row[age_keys[key]], row[profession_keys[key]])
                if row[key].lower() in person_data:
                    person_data[row[key].lower()].append(data)
                else:
                    person_data[row[key].lower()] = [data]
    return person_data

In [None]:
def get_age_from_string(string, reported_errors={}):
    """ extract age from string which could include extra data """
    if regex.search("^[0-9]+$", string):
        return string
    elif regex.search("^([~±<> ;`'\"]*|ruim )[0-9]+ *(jaar|jaren|½| en | of | a ).*$", string, regex.IGNORECASE):
        return regex.sub("^([~±<> ;`'\"]*|ruim )([0-9]+)( *)(jaar|jaren|½| en | of | a ).*$", "\\2", string, regex.IGNORECASE)
    elif regex.search("^[~±<> ;`'\"]*[0-9]+ *(maanden|mnd|maand|weken|week|dagen|dgn|uren|uur).*$", string, regex.IGNORECASE):
        return "0"
    elif regex.search("^[~±<> ;`'\"][0-9]+$", string, regex.IGNORECASE):
        return regex.sub("^[~±<> ;`'\"]", "", string, regex.IGNORECASE)
    elif regex.search("[0-9]+ *j$", string, regex.IGNORECASE):
        return regex.sub(" *j$", "", string, regex.IGNORECASE)
    else:
        if regex.search("[0-9]", string, regex.IGNORECASE) and string not in reported_errors:
            utils.print_with_color(f"unknown age string format in {string}\n")
            reported_errors[string] = True
        return None

In [None]:
def make_link(person_links, data1, data2):
    """ create a link between two data items which refer to the same person """
    if str(data1) in person_links:
        person_links[str(data2)] = person_links[str(data1)]
        for data in person_links:
            if person_links[data] == str(data2):
                person_links[str(data)] = person_links[str(data1)]
    elif str(data2) in person_links:
        person_links[str(data1)] = person_links[str(data2)]
        for data in person_links:
            if person_links[data] == str(data1):
                person_links[str(data)] = person_links[str(data2)]
    else:
        person_links[str(data1)] = str(data1)
        person_links[str(data2)] = person_links[str(data1)]

In [None]:
def make_person_links(linkable_persons):
    """ check which items in a person data list can be merged based on birth year and name """
    nbr_of_person_links = 0
    reported_errors = {}
    linked_person_data = {}
    for linkable_person in linkable_persons:
        person_links = {}
        for data1 in linkable_person[1]:
            for data2 in linkable_person[1]:
                if str(data1) < str(data2):
                    try:
                        year1 = int(data1[1])
                        year2 = int(data2[1])
                        age1 = int(get_age_from_string(data1[2], reported_errors))
                        age2 = int(get_age_from_string(data2[2], reported_errors))
                        birth_year1 = year1 - age1
                        birth_year2 = year2 - age2
                        if birth_year1 <= birth_year2+1 and birth_year2 <= birth_year1+1:
                            make_link(person_links, data1, data2)
                            nbr_of_person_links += 1
                    except:
                        pass
        linked_person_data[linkable_person[0]] = person_links
    print(f"created {nbr_of_person_links} person links")
    return linked_person_data

In [None]:
def remove_duplicates(linkable_persons):
    return [ (data_in[0], list(set(data_in[1]))) for data_in in linkable_persons ]

In [None]:
def summarize(name, data, missing_deaths=0, debug=False):
    summary = {}
    for key in data:
        if data[key] not in summary:
            summary[data[key]] = [key]
        else:
            summary[data[key]].append(key)
    if len(summary) > 0:
        if debug:
            print(f"\nNAME: {name}")
        for key in summary:
            if debug:
                print(key)
            missing_death = not regex.search("overledene", key)
            for value in summary[key]:
                if value != key:
                    if debug:
                        print(f"   {value}")
                if missing_death:
                    missing_death = not regex.search("overledene", value)
        if missing_death:
            missing_deaths += 1
    return summary, missing_deaths

In [None]:
known_certificates = pd.read_csv("../../data/Overlijden/x-misc/Overlijden 1831-1950 JESSYv2-1831-1929.csv", low_memory=False)
print(f"there are {len(known_certificates)} certificates")

In [None]:
person_data = get_person_data_from_certificates(known_certificates)
print(f"data for {len(person_data)} persons were found in the certificates")

In [None]:
linkable_persons = remove_duplicates(get_linkable_persons(person_data))
print(f"there are {len(linkable_persons)} linkable persons")

In [None]:
linked_person_data = make_person_links(linkable_persons)
len(linked_person_data)

In [None]:
missing_deaths = 0
nbr_of_records = 0
for name in linked_person_data:
    summary, missing_deaths = summarize(name, linked_person_data[name], missing_deaths)
    nbr_of_records += len(summary)
print(f"number of person data: {nbr_of_records}; number of missing deaths: {missing_deaths} ({int(100*missing_deaths/nbr_of_records)}%)")

## 2. Look for related entities including father-child relation

In [None]:
for name in linked_person_data:
    for key in linked_person_data[name]:
        try:
            data = literal_eval(key)
            if data[0] == "vader" and data[4] in linked_person_data:
                summary, missing_deaths = summarize(name, linked_person_data[name], debug=True)
                summary, missing_deaths = summarize(data[4], linked_person_data[data[4]], debug=True)
        except:
            pass

## 3. Look for links including mother-child relation

For linking by mother information we need the birth or death year of the mother but these are not in the data? Years 1900-1905 seem to have many column errors, please check

In [None]:
def loop_test():
    for name in person_data:
        for data in person_data[name]:
            if data[0] == "moeder" and isinstance(data[2], str) and data[2].lower() not in "ovl. ovl".split():
                print(name, data)
loop_test()

## 99. Unexpected link

In [None]:
for person in linkable_persons:
    if person[0] == "gerardus martinus eustatia" or person[0] == "gerardus martinus eustacia":
        print(person)