# Entity linking

In [None]:
import os
import pandas as pd
import regex
import sys
sys.path.append(os.getcwd() + '/..')
from scripts import utils

In [None]:
age_keys = { "overledene": "Leeftijd",
             "Vader": "Leeftijd/ovl",
             "Moeder": "Leeftijd/ovl4",
             "Aangever": "Leeftijd6",
             "Getuige": "Leeftijd8",
             "Getuige9": "Leeftijd11",
             "Echtgeno(o)t(e)": "Beroep12" }
profession_keys = { "overledene": "Beroep",
                    "Vader": "Beroep2",
                    "Moeder": "Beroep3",
                    "Aangever": "Beroep5",
                    "Getuige": "Beroep7",
                    "Getuige9": "Beroep10",
                    "Echtgeno(o)t(e)": "Beroep12" }

In [None]:
def other_role(role_list):
    """ test if a person data list contains a role unequaldo deceased/overledene """
    nbr_of_other_roles = 0
    for role in role_list:
        if role[0] != "overledene":
            nbr_of_other_roles += 1
    return nbr_of_other_roles > 0

In [None]:
def ages_known(role_list):
    """ test if a person data list contains at least two data items with a known age; at least one cannot be of the deceased person """
    nbr_of_known_ages = 0
    deceased_seen = False
    for role in role_list:
        if (isinstance(role[2], str) and regex.search("[0-9]", role[2])) or (len(role) > 4 and isinstance(role[4], str) and regex.search("[0-9]", role[4])):
            if not role[0] == "overledene":
                nbr_of_known_ages += 1
            elif not deceased_seen:
                nbr_of_known_ages += 1
                deceased_seen = True
    return nbr_of_known_ages > 1

In [None]:
def get_linkable_persons(person_data):
    """ get linkable person data lists based on the functions other_role/1 and ages_known/1 """
    return [(name, person_data[name]) for name in sorted(person_data.keys(), 
                                                         key=lambda name: len(person_data[name]), 
                                                         reverse=True) if not regex.search("levenloos", name) and 
                                                                      other_role(person_data[name]) and
                                                                      ages_known(person_data[name])]

In [None]:
def get_person_data_from_certificates(known_certificates):
    """ extract person data from available certificates """
    person_data = {}
    for index, row in known_certificates.iterrows():
        if isinstance(row["Achternaam"], str) and isinstance(row["Voornamen"], str):
            name = " ".join([row["Voornamen"], row["Achternaam"]]).lower()
            if not regex.search("[0-9]",name):
                if name in person_data:
                    person_data[name].append(("overledene", row["Jaar"], row[age_keys["overledene"]], row[profession_keys["overledene"]], row["Geb.datum"]))
                else:
                    person_data[name] = [("overledene", row["Jaar"], row[age_keys["overledene"]], row[profession_keys["overledene"]], row["Geb.datum"])]
            
        for key in ["Vader", "Moeder", "Aangever", "Getuige", "Getuige9", "Echtgeno(o)t(e)"]:
            if isinstance(row[key], str) and not regex.search("[0-9]", row[key]):
                if row[key] in person_data:
                    person_data[row[key].lower()].append((key.lower(), row["Jaar"], row[age_keys[key]], row[profession_keys[key]]))
                else:
                    person_data[row[key].lower()] = [(key.lower(), row["Jaar"], row[age_keys[key]], row[profession_keys[key]])]
    return person_data

In [None]:
def make_person_links(linkable_persons):
    """ check which items in a person data list can be merged based on birth year and name """
    nbr_of_person_links = 0
    person_links = {}
    for linkable_person in linkable_persons:
        for data1 in linkable_person[1]:
            for data2 in linkable_person[1]:
                if str(data1) < str(data2):
                    try:
                        year1 = int(data1[1])
                        year2 = int(data2[1])
                        age1 = int(regex.sub("[^0-9]", "", data1[2]))
                        age2 = int(regex.sub("[^0-9]", "", data2[2]))
                        birth_year1 = year1 - age1
                        birth_year2 = year2 - age2
                        if birth_year1 <= birth_year2+1 and birth_year2 <= birth_year1+1:
                            if str(data1) not in person_links or str(data2) not in person_links(str(data1)):
                                print(f"linking {linkable_person[0]}: {data1} with {data2}")
                                nbr_of_person_links += 1
                                if str(data1) in person_links:
                                    person_links[str(data1)].append(str(data2))
                                    utils.print_with_color(f"extra link! ({len(person_links[str(data1)])})\n")
                                else:
                                    person_links[str(data1)]= [str(data2)]
                                if str(data2) in person_links:
                                    person_links[str(data2)].append(str(data1))
                                    utils.print_with_color(f"extra link! ({len(person_links[str(data2)])})\n")
                                else:
                                    person_links[str(data2)]= [str(data1)]
                    except:
                        pass
    print(f"created {nbr_of_person_links} person links")

In [None]:
known_certificates = pd.read_csv("../../data/Overlijden/x-misc/Overlijden 1831-1950 JESSYv2-1831-1929.csv", low_memory=False)
print(f"there are {len(known_certificates)} certificates")

In [None]:
person_data = get_person_data_from_certificates(known_certificates)
print(f"data for {len(person_data)} persons were found in the certificates")

In [None]:
linkable_persons = get_linkable_persons(person_data)
print(f"there are {len(linkable_persons)} linkable persons")

In [None]:
make_person_links(linkable_persons)

In [None]:
for person in linkable_persons:
    if person[0] == "gerardus martinus eustatia" or person[0] == "gerardus martinus eustacia":
        print(person)