# Code Validation / Refactoring
## Non-functionalized versions for easier trouble shooting

In [None]:
#no_test
#dependencies

#nlp packages
import spacy
from spacy.util import minibatch, compounding

#manipulation of tables/arrays
import pandas as pd
import numpy as np
import copy
import json

import difflib
import matplotlib.pyplot as plt
import matplotlib as mpl

#internal imports
from ssda_nlp.collate import *
from ssda_nlp.split_data import *
from ssda_nlp.modeling import *
from ssda_nlp.model_performance_utils import *
from ssda_nlp.xml_parser import *
from ssda_nlp.unstructured2markup import *
from ssda_nlp.utility import *
from ssda_nlp.relationships import *
from ssda_nlp.full_volume import *

## Making better similar_names logic

In [None]:
import unicodedata

def strip_accents(text):
    try:
        text = unicode(text, 'utf-8')
    except NameError: # unicode is a default on python 3 
        pass

    text = unicodedata.normalize('NFD', text)\
           .encode('ascii', 'ignore')\
           .decode("utf-8")

    return str(text)

my_string = 'àéêöhello'
print(my_string)
s = strip_accents(my_string)
print(s)

my_string = "á, é, í, ó, ú, ü, ñ, ¿, ¡, ?, #,  , !, $, @, ."
print(my_string)
s = strip_accents(my_string)
print(s)

In [None]:
#no_test

for person_idx in range(len(entry_people)):
    relationships = entry_people[person_idx].get('relationships')
    
    #are there any people with very similar names that still appear separately
    nameTemp = entry_people[person_idx].get('name')
    name_list.append(nameTemp)
    
    #1: Short name that could be a first name
    #Make a boolean list to flag which names may be first names (based on length)
    possibleFirstNames = []
    fullNames = []
    for name in name_list:
        if name in all_first_names:
            possibleFirstNames.append(name)
        else:
            fullNames.append(name)
    #Check to see if names appear within each other (i.e. is a person double counted)
    doubleCountedNames = []
    for idx in range(len(possibleFirstNames)):
        doubleCountedNames= doubleCountedNames + ([name for name in fullNames if possibleFirstNames[idx] in name])
    if not len(doubleCountedNames)==0:
        print("Possible double count (first name appears in a second instance (full name))")
        print(name_list)
        similarNames = 1
        
    #2: Two similarly-sized names, that could be variations (i.e. missing hypens or have #'s for unknown letters')
    # This is a bad check that needs to be replaced
    # Doesn't even check to see if composition is similar, only length...
    for name_idx in range(len(name_list)):
        # Why is it even passing here?
        if name_idx==0:
            pass
        else:
            for idx in range(len(name_list)):
                for idx2 in range(len(name_list)-idx-1):
                    idx2 = idx2+idx+1
                    if check_lengths(name_list, idx, idx2):
                        print("Similar Names: similar size")
                        print(name_list)
                        similarNames = 1 
                        
    #3: Strip spaces, ".", "#", convert accented characters to unaccented
    for name in name_list:
        # Strip any accents
        utf_name = strip_accents(name)
        # Remove bad characters
        condensed_name = utf_name.translate({ord(c): None for c in './\? !@#$'})
        # ^ How to check for similarity when # appears in one name but the actual letter does in the other?
        

In [14]:
#á, é, í, ó, ú, ü, ñ

name_list = ["Kai Malcolm", "Kai", "Kai Devon Malcolm", "Devon Malcolm", 
             "Dan Jenkins", "Daniel Jenkins", "Daniel Genkins", "Da##iel Genkin#", "D#l Ge#ins", 
            "Maria Andoval", "María Andoval", "Máríá ##o#al",
            "María Dolores", "María Dolores Sanchez", "Maria Sanchez"]

In [None]:
#3: Strip spaces, ".", "#", convert accented characters to unaccented
for name in name_list:
    # Strip any accents
    utf_name = strip_accents(name)
    # Remove bad characters
    condensed_name = utf_name.translate({ord(c): None for c in './\? !@#$'})
    # ^ How to check for similarity when # appears in one name but the actual letter does in the other?

In [13]:
from difflib import SequenceMatcher
s_1 = 'Mohan Mehta'
s_2 = 'Mohan Mehte'
print(SequenceMatcher(a=s_1,b=s_2).ratio())

0.9090909090909091


In [19]:
from difflib import SequenceMatcher
s_1 = 'Maria Dolores Sanchez'
s_2 = 'Juan Dolores Sanchez'
print(SequenceMatcher(a=s_1,b=s_2).ratio())

0.8292682926829268


In [18]:
for name_idx in range(len(name_list)):
    try:
        name1 = name_list[name_idx]
        name2 = name_list[name_idx+1]
        # Strip any accents, Remove bad characters
        n1 = strip_accents(name1).translate({ord(c): None for c in './\? !@#$'})
        n2 = strip_accents(name2).translate({ord(c): None for c in './\? !@#$'})
        
        print(f"{name_list[name_idx]}, {name_list[name_idx+1]}, {SequenceMatcher(a=n1,b=n2).ratio():0.3f}")
        print("---------------------")
    except:
        pass

Kai Malcolm, Kai, 0.462
---------------------
Kai, Kai Devon Malcolm, 0.333
---------------------
Kai Devon Malcolm, Devon Malcolm, 0.889
---------------------
Devon Malcolm, Dan Jenkins, 0.273
---------------------
Dan Jenkins, Daniel Jenkins, 0.870
---------------------
Daniel Jenkins, Daniel Genkins, 0.923
---------------------
Daniel Genkins, Da##iel Genkin#, 0.917
---------------------
Da##iel Genkin#, D#l Ge#ins, 0.667
---------------------
D#l Ge#ins, Maria Andoval, 0.105
---------------------
Maria Andoval, María Andoval, 1.000
---------------------
María Andoval, Máríá ##o#al, 0.800
---------------------
Máríá ##o#al, María Dolores, 0.700
---------------------
María Dolores, María Dolores Sanchez, 0.774
---------------------
María Dolores Sanchez, Maria Sanchez, 0.774
---------------------


In [None]:
entry_ex1 = ['María Dolores Sanchez', 'Don Miguel o’Reilly', 'Juan', 'Deliby', 'Don Francisco Sanchez', 'Maria Dolores', 'José Rivas', 'María de la Luz Blanco']
entry_ex2 = ['Antonio Guillo', 'Don Miguel o’Reilly', 'Isaac Guillo', 'Sara Ca mel', 'Juan Antonio', 'Antonio Pellice', 'Susana Pellicer']
entry_ex3 = ['María Juana Francisca Fish', 'Don Miguel o’Reilly', 'Don Jeremías Fish', 'Eva Fish', 'Maria Juana Francisca', 'Lorenzo Capó', 'Juana Joaquina Gonzalez']
#'Teniente de Cura Beneficiado'

## NB 71

In [None]:
#export
#this is currently configured specifically for baptisms/burials

def assign_characteristics(entry_text, entities_df, characteristics_df, unique_individuals, volume_metadata):
    '''
    matches all labeled characteristics to the correct individual(s) and builds triples
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        characteristics_df: entities given the label "CHAR" from a single entry by an NER model
        unique_individuals: as determined by id_unique_individuals and/or meta-function of disambig pipeline
        volume_metadata: metadata for the volume that the entry comes from, built by retrieve_volume_metadata        
    
        returns: structured representation (a list of dictionaries)
    '''
    people = []
    ethnicities = retrieve_controlled_vocabularies()["ethnicities"]
    #CATEGORIZE_CHARACTERISTICS#############################################################################
    #def categorize_characteristics(entities_df, characteristics_df):
    '''
    determines which category each labeled characteristic belongs to        
        characteristics_df: entities given the label "CHAR" from a single entry by an NER model        

        returns: the same dataframe with an additional column containing a characteristic category
    '''

    vocabs = retrieve_controlled_vocabularies()
    categories = []
    uncategorized_characteristics = pd.DataFrame({"entry_no": pd.Series([], dtype="str"), "pred_entity": pd.Series([], dtype="str"), "pred_label": pd.Series([], dtype="str"), "pred_start": pd.Series([], dtype="int"), "pred_end": pd.Series([], dtype="int"), "assigned": pd.Series([], dtype="bool")}) 
    entities_df.reset_index(inplace = True, drop = True)    

    for index, characteristic in entities_df.iterrows():        
        #development
        #if characteristic["pred_entity"] == "libre":
            #print(characteristic["pred_label"])
        if characteristic["pred_label"] != "CHAR":
            continue
        category = None       
        if characteristic["pred_entity"] in ["Natural", "nral", "Nat.l", "N.l", "nat.l", "natural"]:
            loc_indices = []
            for i, entity in entities_df.iterrows():
                if entity["pred_label"] == "LOC":
                    loc_indices.append(i)
            for loc_index in loc_indices:
                if ((loc_index - index) == 1):
                    category = "origin"
            if category == None:
                category = "legitimacy"        
        for cat in vocabs:
            if (characteristic['pred_entity'] == 'h') or (characteristic['pred_entity'] == "h."):
                category = "relationships"
            elif characteristic["pred_entity"] == "propiedad":
                category = "status"
            if category != None:
                break
            for term in vocabs[cat]:
                if term in characteristic['pred_entity'].lower():
                        category = cat
                        break

        if category == None:            
            uncat_char = uncategorized_characteristics.append(entities_df.iloc[index])

        categories.append(category)

    characteristics_df["category"] = categories
    uncat_char["category"] = None
    categorized_characteristics = characteristics_df
    #return characteristics_df, uncategorized_characteristics
    #categorized_characteristics, uncat_char = categorize_characteristics(entities_df, characteristics_df)
    #CATEGORIZE_CHARACTERISTICS#############################################################################
    assignments = [None] * len(characteristics_df.index)    
    categorized_characteristics.reset_index(inplace=True)
    unique_individuals.reset_index(inplace=True)    
    
    for index in range(len(categorized_characteristics)):
        #development
        #if categorized_characteristics["pred_entity"][index] == "libre":
            #print("libre")
        if ((categorized_characteristics["category"][index] == "age") or (categorized_characteristics["category"][index] == "legitimacy")) and (volume_metadata["type"] == "baptism"):
            principal = determine_principals(entry_text, unique_individuals, 1)
            if principal != None:
                principal = determine_principals(entry_text, unique_individuals, 1)[0]
            else:
                principal = "Unknown principal"
            princ_loc = unique_individuals.index[unique_individuals["pred_entity"] == principal].tolist()
            for loc in princ_loc:
                if assignments[index] == None:
                    assignments[index] = unique_individuals["unique_id"][loc]
                else:
                    assignments[index] += ';' + unique_individuals["unique_id"][loc]
        elif (categorized_characteristics["category"][index] == "occupation") or (categorized_characteristics["category"][index] == "phenotype") or (categorized_characteristics["category"][index] == "ethnicities") or ((categorized_characteristics["category"][index] == "status") and (categorized_characteristics["pred_entity"][index].lower()[-1] != 's')):
            char_start = categorized_characteristics["pred_start"][index]
            lowest_diff = 50
            assign = None
            for i, person in unique_individuals.iterrows():
                person_start = person["pred_start"]
                diff = char_start - person_start
                if (diff > 0) and (diff < lowest_diff):
                    lowest_diff = diff
                    assign = i
            if assign != None:                
                assignments[index] = unique_individuals["unique_id"][assign]
        elif categorized_characteristics["category"][index] == "status":
            char_start = categorized_characteristics["pred_start"][index]
            lowest_diff = 30
            second_lowest_diff = 50
            assign = [None, None]
            for i, person in unique_individuals.iterrows():
                person_start = person["pred_start"]
                diff = char_start - person_start
                if (diff > 0) and (diff < lowest_diff):
                    lowest_diff = diff
                    if assign[0] != None:
                        assign[1] = assign[0]
                        second_lowest_diff = lowest_diff
                    assign[0] = i
                elif (diff > 0) and (diff < second_lowest_diff) and (assign[0] != None):
                    second_lowest_diff = diff
                    assign[1] = i
            ids = []
            for a in assign:
                if a != None:
                    ids.append(unique_individuals["unique_id"][a])
            if len(ids) == 2:
                assignments[index] = ids[0] + ';' + ids[1]
            elif len(ids) == 1:
                assignments[index] = ids[0]
        elif categorized_characteristics["category"][index] == "origin":            
            for i, entity in entities_df.iterrows():                
                if entity["pred_start"] == categorized_characteristics["pred_start"][index]:
                    signal_entity_index = i
                    break            
            if (signal_entity_index != 0) and (len(entities_df["pred_label"]) > (signal_entity_index + 1)) and (entities_df["pred_label"][signal_entity_index - 1] == "PER") and (entities_df["pred_label"][signal_entity_index + 1] == "LOC") and (entities_df["pred_start"][signal_entity_index + 1] - entities_df["pred_end"][signal_entity_index - 1] <= 20):
                place = entities_df["pred_entity"][signal_entity_index + 1]
                multiple = False
                if categorized_characteristics["pred_entity"][index] == "naturales":
                    multiple = True
                categorized_characteristics.at[index, "pred_entity"] = place
                for i, person in unique_individuals.iterrows():
                    if person["pred_start"] == entities_df["pred_start"][signal_entity_index - 1]:
                        assignments[index] = person["unique_id"]
                        break
                if multiple and (entities_df["pred_label"][signal_entity_index - 2] == "PER") and (entities_df["pred_start"][signal_entity_index - 1] - entities_df["pred_end"][signal_entity_index - 2] <= 10):
                    for i, person in unique_individuals.iterrows():
                        if person["pred_start"] == entities_df["pred_start"][signal_entity_index - 2]:
                            assignments[index] += ';' + person["unique_id"]                            
                            break
            elif (len(entities_df["pred_label"]) > (signal_entity_index + 1)) and (entities_df["pred_label"][signal_entity_index + 1] == "LOC"):
                place = entities_df["pred_entity"][signal_entity_index + 1]                
                categorized_characteristics.at[index, "pred_entity"] = place
                principal = determine_principals(entry_text, unique_individuals, 1)
                if principal != None:
                    principal = determine_principals(entry_text, unique_individuals, 1)[0]
                else:
                    principal = "Unknown principal"
                princ_loc = unique_individuals.index[unique_individuals["pred_entity"] == principal].tolist()
                for loc in princ_loc:
                    if assignments[index] == None:
                        assignments[index] = unique_individuals["unique_id"][loc]
                    else:
                        assignments[index] += ';' + unique_individuals["unique_id"][loc]

            
    categorized_characteristics["assignment"] = assignments    
    
    for i in range(len(unique_individuals.index)):        
        
        characteristics = {"origin": None, "ethnicities":[], "age":None, "legitimacy":None,"occupation":[], "phenotype":[], "status":None, "titles":None, "ranks":None, "relationships":None}
        
        for eth in ethnicities:
            if eth in unique_individuals["pred_entity"][i].lower():                
                characteristics["ethnicities"].append(eth[0].upper() + eth[1:])        
        
        for j in range(len(categorized_characteristics.index)):
            if (categorized_characteristics["assignment"][j] == None):
                continue
            if unique_individuals["unique_id"][i] in categorized_characteristics["assignment"][j]:
                if (categorized_characteristics["category"][j] == "origin") or (categorized_characteristics["category"][j] == "age") or (categorized_characteristics["category"][j] == "legitimacy") or (categorized_characteristics["category"][j] == "status"):
                    characteristics[categorized_characteristics["category"][j]] = categorized_characteristics["pred_entity"][j]
                else:
                    characteristics[categorized_characteristics["category"][j]].append(categorized_characteristics["pred_entity"][j])
        
        person_record = {"id": unique_individuals["unique_id"][i], "name": unique_individuals["pred_entity"][i]}
        
        for key in characteristics:
            if ((key=="ethnicities") or (key == "occupation") or (key == "phenotype")) and (len(characteristics[key]) > 0):
                person_record[key] = characteristics[key][0]
                if (len(characteristics[key]) > 1):
                    for char in range(1,len(characteristics[key])):
                        person_record[key] += ';' + characteristics[key][char]
            elif (characteristics[key] != None) and (characteristics[key] != []):
                person_record[key] = characteristics[key]
            else:
                person_record[key] = None
        
        people.append(person_record)
    
    return people, categorized_characteristics

In [None]:
#export

def id_unique_individuals(entry_text, entities, volume_metadata):
    '''
    identifies all unique individuals that appear in an entry (i.e. removing all multiple mentions of the same person)
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model        
        volume_metadata: metadata for the volume that the entry comes from, built by retrieve_volume_metadata
        
        returns: a list of the unique individuals who appear in an entry AND (temporary?) unique IDs for each individual
    '''
    event_id = volume_metadata["id"] + '-' + entities.iloc[0]['entry_no']
    
    people_df = entities.loc[entities['pred_label'] == 'PER']
    people_df.reset_index(inplace=True)
    people_df = people_df.drop('index',axis=1)
    
    unique_individuals = people_df['pred_entity'].unique()
    unique_individuals = np.vstack([unique_individuals, [None] * len(unique_individuals)])    
    
    for i in range(len(unique_individuals[0])):        
        unique_individuals[1][i] = event_id + '-P' + str(i + 1)        
    
    return unique_individuals

In [None]:
#export

def build_event(entry_text, entities, event_type, principals, volume_metadata, n_event_within_entry, unique_individuals):
    '''
    builds out relationships related to a baptism or burial event
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model
        event_type: this could be either a valid record_type OR a secondary event like a birth
        principals: the principal(s) of the event, as indicated by determine_principals
        volume_metadata: metadata for the volume that the entry comes from, built by retrieve_volume_metadata
        unique_individuals: as determined by id_unique_individuals and/or meta-function of disambig pipeline
        
        n_event_within_entry: event number within entry
        
        returns: structured representation of these relationships, including (but not necessarily limited to)
        the event's principal, the date of the event, the location of the event, and the associated cleric
    '''   
    event_id = volume_metadata["id"] + '-' + entities.iloc[0]['entry_no'] + '-E' + str(n_event_within_entry)    
    #it's possible that this function should also be returning an event iterator,
    #but for now I'm planning to do that in build_relationships
    
    if event_type == "baptism":
        if principals != None:           
            principal = principals[0]
        else:
            principal = None
        date = determine_event_date(entry_text, entities, event_type, volume_metadata)
        location = determine_event_location(entry_text, entities, event_type, volume_metadata)
        cleric = identify_cleric(entry_text, entities)
        
        found_principal_id = False
        found_cleric_id = False
        for index, entity in unique_individuals.iterrows():
            if entity['pred_entity'] == principal:
                principal = entity['unique_id']
                found_principal_id = True
                continue
            elif entity['pred_entity'] == cleric:
                cleric = entity['unique_id']
                found_cleric_id = True                
        
        if (principal != None) and (found_principal_id == False):
            principal = None
        if (cleric != None) and (found_cleric_id == False):
            cleric = None
    
    elif event_type == "birth":
        if principals != None:           
            principal = principals[0]
        else:
            principal = None
        date = determine_event_date(entry_text, entities, event_type, volume_metadata)
        location = determine_event_location(entry_text, entities, event_type, volume_metadata)
        cleric = None
        
        found_principal_id = False        
        for index, entity in unique_individuals.iterrows():
            if entity['pred_entity'] == principal:
                principal = entity['unique_id']
                found_principal_id = True
                break                
        
        if (principal != None) and (found_principal_id == False):
            principal = None
        
    else:
        print("That event type can't be built yet.")
        return
    
    event_relationships = {"id": event_id, "type": event_type, "principal": principal, "date": date, "location": location, "cleric": cleric}
        
    return event_relationships

In [None]:
#export

def drop_obvious_duplicates(people, principals, cleric):
    '''
    first-pass disambiguation that drops multiple mentions of cleric and principal(s)
        people: df containing all entities labeled as people in the entry
        principals: as indicated by determine_principals
        
        returns: people df with obvious duplicates dropped
    '''   
    found_principal = False
    found_cleric = False       
    
    if len(principals) == 1:
        for index, person in people.iterrows():
            if (person['pred_entity'] == principals[0]) and (found_principal == False):
                found_principal = True
            elif person['pred_entity'] == principals[0]:                
                people.drop(index, inplace=True)
                
            if cleric != None:
                if (person['pred_entity'] == cleric) and (found_cleric == False):
                    found_cleric = True
                elif person['pred_entity'] == cleric:                
                    people.drop(index, inplace=True)
   
    people.reset_index(inplace=True)
    
    return people

## NB 72

In [None]:
#no_test

path_to_transcription, path_to_model = "transcriptions\\15834.xml", "models/15834"

#def process_volume(path_to_transcription, path_to_model):
#retrieve volume metadata and controlled vocabularies
volume_metadata = retrieve_volume_metadata(path_to_transcription)
images = xml_v2_to_json(path_to_transcription)
vocabularies = retrieve_controlled_vocabularies()

if volume_metadata["country"] == "Brazil":
    lang = "pt"
    language = "portuguese"
else:
    lang = "es"
    language = "spanish"

#load and apply trained model

trained_model = load_model(path_to_model, language=lang, verbose='True')

entry_df = parse_xml_v2(path_to_transcription)

ent_preds_df, metrics_df, per_ent_metrics = test_model(trained_model, entry_df, "entry_no", "text", score_model=False)
print("Entities extracted.")

#development
#pd.set_option("display.max_rows", 101)
#display(ent_preds_df.head(100))

#iterate through each entry and build relationships

people = []
places = []
events = []

entitiesRunning = pd.DataFrame()
noCategoryRunning = pd.DataFrame()

validation_dict_ALL = []

#file path could be passed as parameter, as could language (eventually)
with open("names.json", encoding="utf-8") as infile:
    name_file = json.load(infile)

names = name_file["names"]
all_first_names = []
for name in names:
    all_first_names.append(name["name"])        

for i in range(len(entry_df.index)):

    entry_no = entry_df['entry_no'][i]
    entry_text = entry_df['text'][i]    

    entities = copy.deepcopy(ent_preds_df[ent_preds_df['entry_no'] == entry_no])

    entities["assigned"] = True
    
    #BUILD ENTRY METADATA#################################################################################################################
    #entry_people, entry_places, entry_events, entities, characteristics_df, categorized_characteristics, uncategorized_characteristics = build_entry_metadata(entry_text, entities, path_to_transcription, entry_no)             
    #def build_entry_metadata(entry_text, entities, path_to_volume_xml, entry_number=None):
    '''
    applies rules-based engine for relationship linking to the transcription of a single entry
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity        
        entities: entities of all kinds extracted from that entry by an NER model
        path_to_volume_xml: path to xml file containing full volume transcription and volume-level metadata
        entry_number: entry number, also from spaCy

        returns: three lists containing structured data about the people, places, and events that appear in the entry
    '''

    people = []
    places = []
    events = []

    volume_metadata = retrieve_volume_metadata(path_to_volume_xml)
    people_df = copy.deepcopy(entities.loc[entities['pred_label'] == 'PER'])
    people_df.reset_index(inplace=True)
    people_df, next_id = assign_unique_ids(people_df, volume_metadata, entry_number)
    characteristics_df = copy.deepcopy(entities.loc[entities['pred_label'] == 'CHAR'])
    characteristics_df.reset_index(inplace=True)
    dates_df = copy.deepcopy(entities.loc[entities['pred_label'] == 'DATE'])
    dates_df.reset_index(inplace=True)

    if volume_metadata["type"] == "baptism":
        principal = determine_principals(entry_text, entities, 1)

        if principal == None:            
            people_df, next_id = build_new_person(people_df, next_id, "principal")
            principal = ["Unknown principal"]            

        cleric = identify_cleric(entry_text, entities)                 

        events.append(build_event(entry_text, entities, "baptism", principal, volume_metadata, 1, people_df))
    
        print("Kai disabled the birth build events.  ctrl+f this message to change this")
        #if (len(dates_df.index) > 1):
        #    events.append(build_event(entry_text, entities, "birth", principal, volume_metadata, 2, people_df))

        characteristics_df, uncategorized_characteristics = categorize_characteristics(entities, characteristics_df)
        people, categorized_characteristics = assign_characteristics(entry_text, entities, characteristics_df, people_df, volume_metadata)       
        
        #ALT_ASSIGN_RELATIONSHIPS#################################################################################################################
        #def alt_assign_relationships(entry_text, entities, people_df, people, volume_metadata):
        '''
        matches all labeled relationships to the correct individuals and builds triples
            entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
            entities: df containing all entities extracted from that entry by an NER model
            people_df: entities given the label "PER" from a single entry by an NER model with unique ids
            people: list of dictionaries, each of which represents one mention of a person in the entry
            (as produced by assign_characteristics)
            volume_metadata: metadata for the volume that the entry comes from, built by retrieve_volume_metadata        

            returns: updated version of people with interpersonal relationships added
        '''

        rel_types = retrieve_controlled_vocabularies()["relationships"]
        relationships = copy.deepcopy(entities.loc[entities['pred_label'] == 'REL'])
        relationships.reset_index(inplace=True)
        characteristics = copy.deepcopy(entities.loc[entities['pred_label'] == 'CHAR'])
        characteristics.reset_index(inplace=True)    
        cat_char, uncat_char = categorize_characteristics(entities, characteristics)    
        entities.reset_index(inplace=True)  

        #if determine_principals(entry_text, entities, 1) != None:
        if not (determine_principals(entry_text, entities, 1) == None):
            principal = determine_principals(entry_text, entities, 1)[0]
            for i in range(len(people)):
                if people[i]["name"] == principal:
                    principal_id = people[i]['id']                
                    break
        else:
            principal = "Unknown principal"
            for i in range(len(people)):
                if people[i]["name"] == principal:
                    principal_id = people[i]['id']                
                    break

        found_parents = False
        found_godparents = False
        godparents = []
        found_paternal_grandparents = False
        found_maternal_grandparents = False
        found_enslaver = False
        enslaver_id = None

        #build godparent/godchild relationships    
        #future improvement: add logic to look for spousal relationship between godparents
        if (len(entities) != 0) and (len(relationships) != 0):        
            for index in range(len(entities)):
                if entities['pred_label'][index] == "REL":                
                    if ((entities['pred_entity'][index].lower() == "madrina") or (entities['pred_entity'][index].lower() == "padrino") or (entities['pred_entity'][index].lower() == "padryno")) and (found_godparents == False):                    
                        if (len(entities) > (index + 1)) and (entities['pred_label'][index + 1] == "PER"):
                            for j in range(len(people_df)):
                                if people_df['pred_start'][j] == entities['pred_start'][index + 1]:
                                    from_person = people_df['unique_id'][j]
                            people = build_reciprocal_relationship(people, from_person, principal_id, "godparent")                        
                            found_godparents = True
                            godparent = {"name": people_df["pred_entity"][j], "id": people_df["unique_id"][j]}
                            godparents.append(godparent)
                    elif ((entities['pred_entity'][index].lower() == "padrinos") or (entities['pred_entity'][index].lower() == "p.p.")) and (found_godparents == False):
                        if (len(entities) > (index + 1)) and (entities['pred_label'][index + 1] == "PER"):
                            for j in range(len(people_df)):
                                if people_df['pred_start'][j] == entities['pred_start'][index + 1]:
                                    from_person = people_df['unique_id'][j]
                            people = build_reciprocal_relationship(people, from_person, principal_id, "godparent")
                            found_godparents = True
                            godparent = {"name": people_df["pred_entity"][j], "id": people_df["unique_id"][j]}
                            godparents.append(godparent)
                        if (len(entities) > (index + 2)) and (entities['pred_label'][index + 2] == "PER"):
                            for j in range(len(people_df)):
                                if people_df['pred_start'][j] == entities['pred_start'][index + 2]:
                                    from_person = people_df['unique_id'][j]
                            people = build_reciprocal_relationship(people, from_person, principal_id, "godparent")
                            found_godparents = True
                            godparent = {"name": people_df["pred_entity"][j], "id": people_df["unique_id"][j]}
                            godparents.append(godparent)
                    elif ("p." in entities['pred_entity'][index].lower()) and (found_godparents == False):
                        if (len(entities) > (index + 1)) and not ("p." in entities['pred_entity'][index + 1].lower()):                        
                            if (len(entities) > (index + 1)) and (entities['pred_label'][index + 1] == "PER"):
                                for j in range(len(people_df)):
                                    if people_df['pred_start'][j] == entities['pred_start'][index + 1]:
                                        from_person = people_df['unique_id'][j]
                                people = build_reciprocal_relationship(people, from_person, principal_id, "godparent")
                                found_godparents = True
                                godparent = {"name": people_df["pred_entity"][j], "id": people_df["unique_id"][j]}
                                godparents.append(godparent)
                        elif (len(entities) > (index + 1)):
                            if (len(entities) > (index + 2)) and (entities['pred_label'][index + 2] == "PER"):
                                for j in range(len(people_df)):
                                    if people_df['pred_start'][j] == entities['pred_start'][index + 2]:
                                        from_person = people_df['unique_id'][j]
                                people = build_reciprocal_relationship(people, from_person, principal_id, "godparent")
                                found_godparents = True
                                godparent = {"name": people_df["pred_entity"][j], "id": people_df["unique_id"][j]}
                                godparents.append(godparent)
                            if (len(entities) > (index + 3)) and (entities['pred_label'][index + 3] == "PER"):
                                for j in range(len(people_df)):
                                    if people_df['pred_start'][j] == entities['pred_start'][index + 3]:
                                        from_person = people_df['unique_id'][j]
                                people = build_reciprocal_relationship(people, from_person, principal_id, "godparent")
                                found_godparents = True
                                godparent = {"name": people_df["pred_entity"][j], "id": people_df["unique_id"][j]}
                                godparents.append(godparent)
                    #build grandparents
                    elif ("abuelos" in entities["pred_entity"][index].lower()):                    
                        if ("paternos" in entities["pred_entity"][index].lower()) and (found_paternal_grandparents == False):                        
                            paternal_grandfather = ''
                            paternal_grandmother = ''
                            if entities["pred_label"][index + 1] == "PER":
                                for j in range(len(people_df)):
                                    if people_df['pred_start'][j] == entities['pred_start'][index + 1]:
                                        grandparent_id = people_df['unique_id'][j]
                                        break
                                if determine_sex(entities["pred_entity"][index + 1].split(' ')[0], name_list="names.json") == "male":
                                    paternal_grandfather = grandparent_id
                                    paternal_grandfather_index = j
                                elif determine_sex(entities["pred_entity"][index + 1].split(' ')[0], name_list="names.json") == "female":
                                    paternal_grandmother = grandparent_id
                                    paternal_grandmother_index = j
                                else:
                                    paternal_grandmother = grandparent_id
                                    paternal_grandmother_index = j
                                if entities["pred_label"][index + 2] == "PER":
                                    for j in range(len(people_df)):
                                        if people_df['pred_start'][j] == entities['pred_start'][index + 2]:
                                            grandparent_id = people_df['unique_id'][j]
                                            break
                                    if (determine_sex(entities["pred_entity"][index + 2].split(' ')[0], name_list="names.json") == "male") and (paternal_grandfather == ''):
                                        paternal_grandfather = grandparent_id
                                        paternal_grandfather_index = j
                                    elif (determine_sex(entities["pred_entity"][index + 2].split(' ')[0], name_list="names.json") == "female") and (paternal_grandmother == ''):
                                        paternal_grandmother = grandparent_id
                                        paternal_grandmother_index = j
                                    elif paternal_grandmother == '':
                                        paternal_grandmother = grandparent_id
                                        paternal_grandmother_index = j
                                    else:
                                        paternal_grandfather = grandparent_id
                                        paternal_grandfather_index = j
                            if paternal_grandfather != '':
                                found_paternal_grandparents = True
                                people = build_reciprocal_relationship(people, paternal_grandfather, principal_id, "grandparent")
                            if paternal_grandmother != '':
                                found_paternal_grandparents = True
                                people = build_reciprocal_relationship(people, paternal_grandmother, principal_id, "grandparent")
                            if (paternal_grandfather != '') and (paternal_grandmother != ''):
                                people = build_reciprocal_relationship(people, paternal_grandfather, paternal_grandmother, "spouse")
                        elif ("matern" in entities["pred_entity"][index].lower()) and (found_maternal_grandparents == False):                        
                            maternal_grandfather = ''
                            maternal_grandmother = ''
                            if entities["pred_label"][index + 1] == "PER":
                                for j in range(len(people_df)):
                                    if people_df['pred_start'][j] == entities['pred_start'][index + 1]:
                                        grandparent_id = people_df['unique_id'][j]
                                        break
                                if determine_sex(entities["pred_entity"][index + 1].split(' ')[0], name_list="names.json") == "male":
                                    maternal_grandfather = grandparent_id
                                    maternal_grandfather_index = j
                                elif determine_sex(entities["pred_entity"][index + 1].split(' ')[0], name_list="names.json") == "female":
                                    maternal_grandmother = grandparent_id
                                    maternal_grandmother_index = j
                                else:
                                    maternal_grandmother = grandparent_id
                                    maternal_grandmother_index = j
                                if entities["pred_label"][index + 2] == "PER":
                                    for j in range(len(people_df)):
                                        if people_df['pred_start'][j] == entities['pred_start'][index + 2]:
                                            grandparent_id = people_df['unique_id'][j]
                                            break
                                    if (determine_sex(entities["pred_entity"][index + 2].split(' ')[0], name_list="names.json") == "male") and (maternal_grandfather == ''):
                                        maternal_grandfather = grandparent_id
                                        maternal_grandfather_index = j
                                    elif (determine_sex(entities["pred_entity"][index + 2].split(' ')[0], name_list="names.json") == "female") and (maternal_grandmother == ''):
                                        maternal_grandmother = grandparent_id
                                        maternal_grandmother_index = j
                                    elif maternal_grandmother == '':
                                        maternal_grandmother = grandparent_id
                                        maternal_grandmother_index = j
                                    else:
                                        maternal_grandfather = grandparent_id
                                        maternal_grandfather_index = j
                            if maternal_grandfather != '':
                                found_maternal_grandparents = True
                                people = build_reciprocal_relationship(people, maternal_grandfather, principal_id, "grandparent")
                            if maternal_grandmother != '':
                                found_maternal_grandparents = True
                                people = build_reciprocal_relationship(people, maternal_grandmother, principal_id, "grandparent")
                            if (maternal_grandfather != '') and (maternal_grandmother != ''):                            
                                people = build_reciprocal_relationship(people, maternal_grandfather, maternal_grandmother, "spouse")
                    elif ("matern" in entities["pred_entity"][index].lower()) and found_paternal_grandparents and (found_maternal_grandparents == False):
                        maternal_grandfather = ''
                        maternal_grandmother = ''
                        if entities["pred_label"][index + 1] == "PER":
                            for j in range(len(people_df)):
                                if people_df['pred_start'][j] == entities['pred_start'][index + 1]:
                                    grandparent_id = people_df['unique_id'][j]
                                    break
                            if determine_sex(entities["pred_entity"][index + 1].split(' ')[0], name_list="names.json") == "male":
                                maternal_grandfather = grandparent_id
                                maternal_grandfather_index = j
                            elif determine_sex(entities["pred_entity"][index + 1].split(' ')[0], name_list="names.json") == "female":
                                maternal_grandmother = grandparent_id
                                maternal_grandmother_index = j
                            else:
                                maternal_grandmother = grandparent_id
                                maternal_grandmother_index = j
                            if entities["pred_label"][index + 2] == "PER":
                                for j in range(len(people_df)):
                                    if people_df['pred_start'][j] == entities['pred_start'][index + 2]:
                                        grandparent_id = people_df['unique_id'][j]
                                        break
                                if (determine_sex(entities["pred_entity"][index + 2].split(' ')[0], name_list="names.json") == "male") and (maternal_grandfather == ''):
                                    maternal_grandfather = grandparent_id
                                    maternal_grandfather_index = j
                                elif (determine_sex(entities["pred_entity"][index + 2].split(' ')[0], name_list="names.json") == "female") and (maternal_grandmother == ''):
                                    maternal_grandmother = grandparent_id
                                    maternal_grandmother_index = j
                                elif maternal_grandmother == '':
                                    maternal_grandmother = grandparent_id
                                    maternal_grandmother_index = j
                                else:
                                    maternal_grandfather = grandparent_id
                                    maternal_grandfather_index = j
                        if maternal_grandfather != '':
                            found_maternal_grandparents = True
                            people = build_reciprocal_relationship(people, maternal_grandfather, principal_id, "grandparent")
                        if maternal_grandmother != '':
                            found_maternal_grandparents = True
                            people = build_reciprocal_relationship(people, maternal_grandmother, principal_id, "grandparent")
                        if (maternal_grandfather != '') and (maternal_grandmother != ''):
                                people = build_reciprocal_relationship(people, maternal_grandfather, maternal_grandmother, "spouse")

                    elif ((found_parents == False) and (found_godparents == False) and (found_paternal_grandparents == False) and (found_maternal_grandparents == False) and (found_enslaver == False)):
                        #ie if after all these checks, there are still no relationships found, then we have a case where we have a relationship but no assignment
                        #Note that this relies on setting ALL to FOUND by default, so I don't have to add to the code above each time
                        #Thus, we only flip it in the case that no relationships are found

                        entities.loc[index, "assigned"] = False
                        #print("Failed to find a category for relationship: " + entities["pred_entity"][index])



        if len(godparents) == 2:
            first_godparent_sex = determine_sex(godparents[0]["name"].split(' ')[0], name_list="names.json")
            second_godparent_sex = determine_sex(godparents[1]["name"].split(' ')[0], name_list="names.json")
            #if (first_godparent_sex != second_godparent_sex) or (first_godparent_sex == "unknown" and second_godparent_sex == "unknown"):
                #print("found possible godparent couple: ")
                #print(godparents[0]["name"])
                #print(godparents[1]["name"])

        for i in range(len(cat_char)):
            #build enslaver/enslaved person relationships
            if cat_char["category"][i] == "status":            
                #skip if associated with first mention of principal
                char_start = cat_char['pred_start'][i]
                if char_start <= 25:
                    continue            

                #match enslaved couple to owner
                if (cat_char["pred_entity"][i].lower()[len(cat_char["pred_entity"][i]) - 1] == 's'):
                    close_ep = -1
                    far_ep = -1
                    ens = -1
                    for j in range(len(people_df)):
                        pers_start = people_df["pred_start"][j]
                        poss_diff = char_start - pers_start
                        if (ens == -1) and (poss_diff < 0) and (abs(poss_diff) < 25):
                            ens = j
                        elif (ens != -1) and (poss_diff < 0) and (abs(poss_diff) < abs(char_start - people_df["pred_start"][ens])):
                            ens = j
                        elif (close_ep == -1) and (poss_diff > 0) and (poss_diff < 50):
                            close_ep = j
                        elif (close_ep != -1) and (far_ep == -1) and (poss_diff > 0) and (poss_diff < char_start - people_df["pred_start"][close_ep]):
                            far_ep = close_ep
                            close_ep = j
                        elif (close_ep != -1) and (far_ep == -1) and (poss_diff > 0) and (poss_diff < 50):
                            far_ep = j
                        elif (close_ep != -1) and (far_ep != -1) and (poss_diff > 0) and (poss_diff < char_start - people_df["pred_start"][close_ep]):
                            far_ep = close_ep
                            close_ep = j
                        elif (close_ep != -1) and (far_ep != -1) and (poss_diff > 0) and (poss_diff < char_start - people_df["pred_start"][far_ep]):
                            far_ep = j
                    if (ens != -1) and (close_ep != -1) and (far_ep != -1):
                        people = build_reciprocal_relationship(people, people_df["unique_id"][ens], people_df["unique_id"][close_ep], "enslaver")
                        people = build_reciprocal_relationship(people, people_df["unique_id"][ens], people_df["unique_id"][far_ep], "enslaver")
                    elif (ens != -1) and (close_ep != -1):
                        people = build_reciprocal_relationship(people, people_df["unique_id"][ens], people_df["unique_id"][close_ep], "enslaver")
                #match enslaved person to owner        
                elif "propiedad" in cat_char["pred_entity"][i].lower():
                    for j in range(len(entities)):
                        if entities["pred_start"][j] == cat_char["pred_start"][i]:
                            signal_entity_index = j
                            break                
                    if found_enslaver and (entry_text.rfind("misma", cat_char["pred_start"][i] - 25, cat_char["pred_start"][i]) != -1):
                        if (entities["pred_label"][signal_entity_index - 1] == "PER") and (cat_char["pred_start"][i] - entities["pred_end"][signal_entity_index - 1] <= 20):
                            people = build_reciprocal_relationship(people, enslaver_id, entities["unique_id"][signal_entity_index - 1], "enslaver")
                            if (entities["pred_label"][signal_entity_index - 2] == "PER") and (entities["pred_end"][signal_entity_index - 2] - entities["pred_start"][signal_entity_index - 1] <= 5):
                                people = build_reciprocal_relationship(people, enslaver_id, entities["unique_id"][signal_entity_index - 2], "enslaver")
                    elif (entities["pred_label"][signal_entity_index + 1] == "PER") and ((entities["pred_start"][signal_entity_index + 1] - cat_char["pred_start"][i]) <= 25):
                        for j in range(len(people_df)):
                            if people_df['pred_start'][j] == entities['pred_start'][signal_entity_index + 1]:
                                found_enslaver = True
                                enslaver_id = people_df["unique_id"][j]
                                people = build_reciprocal_relationship(people, enslaver_id, principal_id, "enslaver")
                                break               
                else:
                    ep = -1
                    ens = -1
                    for k in range(len(people_df)):
                        pers_start = people_df["pred_start"][k]
                        poss_diff = char_start - pers_start
                        if (ep == -1) and (poss_diff > 0) and (poss_diff < 50):
                            ep = k                        
                        elif (ens == -1) and (poss_diff < 0) and (abs(poss_diff) < 25):
                            ens = k                        
                        elif (ep != -1) and (poss_diff > 0) and (poss_diff < char_start - people_df["pred_start"][ep]):
                            ep = k
                        elif (ens != -1) and (poss_diff < 0) and (abs(poss_diff) < abs(char_start - people_df["pred_start"][ens])):
                            ens = k
                    if (ep != -1) and (ens != -1):
                        people = build_reciprocal_relationship(people, people_df["unique_id"][ens], people_df["unique_id"][ep], "enslaver")
            #build parent/child relationships
            elif (((cat_char["category"][i] == "relationships") and ((cat_char["pred_entity"][i] == "hijo") or (cat_char["pred_entity"][i] == "hija") or (cat_char["pred_entity"][i] == "h") or (cat_char["pred_entity"][i] == "h."))) or (cat_char["category"][i] == "legitimacy")) and (found_parents == False):
                rel_start = cat_char["pred_start"][i]
                close_parent = -1
                far_parent = -1
                for l in range(len(people_df)):
                    pers_start = people_df["pred_start"][l]
                    poss_diff = rel_start - pers_start
                    if (close_parent == -1) and (poss_diff < 0) and (abs(poss_diff) < 25):
                        close_parent = l
                    elif (close_parent != -1) and (far_parent == -1) and (poss_diff < 0) and (abs(poss_diff) < abs(rel_start - people_df["pred_start"][close_parent])):
                        far_parent = close_parent
                        close_parent = l
                    elif (close_parent != -1) and (far_parent == -1) and (poss_diff < 0) and ((pers_start - people_df["pred_end"][close_parent]) < 10):
                        far_parent = l
                    elif (close_parent != -1) and (far_parent != -1) and (poss_diff > 0) and (abs(poss_diff) < abs(rel_start - people_df["pred_start"][close_parent])):
                        far_parent = close_parent
                        close_parent = l
                    elif (close_parent != -1) and (far_parent != -1) and (poss_diff > 0) and (abs(poss_diff) < abs(rel_start - people_df["pred_start"][far_parent])):
                        far_parent = l
                if (close_parent != -1) and (far_parent != -1):
                    people = build_reciprocal_relationship(people, people_df["unique_id"][close_parent], principal_id, "parent")
                    people = build_reciprocal_relationship(people, people_df["unique_id"][far_parent], principal_id, "parent")
                    if ((cat_char["category"][i] == "legitimacy") and ('r' not in cat_char["pred_entity"][i])) or ((cat_char["category"][i] == "relationships") and ((cat_char['pred_entity'][i] == 'h') or (cat_char['pred_entity'][i] == 'h.')) and ((entry_text[cat_char["pred_end"][i]] == 'l') or (entry_text[cat_char["pred_end"][i] + 1] == 'l'))):
                        people = build_reciprocal_relationship(people, people_df["unique_id"][close_parent], people_df["unique_id"][far_parent], "spouse")
                    #future improvement (after normalization) if both parents enslaved and child not free, make sure child's status is enslaved
                    #future improvement (after normalization) if child is enslaved, make sure reciprocal enslaver/enslaved person relationship exists with mother's enslaver                
                    found_parents = True
                elif (close_parent != -1):
                    people = build_reciprocal_relationship(people, people_df["unique_id"][close_parent], principal_id, "parent")
                    #future improvement (after normalization) if single parent is mother and she is enslaved and child not free, make sure child's status is enslaved
                    #future improvement (after normalization) if child is enslaved, make sure reciprocal enslaver/enslaved person relationship exists with mother's enslaver
                    found_parents = True

        #build parent-child relationships between parents and grandparents
        if found_parents and found_paternal_grandparents:        
            if (far_parent != -1) and (determine_sex(people_df["pred_entity"][far_parent].split(' ')[0]) == "male"):
                if (paternal_grandmother != '') and (paternal_grandfather != ''):
                    people = build_reciprocal_relationship(people, people_df["unique_id"][paternal_grandmother_index], people_df["unique_id"][far_parent], "parent")
                    people = build_reciprocal_relationship(people, people_df["unique_id"][paternal_grandfather_index], people_df["unique_id"][far_parent], "parent")
                elif paternal_grandmother != '':
                    people = build_reciprocal_relationship(people, people_df["unique_id"][paternal_grandmother_index], people_df["unique_id"][far_parent], "parent")        
                else:                
                    people = build_reciprocal_relationship(people, people_df["unique_id"][paternal_grandfather_index], people_df["unique_id"][far_parent], "parent")
            elif (close_parent != -1) and (determine_sex(people_df["pred_entity"][close_parent].split(' ')[0]) == "male"):
                if (paternal_grandmother != '') and (paternal_grandfather != ''):
                    people = build_reciprocal_relationship(people, people_df["unique_id"][paternal_grandmother_index], people_df["unique_id"][close_parent], "parent")
                    people = build_reciprocal_relationship(people, people_df["unique_id"][paternal_grandfather_index], people_df["unique_id"][close_parent], "parent")
                elif paternal_grandmother != '':
                    people = build_reciprocal_relationship(people, people_df["unique_id"][paternal_grandmother_index], people_df["unique_id"][close_parent], "parent")        
                else:                
                    people = build_reciprocal_relationship(people, people_df["unique_id"][paternal_grandfather_index], people_df["unique_id"][close_parent], "parent")
        if found_parents and found_maternal_grandparents:
            if (close_parent != -1)  and (determine_sex(people_df["pred_entity"][close_parent].split(' ')[0]) == "female"):
                if (maternal_grandmother != '') and (maternal_grandfather != ''):
                    people = build_reciprocal_relationship(people, people_df["unique_id"][maternal_grandmother_index], people_df["unique_id"][close_parent], "parent")
                    people = build_reciprocal_relationship(people, people_df["unique_id"][maternal_grandfather_index], people_df["unique_id"][close_parent], "parent")
                elif maternal_grandmother != '':
                    people = build_reciprocal_relationship(people, people_df["unique_id"][maternal_grandmother_index], people_df["unique_id"][close_parent], "parent")        
                else:                
                    people = build_reciprocal_relationship(people, people_df["unique_id"][maternal_grandfather_index], people_df["unique_id"][close_parent], "parent")
            elif (far_parent != -1) and (determine_sex(people_df["pred_entity"][far_parent].split(' ')[0]) == "female"):
                if (maternal_grandmother != '') and (maternal_grandfather != ''):
                    people = build_reciprocal_relationship(people, people_df["unique_id"][maternal_grandmother_index], people_df["unique_id"][far_parent], "parent")
                    people = build_reciprocal_relationship(people, people_df["unique_id"][maternal_grandfather_index], people_df["unique_id"][far_parent], "parent")
                elif maternal_grandmother != '':
                    people = build_reciprocal_relationship(people, people_df["unique_id"][maternal_grandmother_index], people_df["unique_id"][far_parent], "parent")        
                else:                
                    people = build_reciprocal_relationship(people, people_df["unique_id"][maternal_grandfather_index], people_df["unique_id"][far_parent], "parent")
            elif (close_parent != -1) and (far_parent == -1):
                if (maternal_grandmother != '') and (maternal_grandfather != ''):
                    people = build_reciprocal_relationship(people, people_df["unique_id"][maternal_grandmother_index], people_df["unique_id"][close_parent], "parent")
                    people = build_reciprocal_relationship(people, people_df["unique_id"][maternal_grandfather_index], people_df["unique_id"][close_parent], "parent")
                elif maternal_grandmother != '':
                    people = build_reciprocal_relationship(people, people_df["unique_id"][maternal_grandmother_index], people_df["unique_id"][close_parent], "parent")        
                else:                
                    people = build_reciprocal_relationship(people, people_df["unique_id"][maternal_grandfather_index], people_df["unique_id"][close_parent], "parent")

        #return people, entities
        #ALT_ASSIGN_RELATIONSHIPS#################################################################################################################
        
        ############## THIS IS WHERE THE OBVIOUS DUPLICATES MERGE HAPPENS
        obvious_duplicates = id_obvious_duplicates(people_df, principal, cleric)       
        people = merge_duplicates(people, obvious_duplicates)
        #############

        #perform more sophisticated disambiguation

        for event in events:
            if (event["location"] != None) and (not (event["location"] in places)):
                places.append(event["location"])

    elif volume_metadata["type"] == "marriage":        
        #process marriage record
        print("That record type is not supported yet.")
        return None
    elif volume_metadata["type"] == "burial":
        #process burial record
        print("That record type is not supported yet.")
        return None
    else:
        print("That record type is not supported yet.")
        return None    
    #return people, places, events, entities, characteristics_df, categorized_characteristics, uncategorized_characteristics
    #BUILD ENTRY METADATA#################################################################################################################
    
    if uncategorized_characteristics.shape[0] > 0:
        noCategoryRunning = noCategoryRunning.append(uncategorized_characteristics)

    #FIND ENTITIES THAT ARE UNASSIGNED OR UNCATEGORIZED
    entity_index = 0
    for ent_data in entities.itertuples():
        for char_data in characteristics_df.itertuples():
            char_index = 0
            #characteristic is not categorized:
            if (char_data.category == None) and (ent_data.pred_start == char_data.pred_start) and (ent_data.pred_entity == char_data.pred_entity):
                continue #Already dealth with
            #characteristic is categorized but not assigned
            elif (ent_data.pred_label == char_data.pred_label) and (ent_data.pred_start == char_data.pred_start) and (ent_data.pred_entity == char_data.pred_entity):
                if (char_data.assignment == None):
                    entities.at[entity_index, "assigned"] = False
            char_index += 1
        entity_index += 1

    entitiesRunning = entitiesRunning.append(entities)  

    verbosity = 0

    entry_validation_dict = validate_entry(entities, entry_people, entry_places, entry_events, uncategorized_characteristics, all_first_names, isVerbose=verbosity)
    validation_dict_ALL.append(entry_validation_dict)

    people += entry_people
    places += entry_places
    events += entry_events

noCategoryRunning.reset_index(drop = True, inplace = True)
noCategoryRunning["assigned"] = False
print("Relationships linked.")

#disambiguate locations and assign unique ids

unique_places = []
for place in places:
    if (place != None) and (place not in unique_places):
        unique_places.append(place)

for person in people:        
    if (person["origin"] != None) and (person["origin"] not in unique_places):
        unique_places.append(person["origin"])

places = []
curr_place = 1
for unique_place in unique_places:
    place_record = {"id":volume_metadata["id"] + '-L' + str(curr_place), "location":unique_place}
    places.append(place_record)
    curr_place += 1

#incorporate location ids into event metadata and person records

for event in events:
    location = event["location"]
    loc_id = "unknown"
    if location != None:
        for place in places:
            if place["location"] == location:
                loc_id = place["id"]
    if (loc_id == "unknown") and (location != None):
        print("Failed to find location ID for " + location)
        event["location"] = None
    else:
        event["location"] = loc_id

    if event["location"] == "unknown":
        event["location"] = None

for person in people:
    if person["origin"] == None:
        continue

    for place in places:
        if place["location"] == person["origin"]:
            person["origin"] = place["id"]
            break

#bracket missing or incomplete event dates

incomplete_dates = []
last_year = None
last_month = None
last_day = None

for e in range(len(events)):
    curr_year = events[e]["date"][:4]
    curr_month = events[e]["date"][5:7]
    curr_day = events[e]["date"][8:]

    #fix incompletely extracted years
    if (curr_year != "????") and (last_year != None) and (abs(int(curr_year) - int(last_year)) > 1):
        if (curr_year[3] == last_year[3]):
            curr_year = last_year                
        elif (curr_month == "01") and (last_month == "12"):
            curr_year = str(int(last_year) + 1)                
        else:
            curr_year = last_year
        events[e]["date"] = curr_year + '-' + curr_month + '-' + curr_day

    if (curr_year == "????") or (curr_month == "??") or (curr_day == "??"):
        #logic to assign dates for birth events based on associated baptism
        if events[e]["type"] == "birth":
            if (events[e]["id"][:events[e]["id"].find('E')] == events[e - 1]["id"][:events[e - 1]["id"].find('E')]) and (events[e - 1]["type"] == "baptism") and ('?' not in events[e - 1]["date"]):
                    if (curr_month != "??") and (curr_day != "??"):
                        if (curr_month == "12") and (last_month == "01"):
                            curr_year = str(int(last_year) - 1)                                
                        elif (30 * int(last_month) + int(last_day) - 30 * int(curr_month) - int(curr_day)) < 21:
                            curr_year = last_year
                        events[e]["date"] = curr_year + '-' + events[e]["date"][5:7] + '-' + events[e]["date"][8:]
                    elif curr_month != "??":
                        if (curr_month == "12"):
                            curr_day = "01"
                            curr_year = str(int(last_year) - 1)
                            events[e]["date"] = curr_year + '-' + curr_month + '-' + curr_day + '/' + last_year + '-01-01'
                        elif (curr_month == last_month):
                            curr_day = "01"
                            curr_year = last_year
                            events[e]["date"] = curr_year + '-' + curr_month + '-' + curr_day + '/' + last_year + '-' + last_month + '-' + last_day
                        elif int(curr_month) == (int(last_month) - 1):
                            curr_day = "01"
                            curr_year = last_year
                            events[e]["date"] = curr_year + '-' + curr_month + '-' + curr_day + '/' + last_year + '-' + last_month + '-01'                            
                    elif curr_day != "??":
                        if curr_day <= last_day:
                            curr_year = last_year
                            curr_month = last_month                                
                        else:
                            if last_month == "01":
                                curr_month = "12"
                                curr_year = str(int(last_year) - 1)
                            else:
                                curr_month = str(int(last_month) - 1)                                    
                                if len(curr_month) < 2:
                                    curr_month = '0' + curr_month
                                curr_year = last_year
                        events[e]["date"] = curr_year + '-' + curr_month + '-' + curr_day
                    else:
                        if (last_month == '01') and (int(last_day) < 21):
                            curr_year = str(int(last_year) - 1)
                            curr_month = "12"
                            curr_day = str(int(last_day) + 9)                               
                        elif int(last_day) < 21:
                            curr_year = last_year
                            curr_month = str(int(last_month) - 1)
                            if len(curr_month) < 2:
                                curr_month = '0' + curr_month
                            curr_day = str(int(last_day) + 9)
                        else:
                            curr_year = last_year
                            curr_month = last_month
                            curr_day = str(int(last_day) - 20)
                            if len(curr_day) < 2:
                                curr_day = '0' + curr_day
                        events[e]["date"] = curr_year + '-' + curr_month + '-' + curr_day + '/' + last_year + '-' + last_month + '-' + last_day

        if (curr_year == "????") or (curr_month == "??") or (curr_day == "??"):
            incomplete_dates.append(e)
    elif last_year == None:
        for date in incomplete_dates:
            events[date]["date"] = complete_date(events[date]["date"], None, curr_year + '-' + curr_month + '-' + curr_day)

        incomplete_dates = []
        last_year = curr_year
        last_month = curr_month
        last_day = curr_day
    elif (compare_dates(int(curr_year), int(curr_month), int(curr_day), int(last_year), int(last_month), int(last_day)) == '>') or (compare_dates(int(curr_year), int(curr_month), int(curr_day), int(last_year), int(last_month), int(last_day)) == '='):
        for date in incomplete_dates:
            events[date]["date"] = complete_date(events[date]["date"], last_year + '-' + last_month + '-' + last_day, curr_year + '-' + curr_month + '-' + curr_day)

        incomplete_dates = []
        last_year = curr_year
        last_month = curr_month
        last_day = curr_day                    

if last_year != None:
    for date in incomplete_dates:
        events[date]["date"] = complete_date(events[date]["date"], last_year + '-' + last_month + '-' + last_day, None)

#merging any date brackets with equal endpoints
for event in events:
    interval = event["date"].split('/')
    if (len(interval) == 2) and (interval[0] == interval[1]):
        event["date"] == interval[0]            

print("Events configured.")    

for person in people:        
    #strip titles and/or ranks from names
    if person["name"] != None:
        name_parts = person["name"].split(' ')

        if len(name_parts) >= 2:
            while ((name_parts[0].lower() + ' ' + name_parts[1].lower()) in vocabularies["titles"]) or ((name_parts[0].lower() + ' ' + name_parts[1].lower()) in vocabularies["ranks"]):
                if len(name_parts) == 2:
                    person["name"] = None
                else:
                    person["name"] = name_parts[2]
                    for i in range(3, len(name_parts)):
                        person["name"] += ' ' + name_parts[i]

                if (name_parts[0].lower() + ' ' + name_parts[1].lower()) in vocabularies["titles"]:
                    if person["titles"] != None:
                        person["titles"] += ';' + name_parts[0] + ' ' + name_parts[1]
                    else:
                        person["titles"] = name_parts[0] + ' ' + name_parts[1]
                else:
                    if person["ranks"] != None:
                        person["ranks"] += ';' + name_parts[0] + ' ' + name_parts[1]
                    else:
                        person["ranks"] = name_parts[0] + ' ' + name_parts[1]

                if person["name"] == None:
                    break
                name_parts = person["name"].split(' ')
                if len(name_parts) < 2:
                    break

        if person["name"] != None:
            while (name_parts[0].lower() in vocabularies["titles"]) or (name_parts[0].lower() in vocabularies["ranks"]):
                if len(name_parts) == 1:
                    person["name"] = None
                else:
                    person["name"] = name_parts[1]
                    for i in range(2, len(name_parts)):
                        person["name"] += ' ' + name_parts[i]

                if name_parts[0].lower() in vocabularies["titles"]:
                    if person["titles"] != None:
                        person["titles"] += ';' + name_parts[0]
                    else:
                        person["titles"] = name_parts[0]
                else:
                    if person["ranks"] != None:
                        person["ranks"] += ';' + name_parts[0]
                    else:
                        person["ranks"] = name_parts[0]

                if person["name"] == None:
                    break
                name_parts = person["name"].split(' ')

#normalize names and all characteristics
names = []
name_counts = []
ethnonym_vocab = retrieve_json_vocab("synonyms.json", "ethnonyms")
phenotype_vocab = retrieve_json_vocab("synonyms.json", "phenotypes", language="spanish")

for person in people:
    #normalize characteristics and translate to English
    for key in person:
        if person[key] == None:
            continue
        if key == "name":
            person[key] = normalize_text(person[key], "synonyms.json", context="name")
            #check extracted name for ethnonyms and/or attributed phenotypes        
            if (person["name"] != None) and (person["name"] != normalize_text(person["name"], "synonyms.json", context="ethnonym")):
                for token in person["name"].split(' '):
                    eth_norm = normalize_text(token, "synonyms.json", context="ethnonym")
                    if token != eth_norm:
                        if (person["ethnicities"] == None) or (not (eth_norm in person["ethnicities"])):
                            if person["ethnicities"] == None:
                                person["ethnicities"] = eth_norm
                            else:
                                person["ethnicities"] = person["ethnicities"] + ';' + eth_norm
                person["name"] = normalize_text(person["name"], "synonyms.json", context="ethnonym")
            else:
                for ethnonym in ethnonym_vocab:
                    if ethnonym in person["name"]:
                        if person["ethnicities"] == None:
                            person["ethnicities"] = ethnonym
                        else:
                            person["ethnicities"] = person["ethnicities"] + ';' + ethnonym
            for phenotype in phenotype_vocab:
                if phenotype in normalize_text(person[key], "synonyms.json", context="characteristic"):                    
                    if person["phenotype"] == None:
                        person["phenotype"] = phenotype
                    else:
                        person["phenotype"] = person["phenotype"] + ';' + phenotype
                    if phenotype[-1] == 's':
                        for token in person["name"].split(' '):
                            if normalize_text(token, "synonyms.json", context="characteristic") == phenotype:
                                person["name"] = person["name"].replace(' ' + token, '')
        elif key == "ethnicities":                
            if person[key].find(';') == -1:
                person[key] = normalize_text(person[key], "synonyms.json", context="ethnonym")                    
            else:
                char_comp = person[key].split(';')
                person[key] = ""
                #strip out duplicate characteristics
                for char in char_comp:
                    char = normalize_text(char, "synonyms.json", context="ethnonym")                       

                    if not (char in person[key]):
                        if person[key] == "":
                            person[key] = char
                        else:
                            person[key] = person[key] + ';' + char
        elif (key != "id") and (key != "relationships"):
            if person[key].find(';') == -1:
                person[key] = normalize_text(person[key], "synonyms.json", context="characteristic")
                person[key] = translate_characteristic(person[key], "synonyms.json", language)
            else:
                char_comp = person[key].split(';')
                person[key] = ""
                #strip out duplicate characteristics
                for char in char_comp:
                    char = normalize_text(char, "synonyms.json", context="characteristic")                        
                    char = translate_characteristic(char, "synonyms.json", language)                        
                    if not (char in person[key]):
                        if person[key] == "":
                            person[key] = char
                        else:
                            person[key] = person[key] + ';' + char           

    #future improvement: find additional references for plural characteristics

    #count name frequency
    if person["name"] != None:
        if person["name"] in names:
            name_counts[names.index(person['name'])] += 1
        else:
            names.append(person["name"])
            name_counts.append(1)   

#disambiguate and merge people across the volume
redundant_records = []
merged_records = []    
for i in range(len(name_counts)):
    if (name_counts[i] > .1 * len(images)) and (len(names[i].split(' ')) > 1) and (names[i] != "Unknown principal"):
        records_to_merge = []            
        for j in range(len(people)):
            if people[j]["name"] == names[i]:
                redundant_records.append(people[j])
                records_to_merge.append(people[j])                    
        merged_records.append(merge_records(records_to_merge))            
people = [person for person in people if person not in redundant_records]
for person in merged_records:
    people.append(person)    

print("People records enhanced and disambiguated.")

#reduce compound person IDs to single ID, add references field
people, events = compact_references(people, events)

print("Single ID generated for each individual.")

#convert dictionaries into JSON    
with open("volume_records\\" + volume_metadata["id"] + ".json", "w") as outfile:
    outfile.write('{\n\"volume\": \n')
    json.dump(volume_metadata, outfile)
    outfile.write(',')
    outfile.write('\n\"images\": [\n')
    first_img = True
    for image in images:
        if first_img:
            first_img = False
        else:
            outfile.write(",\n")
        json.dump(image, outfile)
    outfile.write("\n],\n")
    outfile.write('\n\"people\": [\n')
    first_person = True
    for person in people:
        if first_person:
            first_person = False
        else:
            outfile.write(",\n")            
        json.dump(person, outfile)            
    outfile.write("\n],\n")
    outfile.write("\"places\": [\n")
    first_place = True
    for place in places:
        if first_place:
            first_place = False
        else:
            outfile.write(",\n")
        json.dump(place, outfile)
    outfile.write("\n],\n")
    outfile.write("\"events\": [\n")
    first_event = True
    for event in events:
        if first_event:
            first_event = False
        else:
            outfile.write(",\n")
        json.dump(event, outfile)
    outfile.write("\n]\n")
    outfile.write('}')

#dump validation dictionaries
with open("validation\\" + volume_metadata["id"] + ".json", "w") as outfile:
    outfile.write('{\n\"entries\": [\n')
    first_entry = True
    for entry in validation_dict_ALL:
        if first_entry:
            first_entry = False
        else:
            outfile.write(",\n")
        json.dump(entry, outfile)
    outfile.write("\n]\n")
    outfile.write('}')

print("JSON built, processing completed.")

people, places, events, json_path, entities, noCategory, validation_list = people, places, events, volume_metadata["id"] + "_ppe.json", entitiesRunning, noCategoryRunning, validation_dict_ALL

In [None]:
#no_test

from nbdev.export import notebook2script
notebook2script()