In [None]:
#default_exp relationships

In [None]:
#export
#dependencies

#nlp packages
import spacy
from spacy.util import minibatch, compounding

#manipulation of tables/arrays
import pandas as pd
import numpy as np
import copy

#internal imports
from ssda_nlp.collate import *
from ssda_nlp.split_data import *
from ssda_nlp.modeling import *
from ssda_nlp.model_performance_utils import *
from ssda_nlp.xml_parser import *
from ssda_nlp.unstructured2markup import *
from ssda_nlp.utility import *

In [None]:
#export
#ideally this function will eventually query the database live, but for now we'll use static snapshots
#of the vocabularies as they currently exist
#should also eventually be enhanced to return equivalents and a requested language

def retrieve_controlled_vocabularies():
    '''
    returns a dictionary containing current version of controlled vocabularies for characteristics
    '''
    age = ["parvulo", "parvula", "parv", "adulto", "adulta", "adul", "niño", "niña", "nino", "nina", "dulto", "dulta"]
    occupation = ["religioso", "ingen.o", "sacristan", "sac.", "sachristan", "cura", "vicario", "eclesiastico", "clerigo", "estudiante"]
    phenotype = ["negro", "negra", "preto", "moreno", "morena", "indio", "india", "pardo", "parda", "mestizo", "mestiza", "mulato", "mulata", "blanco", "blanca", "criollo", "criolla", "branco", "branca"]
    titles = ["doctor", "d.r", "dor", "dr", "d.or", "br", "ber", "don", "doña", "da", "padre", "pe", "predicador", "fray", "d.n", "d.", "d,n", "d;n", "p.e", "p", "dn", "fr.", "fr", "f", "regidor", "rex.or", "alg.l m.or", "ldo", "licenciado", "d", "alg.l", "alcalde"]
    ranks = ["capitan", "capitam", "cap.n", "capn", "sarg.to may.r", "sarg.to", "sargento", "sarjento mayor", "sarjento", "sargto mayor", "theniente", "teniente", "thente"]
    ethnicities = ["ganga", "español", "espanol", "caravali", "ingles", "yngles", "angola", "carabalí", "carabali", "carabaly", "congo", "conga", "mandinga", "mina", "temo", "malagas", "arara", "manga"]
    status = ["clavo", "clava", "escl", "clabo", "claba", "esc.va", "esc.ba", "esc.vo", "escvo", "esclava", "escva", "esc.bo", "esc.a", "esc.o", "libre", "esc.s", "esco", "esca"]
    legitimacy = ["lexma", "lexmo", "legitima", "legitimo", "h l", "natural", "nral", "lexitima", "lexitimo", "nat.l"]
    relationships = ["hermano", "hijo", "hija", "esposo", "esposa", "viudo", "viuda", "padrinos", "padrino", "padryno", "soltera", "soltero", "madrina", "padre", "p.p.", "p. ", "p."]
    
    vocabs = {"legitimacy": legitimacy, "age": age, "occupation": occupation, "phenotype": phenotype, "titles": titles, "ranks": ranks, "ethnicities": ethnicities, "status": status, "relationships": relationships}
    
    return vocabs   

In [None]:
#export

def build_reciprocal_relationship(people, from_person, to_person, relationship_type):
    '''
    helper function that adds a reciprocal relationship of a specified type to the records of two people        
        people: list of dictionaries, each of which represents one mention of a person in the entry
        from_person: unique_id of person that relationship "comes from" (i.e. the parent in a "parent"-type relationship)
        to_person: unique_id of person that relationship "goes to" (i.e. the child in a "parent"-type relationship)
        relationship_type: currently accepts "parent", "godparent", "enslaver", and "spouse"
    
        returns: updated version of people with interpersonal relationship added
    '''
    
    null_from = False
    null_to = False
    no_to = False
    no_from = False
    
    if from_person == None:
        no_from = True
    if to_person == None:
        no_to = True
    
    for i in range(len(people)):
        if people[i]['id'] == from_person:
            from_loc = i
            if people[i]['relationships'] == None:
                null_from = True
        elif people[i]['id'] == to_person:
            to_loc = i
            if people[i]['relationships'] == None:
                null_to = True
    
    if relationship_type == "godparent":
        if not no_from:
            if null_from:
                people[from_loc]['relationships'] = [[to_person, "godchild"]]
            else:
                people[from_loc]['relationships'].append([to_person, "godchild"])
            
        if not no_to:
            if null_to:
                people[to_loc]['relationships'] = [[from_person, "godparent"]]
            else:
                people[to_loc]['relationships'].append([from_person, "godparent"])
    elif relationship_type == "parent":
        if not no_from:
            if null_from:
                people[from_loc]['relationships'] = [[to_person, "child"]]
            else:
                people[from_loc]['relationships'].append([to_person, "child"])
            
        if not no_to:
            if null_to:
                people[to_loc]['relationships'] = [[from_person, "parent"]]
            else:
                people[to_loc]['relationships'].append([from_person, "parent"])
    elif relationship_type == "enslaver":
        if not no_from:
            if null_from:
                people[from_loc]['relationships'] = [[to_person, "slave"]]
            else:
                people[from_loc]['relationships'].append([to_person, "slave"])
            
        if not no_to:
            if null_to:
                people[to_loc]['relationships'] = [[from_person, "enslaver"]]
            else:
                people[to_loc]['relationships'].append([from_person, "enslaver"])
            
    return people

In [None]:
#export

def alt_assign_relationships(entry_text, entities, people_df, people, volume_metadata):
    '''
    matches all labeled relationships to the correct individuals and builds triples
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: df containing all entities extracted from that entry by an NER model
        people_df: entities given the label "PER" from a single entry by an NER model with unique ids
        people: list of dictionaries, each of which represents one mention of a person in the entry
        (as produced by assign_characteristics)
        volume_metadata: metadata for the volume that the entry comes from, built by retrieve_volume_metadata        
    
        returns: updated version of people with interpersonal relationships added
    '''
    
    rel_types = retrieve_controlled_vocabularies()["relationships"]
    relationships = copy.deepcopy(entities.loc[entities['pred_label'] == 'REL'])
    relationships.reset_index(inplace=True)
    entities.reset_index(inplace=True)    
    
    if determine_principals(entry_text, entities, 1) != None:
        principal = determine_principals(entry_text, entities, 1)[0]
        for i in range(len(people)):
            if people[i]["name"] == principal:
                principal_id = people[i]['id']                
                break
    else:
        principal = None
        principal_id = None
    
    found_parents = False
    found_godparents = False
    
    #address master/slave and parents
    
    #build godparent relationships
    #future improvement: add logic to look for spousal relationship between godparents
    if (len(entities) != 0) and (len(relationships) != 0):        
        for index in range(len(entities)):
            if entities['pred_label'][index] == "REL":                
                if ((entities['pred_entity'][index].lower() == "madrina") or (entities['pred_entity'][index].lower() == "padrino") or (entities['pred_entity'][index].lower() == "padryno")) and (found_godparents == False):                    
                    if (len(entities) > (index + 1)) and (entities['pred_label'][index + 1] == "PER"):
                        for j in range(len(people_df)):
                            if people_df['pred_start'][j] == entities['pred_start'][index + 1]:
                                from_person = people_df['unique_id'][j]
                        people = build_reciprocal_relationship(people, from_person, principal_id, "godparent")                        
                        found_godparents = True
                elif ((entities['pred_entity'][index].lower() == "padrinos") or (entities['pred_entity'][index].lower() == "p.p.")) and (found_godparents == False):
                    if (len(entities) > (index + 1)) and (entities['pred_label'][index + 1] == "PER"):
                        for j in range(len(people_df)):
                            if people_df['pred_start'][j] == entities['pred_start'][index + 1]:
                                from_person = people_df['unique_id'][j]
                        people = build_reciprocal_relationship(people, from_person, principal_id, "godparent")
                        found_godparents = True
                    if (len(entities) > (index + 2)) and (entities['pred_label'][index + 2] == "PER"):
                        for j in range(len(people_df)):
                            if people_df['pred_start'][j] == entities['pred_start'][index + 2]:
                                from_person = people_df['unique_id'][j]
                        people = build_reciprocal_relationship(people, from_person, principal_id, "godparent")
                        found_godparents = True
                elif ("p." in entities['pred_entity'][index].lower()) and (found_godparents == False):
                    if (len(entities) > (index + 1)) and not ("p." in entities['pred_entity'][index + 1].lower()):                        
                        if (len(entities) > (index + 1)) and (entities['pred_label'][index + 1] == "PER"):
                            for j in range(len(people_df)):
                                if people_df['pred_start'][j] == entities['pred_start'][index + 1]:
                                    from_person = people_df['unique_id'][j]
                            people = build_reciprocal_relationship(people, from_person, principal_id, "godparent")
                            found_godparents = True
                    elif (len(entities) > (index + 1)):
                        if (len(entities) > (index + 2)) and (entities['pred_label'][index + 2] == "PER"):
                            for j in range(len(people_df)):
                                if people_df['pred_start'][j] == entities['pred_start'][index + 2]:
                                    from_person = people_df['unique_id'][j]
                            people = build_reciprocal_relationship(people, from_person, principal_id, "godparent")
                            found_godparents = True
                        if (len(entities) > (index + 3)) and (entities['pred_label'][index + 3] == "PER"):
                            for j in range(len(people_df)):
                                if people_df['pred_start'][j] == entities['pred_start'][index + 3]:
                                    from_person = people_df['unique_id'][j]
                            people = build_reciprocal_relationship(people, from_person, principal_id, "godparent")
                            found_godparents = True                       
            
    return people

In [None]:
#export

def categorize_characteristics(characteristics_df):
    '''
    determines which category each labeled characteristic belongs to        
        characteristics_df: entities given the label "CHAR" from a single entry by an NER model        
    
        returns: the same dataframe with an additional column containing a characteristic category
    '''
    
    vocabs = retrieve_controlled_vocabularies()
    categories = []
    
    for index, characteristic in characteristics_df.iterrows():
        category = None
        for cat in vocabs:
            if (characteristic['pred_entity'] == 'h') or (characteristic['pred_entity'] == "h."):
                category = "relationships"
            if category != None:
                break
            for term in vocabs[cat]:
                if term in characteristic['pred_entity'].lower():
                        category = cat
                        break
        #if category == None:
            #print("Failed to find a category for " + characteristic['pred_entity'])
        categories.append(category)
        
    characteristics_df["category"] = categories
    
    return characteristics_df

In [None]:
#export
#this is currently configured specifically for baptisms/burials

def assign_characteristics(entry_text, characteristics_df, unique_individuals, volume_metadata):
    '''
    matches all labeled characteristics to the correct individual(s) and builds triples
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        characteristics_df: entities given the label "CHAR" from a single entry by an NER model
        unique_individuals: as determined by id_unique_individuals and/or meta-function of disambig pipeline
        volume_metadata: metadata for the volume that the entry comes from, built by retrieve_volume_metadata        
    
        returns: structured representation (a list of dictionaries)
    '''
    people = []
    ethnicities = retrieve_controlled_vocabularies()["ethnicities"]
    categorized_characteristics = categorize_characteristics(characteristics_df)
    assignments = [None] * len(characteristics_df.index)    
    categorized_characteristics.reset_index(inplace=True)
    unique_individuals.reset_index(inplace=True)   
    
    for index in range(len(categorized_characteristics)):
        if ((categorized_characteristics["category"][index] == "age") or (categorized_characteristics["category"][index] == "legitimacy")) and (volume_metadata["type"] == "baptism"):
            principal = determine_principals(entry_text, unique_individuals, 1)
            if principal != None:
                principal = determine_principals(entry_text, unique_individuals, 1)[0]
                princ_loc = unique_individuals.index[unique_individuals["pred_entity"] == principal].tolist()
                for loc in princ_loc:
                    if assignments[index] == None:
                        assignments[index] = unique_individuals["unique_id"][loc]
                    else:
                        assignments[index] += ';' + unique_individuals["unique_id"][loc]
        elif (categorized_characteristics["category"][index] == "occupation") or (categorized_characteristics["category"][index] == "phenotype") or (categorized_characteristics["category"][index] == "ethnicities") or ((categorized_characteristics["category"][index] == "status") and (categorized_characteristics["pred_entity"][index].lower()[-1] != 's')):
            char_start = categorized_characteristics["pred_start"][index]
            lowest_diff = 50
            assign = None
            for i, person in unique_individuals.iterrows():
                person_start = person["pred_start"]
                diff = char_start - person_start
                if (diff > 0) and (diff < lowest_diff):
                    lowest_diff = diff
                    assign = i
            if assign != None:                
                assignments[index] = unique_individuals["unique_id"][assign]
        elif categorized_characteristics["category"][index] == "status":
            char_start = categorized_characteristics["pred_start"][index]
            lowest_diff = 30
            second_lowest_diff = 50
            assign = [None, None]
            for i, person in unique_individuals.iterrows():
                person_start = person["pred_start"]
                diff = char_start - person_start
                if (diff > 0) and (diff < lowest_diff):
                    lowest_diff = diff
                    if assign[0] != None:
                        assign[1] = assign[0]
                        second_lowest_diff = lowest_diff
                    assign[0] = i
                elif (diff > 0) and (diff < second_lowest_diff) and (assign[0] != None):
                    second_lowest_diff = diff
                    assign[1] = i
            ids = []
            for a in assign:
                if a != None:
                    ids.append(unique_individuals["unique_id"][a])
            if len(ids) == 2:
                assignments[index] = ids[0] + ';' + ids[1]
            elif len(ids) == 1:
                assignments[index] = ids[0]
            
    categorized_characteristics["assignment"] = assignments
    
    #display(categorized_characteristics)
    
    for i in range(len(unique_individuals.index)):        
        
        characteristics = {"ethnicities":[], "age":None, "legitimacy":None,"occupation":[], "phenotype":[], "status":None, "titles":None, "ranks":None, "relationships":None}
        
        for eth in ethnicities:
            if eth in unique_individuals["pred_entity"][i].lower():                
                characteristics["ethnicities"].append(eth[0].upper() + eth[1:])        
        
        for j in range(len(categorized_characteristics.index)):
            if (categorized_characteristics["assignment"][j] == None):
                continue
            if unique_individuals["unique_id"][i] in categorized_characteristics["assignment"][j]:
                if (categorized_characteristics["category"][j] == "age") or (categorized_characteristics["category"][j] == "legitimacy") or (categorized_characteristics["category"][j] == "status"):
                    characteristics[categorized_characteristics["category"][j]] = categorized_characteristics["pred_entity"][j]
                else:
                    characteristics[categorized_characteristics["category"][j]].append(categorized_characteristics["pred_entity"][j])
        
        person_record = {"id": unique_individuals["unique_id"][i], "name": unique_individuals["pred_entity"][i]}
        
        for key in characteristics:
            if ((key=="ethnicities") or (key == "occupation") or (key == "phenotype")) and (len(characteristics[key]) > 0):
                person_record[key] = characteristics[key][0]
                if (len(characteristics[key]) > 1):
                    for char in range(1,len(characteristics[key])):
                        person_record[key] += ';' + characteristics[key][char]
            elif (characteristics[key] != None) and (characteristics[key] != []):
                person_record[key] = characteristics[key]
            else:
                person_record[key] = None
        
        people.append(person_record)
    
    return people

In [None]:
#export

def id_unique_individuals(entry_text, entities, volume_metadata):
    '''
    identifies all unique individuals that appear in an entry (i.e. removing all multiple mentions of the same person)
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model        
        volume_metadata: metadata for the volume that the entry comes from, built by retrieve_volume_metadata
        
        returns: a list of the unique individuals who appear in an entry AND (temporary?) unique IDs for each individual
    '''
    event_id = volume_metadata["id"] + '-' + entities.iloc[0]['entry_no']
    
    people_df = entities.loc[entities['pred_label'] == 'PER']
    people_df.reset_index(inplace=True)
    people_df = people_df.drop('index',axis=1)
    
    unique_individuals = people_df['pred_entity'].unique()
    unique_individuals = np.vstack([unique_individuals, [None] * len(unique_individuals)])    
    
    for i in range(len(unique_individuals[0])):        
        unique_individuals[1][i] = event_id + '-P' + str(i + 1)        
    
    return unique_individuals

We want the first row of unique_individuals to contain the "best" (i.e. most complete/most accurate) name for each disambiguated individual. Once we have the ability to drop non-identical string references, we'll need to add a third row to unique_individuals in which each element is a list/array containing any/all disambiguated name strings since we'll need these to correctly attach characteristic/relationship references. 

### find_sus

In [None]:
#export

def find_sus(entry_text, entities, sus_df, index):
    '''
    identifies corner cases: all entries where there are multiple entities that 1) have the same first name appearing 
        multiple times, 2) have compound names and then a segment of that name appearing, and 3) have a full name with 
        the first name by itself appearing
    Note that this should not be used in tandem with id_unique_individuals, as that function just drops the duplicate names
    
    params:
        entry_text: actual text for comparison
        entities: df of entities identified
        sus_df: either the empty df body or the df from previously loop iterations
        i: current row that the loop is on in DEMO_DF
    
    returns: df of all the entries that may be corner cases, in the same form demo_df, but with two added id columns
    '''
    #Set up
    people_df = entities.loc[entities['pred_label'] == 'PER']
    people_df.reset_index(inplace=True)
    people_df = people_df.drop('index',axis=1)
    
    my_rows = len(people_df.index)
    hold = my_rows * [0]
    people_df['name_status'] = hold
    first_names = []
    check_against = []
    dups = 0
    sus = 0
    
    #Get a list of all the first names that appear in the entities/people_df
    #  This is definitely not the most computationally efficient way to do this
    for i in range(my_rows):
        #Separate people based on whether it is a first name or a full/compound name
        if (" " in people_df.iloc[i,1]) or ("-" in people_df.iloc[i,1]):
            check_against.append(people_df.iloc[i,1])
        elif ~(" " in people_df.iloc[i,1]): #No spaces thus we are assuming it is a first name
            first_names.append(people_df.iloc[i,1])
    #Check to see whether they are subsets of full/compound names
    if len(first_names)>0 and len(check_against)>0:
        for j in range(len(first_names)):
            for k in range(len(check_against)):
                if first_names[j] in check_against[k]:
                    #Mark this entire entry as sus
                    sus = 1
    #Generally check to see if there are any duplicate entities (same name) in the entry
    if people_df['pred_entity'].duplicated().any():
        dups = 1;
    #Set the status column
    if sus and dups:
        status = 11 #ie both sus and dups are true
    elif sus:
        status = 10 #ie sus true, dups false
    elif dups:
        status = 0.01 #ie sus false, dups true
    else:
        status = 0
    #ie if the entry is suspect or has duplicates, then add it to sus_df
    if status>0:
        if len(sus_df.index)<1:
            data = [{'vol_titl':demo_df.iloc[index,0], 'vol_id':demo_df.iloc[index,1], 'fol_id':demo_df.iloc[index,2],
                    'text':demo_df.iloc[index,3],'entry_no':entry_no,'suspect':status}]
            sus_df = pd.DataFrame(data)
        else:
            sus_df = sus_df.append({'vol_titl':demo_df.iloc[index,0], 'vol_id':demo_df.iloc[index,1], 'fol_id':demo_df.iloc[index,2],
                    'text':demo_df.iloc[index,3],'entry_no':entry_no,'suspect':status},ignore_index=True)
    return sus_df

### split_name_col

In [None]:
#export

def split_name_col(people_df):
    '''
    from the fed in entities, strips DF to only include people, then separates based on if it is a first name or a full name
    
    
    ### Functionality is not fully realized yet, could probably be generalized further, but this entire task may not be necessary
    '''
    #Set up
    my_rows = len(people_df.index)
    hold = my_rows * [0]
    people_df['name_status'] = hold
    
    #Separate into two based on first/single and full name status
    for i in range(my_rows):
        if "-" in people_df.iloc[i,1]:
            people_df.iloc[i,5] = 2 #2 therefore represents compound name
        elif " " in people_df.iloc[i,1]:
            people_df.iloc[i,5] = 1 #1 therefore represents a full name
        else: #Must be a single name
            #0 therefore represents a full name
            pass
    first_n = people_df[people_df.name_status == 0]
    full_n = people_df[people_df.name_status == 1]
    cmpd_n = people_df[people_df.name_status == 2]
    
    print("DF of first names")
    display(first_n.head())
    print("DF of full names")
    display(full_n.head())
    print("DF of compound names")
    display(cmpd_n.head())
    print("---------------------")
    
    return first_n, full_n, cmpd_n

### disambiguate
1. Doesn't do anything once entities are separated

In [None]:
#export

def disambiguate():
    '''
    goes through the problem cases previously identified and then applies split_name_col to break the entities down into
        the ones that may be 
    '''
    people_df = entities.loc[entities['pred_label'] == 'PER']
    people_df.reset_index(inplace=True)
    people_df = people_df.drop('index',axis=1)
    
    first_n, full_n, cmpd_n = split_name_col(people_df)
    

In [None]:
#export

def determine_principals(entry_text, entities, n_principals):
    '''
    determines the principal of a single-principal event
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model
        n_principals: expected number of principals
        
        returns: the principal(s) of the event in question, or None if no principal can be identified
    '''
    
    entry_text = entry_text.lower()
    principals = None
    
    if n_principals == 1:       
        
        for index, entity in entities.iterrows():
            if (entity['pred_label'] == 'PER') and (entity['pred_start'] <= 20):
                principals = [entity['pred_entity']]
                
        if principals == None:            
            prox = entry_text.find('oleos')
            if prox != -1:
                for index, entity in entities.iterrows():
                    if (entity['pred_label'] == 'PER') and (abs(entity['pred_start'] - prox) <= 10):
                        principals = [entity['pred_entity']]
                        
        if principals == None:
            prox = entry_text.find('nombre')
            if prox != -1:
                for index, entity in entities.iterrows():
                    if (entity['pred_label'] == 'PER') and (abs(entity['pred_start'] - prox) <= 10):
                        principals = [entity['pred_entity']]                      
        
    elif n_principals == 2:
        print("That number of principals is not supported yet.")        
        #process marriage principals
    else:
        print("Invalid number of principals.")        
    
    return principals

In [None]:
#export

def assign_relationships(entry_text, entities, unique_individuals):
    '''
    Relationship types:
        parent/child --> P. and P.s are parents
        godparents/godchildren --> P.P and p.s are godparents
        slaveholders/enslaved
        spouses
        grandparents    
    Note that this function also calls the determine_principals function to help with rel assignment
    Returns: the relationship in forms of triples (subject, REL, relation) i.e. (me, godmother, my_godmother)
    '''    
    rel_df = entities.loc[entities['pred_label'] == 'REL']
    rel_df.reset_index(inplace=True)
    rel_df = rel_df.drop('index',axis=1)
    #display(rel_df.head()) #Comment this out in final function, this is just for quick verification
    
    principal = determine_principals(entry_text, unique_individuals, 1)[0]
    principal_ID = unique_individuals.loc[unique_individuals.pred_entity==principal,"unique_id"].item()
    status = retrieve_controlled_vocabularies()["status"]

    rel = 0 #Variable telling us later whether or not this entry has any identified relationships
    previous = 0 #Variable telling us whether or not the previous REL combined two entities 
    #(i.e. P. and P. into P.P. and thus can skip the second P. entity)
    event_id = volume_metadata["id"] + '-' + entities.iloc[0]['entry_no']
    my_relations = []
    m,n = entities.shape
    for i in range(m):
        if (entities.iloc[i,2]=='REL'):
            rel = 1 #Relationship present
            #We must check to make sure the first entity isn't a REL or it breaks the func due to positional index error
            if i==0 or i==(m-1):
                print("First/last entity is a REL, this functionality is not yet supported.")
            elif entities.iloc[i,1]=='P.P.':
                try:
                    #This gathers the first name, probably the padrino
                    my_ID = unique_individuals.loc[unique_individuals.pred_entity==entities.iloc[i+1,1],"unique_id"].item()
                    my_triple = (principal_ID,'Padrino',my_ID)
                    my_relations.append(my_triple)
                    #This should be the second name, probably the madrina
                    my_ID = unique_individuals.loc[unique_individuals.pred_entity==entities.iloc[i+2,1],"unique_id"].item()
                    my_triple = (principal_ID,'Madrina',my_ID)
                    my_relations.append(my_triple)
                except:
                    print("Exception: had last entity in DF as a REL and thus out of bounds in current form of function") 
            #Checking if we have back-to-back entities in the form of 'P.' followed by 'P.'
            elif ( (entities.iloc[i+1,2]=='REL') and ('P' in entities.iloc[i+1,1]) and (entities.iloc[i+2,2]=='PER') ):
                previous = 1
                my_ID = unique_individuals.loc[unique_individuals.pred_entity==entities.iloc[i+2,1],"unique_id"].item()
                my_triple = (principal_ID,'Padrino',my_ID)
                my_relations.append(my_triple)
                my_ID = unique_individuals.loc[unique_individuals.pred_entity==entities.iloc[i+3,1],"unique_id"].item()
                my_triple = (principal_ID,'Madrina',my_ID)
                my_relations.append(my_triple)
            #Skipping the second entity from the above case in the next iteration
            elif previous:
                previous = 0
            elif (entities.iloc[i+1,2]=='PER'):
                try:
                    my_ID = unique_individuals.loc[unique_individuals.pred_entity==entities.iloc[i+1,1],"unique_id"].item()
                    my_triple = (principal_ID,(entities.iloc[i,1]),my_ID)
                    my_relations.append(my_triple)
                except:
                    print("Exception: had last entity in DF as a REL and thus out of bounds in current form of function")
            elif ((entities.iloc[i,1].strip()=='P.') or (entities.iloc[i,1].strip()=='P')) and (entities.iloc[i+1,2]=='PER'): 
                try:
                    my_ID = unique_individuals.loc[unique_individuals.pred_entity==entities.iloc[i+1,1],"unique_id"].item()
                    my_triple = (principal_ID,"Padre",my_ID)
                    my_relations.append(my_triple)
                except:
                    print("Exception: had last entity in DF as a REL and thus out of bounds in current form of function") 
            else:
                print("Relationship found, but not between adjacent people")
        elif ((entities.iloc[i,1] in status) and (entities.iloc[i+1,2]=='PER')): #Identify the slave owner
            my_ID = unique_individuals.loc[unique_individuals.pred_entity==entities.iloc[i+1,1],"unique_id"].item()
            my_triple = (principal_ID,"Esclavista",my_ID)
            my_relations.append(my_triple)
    if rel:
        print(entry_text) #Uncomment this for verification
        print()
        print(my_relations)
    print("------------------------------------------")
    print()
    return my_relations

In [None]:
#export

def determine_event_date(entry_text, entities, event_type, volume_metadata, event_ref_pos=None):
    '''
    determines the date of a specific event
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model
        event_type: this could be either a valid record_type OR a secondary event like a birth
        volume_metadata: metadata for the volume that the entry comes from, built by retrieve_volume_metadata
        event_ref_pos: optional index for reference to secondary event (to determine most likely date by proximity)
        
        returns: the date of the event in question, or None if no date can be identified
    '''
    date = None
    date_start = None
    
    if event_type != volume_metadata["type"]:        
        primary_event_date = determine_event_date(entry_text, entities, volume_metadata["type"], volume_metadata)
        for index, entity in entities.iterrows():
            if (entity['pred_label'] == 'DATE') and (entity['pred_entity'] != primary_event_date) and (date == None):
                date = entity['pred_entity']
                date_start = entity['pred_start']
            elif (entity['pred_label'] == 'DATE') and (entity['pred_entity'] != primary_event_date):
                if event_ref_pos == None:
                    date = entity['pred_entity']
                else:
                    if (abs(event_ref_pos - entity['pred_start']) < abs(event_ref_pos - date_start)):
                        date = entity['pred_entity']
                        date_start = entity['pred_start']
    
    elif volume_metadata["type"] == "baptism":
        entry_length = len(entry_text)
        
        for index, entity in entities.iterrows():
            if (entity['pred_label'] == 'DATE') and (entity['pred_start'] <= (entry_length / 3)):
                date = entity['pred_entity']        
                
    else:
        date = "That event type is not supported yet."
        
    return date

In [None]:
#export

def determine_event_location(entry_text, entities, event_type, volume_metadata, event_ref_pos=None):
    '''
    determines the location of a specific event
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model
        event_type: this could be either a valid record_type OR a secondary event like a birth
        volume_metadata: metadata for the volume that the entry comes from, built by retrieve_volume_metadata
        event_ref_pos: optional index for reference to secondary event (to determine most likely date by proximity)
        
        returns: the location of the event in question, or None if no date can be identified
    '''
    location = None
    
    if event_type == volume_metadata["type"]:
        location = volume_metadata["institution"]    
    
    return location

In [None]:
#export

def identify_cleric(entry_text, entities):
    '''
    identifies the cleric(s) associated with a sacramental entry
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model        
        
        returns: the associated cleric(s), or None if no date can be identified
    '''
    clerics = None
    
    for index, entity in entities.iterrows():
            if ((entity['pred_label'] == 'PER') and ((len(entry_text) - entity['pred_end']) <= 10) and (len(entry_text) > 100)):
                clerics = entity['pred_entity']
            #going to keep this condition for now, but it can create false positives when long, incorrect entities are extracted
            #from short and/or garbled entries
            elif (entity['pred_entity'] != None) and (len(entry_text) - entity['pred_end'] <= 2) and (entity['pred_label'] == 'PER'):
                clerics = entity['pred_entity']                                                 
                
    if clerics == None:
        pvs_label = None
        pvs_end = None
        for index, entity in entities.iterrows():
            if entity['pred_label'] == 'PER' and pvs_label == 'DATE' and (entity['pred_start'] - pvs_end) <= 15:
                clerics = entity['pred_entity']                
            pvs_label = entity['pred_label']
            pvs_end = entity['pred_end']
    
    if clerics == None:
        entry_text = entry_text.lower()
        for index, entity in entities.iterrows():
            if entity['pred_label'] == 'PER' and entry_text.find("cura", entity['pred_start'] + len(entity['pred_entity'])) != -1 and ((entry_text.find("cura", entity['pred_start'] + len(entity['pred_entity']))) - entity['pred_end']) <= 15:
                clerics = entity['pred_entity']                
    
    return clerics

In [None]:
#export

def build_event(entry_text, entities, event_type, principals, volume_metadata, n_event_within_entry, unique_individuals):
    '''
    builds out relationships related to a baptism or burial event
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model
        event_type: this could be either a valid record_type OR a secondary event like a birth
        principals: the principal(s) of the event, as indicated by determine_principals
        volume_metadata: metadata for the volume that the entry comes from, built by retrieve_volume_metadata
        unique_individuals: as determined by id_unique_individuals and/or meta-function of disambig pipeline
        
        n_event_within_entry: event number within entry
        
        returns: structured representation of these relationships, including (but not necessarily limited to)
        the event's principal, the date of the event, the location of the event, and the associated cleric
    '''   
    event_id = volume_metadata["id"] + '-' + entities.iloc[0]['entry_no'] + '-E' + str(n_event_within_entry)    
    #it's possible that this function should also be returning an event iterator,
    #but for now I'm planning to do that in build_relationships
    
    if event_type == "baptism":
        if principals != None:           
            principal = principals[0]
        else:
            principal = None
        date = determine_event_date(entry_text, entities, event_type, volume_metadata)
        location = determine_event_location(entry_text, entities, event_type, volume_metadata)
        cleric = identify_cleric(entry_text, entities)
        
        found_principal_id = False
        found_cleric_id = False
        for index, entity in unique_individuals.iterrows():
            if entity['pred_entity'] == principal:
                principal = entity['unique_id']
                found_principal_id = True
                continue
            elif entity['pred_entity'] == cleric:
                cleric = entity['unique_id']
                found_cleric_id = True                
        
        if (principal != None) and (found_principal_id == False):
            principal = None
        if (cleric != None) and (found_cleric_id == False):
            cleric = None
    
    elif event_type == "birth":
        if principals != None:           
            principal = principals[0]
        else:
            principal = None
        date = determine_event_date(entry_text, entities, event_type, volume_metadata)
        location = determine_event_location(entry_text, entities, event_type, volume_metadata)
        cleric = None
        
        found_principal_id = False        
        for index, entity in unique_individuals.iterrows():
            if entity['pred_entity'] == principal:
                principal = entity['unique_id']
                found_principal_id = True
                break                
        
        if (principal != None) and (found_principal_id == False):
            principal = None
        
    else:
        print("That event type can't be built yet.")
        return
    
    event_relationships = {"id": event_id, "type": event_type, "principal": principal, "date": date, "location": location, "cleric": cleric}
        
    return event_relationships

In [None]:
#export

def drop_obvious_duplicates(people, principals, cleric):
    '''
    first-pass disambiguation that drops multiple mentions of cleric and principal(s)
        people: df containing all entities labeled as people in the entry
        principals: as indicated by determine_principals
        
        returns: people df with obvious duplicates dropped
    '''   
    found_principal = False
    found_cleric = False       
    
    if len(principals) == 1:
        for index, person in people.iterrows():
            if (person['pred_entity'] == principals[0]) and (found_principal == False):
                found_principal = True
            elif person['pred_entity'] == principals[0]:                
                people.drop(index, inplace=True)
                
            if cleric != None:
                if (person['pred_entity'] == cleric) and (found_cleric == False):
                    found_cleric = True
                elif person['pred_entity'] == cleric:                
                    people.drop(index, inplace=True)
   
    people.reset_index(inplace=True)
    
    return people

In [None]:
#export

def id_obvious_duplicates(people, principals, cleric):
    '''
    first-pass disambiguation that identifies multiple mentions of cleric and principal(s)
        people: df containing all entities labeled as people in the entry with unique ids
        principals: as indicated by determine_principals
        cleric: as identified by identify_cleric
        
        returns: dictionary with two keys, each containing list of ids corresponding to each mention of individual in question 
    '''   
   
    obv_dups = {"principal":[], "cleric":[]}
    
    if (principals != None) and (len(principals) == 1):
        
        for index, person in people.iterrows():
            
            if (person['pred_entity'] == principals[0]):
                obv_dups["principal"].append(person["unique_id"])           
            
            if (person['pred_entity'] == cleric):
                    obv_dups["cleric"].append(person["unique_id"])
    
    return obv_dups

In [None]:
#export

def assign_unique_ids(people, volume_metadata):
    '''
    assigns unique ids to each person in an entry
        people: df containing all entities labeled as people in the entry that has received first-pass disambiguation
        volume_metadata: metadata for the volume that the entry comes from, built by retrieve_volume_metadata
        
        returns: people df with column containing unique ids appended
    '''
    size = len(people.index)
    
    if size == 0:
        return people
    
    unique_ids = []
    entry_id = volume_metadata["id"] + '-' + people.iloc[0]['entry_no']
    
    for i in range(size):
        unique_ids.append(entry_id + '-P' + str(i+1))
        
    people['unique_id'] = unique_ids
    
    return people

In [None]:
#export

def merge_records(records_to_merge):
    '''
    merge two or more dictionaries with some (but possibly not all) shared keys
        records_to_merge: list containing two or more dictionaries to merge
        
        returns: single, merged dictionary
    '''
    merged_record = records_to_merge[0]
    
    for i in range(1, len(records_to_merge)):
        record = records_to_merge[i]
        for key in record:
            if record[key] != None:                
                if merged_record[key] == None:
                    merged_record[key] = record[key]
                else:
                    if key == 'relationships':
                        for rel in record[key]:
                            if rel in merged_record[key]:
                                continue
                            else:
                                merged_record[key].append(rel)
                    else:
                        values = record[key].split(';')
                        for value in values:
                            if value.lower() in merged_record[key].lower():
                                continue
                            else:
                                merged_record[key] += ';' + value               
                
    return merged_record

In [None]:
#export

def merge_duplicates(people, duplicates):
    '''
    merge two or more dictionaries with some (but possibly not all) shared keys
        people: dataframe in which each row is a person
        duplicates: dictionary containing keys "principal" and "cleric";
        the value of each key is a list containing unique ids for each
        mention of the appropriate individual
        
        returns: dataframe with duplicate mentions of each individual type merged
    '''
    
    if (len(duplicates["principal"]) > 1):
        dups = []
        for person in people:
            if (person['id'] in duplicates["principal"]):
                dups.append(person)
                del people[people.index(person)]
        people.append(merge_records(dups))
    
    if (len(duplicates["cleric"]) > 1):
        dups = []
        for person in people:
            if (person['id'] in duplicates["cleric"]):
                dups.append(person)
                del people[people.index(person)]
        people.append(merge_records(dups))
    
    return people

In [None]:
#export

def build_entry_metadata(entry_text, entities, path_to_volume_xml):
    '''
    applies rules-based engine for relationship linking to the transcription of a single entry
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity        
        entities: entities of all kinds extracted from that entry by an NER model
        path_to_volume_xml: path to xml file containing full volume transcription and volume-level metadata
            
        returns: three lists containing structured data about the people, places, and events that appear in the entry
    '''
        
    people = []
    places = []
    events = []
    
    volume_metadata = retrieve_volume_metadata(path_to_volume_xml)
    people_df = copy.deepcopy(entities.loc[entities['pred_label'] == 'PER'])
    people_df.reset_index(inplace=True)
    people_df = assign_unique_ids(people_df, volume_metadata)
    characteristics_df = copy.deepcopy(entities.loc[entities['pred_label'] == 'CHAR'])
    characteristics_df.reset_index(inplace=True)
    dates_df = copy.deepcopy(entities.loc[entities['pred_label'] == 'DATE'])
    dates_df.reset_index(inplace=True)
    
    if volume_metadata["type"] == "baptism":
        principal = determine_principals(entry_text, entities, 1)
        cleric = identify_cleric(entry_text, entities)
        events.append(build_event(entry_text, entities, "baptism", principal, volume_metadata, 1, people_df))
        if (len(dates_df.index) > 1):
            events.append(build_event(entry_text, entities, "birth", principal, volume_metadata, 2, people_df))
        
        characteristics_df = categorize_characteristics(characteristics_df)
        people = assign_characteristics(entry_text, characteristics_df, people_df, volume_metadata)
        
        people = alt_assign_relationships(entry_text, entities, people_df, people, volume_metadata)
        
        obvious_duplicates = id_obvious_duplicates(people_df, principal, cleric)       
        people = merge_duplicates(people, obvious_duplicates)       
            
        #perform more sophisticated disambiguation
        
        for event in events:
            if (event["location"] != None) and (not (event["location"] in places)):
                places.append(event["location"])
    
    elif volume_metadata["type"] == "marriage":        
        #process marriage record
        print("That record type is not supported yet.")
        return None
    elif volume_metadata["type"] == "burial":
        #process burial record
        print("That record type is not supported yet.")
        return None
    else:
        print("That record type is not supported yet.")
        return None    
    
    return people, places, events

## Unit testing

### Load trained model and entry data

In [None]:
#no_test

trained_model = load_model('models/mat_baut_1', language="es", verbose='True')

Loaded model 'models/mat_baut_1'


In [None]:
#no_test

path_to_transcription = "transcriptions\\15834.xml"
demo_df = parse_xml_v2(path_to_transcription)
demo_df.head()

Unnamed: 0,vol_titl,vol_id,fol_id,text,entry_no
0,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1033,[margin]: Juana. Esc.va Domingo veinte y dos d...,1033-1
1,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1033,[margin]: Paula. Esc.a Juebes veinte y tres de...,1033-2
2,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1033,[margin]: Maria Esc.a Miercoles prim.o de feb....,1033-3
3,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1033,[margin]: Bernardo Esc.vo Domingo nueve de Abr...,1033-4
4,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1034,[margin]: Fran.co Esc.vo [roto] Abril de mil s...,1034-1


In [None]:
#no_test

volume_metadata = retrieve_volume_metadata(path_to_transcription)
print(volume_metadata)

{'type': 'baptism', 'country': 'Cuba', 'state': 'Matanzas', 'city': 'Matanzas', 'institution': 'Catedral de San Carlos Borromeo', 'id': '15834', 'title': 'Libro 1 de Bautismos de Pardos y Morenos, 1719 - 1752, Parroquia de San Carlos de Matanzas'}


### Apply model to entry data

In [None]:
#no_test

ent_preds_df, metrics_df, per_ent_metrics = test_model(trained_model, demo_df, "entry_no", "text", score_model=False)
ent_preds_df.head(20)

Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end
0,1033-1,Juana,PER,10,15
1,1033-1,Esc.va,CHAR,17,23
2,1033-1,Domingo veinte y dos de [roto] y nueve,DATE,24,62
3,1033-1,Thomas de Orvera,PER,66,82
4,1033-1,Juana de nacion,PER,121,136
5,1033-1,Mina,CHAR,137,141
6,1033-1,esclava,CHAR,142,149
7,1033-1,Juan Joseph de Justis,PER,159,180
8,1033-1,P.P.,REL,192,196
9,1033-1,Joseph Salcedo,PER,197,211


### drop_obvious_duplicates

In [None]:
#no_test

for index, row in demo_df.iterrows():
    entry_no = row['entry_no']
    entry_text = row['text']
    
    entities = copy.deepcopy(ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no])    
    people = copy.deepcopy(entities.loc[entities['pred_label'] == 'PER'])
    
    display(people)
    no_dup = drop_obvious_duplicates(people, determine_principals(entry_text, entities, 1), identify_cleric(entry_text, entities))
    display(no_dup)
    
    if index > 5:
        break

Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end
0,1033-1,Juana,PER,10,15
3,1033-1,Thomas de Orvera,PER,66,82
4,1033-1,Juana de nacion,PER,121,136
7,1033-1,Juan Joseph de Justis,PER,159,180
9,1033-1,Joseph Salcedo,PER,197,211
10,1033-1,Ana de Santiago,PER,214,229
11,1033-1,Thomas de Orvera,PER,263,279


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end
0,0,1033-1,Juana,PER,10,15
1,3,1033-1,Thomas de Orvera,PER,66,82
2,4,1033-1,Juana de nacion,PER,121,136
3,7,1033-1,Juan Joseph de Justis,PER,159,180
4,9,1033-1,Joseph Salcedo,PER,197,211
5,10,1033-1,Ana de Santiago,PER,214,229


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end
12,1033-2,Paula,PER,10,15
15,1033-2,Thomas de Orvera,PER,90,106
16,1033-2,Paula,PER,145,150
18,1033-2,Juan Joseph,PER,162,173
19,1033-2,Maria Josepha,PER,177,190
21,1033-2,Capitan D. Luis Hurtado de Mendoza,PER,201,235
23,1033-2,Bartholome Rixo,PER,251,266
24,1033-2,Thomas de Orvera,PER,290,306


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end
0,12,1033-2,Paula,PER,10,15
1,15,1033-2,Thomas de Orvera,PER,90,106
2,18,1033-2,Juan Joseph,PER,162,173
3,19,1033-2,Maria Josepha,PER,177,190
4,21,1033-2,Capitan D. Luis Hurtado de Mendoza,PER,201,235
5,23,1033-2,Bartholome Rixo,PER,251,266


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end
25,1033-3,Maria,PER,10,15
28,1033-3,Thomas de Orvera,PER,83,99
29,1033-3,Maria,PER,136,141
31,1033-3,"Juan,",PER,151,156
32,1033-3,Josepha,PER,159,166
34,1033-3,Capitan Antonio Benites,PER,177,200
36,1033-3,Ysabel Mendez,PER,216,229
37,1033-3,Thomas de Orvera,PER,253,269


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end
0,25,1033-3,Maria,PER,10,15
1,28,1033-3,Thomas de Orvera,PER,83,99
2,31,1033-3,"Juan,",PER,151,156
3,32,1033-3,Josepha,PER,159,166
4,34,1033-3,Capitan Antonio Benites,PER,177,200
5,36,1033-3,Ysabel Mendez,PER,216,229


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end
38,1033-4,Bernardo,PER,10,18
41,1033-4,Thomas de Orvera,PER,84,100
42,1033-4,Bernardo,PER,136,144
47,1033-4,D. Juan Joseph de Justis,PER,186,210
49,1033-4,Andres de Morales,PER,221,238
50,1033-4,Thomas de Orvera,PER,262,278


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end
0,38,1033-4,Bernardo,PER,10,18
1,41,1033-4,Thomas de Orvera,PER,84,100
2,47,1033-4,D. Juan Joseph de Justis,PER,186,210
3,49,1033-4,Andres de Morales,PER,221,238


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end
51,1034-1,Fran.co,PER,10,17
54,1034-1,Th[roto]mas de [roto]vera bap[roto]izé,PER,80,118
55,1034-1,Fran.co,PER,151,158
60,1034-1,D. Ju[roto] Joseph de Justis,PER,196,224
62,1034-1,Pedro Suares,PER,235,247
63,1034-1,Thomas de Orvera,PER,271,287


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end
0,51,1034-1,Fran.co,PER,10,17
1,54,1034-1,Th[roto]mas de [roto]vera bap[roto]izé,PER,80,118
2,60,1034-1,D. Ju[roto] Joseph de Justis,PER,196,224
3,62,1034-1,Pedro Suares,PER,235,247
4,63,1034-1,Thomas de Orvera,PER,271,287


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end
64,1034-2,Ant.o,PER,10,15
67,1034-2,Thomas de Orvera,PER,82,98
68,1034-2,Ant.o,PER,134,139
73,1034-2,D. Juan Joseph de Justis,PER,181,205
75,1034-2,Joseph de Soto,PER,216,230
76,1034-2,Thomas de Orvera,PER,254,270


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end
0,64,1034-2,Ant.o,PER,10,15
1,67,1034-2,Thomas de Orvera,PER,82,98
2,73,1034-2,D. Juan Joseph de Justis,PER,181,205
3,75,1034-2,Joseph de Soto,PER,216,230


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end
77,1034-3,Antonia,PER,10,17
80,1034-3,Thomas de Orvera,PER,85,101
81,1034-3,Antonia,PER,137,144
86,1034-3,D. Ju.o Joseph de Justis,PER,183,207
88,1034-3,Joseph Salcedo,PER,218,232
89,1034-3,Thomas de Orvera,PER,256,272


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end
0,77,1034-3,Antonia,PER,10,17
1,80,1034-3,Thomas de Orvera,PER,85,101
2,86,1034-3,D. Ju.o Joseph de Justis,PER,183,207
3,88,1034-3,Joseph Salcedo,PER,218,232


### assign_unique_ids

In [None]:
#no_test

for index, row in demo_df.iterrows():
    entry_no = row['entry_no']
    entry_text = row['text']
    
    print(entry_text)
    
    entities = copy.deepcopy(ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no])    
    people = copy.deepcopy(entities.loc[entities['pred_label'] == 'PER'])    
    
    no_dup = drop_obvious_duplicates(people, determine_principals(entry_text, entities, 1), identify_cleric(entry_text, entities))
    display(no_dup)
    display(assign_unique_ids(no_dup, volume_metadata))
    
    if index > 10:
        break

[margin]: Juana. Esc.va Domingo veinte y dos de [roto] y nueve yo Thomas de Orvera baptize, y pusse [roto] s.tos oleos a Juana de nacion Mina esclava de[roto] Juan Joseph de Justis fueron sus P.P. Joseph Salcedo y Ana de Santiago su mugger, y lo firmé. [signed]: Thomas de Orvera


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end
0,0,1033-1,Juana,PER,10,15
1,3,1033-1,Thomas de Orvera,PER,66,82
2,4,1033-1,Juana de nacion,PER,121,136
3,7,1033-1,Juan Joseph de Justis,PER,159,180
4,9,1033-1,Joseph Salcedo,PER,197,211
5,10,1033-1,Ana de Santiago,PER,214,229


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end,unique_id
0,0,1033-1,Juana,PER,10,15,15834-1033-1-P1
1,3,1033-1,Thomas de Orvera,PER,66,82,15834-1033-1-P2
2,4,1033-1,Juana de nacion,PER,121,136,15834-1033-1-P3
3,7,1033-1,Juan Joseph de Justis,PER,159,180,15834-1033-1-P4
4,9,1033-1,Joseph Salcedo,PER,197,211,15834-1033-1-P5
5,10,1033-1,Ana de Santiago,PER,214,229,15834-1033-1-P6


[margin]: Paula. Esc.a Juebes veinte y tres de feb.o de mil sietec.tos. y diez y nueve Yo Thomas de Orvera baptizé, y pusse los santos15 oleos á Paula h. l.16 de Juan Joseph, y Maria Josepha esc.s del Capitan D. Luis Hurtado de Mendoza fue su Padrino Bartholome Rixo, y lo firmé. [signed]: Thomas de Orvera


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end
0,12,1033-2,Paula,PER,10,15
1,15,1033-2,Thomas de Orvera,PER,90,106
2,18,1033-2,Juan Joseph,PER,162,173
3,19,1033-2,Maria Josepha,PER,177,190
4,21,1033-2,Capitan D. Luis Hurtado de Mendoza,PER,201,235
5,23,1033-2,Bartholome Rixo,PER,251,266


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end,unique_id
0,12,1033-2,Paula,PER,10,15,15834-1033-2-P1
1,15,1033-2,Thomas de Orvera,PER,90,106,15834-1033-2-P2
2,18,1033-2,Juan Joseph,PER,162,173,15834-1033-2-P3
3,19,1033-2,Maria Josepha,PER,177,190,15834-1033-2-P4
4,21,1033-2,Capitan D. Luis Hurtado de Mendoza,PER,201,235,15834-1033-2-P5
5,23,1033-2,Bartholome Rixo,PER,251,266,15834-1033-2-P6


[margin]: Maria Esc.a Miercoles prim.o de feb.o de mil siete.tos y diez y nueve Yo Thomas de Orvera baptizé, y pusse los santos oleos á Maria h. l. de Juan, y Josepha esc.s del Capitan Antonio Benites fue su Madrina Ysabel Mendez, y lo firmé. [signed]: Thomas de Orvera


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end
0,25,1033-3,Maria,PER,10,15
1,28,1033-3,Thomas de Orvera,PER,83,99
2,31,1033-3,"Juan,",PER,151,156
3,32,1033-3,Josepha,PER,159,166
4,34,1033-3,Capitan Antonio Benites,PER,177,200
5,36,1033-3,Ysabel Mendez,PER,216,229


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end,unique_id
0,25,1033-3,Maria,PER,10,15,15834-1033-3-P1
1,28,1033-3,Thomas de Orvera,PER,83,99,15834-1033-3-P2
2,31,1033-3,"Juan,",PER,151,156,15834-1033-3-P3
3,32,1033-3,Josepha,PER,159,166,15834-1033-3-P4
4,34,1033-3,Capitan Antonio Benites,PER,177,200,15834-1033-3-P5
5,36,1033-3,Ysabel Mendez,PER,216,229,15834-1033-3-P6


[margin]: Bernardo Esc.vo Domingo nueve de Abril de mil sietectos y diez y nueve Yo Thomas de Orvera baptize, y pusse los s.tos oleos á Bernardo negro adulto de nacion carabali esc.o de D. Juan Joseph de Justis fue su P. Andres de Morales, y lo firmé. [signed]: Thomas de Orvera


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end
0,38,1033-4,Bernardo,PER,10,18
1,41,1033-4,Thomas de Orvera,PER,84,100
2,47,1033-4,D. Juan Joseph de Justis,PER,186,210
3,49,1033-4,Andres de Morales,PER,221,238


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end,unique_id
0,38,1033-4,Bernardo,PER,10,18,15834-1033-4-P1
1,41,1033-4,Thomas de Orvera,PER,84,100,15834-1033-4-P2
2,47,1033-4,D. Juan Joseph de Justis,PER,186,210,15834-1033-4-P3
3,49,1033-4,Andres de Morales,PER,221,238,15834-1033-4-P4


[margin]: Fran.co Esc.vo [roto] Abril de mil sietec.tos y diez, y nueve [roto]o Th[roto]mas de [roto]vera bap[roto]izé, y pusse los s.tos oleo[roto] a Fran.co negro adulto de nacion temo esc.o de D. Ju[roto] Joseph de Justis fue su P. Pedro Suares, y lo firmé. [signed]: Thomas de Orvera


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end
0,51,1034-1,Fran.co,PER,10,17
1,54,1034-1,Th[roto]mas de [roto]vera bap[roto]izé,PER,80,118
2,60,1034-1,D. Ju[roto] Joseph de Justis,PER,196,224
3,62,1034-1,Pedro Suares,PER,235,247
4,63,1034-1,Thomas de Orvera,PER,271,287


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end,unique_id
0,51,1034-1,Fran.co,PER,10,17,15834-1034-1-P1
1,54,1034-1,Th[roto]mas de [roto]vera bap[roto]izé,PER,80,118,15834-1034-1-P2
2,60,1034-1,D. Ju[roto] Joseph de Justis,PER,196,224,15834-1034-1-P3
3,62,1034-1,Pedro Suares,PER,235,247,15834-1034-1-P4
4,63,1034-1,Thomas de Orvera,PER,271,287,15834-1034-1-P5


[margin]: Ant.o Esc.vo Domingo nueve de Abril de mil sietec.tos y diez y nueve Yo Thomas de Orvera baptize, y pusse los s.tos oleos á Ant.o negro adulto de nacion carabali esc.o de D. Juan Joseph de Justis fue su P. Joseph de Soto, y lo firmé. [signed]: Thomas de Orvera


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end
0,64,1034-2,Ant.o,PER,10,15
1,67,1034-2,Thomas de Orvera,PER,82,98
2,73,1034-2,D. Juan Joseph de Justis,PER,181,205
3,75,1034-2,Joseph de Soto,PER,216,230


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end,unique_id
0,64,1034-2,Ant.o,PER,10,15,15834-1034-2-P1
1,67,1034-2,Thomas de Orvera,PER,82,98,15834-1034-2-P2
2,73,1034-2,D. Juan Joseph de Justis,PER,181,205,15834-1034-2-P3
3,75,1034-2,Joseph de Soto,PER,216,230,15834-1034-2-P4


[margin]: Antonia Esc.va Domingo nueve de Abril de mil sietec.tos y diez, y nueve Yo Thomas de Orvera baptize, y pusse los s.tos oleos á Antonia negra adulta de nacion mina esc.va de D. Ju.o Joseph de Justis fue su P. Joseph Salcedo, y lo firmé. [signed]: Thomas de Orvera


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end
0,77,1034-3,Antonia,PER,10,17
1,80,1034-3,Thomas de Orvera,PER,85,101
2,86,1034-3,D. Ju.o Joseph de Justis,PER,183,207
3,88,1034-3,Joseph Salcedo,PER,218,232


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end,unique_id
0,77,1034-3,Antonia,PER,10,17,15834-1034-3-P1
1,80,1034-3,Thomas de Orvera,PER,85,101,15834-1034-3-P2
2,86,1034-3,D. Ju.o Joseph de Justis,PER,183,207,15834-1034-3-P3
3,88,1034-3,Joseph Salcedo,PER,218,232,15834-1034-3-P4


[margin]: M.a Luisa esc.va Domingo nueve de Abril de mil sietec.tos y diez, y nueve Yo Thomas de Orvera baptize, y puse los s.tos oleos á Maria Luisa de nacion lucumi negra adulta esc.a de D. Ju.o Joseph de Justis fue su P. Jacinto de Castro, y lo firmé. [signed]: Thomas de Orvera


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end
0,90,1034-4,M.a Luisa,PER,10,19
1,93,1034-4,Thomas de Orvera,PER,87,103
2,94,1034-4,Maria Luisa de nacion,PER,138,159
3,98,1034-4,D. Ju.o Joseph de Justis,PER,189,213
4,100,1034-4,Jacinto de Castro,PER,224,241


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end,unique_id
0,90,1034-4,M.a Luisa,PER,10,19,15834-1034-4-P1
1,93,1034-4,Thomas de Orvera,PER,87,103,15834-1034-4-P2
2,94,1034-4,Maria Luisa de nacion,PER,138,159,15834-1034-4-P3
3,98,1034-4,D. Ju.o Joseph de Justis,PER,189,213,15834-1034-4-P4
4,100,1034-4,Jacinto de Castro,PER,224,241,15834-1034-4-P5


[margin]: Ana esc.va17 Domingo nueve de Abril de mil sietec.tos y diez, y


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end
0,102,1034-5,Ana,PER,10,13


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end,unique_id
0,102,1034-5,Ana,PER,10,13,15834-1034-5-P1


Nueve yo [roto]omas de O[roto]ver[. . .] oleos á Ana negra adulta [. . .] D. Ju.o Joseph de Justis fue su madrina Ysabe[roto] Delgado, y lo firmé. [signed]: Thomas de Orvera


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end
0,105,1035-1,Ana,PER,49,52
1,108,1035-1,D. Ju.o Joseph de Justis,PER,74,98
2,110,1035-1,Ysabe[roto] Delgado,PER,114,133
3,111,1035-1,Thomas de Orvera,PER,157,173


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end,unique_id
0,105,1035-1,Ana,PER,49,52,15834-1035-1-P1
1,108,1035-1,D. Ju.o Joseph de Justis,PER,74,98,15834-1035-1-P2
2,110,1035-1,Ysabe[roto] Delgado,PER,114,133,15834-1035-1-P3
3,111,1035-1,Thomas de Orvera,PER,157,173,15834-1035-1-P4


[margin]: Theresa esc.va Domingo nueve de Abril de mil sietec.tos y diez, y nueve yo Thomas de Orvera baptize, y pusse los s.tos oleos a Theresa negra adulta de nacion mina esc.va de D. Juan Joseph de Justis fueron sus P. P. Joseph Salcede Soto, y Ana de Santiago, y lo firmé. [signed]: Thomas de Orvera


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end
0,112,1035-2,Theresa,PER,10,17
1,115,1035-2,Thomas de Orvera,PER,85,101
2,121,1035-2,D. Juan Joseph de Justis,PER,183,207
3,124,1035-2,Joseph Salcede Soto,PER,225,244
4,125,1035-2,Ana de Santiago,PER,248,263


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end,unique_id
0,112,1035-2,Theresa,PER,10,17,15834-1035-2-P1
1,115,1035-2,Thomas de Orvera,PER,85,101,15834-1035-2-P2
2,121,1035-2,D. Juan Joseph de Justis,PER,183,207,15834-1035-2-P3
3,124,1035-2,Joseph Salcede Soto,PER,225,244,15834-1035-2-P4
4,125,1035-2,Ana de Santiago,PER,248,263,15834-1035-2-P5


[margin]: Antonio. esc.vo Martes onze de Abril de mil sietec.tos y diez y nueve Yo Thomas de Orvera baptizé y pusse los s.tos oleos á Ant.o negro adulto de nacion carabali esc.vo de D. Ju.n. Joseph de Justis fue su P. Pedro Montes de Oca, y lo firmé. [signed]: Thomas de Orvera


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end
0,127,1035-3,Antonio,PER,10,17
1,130,1035-3,Thomas de Orvera,PER,83,99
2,131,1035-3,Ant.o,PER,134,139
3,136,1035-3,D. Ju.n. Joseph de Justis,PER,182,207
4,138,1035-3,Pedro Montes de Oca,PER,218,237


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end,unique_id
0,127,1035-3,Antonio,PER,10,17,15834-1035-3-P1
1,130,1035-3,Thomas de Orvera,PER,83,99,15834-1035-3-P2
2,131,1035-3,Ant.o,PER,134,139,15834-1035-3-P3
3,136,1035-3,D. Ju.n. Joseph de Justis,PER,182,207,15834-1035-3-P4
4,138,1035-3,Pedro Montes de Oca,PER,218,237,15834-1035-3-P5


### id_unique_individuals

In [None]:
#no_test

for index, row in demo_df.iterrows():
    entry_no = row['entry_no']
    entry_text = row['text']
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]    
    print(id_unique_individuals(entry_text, entities, volume_metadata))    
    if index > 15:
        break

[['Juana' 'Thomas de Orvera' 'Juana de nacion' 'Juan Joseph de Justis'
  'Joseph Salcedo' 'Ana de Santiago']
 ['15834-1033-1-P1' '15834-1033-1-P2' '15834-1033-1-P3' '15834-1033-1-P4'
  '15834-1033-1-P5' '15834-1033-1-P6']]
[['Paula' 'Thomas de Orvera' 'Juan Joseph' 'Maria Josepha'
  'Capitan D. Luis Hurtado de Mendoza' 'Bartholome Rixo']
 ['15834-1033-2-P1' '15834-1033-2-P2' '15834-1033-2-P3' '15834-1033-2-P4'
  '15834-1033-2-P5' '15834-1033-2-P6']]
[['Maria' 'Thomas de Orvera' 'Juan,' 'Josepha' 'Capitan Antonio Benites'
  'Ysabel Mendez']
 ['15834-1033-3-P1' '15834-1033-3-P2' '15834-1033-3-P3' '15834-1033-3-P4'
  '15834-1033-3-P5' '15834-1033-3-P6']]
[['Bernardo' 'Thomas de Orvera' 'D. Juan Joseph de Justis'
  'Andres de Morales']
 ['15834-1033-4-P1' '15834-1033-4-P2' '15834-1033-4-P3' '15834-1033-4-P4']]
[['Fran.co' 'Th[roto]mas de [roto]vera bap[roto]izé'
  'D. Ju[roto] Joseph de Justis' 'Pedro Suares' 'Thomas de Orvera']
 ['15834-1034-1-P1' '15834-1034-1-P2' '15834-1034-1-P3' '1583

### find_sus

In [None]:
#no_test

sus_df = pd.DataFrame(columns=['vol_titl', 'vol_id', 'fol_id', 'text','entry_no','suspect'])

#For reference: suspect codes:
#0.01 means there are duplicates but there aren't first names that appear alone
#10.00 means there are first names that are subsets of full names, but no duplicates
#11 means there are both first names that are subsets and duplicates

for index, row in demo_df.iterrows():
    entry_no = row['entry_no']
    entry_text = row['text']
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]
    sus_df = find_sus(entry_text, entities, sus_df, index)
    if index > 150: #Has 481 rows
        break

print(f"Here is the df of sus entries, with a length of ", len(sus_df.index))
display(sus_df.head(20))

Here is the df of sus entries, with a length of  129


Unnamed: 0,vol_titl,vol_id,fol_id,text,entry_no,suspect
0,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1033,[margin]: Juana. Esc.va Domingo veinte y dos d...,1033-1,11.0
1,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1033,[margin]: Paula. Esc.a Juebes veinte y tres de...,1033-2,0.01
2,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1033,[margin]: Maria Esc.a Miercoles prim.o de feb....,1033-3,0.01
3,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1033,[margin]: Bernardo Esc.vo Domingo nueve de Abr...,1033-4,0.01
4,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1034,[margin]: Fran.co Esc.vo [roto] Abril de mil s...,1034-1,0.01
5,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1034,[margin]: Ant.o Esc.vo Domingo nueve de Abril ...,1034-2,0.01
6,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1034,[margin]: Antonia Esc.va Domingo nueve de Abri...,1034-3,0.01
7,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1034,[margin]: M.a Luisa esc.va Domingo nueve de Ab...,1034-4,0.01
8,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1035,[margin]: Theresa esc.va Domingo nueve de Abri...,1035-2,0.01
9,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1035,[margin]: Antonio. esc.vo Martes onze de Abril...,1035-3,0.01


### disambiguate

In [None]:
#no_test

for index, row in sus_df.iterrows():
    entry_no = row['entry_no']
    entry_text = row['text']
    status = row['suspect']
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]
    if status == 0.01:
        pass #There are only duplicates, so might want to deal with this separately
    elif status > 9: #ie sus is true, disambiguation needed
        disambiguate()
    if index > 10: #Has 481 rows
        break

DF of first names


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,name_status
0,1033-1,Juana,PER,10,15,0


DF of full names


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,name_status
1,1033-1,Thomas de Orvera,PER,66,82,1
2,1033-1,Juana de nacion,PER,121,136,1
3,1033-1,Juan Joseph de Justis,PER,159,180,1
4,1033-1,Joseph Salcedo,PER,197,211,1
5,1033-1,Ana de Santiago,PER,214,229,1


DF of compound names


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,name_status


---------------------


## assign_characteristics

In [None]:
#no_test

for index, row in demo_df.iterrows():
    entry_no = row['entry_no']
    entry_text = row['text']    
    
    entities = copy.deepcopy(ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no])    
    people = copy.deepcopy(entities.loc[entities['pred_label'] == 'PER'])
    characteristics = copy.deepcopy(entities.loc[entities['pred_label'] == 'CHAR'])
    
    #no_dup = drop_obvious_duplicates(people, determine_principals(entry_text, entities, 1), identify_cleric(entry_text, entities))
    
    unique_individuals = assign_unique_ids(people, volume_metadata)
    
    display(unique_individuals)
    
    print(assign_characteristics(entry_text, characteristics, unique_individuals, volume_metadata))   
    
    if index > 5:
        break

Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,unique_id
0,1033-1,Juana,PER,10,15,15834-1033-1-P1
3,1033-1,Thomas de Orvera,PER,66,82,15834-1033-1-P2
4,1033-1,Juana de nacion,PER,121,136,15834-1033-1-P3
7,1033-1,Juan Joseph de Justis,PER,159,180,15834-1033-1-P4
9,1033-1,Joseph Salcedo,PER,197,211,15834-1033-1-P5
10,1033-1,Ana de Santiago,PER,214,229,15834-1033-1-P6
11,1033-1,Thomas de Orvera,PER,263,279,15834-1033-1-P7


[{'id': '15834-1033-1-P1', 'name': 'Juana', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': 'Esc.va', 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1033-1-P2', 'name': 'Thomas de Orvera', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': None, 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1033-1-P3', 'name': 'Juana de nacion', 'ethnicities': 'Mina', 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': 'esclava', 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1033-1-P4', 'name': 'Juan Joseph de Justis', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': None, 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1033-1-P5', 'name': 'Joseph Salcedo', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': Non

Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,unique_id
12,1033-2,Paula,PER,10,15,15834-1033-2-P1
15,1033-2,Thomas de Orvera,PER,90,106,15834-1033-2-P2
16,1033-2,Paula,PER,145,150,15834-1033-2-P3
18,1033-2,Juan Joseph,PER,162,173,15834-1033-2-P4
19,1033-2,Maria Josepha,PER,177,190,15834-1033-2-P5
21,1033-2,Capitan D. Luis Hurtado de Mendoza,PER,201,235,15834-1033-2-P6
23,1033-2,Bartholome Rixo,PER,251,266,15834-1033-2-P7
24,1033-2,Thomas de Orvera,PER,290,306,15834-1033-2-P8


[{'id': '15834-1033-2-P1', 'name': 'Paula', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': 'Esc.a', 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1033-2-P2', 'name': 'Thomas de Orvera', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': None, 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1033-2-P3', 'name': 'Paula', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': None, 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1033-2-P4', 'name': 'Juan Joseph', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': 'esc.s', 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1033-2-P5', 'name': 'Maria Josepha', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'sta

Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,unique_id
25,1033-3,Maria,PER,10,15,15834-1033-3-P1
28,1033-3,Thomas de Orvera,PER,83,99,15834-1033-3-P2
29,1033-3,Maria,PER,136,141,15834-1033-3-P3
31,1033-3,"Juan,",PER,151,156,15834-1033-3-P4
32,1033-3,Josepha,PER,159,166,15834-1033-3-P5
34,1033-3,Capitan Antonio Benites,PER,177,200,15834-1033-3-P6
36,1033-3,Ysabel Mendez,PER,216,229,15834-1033-3-P7
37,1033-3,Thomas de Orvera,PER,253,269,15834-1033-3-P8


[{'id': '15834-1033-3-P1', 'name': 'Maria', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': 'Esc.a', 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1033-3-P2', 'name': 'Thomas de Orvera', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': None, 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1033-3-P3', 'name': 'Maria', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': None, 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1033-3-P4', 'name': 'Juan,', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': 'esc.s', 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1033-3-P5', 'name': 'Josepha', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': 'esc.s

Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,unique_id
38,1033-4,Bernardo,PER,10,18,15834-1033-4-P1
41,1033-4,Thomas de Orvera,PER,84,100,15834-1033-4-P2
42,1033-4,Bernardo,PER,136,144,15834-1033-4-P3
47,1033-4,D. Juan Joseph de Justis,PER,186,210,15834-1033-4-P4
49,1033-4,Andres de Morales,PER,221,238,15834-1033-4-P5
50,1033-4,Thomas de Orvera,PER,262,278,15834-1033-4-P6


[{'id': '15834-1033-4-P1', 'name': 'Bernardo', 'ethnicities': None, 'age': 'adulto', 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': 'Esc.vo', 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1033-4-P2', 'name': 'Thomas de Orvera', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': None, 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1033-4-P3', 'name': 'Bernardo', 'ethnicities': 'carabali', 'age': 'adulto', 'legitimacy': None, 'occupation': None, 'phenotype': 'negro', 'status': 'esc.o', 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1033-4-P4', 'name': 'D. Juan Joseph de Justis', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': None, 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1033-4-P5', 'name': 'Andres de Morales', 'ethnicities': None, 'age': None, 'legitimacy': None, 'o

Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,unique_id
51,1034-1,Fran.co,PER,10,17,15834-1034-1-P1
54,1034-1,Th[roto]mas de [roto]vera bap[roto]izé,PER,80,118,15834-1034-1-P2
55,1034-1,Fran.co,PER,151,158,15834-1034-1-P3
60,1034-1,D. Ju[roto] Joseph de Justis,PER,196,224,15834-1034-1-P4
62,1034-1,Pedro Suares,PER,235,247,15834-1034-1-P5
63,1034-1,Thomas de Orvera,PER,271,287,15834-1034-1-P6


[{'id': '15834-1034-1-P1', 'name': 'Fran.co', 'ethnicities': None, 'age': 'adulto', 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': 'Esc.vo', 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1034-1-P2', 'name': 'Th[roto]mas de [roto]vera bap[roto]izé', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': None, 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1034-1-P3', 'name': 'Fran.co', 'ethnicities': 'temo', 'age': 'adulto', 'legitimacy': None, 'occupation': None, 'phenotype': 'negro', 'status': 'esc.o', 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1034-1-P4', 'name': 'D. Ju[roto] Joseph de Justis', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': None, 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1034-1-P5', 'name': 'Pedro Suares', 'ethnicities': None, 'age': None, 'legiti

Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,unique_id
64,1034-2,Ant.o,PER,10,15,15834-1034-2-P1
67,1034-2,Thomas de Orvera,PER,82,98,15834-1034-2-P2
68,1034-2,Ant.o,PER,134,139,15834-1034-2-P3
73,1034-2,D. Juan Joseph de Justis,PER,181,205,15834-1034-2-P4
75,1034-2,Joseph de Soto,PER,216,230,15834-1034-2-P5
76,1034-2,Thomas de Orvera,PER,254,270,15834-1034-2-P6


[{'id': '15834-1034-2-P1', 'name': 'Ant.o', 'ethnicities': None, 'age': 'adulto', 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': 'Esc.vo', 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1034-2-P2', 'name': 'Thomas de Orvera', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': None, 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1034-2-P3', 'name': 'Ant.o', 'ethnicities': 'carabali', 'age': 'adulto', 'legitimacy': None, 'occupation': None, 'phenotype': 'negro', 'status': 'esc.o', 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1034-2-P4', 'name': 'D. Juan Joseph de Justis', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': None, 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1034-2-P5', 'name': 'Joseph de Soto', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation

Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,unique_id
77,1034-3,Antonia,PER,10,17,15834-1034-3-P1
80,1034-3,Thomas de Orvera,PER,85,101,15834-1034-3-P2
81,1034-3,Antonia,PER,137,144,15834-1034-3-P3
86,1034-3,D. Ju.o Joseph de Justis,PER,183,207,15834-1034-3-P4
88,1034-3,Joseph Salcedo,PER,218,232,15834-1034-3-P5
89,1034-3,Thomas de Orvera,PER,256,272,15834-1034-3-P6


[{'id': '15834-1034-3-P1', 'name': 'Antonia', 'ethnicities': None, 'age': 'adulta', 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': 'Esc.va', 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1034-3-P2', 'name': 'Thomas de Orvera', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': None, 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1034-3-P3', 'name': 'Antonia', 'ethnicities': 'mina', 'age': 'adulta', 'legitimacy': None, 'occupation': None, 'phenotype': 'negra', 'status': 'esc.va', 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1034-3-P4', 'name': 'D. Ju.o Joseph de Justis', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': None, 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1034-3-P5', 'name': 'Joseph Salcedo', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupatio

## alt_assign_relationships

In [None]:
#no_test

for index, row in demo_df.iterrows():
    entry_no = row['entry_no']
    entry_text = row['text']    
    
    #print(entry_text)
    
    entities = copy.deepcopy(ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no])    
    people = copy.deepcopy(entities.loc[entities['pred_label'] == 'PER'])   
    characteristics = copy.deepcopy(entities.loc[entities['pred_label'] == 'CHAR'])
    
    display(entities)
    
    unique_individuals = assign_unique_ids(people, volume_metadata)    
    
    people = assign_characteristics(entry_text, characteristics, unique_individuals, volume_metadata)
    
    print(alt_assign_relationships(entry_text, entities, unique_individuals, people, volume_metadata))
    print('\n')
    
    #alt_assign_relationships(entry_text, entities, people, volume_metadata)
    
    if index > 5:
        break    

Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end
0,1033-1,Juana,PER,10,15
1,1033-1,Esc.va,CHAR,17,23
2,1033-1,Domingo veinte y dos de [roto] y nueve,DATE,24,62
3,1033-1,Thomas de Orvera,PER,66,82
4,1033-1,Juana de nacion,PER,121,136
5,1033-1,Mina,CHAR,137,141
6,1033-1,esclava,CHAR,142,149
7,1033-1,Juan Joseph de Justis,PER,159,180
8,1033-1,P.P.,REL,192,196
9,1033-1,Joseph Salcedo,PER,197,211


[{'id': '15834-1033-1-P1', 'name': 'Juana', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': 'Esc.va', 'titles': None, 'ranks': None, 'relationships': [['15834-1033-1-P5', 'godparent'], ['15834-1033-1-P6', 'godparent']]}, {'id': '15834-1033-1-P2', 'name': 'Thomas de Orvera', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': None, 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1033-1-P3', 'name': 'Juana de nacion', 'ethnicities': 'Mina', 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': 'esclava', 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1033-1-P4', 'name': 'Juan Joseph de Justis', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': None, 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1033-1-P5', 'name': 'Joseph Salcedo', 'ethnic

Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end
12,1033-2,Paula,PER,10,15
13,1033-2,Esc.a,CHAR,17,22
14,1033-2,Juebes veinte y tres de feb.o de mil sietec.to...,DATE,23,86
15,1033-2,Thomas de Orvera,PER,90,106
16,1033-2,Paula,PER,145,150
17,1033-2,h.,CHAR,151,153
18,1033-2,Juan Joseph,PER,162,173
19,1033-2,Maria Josepha,PER,177,190
20,1033-2,esc.s,CHAR,191,196
21,1033-2,Capitan D. Luis Hurtado de Mendoza,PER,201,235


[{'id': '15834-1033-2-P1', 'name': 'Paula', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': 'Esc.a', 'titles': None, 'ranks': None, 'relationships': [['15834-1033-2-P7', 'godparent']]}, {'id': '15834-1033-2-P2', 'name': 'Thomas de Orvera', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': None, 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1033-2-P3', 'name': 'Paula', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': None, 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1033-2-P4', 'name': 'Juan Joseph', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': 'esc.s', 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1033-2-P5', 'name': 'Maria Josepha', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation':

Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end
25,1033-3,Maria,PER,10,15
26,1033-3,Esc.a,CHAR,16,21
27,1033-3,Miercoles prim.o de feb.o de mil siete.tos y d...,DATE,22,79
28,1033-3,Thomas de Orvera,PER,83,99
29,1033-3,Maria,PER,136,141
30,1033-3,h.,CHAR,142,144
31,1033-3,"Juan,",PER,151,156
32,1033-3,Josepha,PER,159,166
33,1033-3,esc.s,CHAR,167,172
34,1033-3,Capitan Antonio Benites,PER,177,200


[{'id': '15834-1033-3-P1', 'name': 'Maria', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': 'Esc.a', 'titles': None, 'ranks': None, 'relationships': [['15834-1033-3-P7', 'godparent']]}, {'id': '15834-1033-3-P2', 'name': 'Thomas de Orvera', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': None, 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1033-3-P3', 'name': 'Maria', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': None, 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1033-3-P4', 'name': 'Juan,', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': 'esc.s', 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1033-3-P5', 'name': 'Josepha', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phen

Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end
38,1033-4,Bernardo,PER,10,18
39,1033-4,Esc.vo,CHAR,19,25
40,1033-4,Domingo nueve de Abril de mil sietectos y diez...,DATE,26,80
41,1033-4,Thomas de Orvera,PER,84,100
42,1033-4,Bernardo,PER,136,144
43,1033-4,negro,CHAR,145,150
44,1033-4,adulto,CHAR,151,157
45,1033-4,carabali,CHAR,168,176
46,1033-4,esc.o,CHAR,177,182
47,1033-4,D. Juan Joseph de Justis,PER,186,210


[{'id': '15834-1033-4-P1', 'name': 'Bernardo', 'ethnicities': None, 'age': 'adulto', 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': 'Esc.vo', 'titles': None, 'ranks': None, 'relationships': [['15834-1033-4-P5', 'godparent']]}, {'id': '15834-1033-4-P2', 'name': 'Thomas de Orvera', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': None, 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1033-4-P3', 'name': 'Bernardo', 'ethnicities': 'carabali', 'age': 'adulto', 'legitimacy': None, 'occupation': None, 'phenotype': 'negro', 'status': 'esc.o', 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1033-4-P4', 'name': 'D. Juan Joseph de Justis', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': None, 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1033-4-P5', 'name': 'Andres de Morales', 'ethnicities': None, 'age'

Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end
51,1034-1,Fran.co,PER,10,17
52,1034-1,Esc.vo,CHAR,18,24
53,1034-1,"[roto] Abril de mil sietec.tos y diez, y nueve [",DATE,25,73
54,1034-1,Th[roto]mas de [roto]vera bap[roto]izé,PER,80,118
55,1034-1,Fran.co,PER,151,158
56,1034-1,negro,CHAR,159,164
57,1034-1,adulto,CHAR,165,171
58,1034-1,temo,CHAR,182,186
59,1034-1,esc.o,CHAR,187,192
60,1034-1,D. Ju[roto] Joseph de Justis,PER,196,224


[{'id': '15834-1034-1-P1', 'name': 'Fran.co', 'ethnicities': None, 'age': 'adulto', 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': 'Esc.vo', 'titles': None, 'ranks': None, 'relationships': [['15834-1034-1-P5', 'godparent']]}, {'id': '15834-1034-1-P2', 'name': 'Th[roto]mas de [roto]vera bap[roto]izé', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': None, 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1034-1-P3', 'name': 'Fran.co', 'ethnicities': 'temo', 'age': 'adulto', 'legitimacy': None, 'occupation': None, 'phenotype': 'negro', 'status': 'esc.o', 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1034-1-P4', 'name': 'D. Ju[roto] Joseph de Justis', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': None, 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1034-1-P5', 'name': 'Pedro Suares', 'ethnicitie

Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end
64,1034-2,Ant.o,PER,10,15
65,1034-2,Esc.vo,CHAR,16,22
66,1034-2,Domingo nueve de Abril de mil sietec.tos y die...,DATE,23,78
67,1034-2,Thomas de Orvera,PER,82,98
68,1034-2,Ant.o,PER,134,139
69,1034-2,negro,CHAR,140,145
70,1034-2,adulto,CHAR,146,152
71,1034-2,carabali,CHAR,163,171
72,1034-2,esc.o,CHAR,172,177
73,1034-2,D. Juan Joseph de Justis,PER,181,205


[{'id': '15834-1034-2-P1', 'name': 'Ant.o', 'ethnicities': None, 'age': 'adulto', 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': 'Esc.vo', 'titles': None, 'ranks': None, 'relationships': [['15834-1034-2-P5', 'godparent']]}, {'id': '15834-1034-2-P2', 'name': 'Thomas de Orvera', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': None, 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1034-2-P3', 'name': 'Ant.o', 'ethnicities': 'carabali', 'age': 'adulto', 'legitimacy': None, 'occupation': None, 'phenotype': 'negro', 'status': 'esc.o', 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1034-2-P4', 'name': 'D. Juan Joseph de Justis', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': None, 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1034-2-P5', 'name': 'Joseph de Soto', 'ethnicities': None, 'age': None, '

Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end
77,1034-3,Antonia,PER,10,17
78,1034-3,Esc.va,CHAR,18,24
79,1034-3,Domingo nueve de Abril de mil sietec.tos y die...,DATE,25,81
80,1034-3,Thomas de Orvera,PER,85,101
81,1034-3,Antonia,PER,137,144
82,1034-3,negra,CHAR,145,150
83,1034-3,adulta,CHAR,151,157
84,1034-3,mina,CHAR,168,172
85,1034-3,esc.va,CHAR,173,179
86,1034-3,D. Ju.o Joseph de Justis,PER,183,207


[{'id': '15834-1034-3-P1', 'name': 'Antonia', 'ethnicities': None, 'age': 'adulta', 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': 'Esc.va', 'titles': None, 'ranks': None, 'relationships': [['15834-1034-3-P5', 'godparent']]}, {'id': '15834-1034-3-P2', 'name': 'Thomas de Orvera', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': None, 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1034-3-P3', 'name': 'Antonia', 'ethnicities': 'mina', 'age': 'adulta', 'legitimacy': None, 'occupation': None, 'phenotype': 'negra', 'status': 'esc.va', 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1034-3-P4', 'name': 'D. Ju.o Joseph de Justis', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': None, 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1034-3-P5', 'name': 'Joseph Salcedo', 'ethnicities': None, 'age': None, 

### assign_relationships

In [None]:
#no_test

for index, row in demo_df.iterrows():
    entry_no = row['entry_no']
    entry_text = row['text']
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]
    people = copy.deepcopy(entities.loc[entities['pred_label'] == 'PER'])
    people = drop_obvious_duplicates(people, determine_principals(entry_text, entities, 1), identify_cleric(entry_text, entities))
    unique_individuals = assign_unique_ids(people, volume_metadata)
    unique_individuals.drop("index",axis=1,inplace=True)
    display(entities.head(20))
    display(unique_individuals.head(20))
    assign_relationships(entry_text, entities, unique_individuals)
    if index > 2:
        break

Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end
0,1033-1,Juana,PER,10,15
1,1033-1,Esc.va,CHAR,17,23
2,1033-1,Domingo veinte y dos de [roto] y nueve,DATE,24,62
3,1033-1,Thomas de Orvera,PER,66,82
4,1033-1,Juana de nacion,PER,121,136
5,1033-1,Mina,CHAR,137,141
6,1033-1,esclava,CHAR,142,149
7,1033-1,Juan Joseph de Justis,PER,159,180
8,1033-1,P.P.,REL,192,196
9,1033-1,Joseph Salcedo,PER,197,211


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,unique_id
0,1033-1,Juana,PER,10,15,15834-1033-1-P1
1,1033-1,Thomas de Orvera,PER,66,82,15834-1033-1-P2
2,1033-1,Juana de nacion,PER,121,136,15834-1033-1-P3
3,1033-1,Juan Joseph de Justis,PER,159,180,15834-1033-1-P4
4,1033-1,Joseph Salcedo,PER,197,211,15834-1033-1-P5
5,1033-1,Ana de Santiago,PER,214,229,15834-1033-1-P6


[margin]: Juana. Esc.va Domingo veinte y dos de [roto] y nueve yo Thomas de Orvera baptize, y pusse [roto] s.tos oleos a Juana de nacion Mina esclava de[roto] Juan Joseph de Justis fueron sus P.P. Joseph Salcedo y Ana de Santiago su mugger, y lo firmé. [signed]: Thomas de Orvera

[('15834-1033-1-P1', 'Esclavista', '15834-1033-1-P4'), ('15834-1033-1-P1', 'Padrino', '15834-1033-1-P5'), ('15834-1033-1-P1', 'Madrina', '15834-1033-1-P6')]
------------------------------------------



Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end
12,1033-2,Paula,PER,10,15
13,1033-2,Esc.a,CHAR,17,22
14,1033-2,Juebes veinte y tres de feb.o de mil sietec.to...,DATE,23,86
15,1033-2,Thomas de Orvera,PER,90,106
16,1033-2,Paula,PER,145,150
17,1033-2,h.,CHAR,151,153
18,1033-2,Juan Joseph,PER,162,173
19,1033-2,Maria Josepha,PER,177,190
20,1033-2,esc.s,CHAR,191,196
21,1033-2,Capitan D. Luis Hurtado de Mendoza,PER,201,235


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,unique_id
0,1033-2,Paula,PER,10,15,15834-1033-2-P1
1,1033-2,Thomas de Orvera,PER,90,106,15834-1033-2-P2
2,1033-2,Juan Joseph,PER,162,173,15834-1033-2-P3
3,1033-2,Maria Josepha,PER,177,190,15834-1033-2-P4
4,1033-2,Capitan D. Luis Hurtado de Mendoza,PER,201,235,15834-1033-2-P5
5,1033-2,Bartholome Rixo,PER,251,266,15834-1033-2-P6


[margin]: Paula. Esc.a Juebes veinte y tres de feb.o de mil sietec.tos. y diez y nueve Yo Thomas de Orvera baptizé, y pusse los santos15 oleos á Paula h. l.16 de Juan Joseph, y Maria Josepha esc.s del Capitan D. Luis Hurtado de Mendoza fue su Padrino Bartholome Rixo, y lo firmé. [signed]: Thomas de Orvera

[('15834-1033-2-P1', 'Esclavista', '15834-1033-2-P5'), ('15834-1033-2-P1', 'Padrino', '15834-1033-2-P6')]
------------------------------------------



Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end
25,1033-3,Maria,PER,10,15
26,1033-3,Esc.a,CHAR,16,21
27,1033-3,Miercoles prim.o de feb.o de mil siete.tos y d...,DATE,22,79
28,1033-3,Thomas de Orvera,PER,83,99
29,1033-3,Maria,PER,136,141
30,1033-3,h.,CHAR,142,144
31,1033-3,"Juan,",PER,151,156
32,1033-3,Josepha,PER,159,166
33,1033-3,esc.s,CHAR,167,172
34,1033-3,Capitan Antonio Benites,PER,177,200


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,unique_id
0,1033-3,Maria,PER,10,15,15834-1033-3-P1
1,1033-3,Thomas de Orvera,PER,83,99,15834-1033-3-P2
2,1033-3,"Juan,",PER,151,156,15834-1033-3-P3
3,1033-3,Josepha,PER,159,166,15834-1033-3-P4
4,1033-3,Capitan Antonio Benites,PER,177,200,15834-1033-3-P5
5,1033-3,Ysabel Mendez,PER,216,229,15834-1033-3-P6


[margin]: Maria Esc.a Miercoles prim.o de feb.o de mil siete.tos y diez y nueve Yo Thomas de Orvera baptizé, y pusse los santos oleos á Maria h. l. de Juan, y Josepha esc.s del Capitan Antonio Benites fue su Madrina Ysabel Mendez, y lo firmé. [signed]: Thomas de Orvera

[('15834-1033-3-P1', 'Esclavista', '15834-1033-3-P5'), ('15834-1033-3-P1', 'Madrina', '15834-1033-3-P6')]
------------------------------------------



Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end
38,1033-4,Bernardo,PER,10,18
39,1033-4,Esc.vo,CHAR,19,25
40,1033-4,Domingo nueve de Abril de mil sietectos y diez...,DATE,26,80
41,1033-4,Thomas de Orvera,PER,84,100
42,1033-4,Bernardo,PER,136,144
43,1033-4,negro,CHAR,145,150
44,1033-4,adulto,CHAR,151,157
45,1033-4,carabali,CHAR,168,176
46,1033-4,esc.o,CHAR,177,182
47,1033-4,D. Juan Joseph de Justis,PER,186,210


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,unique_id
0,1033-4,Bernardo,PER,10,18,15834-1033-4-P1
1,1033-4,Thomas de Orvera,PER,84,100,15834-1033-4-P2
2,1033-4,D. Juan Joseph de Justis,PER,186,210,15834-1033-4-P3
3,1033-4,Andres de Morales,PER,221,238,15834-1033-4-P4


[margin]: Bernardo Esc.vo Domingo nueve de Abril de mil sietectos y diez y nueve Yo Thomas de Orvera baptize, y pusse los s.tos oleos á Bernardo negro adulto de nacion carabali esc.o de D. Juan Joseph de Justis fue su P. Andres de Morales, y lo firmé. [signed]: Thomas de Orvera

[('15834-1033-4-P1', 'Esclavista', '15834-1033-4-P3'), ('15834-1033-4-P1', 'P.', '15834-1033-4-P4')]
------------------------------------------



### determine_principals

In [None]:
#no_test

for index, row in demo_df.iterrows():
    entry_no = row['entry_no']
    entry_text = row['text']
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]
    print(determine_principals(entry_text, entities, 1))
    if index > 25:
        break

['Juana']
['Paula']
['Maria']
['Bernardo']
['Fran.co']
['Ant.o']
['Antonia']
['M.a Luisa']
['Ana']
['Ana']
['Theresa']
['Antonio']
['Franc.co de Paula']
['Juan']
['Vicente']
['Joseph']
['Ysabel']
['Vicente']
['Joseph']
['Maria']
['Antonia']
['Juan']
['Alexandro']
['Elena Maria']
['Juan Joseph']
['Juan']
['Geronima']


### determine_event_date

In [None]:
#no_test

for index, row in demo_df.iterrows():    
    entry_no = row['entry_no']
    entry_text = row['text']
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]
    print(determine_event_date(entry_text, entities, volume_metadata["type"], volume_metadata))
    if index > 25:
        break

Domingo veinte y dos de [roto] y nueve
Juebes veinte y tres de feb.o de mil sietec.tos. y diez y nueve
Miercoles prim.o de feb.o de mil siete.tos y diez y nueve
Domingo nueve de Abril de mil sietectos y diez y nueve
[roto] Abril de mil sietec.tos y diez, y nueve [
Domingo nueve de Abril de mil sietec.tos y diez y nueve
Domingo nueve de Abril de mil sietec.tos y diez, y nueve
Domingo nueve de Abril de mil sietec.tos y diez, y nueve
Domingo nueve de Abril de mil sietec.tos y diez, y
None
Domingo nueve de Abril de mil sietec.tos y diez, y nueve
Martes onze de Abril de mil sietec.tos y diez y nueve
Domingo quatro de Junio de mil sietecientos i dies i nuebe yo
Sabado veinte y quatro de Junio de mil sietectos, y diez, y nueve
Domingo dos de Julio de mil sietec.tos y diez y nueve yo
Domingo dos de Julio de mil sietec.tos y diez, y nueve yo
Domingo treinta de Julio de mil sietec.tos y diez y nueve
Domingo dos de Julio de [. . .] fr
Domingo dos de Julio de mil sietec.tos y diez y nueve yo el [r

The function currently fails to find a date for a substantial proportion (~20%) of entries because dates aren't being accurately extracted from the original. If this problem continues as the model improves and more entry data is incorporated into the sample, we'll need to add the post-processing capacity to bracket missing event dates by looking at entries on either side of the entry missing a date. Regardless, we will also need to add the post-processing capacity to convert these textual dates into numerical ones.

### determine_event_location

In [None]:
#no_test

for index, row in demo_df.iterrows():    
    entry_no = row['entry_no']
    entry_text = row['text']
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]
    print(determine_event_location(entry_text, entities, volume_metadata["type"], volume_metadata))
    if index > 25:
        break

Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo


### identify_cleric

In [None]:
#no_test

for index, row in demo_df.iterrows():    
    entry_no = row['entry_no']
    entry_text = row['text']
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]
    print(identify_cleric(entry_text, entities))
    if index > 25:
        break

Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
None
Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
Joseph Lopez de Cuella
Joseph Hern.z
Joseph Hern.z
Joseph Lopez de Cuella
Joseph Lopez de Cuella
Joseph Lopez de Cuella
Joseph Lopez de Cuella
Thomas de Orvera
Joseph Lopez de Cuella
Joseph Lopez de Cuella
Joseph Lopez de Cuella


## build_event

In [None]:
#no_test

for index, row in demo_df.iterrows():    
    entry_no = row['entry_no']
    entry_text = row['text']
    
    entities = copy.deepcopy(ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no])    
    people = copy.deepcopy(entities.loc[entities['pred_label'] == 'PER'])    
    
    no_dup = drop_obvious_duplicates(people, determine_principals(entry_text, entities, 1), identify_cleric(entry_text, entities))
    unique_people = assign_unique_ids(no_dup, volume_metadata)  
    cpd_identifier = volume_metadata["id"] + '-' + row["entry_no"]
    
    baptism_event_metadata = build_event(entry_text, entities, volume_metadata["type"], determine_principals(entry_text, entities, 1), volume_metadata, 1, unique_people)
    
    print("Event has ID " + baptism_event_metadata["id"])
    print(baptism_event_metadata["id"] + " has type " + baptism_event_metadata["type"])
    if baptism_event_metadata["principal"] == None:
        print("Could not identify principal for " + baptism_event_metadata["id"])
    else:
        print(baptism_event_metadata["id"] + " has principal " + baptism_event_metadata["principal"])
    if baptism_event_metadata["date"] == None:
        print("Could not identify date for " + baptism_event_metadata["id"])
    else:
        print(baptism_event_metadata["id"] + " has date " + baptism_event_metadata["date"])
    print(baptism_event_metadata["id"] + " has location " + baptism_event_metadata["location"])
    if baptism_event_metadata["cleric"] == None:
        print("Could not identify cleric for " + baptism_event_metadata["id"])
    else:
        print(baptism_event_metadata["id"] + " has cleric " + baptism_event_metadata["cleric"])    
    
    if index > 5:
        break

Event has ID 15834-1033-1-E1
15834-1033-1-E1 has type baptism
15834-1033-1-E1 has principal 15834-1033-1-P1
15834-1033-1-E1 has date Domingo veinte y dos de [roto] y nueve
15834-1033-1-E1 has location Catedral de San Carlos Borromeo
15834-1033-1-E1 has cleric 15834-1033-1-P2
Event has ID 15834-1033-2-E1
15834-1033-2-E1 has type baptism
15834-1033-2-E1 has principal 15834-1033-2-P1
15834-1033-2-E1 has date Juebes veinte y tres de feb.o de mil sietec.tos. y diez y nueve
15834-1033-2-E1 has location Catedral de San Carlos Borromeo
15834-1033-2-E1 has cleric 15834-1033-2-P2
Event has ID 15834-1033-3-E1
15834-1033-3-E1 has type baptism
15834-1033-3-E1 has principal 15834-1033-3-P1
15834-1033-3-E1 has date Miercoles prim.o de feb.o de mil siete.tos y diez y nueve
15834-1033-3-E1 has location Catedral de San Carlos Borromeo
15834-1033-3-E1 has cleric 15834-1033-3-P2
Event has ID 15834-1033-4-E1
15834-1033-4-E1 has type baptism
15834-1033-4-E1 has principal 15834-1033-4-P1
15834-1033-4-E1 has 

### categorize_characteristics

In [None]:
#no_test

for index, row in demo_df.iterrows():    
    entry_no = row['entry_no']
    entry_text = row['text']
    
    entities = copy.deepcopy(ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no])    
    characteristics = copy.deepcopy(entities.loc[entities['pred_label'] == 'CHAR'])   
    
    display(categorize_characteristics(characteristics))
    
    if index > 5:
        break

Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,category
1,1033-1,Esc.va,CHAR,17,23,status
5,1033-1,Mina,CHAR,137,141,ethnicities
6,1033-1,esclava,CHAR,142,149,status


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,category
13,1033-2,Esc.a,CHAR,17,22,status
17,1033-2,h.,CHAR,151,153,relationships
20,1033-2,esc.s,CHAR,191,196,status


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,category
26,1033-3,Esc.a,CHAR,16,21,status
30,1033-3,h.,CHAR,142,144,relationships
33,1033-3,esc.s,CHAR,167,172,status


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,category
39,1033-4,Esc.vo,CHAR,19,25,status
43,1033-4,negro,CHAR,145,150,phenotype
44,1033-4,adulto,CHAR,151,157,age
45,1033-4,carabali,CHAR,168,176,ethnicities
46,1033-4,esc.o,CHAR,177,182,status


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,category
52,1034-1,Esc.vo,CHAR,18,24,status
56,1034-1,negro,CHAR,159,164,phenotype
57,1034-1,adulto,CHAR,165,171,age
58,1034-1,temo,CHAR,182,186,ethnicities
59,1034-1,esc.o,CHAR,187,192,status


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,category
65,1034-2,Esc.vo,CHAR,16,22,status
69,1034-2,negro,CHAR,140,145,phenotype
70,1034-2,adulto,CHAR,146,152,age
71,1034-2,carabali,CHAR,163,171,ethnicities
72,1034-2,esc.o,CHAR,172,177,status


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,category
78,1034-3,Esc.va,CHAR,18,24,status
82,1034-3,negra,CHAR,145,150,phenotype
83,1034-3,adulta,CHAR,151,157,age
84,1034-3,mina,CHAR,168,172,ethnicities
85,1034-3,esc.va,CHAR,173,179,status


# build_entry_metadata

In [None]:
#no_test    
for i in range(7):    
    
    entry_no = demo_df['entry_no'][i]
    entry_text = demo_df['text'][i]
    print('\n' + entry_text + '\n')
    
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]      
    
    people, places, events = build_entry_metadata(entry_text, entities, path_to_transcription)        
        
    print("People: ")
    print(people) 
    print('\n')
    print("Events: ")
    print(events)


[margin]: Juana. Esc.va Domingo veinte y dos de [roto] y nueve yo Thomas de Orvera baptize, y pusse [roto] s.tos oleos a Juana de nacion Mina esclava de[roto] Juan Joseph de Justis fueron sus P.P. Joseph Salcedo y Ana de Santiago su mugger, y lo firmé. [signed]: Thomas de Orvera

People: 
[{'id': '15834-1033-1-P1', 'name': 'Juana', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': 'Esc.va', 'titles': None, 'ranks': None, 'relationships': [['15834-1033-1-P5', 'godparent'], ['15834-1033-1-P6', 'godparent']]}, {'id': '15834-1033-1-P3', 'name': 'Juana de nacion', 'ethnicities': 'Mina', 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': 'esclava', 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1033-1-P4', 'name': 'Juan Joseph de Justis', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': None, 'titles': None, 'ranks': None, 'relations

[{'id': '15834-1034-2-E1', 'type': 'baptism', 'principal': '15834-1034-2-P1', 'date': 'Domingo nueve de Abril de mil sietec.tos y diez y nueve', 'location': 'Catedral de San Carlos Borromeo', 'cleric': '15834-1034-2-P2'}]

[margin]: Antonia Esc.va Domingo nueve de Abril de mil sietec.tos y diez, y nueve Yo Thomas de Orvera baptize, y pusse los s.tos oleos á Antonia negra adulta de nacion mina esc.va de D. Ju.o Joseph de Justis fue su P. Joseph Salcedo, y lo firmé. [signed]: Thomas de Orvera

People: 
[{'id': '15834-1034-3-P4', 'name': 'D. Ju.o Joseph de Justis', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': None, 'titles': None, 'ranks': None, 'relationships': None}, {'id': '15834-1034-3-P5', 'name': 'Joseph Salcedo', 'ethnicities': None, 'age': None, 'legitimacy': None, 'occupation': None, 'phenotype': None, 'status': None, 'titles': None, 'ranks': None, 'relationships': [['15834-1034-3-P1', 'godchild']]}, {'id': '15834-1034-3-P

In [None]:
#no_test

num_ppl = 0
num_events = 0

for i in range(len(demo_df.index)):    
    
    entry_no = demo_df['entry_no'][i]
    entry_text = demo_df['text'][i]    
    
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]      
    
    people, places, events = build_entry_metadata(entry_text, entities, path_to_transcription)        
        
    num_ppl += len(people)
    num_events += len(events)
    
print("People: " + str(num_ppl))
print("Events: " + str(num_events))

People: 6108
Events: 1337


In [None]:
#no_test

from nbdev.export import notebook2script
notebook2script()

Converted 12-ssda-xml-parser.ipynb.
Converted 31-collate-xml-entities-spans.ipynb.
Converted 33-split-data.ipynb.
Converted 41-generic-framework-for-spacy-training.ipynb.
Converted 42-initial-model.ipynb.
Converted 51-data-preprocessing.ipynb.
Converted 52-unstructured-to-markup.ipynb.
Converted 53-markup-to-spatial-historian.ipynb.
Converted 54-utility-functions.ipynb.
Converted 61-prodigy-output-training-demo.ipynb.
Converted 62-full-model-application-demo.ipynb.
Converted 63-pt-model-training.ipynb.
Converted 64-es-model-training.ipynb.
Converted 65-all-annotations-model-training.ipynb.
Converted 66-es-guatemala-model-training.ipynb.
Converted 67-death-and-birth-records-together.ipynb.
Converted 71-relationship-builder.ipynb.
Converted 72-full-volume-processor.ipynb.
