In [None]:
#default_exp relationships

In [None]:
#export
#dependencies

#nlp packages
import spacy
from spacy.util import minibatch, compounding

#manipulation of tables/arrays
import pandas as pd
import numpy as np
import copy

#internal imports
from ssda_nlp.collate import *
from ssda_nlp.split_data import *
from ssda_nlp.modeling import *
from ssda_nlp.model_performance_utils import *
from ssda_nlp.xml_parser import *
from ssda_nlp.unstructured2markup import *

In [None]:
#export

def assign_characteristics(entry_text, entities, unique_individuals):
    '''
    matches all labeled characteristics to the correct individual(s) and builds triples
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model
        unique_individuals: as determined by id_unique_individuals and/or meta-function of disambig pipeline
    
        returns: structured representation (a list of dictionaries)
    '''
    people = []
    
    for i in range(len(unique_individuals[0])):
        people.append({"id": unique_individuals[1][i], "name": unique_individuals[0][i]})
    
    return people

In [None]:
#export

def id_unique_individuals(entry_text, entities, volume_metadata):
    '''
    identifies all unique individuals that appear in an entry (i.e. removing all multiple mentions of the same person)
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model        
        volume_metadata: metadata for the volume that the entry comes from, built by retrieve_volume_metadata
        
        returns: a list of the unique individuals who appear in an entry AND (temporary?) unique IDs for each individual
    '''
    event_id = volume_metadata["id"] + '-' + entities.iloc[0]['entry_no']
    
    people_df = entities.loc[entities['pred_label'] == 'PER']
    people_df.reset_index(inplace=True)
    people_df = people_df.drop('index',axis=1)
    unique_individuals = people_df['pred_entity'].unique()
    unique_individuals = np.vstack([unique_individuals, [None] * len(unique_individuals)])    
    
    for i in range(len(unique_individuals[0])):        
        unique_individuals[1][i] = event_id + '-P' + str(i + 1)        
    
    return unique_individuals

We want the first row of unique_individuals to contain the "best" (i.e. most complete/most accurate) name for each disambiguated individual. Once we have the ability to drop non-identical string references, we'll need to add a third row to unique_individuals in which each element is a list/array containing any/all disambiguated name strings since we'll need these to correctly attach characteristic/relationship references. 

### find_sus is broken

In [None]:
#export

def find_sus(entry_text, entities, sus_df, index):
    '''
    identifies corner cases: all entries where there are multiple entities that 1) have the same first name appearing 
        multiple times, 2) have compound names and then a segment of that name appearing, and 3) have a full name with 
        the first name by itself appearing
    Note that this should not be used in tandem with id_unique_individuals, as that function just drops the duplicate names
    
    params:
        entry_text: actual text for comparison
        entities: df of entities identified
        sus_df: either the empty df body or the df from previously loop iterations
        i: current row that the loop is on in DEMO_DF
    
    returns: df of all the entries that may be corner cases, in the same form demo_df, but with two added id columns
    '''
    #Set up
    people_df = entities.loc[entities['pred_label'] == 'PER']
    people_df.reset_index(inplace=True)
    people_df = people_df.drop('index',axis=1)
    
    my_rows = len(people_df.index)
    hold = my_rows * [0]
    people_df['name_status'] = hold
    first_names = []
    check_against = []
    dups = 0
    sus = 0
    
    #Get a list of all the first names that appear in the entities/people_df
    #  This is definitely not the most computationally efficient way to do this
    for i in range(my_rows):
        #Separate people based on whether it is a first name or a full/compound name
        if (" " in people_df.iloc[i,1]) or ("-" in people_df.iloc[i,1]):
            check_against.append(people_df.iloc[i,1])
        elif ~(" " in people_df.iloc[i,1]): #No spaces thus we are assuming it is a first name
            first_names.append(people_df.iloc[i,1])
    #Check to see whether they are subsets of full/compound names
    if len(first_names)>0 and len(check_against)>0:
        for j in range(len(first_names)):
            for k in range(len(check_against)):
                if first_names[j] in check_against[k]:
                    #Mark this entire entry as sus
                    sus = 1
    #Generally check to see if there are any duplicate entities (same name) in the entry
    if people_df['pred_entity'].duplicated().any():
        dups = 1;
    #Set the status column
    if sus and dups:
        status = 11 #ie both sus and dups are true
    elif sus:
        status = 10 #ie sus true, dups false
    elif dups:
        status = 0.01 #ie sus false, dups true
    else:
        status = 0
    #ie if the entry is suspect or has duplicates, then add it to sus_df
    if status>0:
        if len(sus_df.index)<1:
            data = [{'vol_titl':demo_df.iloc[index,0], 'vol_id':demo_df.iloc[index,1], 'fol_id':demo_df.iloc[index,2],
                    'text':demo_df.iloc[index,3],'entry_no':entry_no,'suspect':status}]
            sus_df = pd.DataFrame(data)
        else:
            sus_df = sus_df.append({'vol_titl':demo_df.iloc[index,0], 'vol_id':demo_df.iloc[index,1], 'fol_id':demo_df.iloc[index,2],
                    'text':demo_df.iloc[index,3],'entry_no':entry_no,'suspect':status},ignore_index=True)
    return sus_df

### split_name_col is not completed yet

In [None]:
#export

def split_name_col(people_df):
    '''
    from the fed in entities, strips DF to only include people, then separates based on if it is a first name or a full name
    
    '''
    #Set up
    my_rows = len(people_df.index)
    hold = my_rows * [0]
    people_df['name_status'] = hold
    
    #Separate into two based on first/single and full name status
    for i in range(my_rows):
        if "-" in people_df.iloc[i,1]:
            people_df.iloc[i,5] = 2 #2 therefore represents compound name
        elif " " in people_df.iloc[i,1]:
            people_df.iloc[i,5] = 1 #1 therefore represents a full name
        else: #Must be a single name
            #0 therefore represents a full name
            pass
    first_n = people_df[people_df.name_status == 0]
    full_n = people_df[people_df.name_status == 1]
    cmpd_n = people_df[people_df.name_status == 2]
    
    print("DF of first names")
    display(first_n.head())
    print("DF of full names")
    display(full_n.head())
    print("DF of compound names")
    display(cmpd_n.head())
    print("---------------------")
    
    return first_n, full_n, cmpd_n

### disambiguate
1. Doesn't do anything once entities are separated

In [None]:
#export

def disambiguate():
    '''
    goes through the problem cases previously identified and then applies split_name_col to break the entities down into
        the ones that may be 
    '''
    people_df = entities.loc[entities['pred_label'] == 'PER']
    people_df.reset_index(inplace=True)
    people_df = people_df.drop('index',axis=1)
    
    first_n, full_n, cmpd_n = split_name_col(people_df)
    

In [None]:
#export

def determine_principals(entry_text, entities, n_principals):
    '''
    determines the principal of a single-principal event
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model
        n_principals: expected number of principals
        
        returns: the principal(s) of the event in question, or None if no principal can be identified
    '''
    
    entry_text = entry_text.lower()
    principals = []
    
    if n_principals == 1:       
        
        for index, entity in entities.iterrows():
            if entity['pred_label'] == 'PER' and entity['pred_start'] <= 20:
                principals.append(entity['pred_entity'])
                
        if len(principals) == 0:            
            prox = entry_text.find('oleos')
            if prox != -1:
                for index, entity in entities.iterrows():
                    if entity['pred_label'] == 'PER' and (abs(entity['pred_start'] - prox) <= 10):
                        principals.append(entity['pred_entity'])
                        
        if len(principals) == 0:
            prox = entry_text.find('nombre')
            if prox != -1:
                for index, entity in entities.iterrows():
                    if entity['pred_label'] == 'PER' and (abs(entity['pred_start'] - prox) <= 10):
                        principals.append(entity['pred_entity'])                      
        
    elif n_principals == 2:
        print("That number of principals is not supported yet.")
        return None
        #process marriage principals
    else:
        print("Invalid number of principals.")
        return None
    
    return principals

In [None]:
#export

def determine_event_date(entry_text, entities, event_type, volume_metadata):
    '''
    determines the date of a specific event
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model
        event_type: this could be either a valid record_type OR a secondary event like a birth
        volume_metadata: metadata for the volume that the entry comes from, built by retrieve_volume_metadata
        
        returns: the date of the event in question, or None if no date can be identified
    '''
    date = None
    
    if event_type != volume_metadata["type"]:        
        primary_event_date = determine_event_date(entry_text, entities, event_type, volume_metadata)
        for index, entity in entities.iterrows():
            if (entity['pred_label'] == 'DATE') and (entity['pred_entity'] != primary_event_date):
                date = entity['pred_entity']
    
    elif volume_metadata["type"] == "baptism":
        entry_length = len(entry_text)
        
        for index, entity in entities.iterrows():
            if (entity['pred_label'] == 'DATE') and (entity['pred_start'] <= (entry_length / 3)):
                date = entity['pred_entity']        
                
    else:
        date = "That event type is not supported yet."
        
    return date

In [None]:
#export

def determine_event_location(entry_text, entities, event_type, volume_metadata):
    '''
    determines the location of a specific event
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model
        event_type: this could be either a valid record_type OR a secondary event like a birth
        volume_metadata: metadata for the volume that the entry comes from, built by retrieve_volume_metadata
        
        returns: the location of the event in question, or None if no date can be identified
    '''
    location = None
    
    if event_type == volume_metadata["type"]:
        location = volume_metadata["institution"]    
    else:
        location = "That event type is not supported yet."
    
    return location

In [None]:
#export

def identify_cleric(entry_text, entities):
    '''
    identifies the cleric(s) associated with a sacramental entry
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model        
        
        returns: the associated cleric(s), or None if no date can be identified
    '''
    clerics = None
    
    for index, entity in entities.iterrows():
            if ((entity['pred_label'] == 'PER') and ((len(entry_text) - entity['pred_end']) <= 10) and (len(entry_text) > 100)):
                clerics = entity['pred_entity']
            #going to keep this condition for now, but it can create false positives when long, incorrect entities are extracted
            #from short and/or garbled entries
            elif (entity['pred_entity'] != None) and (len(entry_text) - entity['pred_end'] <= 2) and (entity['pred_label'] == 'PER'):
                clerics = entity['pred_entity']                                                 
                
    if clerics == None:
        pvs_label = None
        pvs_end = None
        for index, entity in entities.iterrows():
            if entity['pred_label'] == 'PER' and pvs_label == 'DATE' and (entity['pred_start'] - pvs_end) <= 15:
                clerics = entity['pred_entity']                
            pvs_label = entity['pred_label']
            pvs_end = entity['pred_end']
    
    if clerics == None:
        entry_text = entry_text.lower()
        for index, entity in entities.iterrows():
            if entity['pred_label'] == 'PER' and entry_text.find("cura", entity['pred_start'] + len(entity['pred_entity'])) != -1 and ((entry_text.find("cura", entity['pred_start'] + len(entity['pred_entity']))) - entity['pred_end']) <= 15:
                clerics = entity['pred_entity']                
    
    return clerics

In [None]:
#export

def build_event(entry_text, entities, event_type, principals, volume_metadata, n_event_within_entry):
    '''
    builds out relationships related to a baptism or burial event
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model
        event_type: this could be either a valid record_type OR a secondary event like a birth
        principals: the principal(s) of the event, as indicated by determine_principals
        volume_metadata: metadata for the volume that the entry comes from, built by retrieve_volume_metadata
        
        n_event_within_entry: event number within entry
        
        returns: structured representation of these relationships, including (but not necessarily limited to)
        the event's principal, the date of the event, the location of the event, and the associated cleric
    '''   
    event_id = volume_metadata["id"] + '-' + entities.iloc[0]['entry_no'] + '-' + str(n_event_within_entry)    
    #it's possible that this function should also be returning an event iterator,
    #but for now I'm planning to do that in build_relationships
    
    if event_type == "baptism":
        if len(principals) == 0:
            principal = None
        else:
            principal = principals[0]
        date = determine_event_date(entry_text, entities, event_type, volume_metadata)
        location = determine_event_location(entry_text, entities, event_type, volume_metadata)
        cleric = identify_cleric(entry_text, entities)
    else:
        print("That event type can't be built yet.")
        return
    
    event_relationships = {"id": event_id, "type": event_type, "principal": principal, "date": date, "location": location, "cleric": cleric}
        
    return event_relationships

In [None]:
#export

def drop_obvious_duplicates(people, principals):
    '''
    first-pass disambiguation that drops multiple mentions of principal(s)
        people: df containing all entities labeled as people in the entry
        principals: as indicated by determine_principals
        
        returns: people df with obvious duplicates dropped
    '''   
    found_principal = False
    indices_to_drop = []    
    
    if len(principals) == 1:
        for index, person in people.iterrows():
            if (person['pred_entity'] == principals[0]) and (found_principal == False):
                found_principal = True
            elif person['pred_entity'] == principals[0]:                
                people.drop(index, inplace=True)               
   
    people.reset_index(inplace=True)
    
    return people

In [None]:
#export

def assign_unique_ids(people, volume_metadata):
    '''
    assigns unique ids to each person in an entry
        people: df containing all entities labeled as people in the entry that has received first-pass disambiguation
        volume_metadata: metadata for the volume that the entry comes from, built by retrieve_volume_metadata
        
        returns: people df with column containing unique ids appended
    '''
    size = len(people.index)
    unique_ids = []
    entry_id = volume_metadata["id"] + '-' + people.iloc[0]['entry_no']
    
    for i in range(size):
        unique_ids.append(entry_id + '-P' + str(i+1))
        
    people['unique_id'] = unique_ids
    
    return people

In [None]:
#export

def build_entry_metadata(entry_text, entities, path_to_volume_xml):
    '''
    Master function that will combine all helper functions built above
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity        
        entities: entities of all kinds extracted from that entry by an NER model        
            
        returns: paths to three JSON files containg, respectively, 
        metadata re people, places, and events that appear in the entry
    '''
        
    volume_metadata = retrieve_metadata(path_to_volume_xml)
    people_df = copy.deepcopy(entities.loc[entities['pred_label'] == 'PER'])
    people_df.reset_index(inplace=True)    
    
    if volume_metadata["type"] == "baptism":
        principal = determine_principals(entry_text, entities, 1)
        people_df = assign_unique_ids(drop_obvious_duplicates(people_df, principal), volume_metadata)
        #event_relationships = build_event(entry_text, entities, "baptism", principals)               
        #interpersonal_relationships = process_interpersonal(entry_text, entities)
        #characteristics = process_characteristics(entry_text, entities, interpersonal_relationships)        
    elif volume_metadata["type"] == "marriage":        
        #process marriage record
        print("That record type is not supported yet.")
        return None
    elif volume_metadata["type"] == "burial":
        #process burial record
        print("That record type is not supported yet.")
        return None
    else:
        print("That record type is not supported yet.")
        return None
    
    #code that turns pieces defined above into well-formed relationships
    
    return relationships

## Unit testing

### Load trained model and entry data

In [None]:
#no_test

trained_model = load_model('models/mat_baut_1', language="es", verbose='True')

Loaded model 'models/mat_baut_1'


In [None]:
#no_test

path_to_transcription = "transcriptions\\15834.xml"
demo_df = parse_xml_v2(path_to_transcription)
demo_df.head()

Unnamed: 0,vol_titl,vol_id,fol_id,text,entry_no
0,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1033,[margin]: Juana. Esc.va Domingo veinte y dos d...,1033-1
1,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1033,[margin]: Paula. Esc.a Juebes veinte y tres de...,1033-2
2,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1033,[margin]: Maria Esc.a Miercoles prim.o de feb....,1033-3
3,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1033,[margin]: Bernardo Esc.vo Domingo nueve de Abr...,1033-4
4,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1034,[margin]: Fran.co Esc.vo [roto] Abril de mil s...,1034-1


In [None]:
#no_test

volume_metadata = retrieve_volume_metadata(path_to_transcription)
print(volume_metadata)

{'type': 'baptism', 'country': 'Cuba', 'state': 'Matanzas', 'city': 'Matanzas', 'institution': 'Catedral de San Carlos Borromeo', 'id': '15834', 'title': 'Libro 1 de Bautismos de Pardos y Morenos, 1719 - 1752, Parroquia de San Carlos de Matanzas'}


### Apply model to entry data

In [None]:
#no_test

ent_preds_df, metrics_df, per_ent_metrics = test_model(trained_model, demo_df, "entry_no", "text", score_model=False)
ent_preds_df.head(20)

Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end
0,1033-1,Juana,PER,10,15
1,1033-1,Esc.va,CHAR,17,23
2,1033-1,Domingo veinte y dos de [roto] y nueve,DATE,24,62
3,1033-1,Thomas de Orvera,PER,66,82
4,1033-1,Juana de nacion,PER,121,136
5,1033-1,Mina,CHAR,137,141
6,1033-1,esclava,CHAR,142,149
7,1033-1,Juan Joseph de Justis,PER,159,180
8,1033-1,P.P.,REL,192,196
9,1033-1,Joseph Salcedo,PER,197,211


### drop_obvious_duplicates

In [None]:
#no_test

for index, row in demo_df.iterrows():
    entry_no = row['entry_no']
    entry_text = row['text']
    
    entities = copy.deepcopy(ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no])    
    people = copy.deepcopy(entities.loc[entities['pred_label'] == 'PER'])
    
    display(people)
    no_dup = drop_obvious_duplicates(people, determine_principals(entry_text, entities, 1))
    display(no_dup)
    
    if index > 5:
        break

Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end
0,1033-1,Juana,PER,10,15
3,1033-1,Thomas de Orvera,PER,66,82
4,1033-1,Juana de nacion,PER,121,136
7,1033-1,Juan Joseph de Justis,PER,159,180
9,1033-1,Joseph Salcedo,PER,197,211
10,1033-1,Ana de Santiago,PER,214,229
11,1033-1,Thomas de Orvera,PER,263,279


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end
0,0,1033-1,Juana,PER,10,15
1,3,1033-1,Thomas de Orvera,PER,66,82
2,4,1033-1,Juana de nacion,PER,121,136
3,7,1033-1,Juan Joseph de Justis,PER,159,180
4,9,1033-1,Joseph Salcedo,PER,197,211
5,10,1033-1,Ana de Santiago,PER,214,229
6,11,1033-1,Thomas de Orvera,PER,263,279


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end
12,1033-2,Paula,PER,10,15
15,1033-2,Thomas de Orvera,PER,90,106
16,1033-2,Paula,PER,145,150
18,1033-2,Juan Joseph,PER,162,173
19,1033-2,Maria Josepha,PER,177,190
21,1033-2,Capitan D. Luis Hurtado de Mendoza,PER,201,235
23,1033-2,Bartholome Rixo,PER,251,266
24,1033-2,Thomas de Orvera,PER,290,306


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end
0,12,1033-2,Paula,PER,10,15
1,15,1033-2,Thomas de Orvera,PER,90,106
2,18,1033-2,Juan Joseph,PER,162,173
3,19,1033-2,Maria Josepha,PER,177,190
4,21,1033-2,Capitan D. Luis Hurtado de Mendoza,PER,201,235
5,23,1033-2,Bartholome Rixo,PER,251,266
6,24,1033-2,Thomas de Orvera,PER,290,306


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end
25,1033-3,Maria,PER,10,15
28,1033-3,Thomas de Orvera,PER,83,99
29,1033-3,Maria,PER,136,141
31,1033-3,"Juan,",PER,151,156
32,1033-3,Josepha,PER,159,166
34,1033-3,Capitan Antonio Benites,PER,177,200
36,1033-3,Ysabel Mendez,PER,216,229
37,1033-3,Thomas de Orvera,PER,253,269


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end
0,25,1033-3,Maria,PER,10,15
1,28,1033-3,Thomas de Orvera,PER,83,99
2,31,1033-3,"Juan,",PER,151,156
3,32,1033-3,Josepha,PER,159,166
4,34,1033-3,Capitan Antonio Benites,PER,177,200
5,36,1033-3,Ysabel Mendez,PER,216,229
6,37,1033-3,Thomas de Orvera,PER,253,269


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end
38,1033-4,Bernardo,PER,10,18
41,1033-4,Thomas de Orvera,PER,84,100
42,1033-4,Bernardo,PER,136,144
47,1033-4,D. Juan Joseph de Justis,PER,186,210
49,1033-4,Andres de Morales,PER,221,238
50,1033-4,Thomas de Orvera,PER,262,278


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end
0,38,1033-4,Bernardo,PER,10,18
1,41,1033-4,Thomas de Orvera,PER,84,100
2,47,1033-4,D. Juan Joseph de Justis,PER,186,210
3,49,1033-4,Andres de Morales,PER,221,238
4,50,1033-4,Thomas de Orvera,PER,262,278


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end
51,1034-1,Fran.co,PER,10,17
54,1034-1,Th[roto]mas de [roto]vera bap[roto]izé,PER,80,118
55,1034-1,Fran.co,PER,151,158
60,1034-1,D. Ju[roto] Joseph de Justis,PER,196,224
62,1034-1,Pedro Suares,PER,235,247
63,1034-1,Thomas de Orvera,PER,271,287


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end
0,51,1034-1,Fran.co,PER,10,17
1,54,1034-1,Th[roto]mas de [roto]vera bap[roto]izé,PER,80,118
2,60,1034-1,D. Ju[roto] Joseph de Justis,PER,196,224
3,62,1034-1,Pedro Suares,PER,235,247
4,63,1034-1,Thomas de Orvera,PER,271,287


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end
64,1034-2,Ant.o,PER,10,15
67,1034-2,Thomas de Orvera,PER,82,98
68,1034-2,Ant.o,PER,134,139
73,1034-2,D. Juan Joseph de Justis,PER,181,205
75,1034-2,Joseph de Soto,PER,216,230
76,1034-2,Thomas de Orvera,PER,254,270


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end
0,64,1034-2,Ant.o,PER,10,15
1,67,1034-2,Thomas de Orvera,PER,82,98
2,73,1034-2,D. Juan Joseph de Justis,PER,181,205
3,75,1034-2,Joseph de Soto,PER,216,230
4,76,1034-2,Thomas de Orvera,PER,254,270


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end
77,1034-3,Antonia,PER,10,17
80,1034-3,Thomas de Orvera,PER,85,101
81,1034-3,Antonia,PER,137,144
86,1034-3,D. Ju.o Joseph de Justis,PER,183,207
88,1034-3,Joseph Salcedo,PER,218,232
89,1034-3,Thomas de Orvera,PER,256,272


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end
0,77,1034-3,Antonia,PER,10,17
1,80,1034-3,Thomas de Orvera,PER,85,101
2,86,1034-3,D. Ju.o Joseph de Justis,PER,183,207
3,88,1034-3,Joseph Salcedo,PER,218,232
4,89,1034-3,Thomas de Orvera,PER,256,272


### assign_unique_ids

In [None]:
#no_test

for index, row in demo_df.iterrows():
    entry_no = row['entry_no']
    entry_text = row['text']
    
    entities = copy.deepcopy(ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no])    
    people = copy.deepcopy(entities.loc[entities['pred_label'] == 'PER'])    
    
    no_dup = drop_obvious_duplicates(people, determine_principals(entry_text, entities, 1))
    display(assign_unique_ids(no_dup, volume_metadata))
    
    if index > 5:
        break

Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end,unique_id
0,0,1033-1,Juana,PER,10,15,15834-1033-1-P1
1,3,1033-1,Thomas de Orvera,PER,66,82,15834-1033-1-P2
2,4,1033-1,Juana de nacion,PER,121,136,15834-1033-1-P3
3,7,1033-1,Juan Joseph de Justis,PER,159,180,15834-1033-1-P4
4,9,1033-1,Joseph Salcedo,PER,197,211,15834-1033-1-P5
5,10,1033-1,Ana de Santiago,PER,214,229,15834-1033-1-P6
6,11,1033-1,Thomas de Orvera,PER,263,279,15834-1033-1-P7


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end,unique_id
0,12,1033-2,Paula,PER,10,15,15834-1033-2-P1
1,15,1033-2,Thomas de Orvera,PER,90,106,15834-1033-2-P2
2,18,1033-2,Juan Joseph,PER,162,173,15834-1033-2-P3
3,19,1033-2,Maria Josepha,PER,177,190,15834-1033-2-P4
4,21,1033-2,Capitan D. Luis Hurtado de Mendoza,PER,201,235,15834-1033-2-P5
5,23,1033-2,Bartholome Rixo,PER,251,266,15834-1033-2-P6
6,24,1033-2,Thomas de Orvera,PER,290,306,15834-1033-2-P7


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end,unique_id
0,25,1033-3,Maria,PER,10,15,15834-1033-3-P1
1,28,1033-3,Thomas de Orvera,PER,83,99,15834-1033-3-P2
2,31,1033-3,"Juan,",PER,151,156,15834-1033-3-P3
3,32,1033-3,Josepha,PER,159,166,15834-1033-3-P4
4,34,1033-3,Capitan Antonio Benites,PER,177,200,15834-1033-3-P5
5,36,1033-3,Ysabel Mendez,PER,216,229,15834-1033-3-P6
6,37,1033-3,Thomas de Orvera,PER,253,269,15834-1033-3-P7


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end,unique_id
0,38,1033-4,Bernardo,PER,10,18,15834-1033-4-P1
1,41,1033-4,Thomas de Orvera,PER,84,100,15834-1033-4-P2
2,47,1033-4,D. Juan Joseph de Justis,PER,186,210,15834-1033-4-P3
3,49,1033-4,Andres de Morales,PER,221,238,15834-1033-4-P4
4,50,1033-4,Thomas de Orvera,PER,262,278,15834-1033-4-P5


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end,unique_id
0,51,1034-1,Fran.co,PER,10,17,15834-1034-1-P1
1,54,1034-1,Th[roto]mas de [roto]vera bap[roto]izé,PER,80,118,15834-1034-1-P2
2,60,1034-1,D. Ju[roto] Joseph de Justis,PER,196,224,15834-1034-1-P3
3,62,1034-1,Pedro Suares,PER,235,247,15834-1034-1-P4
4,63,1034-1,Thomas de Orvera,PER,271,287,15834-1034-1-P5


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end,unique_id
0,64,1034-2,Ant.o,PER,10,15,15834-1034-2-P1
1,67,1034-2,Thomas de Orvera,PER,82,98,15834-1034-2-P2
2,73,1034-2,D. Juan Joseph de Justis,PER,181,205,15834-1034-2-P3
3,75,1034-2,Joseph de Soto,PER,216,230,15834-1034-2-P4
4,76,1034-2,Thomas de Orvera,PER,254,270,15834-1034-2-P5


Unnamed: 0,index,entry_no,pred_entity,pred_label,pred_start,pred_end,unique_id
0,77,1034-3,Antonia,PER,10,17,15834-1034-3-P1
1,80,1034-3,Thomas de Orvera,PER,85,101,15834-1034-3-P2
2,86,1034-3,D. Ju.o Joseph de Justis,PER,183,207,15834-1034-3-P3
3,88,1034-3,Joseph Salcedo,PER,218,232,15834-1034-3-P4
4,89,1034-3,Thomas de Orvera,PER,256,272,15834-1034-3-P5


### id_unique_individuals

In [None]:
#no_test

for index, row in demo_df.iterrows():
    entry_no = row['entry_no']
    entry_text = row['text']
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]    
    print(id_unique_individuals(entry_text, entities, volume_metadata))    
    if index > 15:
        break

[['Juana' 'Thomas de Orvera' 'Juana de nacion' 'Juan Joseph de Justis'
  'Joseph Salcedo' 'Ana de Santiago']
 ['15834-1033-1-P1' '15834-1033-1-P2' '15834-1033-1-P3' '15834-1033-1-P4'
  '15834-1033-1-P5' '15834-1033-1-P6']]
[['Paula' 'Thomas de Orvera' 'Juan Joseph' 'Maria Josepha'
  'Capitan D. Luis Hurtado de Mendoza' 'Bartholome Rixo']
 ['15834-1033-2-P1' '15834-1033-2-P2' '15834-1033-2-P3' '15834-1033-2-P4'
  '15834-1033-2-P5' '15834-1033-2-P6']]
[['Maria' 'Thomas de Orvera' 'Juan,' 'Josepha' 'Capitan Antonio Benites'
  'Ysabel Mendez']
 ['15834-1033-3-P1' '15834-1033-3-P2' '15834-1033-3-P3' '15834-1033-3-P4'
  '15834-1033-3-P5' '15834-1033-3-P6']]
[['Bernardo' 'Thomas de Orvera' 'D. Juan Joseph de Justis'
  'Andres de Morales']
 ['15834-1033-4-P1' '15834-1033-4-P2' '15834-1033-4-P3' '15834-1033-4-P4']]
[['Fran.co' 'Th[roto]mas de [roto]vera bap[roto]izé'
  'D. Ju[roto] Joseph de Justis' 'Pedro Suares' 'Thomas de Orvera']
 ['15834-1034-1-P1' '15834-1034-1-P2' '15834-1034-1-P3' '1583

### find_sus

In [None]:
#no_test

sus_df = pd.DataFrame(columns=['vol_titl', 'vol_id', 'fol_id', 'text','entry_no','suspect'])

#For reference: suspect codes:
#0.01 means there are duplicates but there aren't first names that appear alone
#10.00 means there are first names that are subsets of full names, but no duplicates
#11 means there are both first names that are subsets and duplicates

for index, row in demo_df.iterrows():
    entry_no = row['entry_no']
    entry_text = row['text']
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]
    sus_df = find_sus(entry_text, entities, sus_df, index)
    if index > 150: #Has 481 rows
        break

print(f"Here is the df of sus entries, with a length of ", len(sus_df.index))
display(sus_df.head(20))

Here is the df of sus entries, with a length of  129


Unnamed: 0,vol_titl,vol_id,fol_id,text,entry_no,suspect
0,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1033,[margin]: Juana. Esc.va Domingo veinte y dos d...,1033-1,11.0
1,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1033,[margin]: Paula. Esc.a Juebes veinte y tres de...,1033-2,0.01
2,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1033,[margin]: Maria Esc.a Miercoles prim.o de feb....,1033-3,0.01
3,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1033,[margin]: Bernardo Esc.vo Domingo nueve de Abr...,1033-4,0.01
4,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1034,[margin]: Fran.co Esc.vo [roto] Abril de mil s...,1034-1,0.01
5,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1034,[margin]: Ant.o Esc.vo Domingo nueve de Abril ...,1034-2,0.01
6,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1034,[margin]: Antonia Esc.va Domingo nueve de Abri...,1034-3,0.01
7,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1034,[margin]: M.a Luisa esc.va Domingo nueve de Ab...,1034-4,0.01
8,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1035,[margin]: Theresa esc.va Domingo nueve de Abri...,1035-2,0.01
9,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1035,[margin]: Antonio. esc.vo Martes onze de Abril...,1035-3,0.01


### disambiguate

In [None]:
#no_test

for index, row in sus_df.iterrows():
    entry_no = row['entry_no']
    entry_text = row['text']
    status = row['suspect']
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]
    if status == 0.01:
        pass #There are only duplicates, so might want to deal with this separately
    elif status > 9: #ie sus is true, disambiguation needed
        disambiguate()
    if index > 10: #Has 481 rows
        break

DF of first names


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,name_status
0,1033-1,Juana,PER,10,15,0


DF of full names


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,name_status
1,1033-1,Thomas de Orvera,PER,66,82,1
2,1033-1,Juana de nacion,PER,121,136,1
3,1033-1,Juan Joseph de Justis,PER,159,180,1
4,1033-1,Joseph Salcedo,PER,197,211,1
5,1033-1,Ana de Santiago,PER,214,229,1


DF of compound names


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,name_status


---------------------


## assign_characteristics

In [None]:
#no_test

for index, row in demo_df.iterrows():
    entry_no = row['entry_no']
    entry_text = row['text']
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]    
    print(assign_characteristics(entry_text, entities, id_unique_individuals(entry_text, entities, volume_metadata)))
    if index > 10:
        break

[{'id': '15834-1033-1-P1', 'name': 'Juana'}, {'id': '15834-1033-1-P2', 'name': 'Thomas de Orvera'}, {'id': '15834-1033-1-P3', 'name': 'Juana de nacion'}, {'id': '15834-1033-1-P4', 'name': 'Juan Joseph de Justis'}, {'id': '15834-1033-1-P5', 'name': 'Joseph Salcedo'}, {'id': '15834-1033-1-P6', 'name': 'Ana de Santiago'}]
[{'id': '15834-1033-2-P1', 'name': 'Paula'}, {'id': '15834-1033-2-P2', 'name': 'Thomas de Orvera'}, {'id': '15834-1033-2-P3', 'name': 'Juan Joseph'}, {'id': '15834-1033-2-P4', 'name': 'Maria Josepha'}, {'id': '15834-1033-2-P5', 'name': 'Capitan D. Luis Hurtado de Mendoza'}, {'id': '15834-1033-2-P6', 'name': 'Bartholome Rixo'}]
[{'id': '15834-1033-3-P1', 'name': 'Maria'}, {'id': '15834-1033-3-P2', 'name': 'Thomas de Orvera'}, {'id': '15834-1033-3-P3', 'name': 'Juan,'}, {'id': '15834-1033-3-P4', 'name': 'Josepha'}, {'id': '15834-1033-3-P5', 'name': 'Capitan Antonio Benites'}, {'id': '15834-1033-3-P6', 'name': 'Ysabel Mendez'}]
[{'id': '15834-1033-4-P1', 'name': 'Bernardo'}

### determine_principals

In [None]:
#no_test

for index, row in demo_df.iterrows():
    entry_no = row['entry_no']
    entry_text = row['text']
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]
    print(determine_principals(entry_text, entities, 1))
    if index > 25:
        break

['Juana']
['Paula']
['Maria']
['Bernardo']
['Fran.co']
['Ant.o']
['Antonia']
['M.a Luisa']
['Ana']
['Ana']
['Theresa']
['Antonio']
['Franc.co de Paula']
['Juan']
['Vicente']
['Joseph']
['Ysabel']
['Vicente']
['Joseph']
['Maria']
['Antonia']
['Juan']
['Alexandro']
['Elena Maria']
['Juan Joseph']
['Juan']
['Geronima']


### determine_event_date

In [None]:
#no_test

for index, row in demo_df.iterrows():    
    entry_no = row['entry_no']
    entry_text = row['text']
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]
    print(determine_event_date(entry_text, entities, volume_metadata["type"], volume_metadata))
    if index > 25:
        break       

Domingo veinte y dos de [roto] y nueve
Juebes veinte y tres de feb.o de mil sietec.tos. y diez y nueve
Miercoles prim.o de feb.o de mil siete.tos y diez y nueve
Domingo nueve de Abril de mil sietectos y diez y nueve
[roto] Abril de mil sietec.tos y diez, y nueve [
Domingo nueve de Abril de mil sietec.tos y diez y nueve
Domingo nueve de Abril de mil sietec.tos y diez, y nueve
Domingo nueve de Abril de mil sietec.tos y diez, y nueve
Domingo nueve de Abril de mil sietec.tos y diez, y
None
Domingo nueve de Abril de mil sietec.tos y diez, y nueve
Martes onze de Abril de mil sietec.tos y diez y nueve
Domingo quatro de Junio de mil sietecientos i dies i nuebe yo
Sabado veinte y quatro de Junio de mil sietectos, y diez, y nueve
Domingo dos de Julio de mil sietec.tos y diez y nueve yo
Domingo dos de Julio de mil sietec.tos y diez, y nueve yo
Domingo treinta de Julio de mil sietec.tos y diez y nueve
Domingo dos de Julio de [. . .] fr
Domingo dos de Julio de mil sietec.tos y diez y nueve yo el [r

The function currently fails to find a date for a substantial proportion (~20%) of entries because dates aren't being accurately extracted from the original. If this problem continues as the model improves and more entry data is incorporated into the sample, we'll need to add the post-processing capacity to bracket missing event dates by looking at entries on either side of the entry missing a date. Regardless, we will also need to add the post-processing capacity to convert these textual dates into numerical ones.

### determine_event_location

In [None]:
#no_test

for index, row in demo_df.iterrows():    
    entry_no = row['entry_no']
    entry_text = row['text']
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]
    print(determine_event_location(entry_text, entities, volume_metadata["type"], volume_metadata))
    if index > 25:
        break

Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo


### identify_cleric

In [None]:
#no_test

for index, row in demo_df.iterrows():    
    entry_no = row['entry_no']
    entry_text = row['text']
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]
    print(identify_cleric(entry_text, entities))
    if index > 25:
        break

Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
None
Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
Joseph Lopez de Cuella
Joseph Hern.z
Joseph Hern.z
Joseph Lopez de Cuella
Joseph Lopez de Cuella
Joseph Lopez de Cuella
Joseph Lopez de Cuella
Thomas de Orvera
Joseph Lopez de Cuella
Joseph Lopez de Cuella
Joseph Lopez de Cuella


## build_event

In [None]:
#no_test

for index, row in demo_df.iterrows():    
    entry_no = row['entry_no']
    entry_text = row['text']
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]
    cpd_identifier = volume_metadata["id"] + '-' + row["entry_no"]
    baptism_event_metadata = build_event(entry_text, entities, volume_metadata["type"], determine_principals(entry_text, entities, 1), volume_metadata, 1)
    
    print("Event has ID " + baptism_event_metadata["id"])
    print(baptism_event_metadata["id"] + " has type " + baptism_event_metadata["type"])
    if baptism_event_metadata["principal"] == None:
        print("Could not identify principal for " + baptism_event_metadata["id"])
    else:
        print(baptism_event_metadata["id"] + " has principal " + baptism_event_metadata["principal"])
    if baptism_event_metadata["date"] == None:
        print("Could not identify date for " + baptism_event_metadata["id"])
    else:
        print(baptism_event_metadata["id"] + " has date " + baptism_event_metadata["date"])
    print(baptism_event_metadata["id"] + " has location " + baptism_event_metadata["location"])
    if baptism_event_metadata["cleric"] == None:
        print("Could not identify cleric for " + baptism_event_metadata["id"])
    else:
        print(baptism_event_metadata["id"] + " has cleric " + baptism_event_metadata["cleric"])
    
    if index > 5:
        break

Event has ID 15834-1033-1-1
15834-1033-1-1 has type baptism
15834-1033-1-1 has principal Juana
15834-1033-1-1 has date Domingo veinte y dos de [roto] y nueve
15834-1033-1-1 has location Catedral de San Carlos Borromeo
15834-1033-1-1 has cleric Thomas de Orvera
Event has ID 15834-1033-2-1
15834-1033-2-1 has type baptism
15834-1033-2-1 has principal Paula
15834-1033-2-1 has date Juebes veinte y tres de feb.o de mil sietec.tos. y diez y nueve
15834-1033-2-1 has location Catedral de San Carlos Borromeo
15834-1033-2-1 has cleric Thomas de Orvera
Event has ID 15834-1033-3-1
15834-1033-3-1 has type baptism
15834-1033-3-1 has principal Maria
15834-1033-3-1 has date Miercoles prim.o de feb.o de mil siete.tos y diez y nueve
15834-1033-3-1 has location Catedral de San Carlos Borromeo
15834-1033-3-1 has cleric Thomas de Orvera
Event has ID 15834-1033-4-1
15834-1033-4-1 has type baptism
15834-1033-4-1 has principal Bernardo
15834-1033-4-1 has date Domingo nueve de Abril de mil sietectos y diez y nu

In [None]:
#no_test

from nbdev.export import notebook2script
notebook2script()

Converted 12-ssda-xml-parser.ipynb.
Converted 31-collate-xml-entities-spans.ipynb.
Converted 33-split-data.ipynb.
Converted 41-generic-framework-for-spacy-training.ipynb.
Converted 42-initial-model.ipynb.
Converted 51-data-preprocessing.ipynb.
Converted 52-unstructured-to-markup.ipynb.
Converted 53-markup-to-spatial-historian.ipynb.
Converted 61-prodigy-output-training-demo.ipynb.
Converted 62-full-model-application-demo.ipynb.
Converted 63-pt-model-training.ipynb.
Converted 64-es-model-training.ipynb.
Converted 71-relationship-builder.ipynb.
