In [None]:
#default_exp relationships

In [None]:
#no_test
#dependencies

#nlp packages
import spacy
from spacy.util import minibatch, compounding

#manipulation of tables/arrays
import pandas as pd
import numpy as np

#internal imports
from ssda_nlp.collate import *
from ssda_nlp.split_data import *
from ssda_nlp.modeling import *
from ssda_nlp.model_performance_utils import *
from ssda_nlp.xml_parser import *
from ssda_nlp.unstructured2markup import *

In [None]:
#export

def id_unique_individuals(entry_text, entities, record_type):
    '''
    identifies all unique individuals that appear in an entry (i.e. removing all multiple mentions of the same person)
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model
        record_type: simple flag indicating whether records are baptisms, marriages, burials, etc. (this can also be determined
        programmatically and may be deprecated later)
        
        returns: a list of the unique individuals who appear in an entry
    '''
    
    people_df = entities.loc[entities['pred_label'] == 'PER']
    people_df.reset_index(inplace=True)
    people_df = people_df.drop('index',axis=1)
    unique_individuals = people_df['pred_entity'].unique()
    
    return unique_individuals

### find_sus is broken
1. Every time you run it the sus_df it returns gets longer for some reason
2. Testing it again its only finding a total of 6 entries with duplicates, whereas before it was finding 50+.  Need to check validity.  This is also a function of whatver you set the index before breaking the loop.

In [None]:
#export

def find_sus(entry_text, entities, sus_df, i):
    '''
    identifies corner cases: all entries where there are multiple entities that 1) have the same first name appearing 
        multiple times, 2) have compound names and then a segment of that name appearing, and 3) have a full name with 
        the first name by itself appearing
    Note that this should not be used in tandem with id_unique_individuals, as that function just drops the duplicate names
    
    params:
        entry_text: actual text for comparison
        entities: df of entities identified
        sus_df: either the empty df body or the df from previously loop iterations
        i: current row that the loop is on in DEMO_DF
    
    returns: df of all the entries that may be corner cases, in the same form demo_df
    '''
    
    people_df = entities.loc[entities['pred_label'] == 'PER']
    people_df.reset_index(inplace=True)
    people_df = people_df.drop('index',axis=1)
    
    if people_df['pred_entity'].duplicated().any(): #ie if there are duplicate names
        if len(sus_df.index)<1:
            #display(people_df.head(20))
            data = [{'vol_titl':demo_df.iloc[i,0], 'vol_id':demo_df.iloc[i,1], 'fol_id':demo_df.iloc[i,2],
                    'text':demo_df.iloc[i,3],'entry_no':entry_no}]
            sus_df = pd.DataFrame(data)
        else:
            sus_df = sus_df.append({'vol_titl':demo_df.iloc[i,0], 'vol_id':demo_df.iloc[i,1], 'fol_id':demo_df.iloc[i,2],
                    'text':demo_df.iloc[i,3],'entry_no':entry_no},ignore_index=True) 
    return sus_df

### split_name_col is not completed yet

In [None]:
#export

def split_name_col(people_df):
    '''
    from the fed in entities, strips DF to only include people, then separates based on if it is a first name or a full name
    
    '''
    #Set up
    my_rows = len(people_df.index)
    hold = my_rows * [0]
    people_df['name_status'] = hold
    
    #Separate into two based on first/single and full name status
    for i in range(my_rows):
        if "-" in people_df.iloc[i,1]:
            people_df.iloc[i,5] = 2 #2 therefore represents compound name
        elif " " in people_df.iloc[i,1]:
            people_df.iloc[i,5] = 1 #1 therefore represents a full name
        else: #Must be a single name
            #0 therefore represents a full name
            pass
    first_n = people_df[people_df.name_status == 0]
    full_n = people_df[people_df.name_status == 1]
    cmpd_n = people_df[people_df.name_status == 2]
    
    print("DF of first names")
    display(first_n.head())
    print("DF of ful names")
    display(full_n.head())
    print("DF of compound names")
    display(cmpd_n.head())
    
    return first_n, full_n, cmpd_n

### disambiguate is not complete yet

In [None]:
#export

def disambiguate(sus_df):
    '''
    goes through the problem cases previously identified and then applies split_name_col to break the entities down into
        the ones that may be 
    '''
    
    people_df = entities.loc[entities['pred_label'] == 'PER']
    people_df.reset_index(inplace=True)
    people_df = people_df.drop('index',axis=1)

    split_name_col(people_df)

In [None]:
#export

def determine_principals(entry_text, entities, n_principals):
    '''
    determines the principal of a single-principal event
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model
        n_principals: expected number of principals
        
        returns: the principal(s) of the event in question, or None if no principal can be identified
    '''
    
    entry_text = entry_text.lower()
    
    if n_principals == 1:        
        principals = None
        
        for index, entity in entities.iterrows():
            if entity['pred_label'] == 'PER' and entity['pred_start'] <= 20:
                principals = entity['pred_entity']
                
        if principals == None:            
            prox = entry_text.find('oleos')
            if prox != -1:
                for index, entity in entities.iterrows():
                    if entity['pred_label'] == 'PER' and (abs(entity['pred_start'] - prox) <= 10):
                        principals = entity['pred_entity']
                        
        if principals == None:
            prox = entry_text.find('nombre')
            if prox != -1:
                for index, entity in entities.iterrows():
                    if entity['pred_label'] == 'PER' and (abs(entity['pred_start'] - prox) <= 10):
                        principals = entity['pred_entity']                        
        
    elif n_principals == 2:
        print("That number of principals is not supported yet.")
        return None
        #process marriage principals
    else:
        print("Invalid number of principals.")
        return None
    
    return principals

In [None]:
#export

def determine_event_date(entry_text, entities, event_type, volume_metadata):
    '''
    determines the date of a specific event
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model
        event_type: this could be either a valid record_type OR a secondary event like a birth
        volume_metadata: metadata for the volume that the entry comes from, built by retrieve_volume_metadata
        
        returns: the date of the event in question, or None if no date can be identified
    '''
    date = None
    
    if event_type != volume_metadata["type"]:        
        primary_event_date = determine_event_date(entry_text, entities, event_type, volume_metadata)
        for index, entity in entities.iterrows():
            if entity['pred_label'] == 'DATE' and entity['pred_entity'] != primary_event_date:
                date = entity['pred_entity']
    
    elif volume_metadata["type"] == "baptism":
        entry_length = len(entry_text)
        
        for index, entity in entities.iterrows():
            if entity['pred_label'] == 'DATE' and entity['pred_start'] <= (entry_length / 3):
                date = entity['pred_entity']        
                
    else:
        date = "That event type is not supported yet."
        
    return date

In [None]:
#export

def determine_event_location(entry_text, entities, event_type, volume_metadata):
    '''
    determines the location of a specific event
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model
        event_type: this could be either a valid record_type OR a secondary event like a birth
        volume_metadata: metadata for the volume that the entry comes from, built by retrieve_volume_metadata
        
        returns: the location of the event in question, or None if no date can be identified
    '''
    location = None
    
    if event_type == volume_metadata["type"]:
        location = volume_metadata["institution"]    
    else:
        location = "That event type is not supported yet."
    
    return location

In [None]:
#export

def identify_cleric(entry_text, entities):
    '''
    identifies the cleric(s) associated with a sacramental entry
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model        
        
        returns: the associated cleric(s), or None if no date can be identified
    '''
    clerics = None
    
    for index, entity in entities.iterrows():
            if ((entity['pred_label'] == 'PER') and ((len(entry_text) - entity['pred_end']) <= 10) and (len(entry_text) > 100)):
                clerics = entity['pred_entity']
            #going to keep this condition for now, but it can create false positives when long, incorrect entities are extracted
            #from short and/or garbled entries
            elif (entity['pred_entity'] != None) and (len(entry_text) - entity['pred_end'] <= 2) and (entity['pred_label'] == 'PER'):
                clerics = entity['pred_entity']                                                 
                
    if clerics == None:
        pvs_label = None
        pvs_end = None
        for index, entity in entities.iterrows():
            if entity['pred_label'] == 'PER' and pvs_label == 'DATE' and (entity['pred_start'] - pvs_end) <= 15:
                clerics = entity['pred_entity']                
            pvs_label = entity['pred_label']
            pvs_end = entity['pred_end']
    
    if clerics == None:
        entry_text = entry_text.lower()
        for index, entity in entities.iterrows():
            if entity['pred_label'] == 'PER' and entry_text.find("cura", entity['pred_start'] + len(entity['pred_entity'])) != -1 and ((entry_text.find("cura", entity['pred_start'] + len(entity['pred_entity']))) - entity['pred_end']) <= 15:
                clerics = entity['pred_entity']                
    
    return clerics

In [None]:
#export

def build_event(entry_text, entities, event_type, principals):
    '''
    builds out relationships related to a baptism or burial event
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model
        event_type: this could be either a valid record_type OR a secondary event like a birth
        
        returns: structured representation of these relationships, including (but not necessarily limited to)
        the event's principal, the date of the event, the location of the event, and the associated cleric
    '''   
    
    date = determine_event_date(entry_text, entities, event_type)
    location = determine_event_location(entry_text, entities, event_type)
    cleric = identify_cleric(entry_text, entities)
    
    return event_relationships

In [None]:
#export

def build_relationships(entry_text, entities, path_to_volume_xml):
    '''
    Master function that will combine all helper functions built above
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model
        record_type: simple flag indicating whether records are baptisms, marriages, burials, etc. (this can also be determined
        programmatically and may be deprecated later)
            
        returns: structured data (specific format to be named later) containing all relationships extracted
    '''
    #unique_individuals = id_unique_indviduals(entry_text, entities, record_type)
    
    volume_metadata = retrieve_metadata(path_to_volume_xml)
    
    if volume_metadata["type"] == "baptism":
        principals = determine_principals(entry_text, entities, 1)
        #event_relationships = build_event(entry_text, entities, "baptism", principals)               
        #interpersonal_relationships = process_interpersonal(entry_text, entities)
        #characteristics = process_characteristics(entry_text, entities, interpersonal_relationships)        
    elif volume_metadata["type"] == "marriage":        
        #process marriage record
        print("That record type is not supported yet.")
        return None
    elif volume_metadata["type"] == "burial":
        #process burial record
        print("That record type is not supported yet.")
        return None
    else:
        print("That record type is not supported yet.")
        return None
    
    #code that turns pieces defined above into well-formed relationships
    
    return relationships

## Unit testing

### Load trained model and entry data

In [None]:
#no_test

trained_model = load_model('models/st_aug_bapt_2', language="es", verbose='True')

Loaded model 'models/st_aug_bapt_2'


In [None]:
#no_test

path_to_transcription = "transcriptions\\239746.xml"
demo_df = parse_xml_v2(path_to_transcription)
demo_df.head()

Unnamed: 0,vol_titl,vol_id,fol_id,text,entry_no
0,Baptisms - 1793-1807,239746,1013,"1. María Dolores Sanchez Lunes, día veinte y ...",1013-1
1,Baptisms - 1793-1807,239746,1013,"2. Antonio Guillo Miercoles, día veinte de No...",1013-2
2,Baptisms - 1793-1807,239746,1014,"3. María Juana Francisca Fish Domingo, día ve...",1014-1
3,Baptisms - 1793-1807,239746,1014,"4. Maria Teresa Camel Domingo, día veinte y q...",1014-2
4,Baptisms - 1793-1807,239746,1015,"Maria Josefa Andrea de la Puente Miércoles, d...",1015-1


In [None]:
#no_test

volume_metadata = retrieve_volume_metadata(path_to_transcription)
print(volume_metadata)

{'type': 'baptism', 'country': 'United States', 'state': 'Florida', 'city': 'St. Augustine', 'institution': 'Archives of the Diocese of St. Augustine', 'id': 'St. Augustine', 'title': 'Baptisms - 1793-1807'}


### Apply model to entry data

In [None]:
#no_test

ent_preds_df, metrics_df, per_ent_metrics = test_model(trained_model, demo_df, "entry_no", "text", score_model=False)
ent_preds_df.head(20)

Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end
0,1013-1,María Dolores Sanchez,PER,3,24
1,1013-1,Lunes,CHAR,26,31
2,1013-1,Don Miguel o’Reilly,PER,99,118
3,1013-1,Teniente de Cura Beneficiado,CHAR,119,147
4,1013-1,Vicario,CHAR,149,156
5,1013-1,Juez Eclesiástico Auxiliar,CHAR,160,186
6,1013-1,Igle sia parroquial,LOC,195,214
7,1013-1,provincia de San Agustín,LOC,218,242
8,1013-1,Florida Oriental,LOC,249,265
9,1013-1,niña,CHAR,305,309


### id_unique_individuals

In [None]:
#no_test

for index, row in demo_df.iterrows():
    entry_no = row['entry_no']
    entry_text = row['text']
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]
    print(id_unique_individuals(entry_text, entities, volume_metadata["type"]))
    if index > 15:
        break

['María Dolores Sanchez' 'Don Miguel o’Reilly' 'Juan' 'Deliby Naturales'
 'Don Francisco Sanchez' 'Maria Dolores' 'José Rivas'
 'María de la Luz Blanco']
['Antonio Guillo' 'Don Miguel o’Reilly' 'Isaac Guillo' 'Sara Ca'
 'Juan Antonio' 'Antonio Pellice' 'Susana Pellicer']
['María Juana Francisca Fish' 'Don Miguel o’Reilly' 'Don Jeremías Fish'
 'Eva Fish' 'Maria Juana Francisca' 'Lorenzo Capó'
 'Juana Joaquina Gonzalez']
['Maria Teresa Camel' 'Don Miguel o’Reilly' 'Andres Camel' 'Eva Fish'
 'Maria Tere sa' 'Francisco Huiten' 'Andrea Ortigas']
['Maria Josefa Andrea de la Puente' 'Don Miguel o’Reilly'
 'Maria Prudencia Zayas' 'Don Armando de la Puente' 'Maria Josefa Andrea'
 'Domingo' 'Doña Corolinda Josefa Leonardy']
['Juana Teresa de Jesus Gernon' 'Don Miguel o’Reilly'
 'Don Guil lermo Gernon' 'Juana Teresa de Jesus' 'Doña Teresa Rodríguez']
['Maria Josepha de los' 'Dolores Guadara ma' 'Don Miguel o’Reilly'
 'Don Mateo Guadarama' 'Maria Josefa de los'
 'Doña Catalina Gerónima Porras']
['

### find_sus

In [None]:
sus_df = pd.DataFrame(columns=['vol_titl', 'vol_id', 'fol_id', 'text','entry_no'])
display(sus_df.head())

for index, row in demo_df.iterrows():
    entry_no = row['entry_no']
    entry_text = row['text']
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]
    sus_df = find_sus(entry_text, entities, sus_df, index)
    if index > 50: #Has 481 rows
        break

print(len(sus_df.index))
sus_df.head(20)

Unnamed: 0,vol_titl,vol_id,fol_id,text,entry_no


6


Unnamed: 0,vol_titl,vol_id,fol_id,text,entry_no
0,Baptisms - 1793-1807,239746,1019,"Antonio Joseph Dimas Pater Adulto Domingo, dí...",1019-1
1,Baptisms - 1793-1807,239746,1019,"Maria Dolores Judith Smith Adulta Domingo, dí...",1019-2
2,Baptisms - 1793-1807,239746,1022,"19. Juan Baptista Isnardy Martes, día veinte ...",1022-1
3,Baptisms - 1793-1807,239746,1022,"Roberto Macqueen Miércoles Martes, día veinte...",1022-2
4,Baptisms - 1793-1807,239746,1024,"Benjamin Macqueen Miércoles, día veinte y uno...",1024-2
5,Baptisms - 1793-1807,239746,1030,"Juan Macqueen Miércoles, día veinte y uno de ...",1030-2


### split_name_col

### disambiguate

In [None]:
for index, row in sus_df.iterrows(): #Note that demo_df was changed to sus_df, they have the same form
    entry_no = row['entry_no']
    entry_text = row['text']
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]
    print(disambiguate(sus_df))
    if index > 15:
        break

DF of first names


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,name_status


DF of ful names


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,name_status
0,1019-1,Antonio Joseph Dimas Pater,PER,0,26,1
1,1019-1,Don Miguel o’Reilly,PER,110,129,1
2,1019-1,Antonio Joseph Dimas Pater,PER,516,542,1
3,1019-1,Dimas Ponz,PER,563,573,1
4,1019-1,Margarita Fray,PER,576,590,1


DF of compound names


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,name_status


None
DF of first names


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,name_status


DF of ful names


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,name_status
0,1019-2,Maria Dolores Judith Smith,PER,0,26,1
1,1019-2,Don Miguel o’Reilly,PER,110,129,1
2,1019-2,Maria Dolores Judith Smith,PER,559,585,1
3,1019-2,Doña Rosa de la Luz,PER,601,620,1


DF of compound names


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,name_status


None
DF of first names


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,name_status


DF of ful names


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,name_status
0,1022-1,Juan Baptista Isnardy,PER,4,25,1
1,1022-1,Don Miguel O’Reilly,PER,94,113,1
2,1022-1,Don Miguel Lorenzo Isnardy,PER,410,436,1
3,1022-1,Juan Baptistafue,PER,538,554,1
4,1022-1,Don Miguel Lorenzo Isnardy,PER,566,592,1


DF of compound names


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,name_status


None
DF of first names


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,name_status
2,1022-2,Roberto,PER,366,373,0
5,1022-2,Roberto,PER,556,563,0


DF of ful names


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,name_status
0,1022-2,Roberto Macqueen,PER,0,16,1
1,1022-2,Don Thomas Hafsett,PER,101,119,1
3,1022-2,Eleonora na,PER,376,387,1
4,1022-2,Don Juan Macqueen,PER,437,454,1
6,1022-2,Doctor Don Thomas Sterling,PER,584,610,1


DF of compound names


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,name_status


None
DF of first names


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,name_status
2,1024-2,Benjamin,PER,367,375,0
3,1024-2,Isabel,PER,378,384,0
5,1024-2,Benjamin,PER,561,569,0


DF of ful names


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,name_status
0,1024-2,Benjamin Macqueen,PER,0,17,1
1,1024-2,Don Thomas Hafsett,PER,95,113,1
4,1024-2,Don Juan Macqueen,PER,442,459,1
6,1024-2,Doctor Don Thomas Sterling,PER,589,615,1


DF of compound names


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,name_status


None
DF of first names


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,name_status
2,1030-2,Smart,PER,361,366,0
4,1030-2,Don,PER,440,443,0
6,1030-2,Juan,PER,561,565,0


DF of ful names


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,name_status
0,1030-2,Juan Macqueen,PER,0,13,1
1,1030-2,Don Thomas Hafsett,PER,91,109,1
3,1030-2,Rachael Macqueen,PER,369,385,1
5,1030-2,Juan Macqueen,PER,445,458,1
7,1030-2,Doctor Don Thomas Sterling,PER,584,610,1


DF of compound names


Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end,name_status


None


### determine_principals

In [None]:
#no_test

for index, row in demo_df.iterrows():
    entry_no = row['entry_no']
    entry_text = row['text']
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]
    print(determine_principals(entry_text, entities, volume_metadata["type"]))
    if index > 25:
        break

Invalid number of principals.
None
Invalid number of principals.
None
Invalid number of principals.
None
Invalid number of principals.
None
Invalid number of principals.
None
Invalid number of principals.
None
Invalid number of principals.
None
Invalid number of principals.
None
Invalid number of principals.
None
Invalid number of principals.
None
Invalid number of principals.
None
Invalid number of principals.
None
Invalid number of principals.
None
Invalid number of principals.
None
Invalid number of principals.
None
Invalid number of principals.
None
Invalid number of principals.
None
Invalid number of principals.
None
Invalid number of principals.
None
Invalid number of principals.
None
Invalid number of principals.
None
Invalid number of principals.
None
Invalid number of principals.
None
Invalid number of principals.
None
Invalid number of principals.
None
Invalid number of principals.
None
Invalid number of principals.
None


### determine_event_date

In [None]:
#no_test

for index, row in demo_df.iterrows():    
    entry_no = row['entry_no']
    entry_text = row['text']
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]
    print(determine_event_date(entry_text, entities, volume_metadata["type"], volume_metadata))
    if index > 25:
        break       

None
None
None
None
Miércoles, día quatro de Diciembre de Mil Setecien tos Noventa y Tres. Yo
None
None
Jueves, día veinte y seis de Diciembre de Mil Setecientos Noventa y Tres. Yo
None
Lunes, día treze de Enero de Mil Setecientos Noventa y Quatro. Yo
Lunes, día tres de Febrero de Mil Setecientos Noventa y Quatro. Yo
Martes, día cinco de Marzo de Mil Setecientos Noventa y Quatro. Yo
Domingo, día diez y seis de Marzo de Mil Setecien tos Noventa y Quatro. Yo
None
None
Lunes, día quatorze de Abril de Mil Setecientos Noventa y Quatro. Yo
Sabado, día tres de Mayo de Mil Setecientos Noventa y Quatro. Yo
Sabado, día tres de Mayo de Mil Setecientos Noventa y Quatro. Yo
None
Miércoles  
Miércoles, día veinte y uno de Mayo de Mil Setecientos Noventa y Quatro. Yo
Miércoles, día veinte y uno de Mayo de Mil Setecientos Noventa y Quatro. Yo
Miércoles, día veinte y uno de Mayo de Mil Setecientos Noventa y Quatro. Yo
Miércoles, día veinte y uno de Mayo de Mil Setecientos Noventa y Quatro. Yo
Miércoles

The function currently fails to find a date for a substantial proportion (~20%) of entries because dates aren't being accurately extracted from the original. If this problem continues as the model improves and more entry data is incorporated into the sample, we'll need to add the post-processing capacity to bracket missing event dates by looking at entries on either side of the entry missing a date. Regardless, we will also need to add the post-processing capacity to convert these textual dates into numerical ones.

### determine_event_location

In [None]:
#no_test

for index, row in demo_df.iterrows():    
    entry_no = row['entry_no']
    entry_text = row['text']
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]
    print(determine_event_location(entry_text, entities, volume_metadata["type"], volume_metadata))
    if index > 25:
        break

Archives of the Diocese of St. Augustine
Archives of the Diocese of St. Augustine
Archives of the Diocese of St. Augustine
Archives of the Diocese of St. Augustine
Archives of the Diocese of St. Augustine
Archives of the Diocese of St. Augustine
Archives of the Diocese of St. Augustine
Archives of the Diocese of St. Augustine
Archives of the Diocese of St. Augustine
Archives of the Diocese of St. Augustine
Archives of the Diocese of St. Augustine
Archives of the Diocese of St. Augustine
Archives of the Diocese of St. Augustine
Archives of the Diocese of St. Augustine
Archives of the Diocese of St. Augustine
Archives of the Diocese of St. Augustine
Archives of the Diocese of St. Augustine
Archives of the Diocese of St. Augustine
Archives of the Diocese of St. Augustine
Archives of the Diocese of St. Augustine
Archives of the Diocese of St. Augustine
Archives of the Diocese of St. Augustine
Archives of the Diocese of St. Augustine
Archives of the Diocese of St. Augustine
Archives of the 

### identify_cleric

In [None]:
#no_test

for index, row in demo_df.iterrows():    
    entry_no = row['entry_no']
    entry_text = row['text']
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]
    print(identify_cleric(entry_text, entities))
    if index > 25:
        break

Don Miguel o’Reilly
Don Miguel o’Reilly
Don Miguel o’Reilly
Don Miguel o’Reilly
Don Miguel o’Reilly
Don Miguel o’Reilly
Don Miguel o’Reilly
Don Miguel
Don Miguel o’Reilly
Don Miguel o’Reilly
Don Miguel o’Reilly
Don Miguel o’Reilly
Don Miguel o’Reilly
Don Miguel o’Reilly
Don Mi guel o’Reilly
Don Thomas Hafsett
Don Miguel O’Reilly
Don Thomas Hafsett
Don Miguel O’Reilly
Don Thomas Hafsett
Don Thomas Hafsett
Don Thomas Hafsett
Don Thomas Hafsett
Don Thomas Hafsett
Don Thomas Hafsett
Don Thomas Haf sett
Don Thomas Hafsett


In [None]:
#no_test

from nbdev.export import notebook2script
notebook2script()

Converted 12-ssda-xml-parser.ipynb.
Converted 31-collate-xml-entities-spans.ipynb.
Converted 33-split-data.ipynb.
Converted 41-generic-framework-for-spacy-training.ipynb.
Converted 42-initial-model.ipynb.
Converted 51-data-preprocessing.ipynb.
Converted 52-unstructured-to-markup.ipynb.
Converted 53-markup-to-spatial-historian.ipynb.
Converted 61-prodigy-output-training-demo.ipynb.
Converted 62-full-model-application-demo.ipynb.
Converted 63-pt-model-training.ipynb.
Converted 64-es-model-training.ipynb.
Converted 71-relationship-builder.ipynb.
