In [None]:
#default_exp relationships

In [None]:
#no_test
#dependencies

#nlp packages
import spacy
from spacy.util import minibatch, compounding

#manipulation of tables/arrays
import pandas as pd
import numpy as np

#internal imports
from ssda_nlp.collate import *
from ssda_nlp.split_data import *
from ssda_nlp.modeling import *
from ssda_nlp.model_performance_utils import *
from ssda_nlp.xml_parser import *
from ssda_nlp.unstructured2markup import *

In [None]:
#export

def id_unique_individuals(entry_text, entities, record_type):
    '''
    identifies all unique individuals that appear in an entry (i.e. removing all multiple mentions of the same person)
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model
        record_type: simple flag indicating whether records are baptisms, marriages, burials, etc. (this can also be determined
        programmatically and may be deprecated later)
        
        returns: a list of the unique individuals who appear in an entry
    '''
    
    people_df = entities.loc[entities['pred_label'] == 'PER']
    people_df.reset_index(inplace=True)
    people_df = people_df.drop('index',axis=1)
    unique_individuals = people_df['pred_entity'].unique()
    
    return unique_individuals

In [None]:
#export

def determine_principals(entry_text, entities, n_principals):
    '''
    determines the principal of a single-principal event
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model
        n_principals: expected number of principals
        
        returns: the principal(s) of the event in question, or None if no principal can be identified
    '''
    
    entry_text = entry_text.lower()
    
    if n_principals == 1:        
        principals = None
        
        for index, entity in entities.iterrows():
            if entity['pred_label'] == 'PER' and entity['pred_start'] <= 20:
                principals = entity['pred_entity']
                
        if principals == None:            
            prox = entry_text.find('oleos')
            if prox != -1:
                for index, entity in entities.iterrows():
                    if entity['pred_label'] == 'PER' and (abs(entity['pred_start'] - prox) <= 10):
                        principals = entity['pred_entity']
                        
        if principals == None:
            prox = entry_text.find('nombre')
            if prox != -1:
                for index, entity in entities.iterrows():
                    if entity['pred_label'] == 'PER' and (abs(entity['pred_start'] - prox) <= 10):
                        principals = entity['pred_entity']                        
        
    elif n_principals == 2:
        print("That number of principals is not supported yet.")
        return None
        #process marriage principals
    else:
        print("Invalid number of principals.")
        return None
    
    return principals

In [None]:
#export

def determine_event_date(entry_text, entities, event_type, volume_metadata):
    '''
    determines the date of a specific event
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model
        event_type: this could be either a valid record_type OR a secondary event like a birth
        volume_metadata: metadata for the volume that the entry comes from, built by retrieve_volume_metadata
        
        returns: the date of the event in question, or None if no date can be identified
    '''
    date = None
    
    if event_type != volume_metadata["type"]:        
        primary_event_date = determine_event_date(entry_text, entities, event_type, volume_metadata)
        for index, entity in entities.iterrows():
            if entity['pred_label'] == 'DATE' and entity['pred_entity'] != primary_event_date:
                date = entity['pred_entity']
    
    elif volume_metadata["type"] == "baptism":
        entry_length = len(entry_text)
        
        for index, entity in entities.iterrows():
            if entity['pred_label'] == 'DATE' and entity['pred_start'] <= (entry_length / 3):
                date = entity['pred_entity']        
                
    else:
        date = "That event type is not supported yet."
        
    return date

In [None]:
#export

def determine_event_location(entry_text, entities, event_type, volume_metadata):
    '''
    determines the location of a specific event
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model
        event_type: this could be either a valid record_type OR a secondary event like a birth
        volume_metadata: metadata for the volume that the entry comes from, built by retrieve_volume_metadata
        
        returns: the location of the event in question, or None if no date can be identified
    '''
    location = None
    
    if event_type == volume_metadata["type"]:
        location = volume_metadata["institution"]    
    else:
        location = "That event type is not supported yet."
    
    return location

In [None]:
#export

def identify_cleric(entry_text, entities):
    '''
    identifies the cleric(s) associated with a sacramental entry
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model        
        
        returns: the associated cleric(s), or None if no date can be identified
    '''
    clerics = None
    
    for index, entity in entities.iterrows():
            if ((entity['pred_label'] == 'PER') and ((len(entry_text) - entity['pred_end']) <= 10) and (len(entry_text) > 100)):
                clerics = entity['pred_entity']
            #going to keep this condition for now, but it can create false positives when long, incorrect entities are extracted
            #from short and/or garbled entries
            elif (entity['pred_entity'] != None) and (len(entry_text) - entity['pred_end'] <= 2) and (entity['pred_label'] == 'PER'):
                clerics = entity['pred_entity']                                                 
                
    if clerics == None:
        pvs_label = None
        pvs_end = None
        for index, entity in entities.iterrows():
            if entity['pred_label'] == 'PER' and pvs_label == 'DATE' and (entity['pred_start'] - pvs_end) <= 15:
                clerics = entity['pred_entity']                
            pvs_label = entity['pred_label']
            pvs_end = entity['pred_end']
    
    if clerics == None:
        entry_text = entry_text.lower()
        for index, entity in entities.iterrows():
            if entity['pred_label'] == 'PER' and entry_text.find("cura", entity['pred_start'] + len(entity['pred_entity'])) != -1 and ((entry_text.find("cura", entity['pred_start'] + len(entity['pred_entity']))) - entity['pred_end']) <= 15:
                clerics = entity['pred_entity']                
    
    return clerics

In [None]:
#export

def build_event(entry_text, entities, event_type, principals):
    '''
    builds out relationships related to a baptism or burial event
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model
        event_type: this could be either a valid record_type OR a secondary event like a birth
        
        returns: structured representation of these relationships, including (but not necessarily limited to)
        the event's principal, the date of the event, the location of the event, and the associated cleric
    '''   
    
    date = determine_event_date(entry_text, entities, event_type)
    location = determine_event_location(entry_text, entities, event_type)
    cleric = identify_cleric(entry_text, entities)
    
    return event_relationships

In [None]:
#export

def build_relationships(entry_text, entities, path_to_volume_xml):
    '''
    Master function that will combine all helper functions built above
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model
        record_type: simple flag indicating whether records are baptisms, marriages, burials, etc. (this can also be determined
        programmatically and may be deprecated later)
            
        returns: structured data (specific format to be named later) containing all relationships extracted
    '''
    #unique_individuals = id_unique_indviduals(entry_text, entities, record_type)
    
    volume_metadata = retrieve_metadata(path_to_volume_xml)
    
    if volume_metadata["type"] == "baptism":
        principals = determine_principals(entry_text, entities, 1)
        #event_relationships = build_event(entry_text, entities, "baptism", principals)               
        #interpersonal_relationships = process_interpersonal(entry_text, entities)
        #characteristics = process_characteristics(entry_text, entities, interpersonal_relationships)        
    elif volume_metadata["type"] == "marriage":        
        #process marriage record
        print("That record type is not supported yet.")
        return None
    elif volume_metadata["type"] == "burial":
        #process burial record
        print("That record type is not supported yet.")
        return None
    else:
        print("That record type is not supported yet.")
        return None
    
    #code that turns pieces defined above into well-formed relationships
    
    return relationships

## Unit testing

### Load trained model and entry data

In [None]:
#no_test

trained_model = load_model('models/st_aug_bapt_2', language="es", verbose='True')

Loaded model 'models/st_aug_bapt_2'


In [None]:
#no_test

path_to_transcription = "transcriptions\\239746.xml"
demo_df = parse_xml_v2(path_to_transcription)
demo_df.head()

Unnamed: 0,vol_titl,vol_id,fol_id,text,entry_no
0,Baptisms - 1793-1807,239746,1013,"1. María Dolores Sanchez Lunes, día veinte y ...",1013-1
1,Baptisms - 1793-1807,239746,1013,"2. Antonio Guillo Miercoles, día veinte de No...",1013-2
2,Baptisms - 1793-1807,239746,1014,"3. María Juana Francisca Fish Domingo, día ve...",1014-1
3,Baptisms - 1793-1807,239746,1014,"4. Maria Teresa Camel Domingo, día veinte y q...",1014-2
4,Baptisms - 1793-1807,239746,1015,"Maria Josefa Andrea de la Puente Miércoles, d...",1015-1


In [None]:
#no_test

volume_metadata = retrieve_volume_metadata(path_to_transcription)
print(volume_metadata)

{'type': 'baptism', 'country': 'United States', 'state': 'Florida', 'city': 'St. Augustine', 'institution': 'Archives of the Diocese of St. Augustine', 'id': 'St. Augustine', 'title': 'Baptisms - 1793-1807'}


### Apply model to entry data

In [None]:
#no_test

ent_preds_df, metrics_df, per_ent_metrics = test_model(trained_model, demo_df, "entry_no", "text", score_model=False)
ent_preds_df.head(20)

Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end
0,1013-1,María Dolores Sanchez,PER,3,24
1,1013-1,Lunes,CHAR,26,31
2,1013-1,Don Miguel o’Reilly,PER,99,118
3,1013-1,Teniente de Cura Beneficiado,CHAR,119,147
4,1013-1,Vicario,CHAR,149,156
5,1013-1,Juez Eclesiástico Auxiliar,CHAR,160,186
6,1013-1,Igle sia parroquial,LOC,195,214
7,1013-1,provincia de San Agustín,LOC,218,242
8,1013-1,Florida Oriental,LOC,249,265
9,1013-1,niña,CHAR,305,309


### id_unique_individuals

In [None]:
#no_test

for index, row in demo_df.iterrows():
    entry_no = row['entry_no']
    entry_text = row['text']
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]
    print(id_unique_individuals(entry_text, entities, volume_metadata["type"]))
    if index > 25:
        break

['María Dolores Sanchez' 'Don Miguel o’Reilly' 'Juan' 'Deliby Naturales'
 'Don Francisco Sanchez' 'Maria Dolores' 'José Rivas'
 'María de la Luz Blanco']
['Antonio Guillo' 'Don Miguel o’Reilly' 'Isaac Guillo' 'Sara Ca'
 'Juan Antonio' 'Antonio Pellice' 'Susana Pellicer']
['María Juana Francisca Fish' 'Don Miguel o’Reilly' 'Don Jeremías Fish'
 'Eva Fish' 'Maria Juana Francisca' 'Lorenzo Capó'
 'Juana Joaquina Gonzalez']
['Maria Teresa Camel' 'Don Miguel o’Reilly' 'Andres Camel' 'Eva Fish'
 'Maria Tere sa' 'Francisco Huiten' 'Andrea Ortigas']
['Maria Josefa Andrea de la Puente' 'Don Miguel o’Reilly'
 'Maria Prudencia Zayas' 'Don Armando de la Puente' 'Maria Josefa Andrea'
 'Domingo' 'Doña Corolinda Josefa Leonardy']
['Juana Teresa de Jesus Gernon' 'Don Miguel o’Reilly'
 'Don Guil lermo Gernon' 'Juana Teresa de Jesus' 'Doña Teresa Rodríguez']
['Maria Josepha de los' 'Dolores Guadara ma' 'Don Miguel o’Reilly'
 'Don Mateo Guadarama' 'Maria Josefa de los'
 'Doña Catalina Gerónima Porras']
['

### determine_principals

In [None]:
#no_test

for index, row in demo_df.iterrows():
    entry_no = row['entry_no']
    entry_text = row['text']
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]
    print(determine_principals(entry_text, entities, volume_metadata["type"]))
    if index > 25:
        break

Juana
Paula
Maria
Bernardo
Fran.co
Ant.o
Antonia
M.a Luisa
Ana
Ana
Theresa
Antonio
Franc.co de Paula
Juan
Vicente
Joseph
Ysabel
Vicente
Joseph
Maria
Antonia
Juan
Alexandro
Elena Maria
Juan Joseph
Juan
Geronima


### determine_event_date

In [None]:
#no_test

for index, row in demo_df.iterrows():    
    entry_no = row['entry_no']
    entry_text = row['text']
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]
    print(determine_event_date(entry_text, entities, volume_metadata["type"], volume_metadata))
    if index > 25:
        break       

Domingo veinte y dos de [roto] y nueve
Juebes veinte y tres de feb.o de mil sietec.tos. y diez y nueve
Miercoles prim.o de feb.o de mil siete.tos y diez y nueve
Domingo nueve de Abril de mil sietectos y diez y nueve
[roto] Abril de mil sietec.tos y diez, y nueve [
Domingo nueve de Abril de mil sietec.tos y diez y nueve
Domingo nueve de Abril de mil sietec.tos y diez, y nueve
Domingo nueve de Abril de mil sietec.tos y diez, y nueve
Domingo nueve de Abril de mil sietec.tos y diez, y
None
Domingo nueve de Abril de mil sietec.tos y diez, y nueve
Martes onze de Abril de mil sietec.tos y diez y nueve
Domingo quatro de Junio de mil sietecientos i dies i nuebe yo
Sabado veinte y quatro de Junio de mil sietectos, y diez, y nueve
Domingo dos de Julio de mil sietec.tos y diez y nueve yo
Domingo dos de Julio de mil sietec.tos y diez, y nueve yo
Domingo treinta de Julio de mil sietec.tos y diez y nueve
Domingo dos de Julio de [. . .] fr
Domingo dos de Julio de mil sietec.tos y diez y nueve yo el [r

The function currently fails to find a date for a substantial proportion (~20%) of entries because dates aren't being accurately extracted from the original. If this problem continues as the model improves and more entry data is incorporated into the sample, we'll need to add the post-processing capacity to bracket missing event dates by looking at entries on either side of the entry missing a date. Regardless, we will also need to add the post-processing capacity to convert these textual dates into numerical ones.

### determine_event_location

In [None]:
#no_test

for index, row in demo_df.iterrows():    
    entry_no = row['entry_no']
    entry_text = row['text']
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]
    print(determine_event_location(entry_text, entities, volume_metadata["type"], volume_metadata))
    if index > 25:
        break

Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo
Catedral de San Carlos Borromeo


### identify_cleric

In [None]:
#no_test

for index, row in demo_df.iterrows():    
    entry_no = row['entry_no']
    entry_text = row['text']
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]
    print(identify_cleric(entry_text, entities))
    if index > 25:
        break

Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
None
Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
Joseph Lopez de Cuella
Joseph Hern.z
Joseph Hern.z
Joseph Lopez de Cuella
Joseph Lopez de Cuella
Joseph Lopez de Cuella
Joseph Lopez de Cuella
Thomas de Orvera
Joseph Lopez de Cuella
Joseph Lopez de Cuella
Joseph Lopez de Cuella


In [None]:
#no_test

from nbdev.export import notebook2script
notebook2script()

Converted 12-ssda-xml-parser.ipynb.
Converted 31-collate-xml-entities-spans.ipynb.
Converted 33-split-data.ipynb.
Converted 41-generic-framework-for-spacy-training.ipynb.
Converted 42-initial-model.ipynb.
Converted 51-data-preprocessing.ipynb.
Converted 52-unstructured-to-markup.ipynb.
Converted 53-markup-to-spatial-historian.ipynb.
Converted 61-prodigy-output-training-demo.ipynb.
Converted 62-full-model-application-demo.ipynb.
Converted 63-pt-model-training.ipynb.
Converted 64-es-model-training.ipynb.
Converted 71-relationship-builder.ipynb.
