In [None]:
#default_exp relationships

In [None]:
#no_test
#dependencies

#nlp packages
import spacy
from spacy.util import minibatch, compounding

#manipulation of tables/arrays
import pandas as pd
import numpy as np

#internal imports
from ssda_nlp.collate import *
from ssda_nlp.split_data import *
from ssda_nlp.modeling import *
from ssda_nlp.model_performance_utils import *
from ssda_nlp.xml_parser import *
from ssda_nlp.relationships import *
from ssda_nlp.unstructured2markup import *

In [None]:
#export

def id_unique_individuals(entry_text, entities, record_type):
    '''
    identifies all unique individuals that appear in an entry (i.e. removing all multiple mentions of the same person)
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model
        record_type: simple flag indicating whether records are baptisms, marriages, burials, etc. (this can also be determined
        programmatically and may be deprecated later)
        
        returns: a list of the unique individuals who appear in an entry
    '''
    
    return unique_individuals

In [None]:
#export

def determine_principals(entry_text, entities, n_principals):
    '''
    determines the principal of a single-principal event
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model
        n_principals: expected number of principals
        
        returns: the principal(s) of the event in question, or None if no principal can be identified
    '''
    
    entry_text = entry_text.lower()
    
    if n_principals == 1:        
        principals = None
        
        for index, entity in entities.iterrows():
            if entity['pred_label'] == 'PER' and entity['pred_start'] <= 20:
                principals = entity['pred_entity']
                
        if principals == None:            
            prox = entry_text.find('oleos')
            if prox != -1:
                for index, entity in entities.iterrows():
                    if entity['pred_label'] == 'PER' and (abs(entity['pred_start'] - prox) <= 10):
                        principals = entity['pred_entity']
                        
        if principals == None:
            prox = entry_text.find('nombre')
            if prox != -1:
                for index, entity in entities.iterrows():
                    if entity['pred_label'] == 'PER' and (abs(entity['pred_start'] - prox) <= 10):
                        principals = entity['pred_entity']                        
        
    elif n_principals == 2:
        print("That number of principals is not supported yet.")
        return None
        #process marriage principals
    else:
        print("Invalid number of principals.")
        return None
    
    return principals

In [None]:
#export

def determine_event_date(entry_text, entities, primary_event_type, secondary_event_type = None):
    '''
    determines the date of a specific event
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model
        primary_event_type: a valid record_type (i.e. "baptism", "marriage", or "burial")
        secondary_event_type: use if attempting to identify a secondary event (i.e. "birth") in a primary record
        
        returns: the date of the event in question, or None if no date can be identified
    '''
    date = None
    
    if secondary_event_type != None:        
        primary_event_date = determine_event_date(entry_text, entities, primary_event_type)
        for index, entity in entities.iterrows():
            if entity['pred_label'] == 'DATE' and entity['pred_entity'] != primary_event_date:
                date = entity['pred_entity']
    
    elif primary_event_type == "baptism":
        entry_length = len(entry_text)
        
        for index, entity in entities.iterrows():
            if entity['pred_label'] == 'DATE' and entity['pred_start'] <= (entry_length / 3):
                date = entity['pred_entity']        
                
    else:
        date = "That event type is not supported yet."
        
    return date

In [None]:
#export

def determine_event_location(entry_text, entities, event_type):
    '''
    determines the location of a specific event
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model
        event_type: this could be either a valid record_type OR a secondary event like a birth
        
        returns: the location of the event in question, or None if no date can be identified
    '''
    
    return location

In [None]:
#export

def identify_cleric(entry_text, entities):
    '''
    identifies the cleric(s) associated with a sacramental entry
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model        
        
        returns: the associated cleric(s), or None if no date can be identified
    '''
    
    return clerics

In [None]:
#export

def build_event(entry_text, entities, event_type, principals):
    '''
    builds out relationships related to a baptism or burial event
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model
        
        returns: structured representation of these relationships, including (but not necessarily limited to)
        the event's principal, the date of the event, the location of the event, and the associated cleric
    '''   
    
    date = determine_event_date(entry_text, entities, event_type)
    location = determine_event_location(entry_text, entities, event_type)
    cleric = identify_cleric(entry_text, entities)
    
    return event_relationships

In [None]:
#export

def build_relationships(entry_text, entities, record_type):
    '''
    Master function that will combine all helper functions built above
        entry_text: the full text of a single entry, ported directly from spaCy to ensure congruity
        entities: entities of all kinds extracted from that entry by an NER model
        record_type: simple flag indicating whether records are baptisms, marriages, burials, etc. (this can also be determined
        programmatically and may be deprecated later)
            
        returns: structured data (specific format to be named later) containing all relationships extracted
    '''
    #unique_individuals = id_unique_indviduals(entry_text, entities, record_type)
    
    if record_type == "baptism":
        principals = determine_principals(entry_text, entities, 1)
        #event_relationships = build_event(entry_text, entities, "baptism", principals)               
        #interpersonal_relationships = process_interpersonal(entry_text, entities)
        #characteristics = process_characteristics(entry_text, entities, interpersonal_relationships)        
    elif record_type == "marriage":        
        #process marriage record
        print("That record type is not supported yet.")
        return None
    elif record_type == "burial":
        #process burial record
        print("That record type is not supported yet.")
        return None
    else:
        print("That record type is not supported yet.")
        return None
    
    #code that turns pieces defined above into well-formed relationships
    
    return relationships

## Unit testing

### Load trained model and entry data

In [None]:
#no_test

trained_model = load_model('models/mat_baut_1', language="es", verbose='True')

Loaded model 'models/mat_baut_1'


In [None]:
#no_test

demo_df = parse_xml_v2("transcriptions\\15834.xml")
demo_df.head()

Unnamed: 0,vol_titl,vol_id,fol_id,text,entry_no
0,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1033,[margin]: Juana. Esc.va Domingo veinte y dos d...,1033-1
1,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1033,[margin]: Paula. Esc.a Juebes veinte y tres de...,1033-2
2,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1033,[margin]: Maria Esc.a Miercoles prim.o de feb....,1033-3
3,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1033,[margin]: Bernardo Esc.vo Domingo nueve de Abr...,1033-4
4,"Libro 1 de Bautismos de Pardos y Morenos, 1719...",15834,1034,[margin]: Fran.co Esc.vo [roto] Abril de mil s...,1034-1


### Apply model to entry data

In [None]:
#no_test

ent_preds_df, metrics_df, per_ent_metrics = test_model(trained_model, demo_df, "entry_no", "text", score_model=False)
ent_preds_df.head(20)

Unnamed: 0,entry_no,pred_entity,pred_label,pred_start,pred_end
0,1033-1,Juana,PER,10,15
1,1033-1,Esc.va,CHAR,17,23
2,1033-1,Domingo veinte y dos de [roto] y nueve,DATE,24,62
3,1033-1,Thomas de Orvera,PER,66,82
4,1033-1,Juana de nacion,PER,121,136
5,1033-1,Mina,CHAR,137,141
6,1033-1,esclava,CHAR,142,149
7,1033-1,Juan Joseph de Justis,PER,159,180
8,1033-1,P.P.,REL,192,196
9,1033-1,Joseph Salcedo,PER,197,211


### determine_principals

In [None]:
#no_test

for index, row in demo_df.iterrows():
    entry_no = row['entry_no']
    entry_text = row['text']
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]
    print(determine_principals(entry_text, entities, 1))
    if index > 25:
        break

Juana
Paula
Maria
Bernardo
Fran.co
Ant.o
Antonia
M.a Luisa
Ana
Ana
Theresa
Antonio
Franc.co de Paula
Juan
Vicente
Joseph
Ysabel
Vicente
Joseph
Maria
Antonia
Juan
Alexandro
Elena Maria
Juan Joseph
Juan
Geronima


### determine_event_date

In [None]:
#no_test

for index, row in demo_df.iterrows():
    has_date = False
    entry_no = row['entry_no']
    entry_text = row['text']
    entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]
    print(determine_event_date(entry_text, entities, "baptism"))
    if index > 25:
        break       

Domingo veinte y dos de [roto] y nueve
Juebes veinte y tres de feb.o de mil sietec.tos. y diez y nueve
Miercoles prim.o de feb.o de mil siete.tos y diez y nueve
Domingo nueve de Abril de mil sietectos y diez y nueve
[roto] Abril de mil sietec.tos y diez, y nueve [
Domingo nueve de Abril de mil sietec.tos y diez y nueve
Domingo nueve de Abril de mil sietec.tos y diez, y nueve
Domingo nueve de Abril de mil sietec.tos y diez, y nueve
Domingo nueve de Abril de mil sietec.tos y diez, y
None
Domingo nueve de Abril de mil sietec.tos y diez, y nueve
Martes onze de Abril de mil sietec.tos y diez y nueve
Domingo quatro de Junio de mil sietecientos i dies i nuebe yo
Sabado veinte y quatro de Junio de mil sietectos, y diez, y nueve
Domingo dos de Julio de mil sietec.tos y diez y nueve yo
Domingo dos de Julio de mil sietec.tos y diez, y nueve yo
Domingo treinta de Julio de mil sietec.tos y diez y nueve
Domingo dos de Julio de [. . .] fr
Domingo dos de Julio de mil sietec.tos y diez y nueve yo el [r

The function currently fails to find a date for a substantial proportion (~20%) of entries because dates aren't being accurately extracted from the original. If this problem continues as the model improves and more entry data is incorporated into the sample, we'll need to add the post-processing capacity to bracket missing event dates by looking at entries on either side of the entry missing a date. Regardless, we will also need to add the post-processing capacity to convert these textual dates into numerical ones.

In [None]:
#no_test

from nbdev.export import notebook2script
notebook2script()

Converted 12-ssda-xml-parser.ipynb.
Converted 31-collate-xml-entities-spans.ipynb.
Converted 33-split-data.ipynb.
Converted 41-generic-framework-for-spacy-training.ipynb.
Converted 42-initial-model.ipynb.
Converted 51-data-preprocessing.ipynb.
Converted 52-unstructured-to-markup.ipynb.
Converted 53-markup-to-spatial-historian.ipynb.
Converted 61-prodigy-output-training-demo.ipynb.
Converted 62-full-model-application-demo.ipynb.
Converted 71-relationship-builder.ipynb.
