In [None]:
#default_exp full_volume

In [None]:
#export
#dependencies

#nlp packages
import spacy
from spacy.util import minibatch, compounding

#manipulation of tables/arrays
import pandas as pd
import numpy as np
import copy
import json

#internal imports
from ssda_nlp.collate import *
from ssda_nlp.split_data import *
from ssda_nlp.modeling import *
from ssda_nlp.model_performance_utils import *
from ssda_nlp.xml_parser import *
from ssda_nlp.unstructured2markup import *
from ssda_nlp.utility import *
from ssda_nlp.relationships import *

In [None]:
#export

def process_volume(path_to_transcription, path_to_model):
    '''
    runs the transcription of a single volume (formatted according to SSDA markup 2.0 specs) through the ML entity extraction
    and rules-based relationship linking pipelines, then formats resulting data for export into SQL
        path_to_transcription: path to an XML file containing the transcription of a single volume
        path_to_model: path to a spaCy model trained to extract entities from the proper type of volume
    
        returns: final people, place, and event dictionaries as well as the
        path to a JSON file containing volume metadata as well as people, place, and event records
    '''
    
    #retrieve volume metadata and controlled vocabularies
    
    volume_metadata = retrieve_volume_metadata(path_to_transcription)
    vocabularies = retrieve_controlled_vocabularies()
    
    if volume_metadata["country"] == "Brazil":
        lang = "pt"
    else:
        lang = "es"
        
    #load and apply trained model
    
    trained_model = load_model(path_to_model, language=lang, verbose='True')
    
    entry_df = parse_xml_v2(path_to_transcription)
    
    ent_preds_df, metrics_df, per_ent_metrics = test_model(trained_model, entry_df, "entry_no", "text", score_model=False)
    print("Entities extracted.")
    
    #iterate through each entry and build relationships
    
    people = []
    places = []
    events = []
    
    for i in range(len(entry_df.index)):
        
        entry_no = entry_df['entry_no'][i]
        entry_text = entry_df['text'][i]    
    
        entities = ent_preds_df.loc[ent_preds_df['entry_no'] == entry_no]      
    
        entry_people, entry_places, entry_events = build_entry_metadata(entry_text, entities, path_to_transcription)
        
        people += entry_people
        places += entry_places
        events += entry_events
        
    print("Relationships linked.")
    
    #disambiguate locations and assign unique ids
    
    unique_places = []
    for place in places:
        if (place != None) and (place not in unique_places):
            unique_places.append(place)
    
    places = []
    curr_place = 1
    for unique_place in unique_places:
        place_record = {"id":volume_metadata["id"] + '-L' + str(curr_place), "location":unique_place}
        places.append(place_record)
        curr_place += 1
        
    #incorporate location ids into event metadata
    
    for event in events:
        location = event["location"]
        loc_id = "unknown"
        if location != None:
            for place in places:
                if place["location"] == location:
                    loc_id = place["id"]
        if (loc_id == "unknown") and (location != None):
            print("Failed to find location ID for " + location)
            event["location"] = None
        else:
            event["location"] = loc_id
            
    for person in people:
        #strip titles and/or ranks from names
        if person["name"] != None:
            name_parts = person["name"].split(' ')

            if len(name_parts) >= 2:
                while ((name_parts[0].lower() + ' ' + name_parts[1].lower()) in vocabularies["titles"]) or ((name_parts[0].lower() + ' ' + name_parts[1].lower()) in vocabularies["ranks"]):
                    if len(name_parts) == 2:
                        person["name"] = None
                    else:
                        person["name"] = name_parts[2]
                        for i in range(3, len(name_parts)):
                            person["name"] += ' ' + name_parts[i]

                    if (name_parts[0].lower() + ' ' + name_parts[1].lower()) in vocabularies["titles"]:
                        if person["titles"] != None:
                            person["titles"] += ';' + name_parts[0] + ' ' + name_parts[1]
                        else:
                            person["titles"] = name_parts[0] + ' ' + name_parts[1]
                    else:
                        if person["ranks"] != None:
                            person["ranks"] += ';' + name_parts[0] + ' ' + name_parts[1]
                        else:
                            person["ranks"] = name_parts[0] + ' ' + name_parts[1]

                    if person["name"] == None:
                        break
                    name_parts = person["name"].split(' ')
                    if len(name_parts) < 2:
                        break

            if person["name"] != None:
                while (name_parts[0].lower() in vocabularies["titles"]) or (name_parts[0].lower() in vocabularies["ranks"]):
                    if len(name_parts) == 1:
                        person["name"] = None
                    else:
                        person["name"] = name_parts[1]
                        for i in range(2, len(name_parts)):
                            person["name"] += ' ' + name_parts[i]

                    if name_parts[0].lower() in vocabularies["titles"]:
                        if person["titles"] != None:
                            person["titles"] += ';' + name_parts[0]
                        else:
                            person["titles"] = name_parts[0]
                    else:
                        if person["ranks"] != None:
                            person["ranks"] += ';' + name_parts[0]
                        else:
                            person["ranks"] = name_parts[0]

                    if person["name"] == None:
                        break
                    name_parts = person["name"].split(' ')
                    
            #expand abbreviations in remaining parts of name
            
            #disambiguate and merge people across the volume
            
    print("People records enhanced and disambiguated.")
    
    #convert dictionaries into JSON
    json_path = volume_metadata["id"] + "_ppe.json"
    with open("volume_records\\" + volume_metadata["id"] + "_ppe.json", "w") as outfile:
        outfile.write('{\n\"volume\": \n')
        json.dump(volume_metadata, outfile)
        outfile.write(',')
        outfile.write('\n\"people\": [\n')
        first_person = True
        for person in people:
            if first_person:
                first_person = False
            else:
                outfile.write(",\n")            
            json.dump(person, outfile)            
        outfile.write("\n],\n")
        outfile.write("\"places\": [\n")
        first_place = True
        for place in places:
            if first_place:
                first_place = False
            else:
                outfile.write(",\n")
            json.dump(place, outfile)
        outfile.write("\n],\n")
        outfile.write("\"events\": [\n")
        first_event = True
        for event in events:
            if first_event:
                first_event = False
            else:
                outfile.write(",\n")
            json.dump(event, outfile)
        outfile.write("\n]\n")
        outfile.write('}')
            
    print("JSON built, processing completed.")
            
    return people, places, events, volume_metadata["id"] + "_ppe.json"

In [None]:
#no_test

people, places, events, json_path = process_volume("transcriptions\\15834.xml", "models/mat_baut_1")

Loaded model 'models/mat_baut_1'
Entities extracted.
Relationships linked.
People records enhanced and disambiguated.
JSON built, processing completed.


In [None]:
#no_test

nones = 0
for person in people:
    if person["name"] == None:
        nones += 1
        continue
    #if "titles" in person:
        #print(person)
    if (len(person["name"].split(' ')) > 2):
        print(person["name"])
        
print(str(nones))

Juana de nacion
Juan Joseph de Justis
Ana de Santiago
Thomas de Orvera
Luis Hurtado de Mendoza
Thomas de Orvera
Thomas de Orvera
Juan Joseph de Justis
Andres de Morales
Thomas de Orvera
Th[roto]mas de [roto]vera bap[roto]izé
Ju[roto] Joseph de Justis
Thomas de Orvera
Juan Joseph de Justis
Joseph de Soto
Thomas de Orvera
Ju.o Joseph de Justis
Thomas de Orvera
Maria Luisa de nacion
Ju.o Joseph de Justis
Jacinto de Castro
Thomas de Orvera
Ju.o Joseph de Justis
Thomas de Orvera
Juan Joseph de Justis
Joseph Salcede Soto
Ana de Santiago
Thomas de Orvera
Ju.n. Joseph de Justis
Pedro Montes de Oca
Thomas de Orvera
Franc.co de Paula
Fran.co de Paula
Juo J.e de Justis
Thomas de Orvera
Joseph Lopez de Cuella
Thomas de Orvera
Ygnacio Joseph de Loyola
Thomas de Orvera
Thomas de Orvera
Thomas de Orvera
Ju.n Ygnacio Miraval
Thomas de Orvera
Joseph Lopez de Cuella
Juan Joseph de Justis
Juan Ygnacio Miraval
Thomas de Orvera
Joseph Lopez de Cuella
Ju.o Joseph de Justis
Thomas de Orvera
Joseph Lopez de C

Clemen te Molina
Lorenzo Noriega y Marroquin
Lorenzo Noriega y Marroquin
Juana [roto] ya
Juo de Justis
Lorenzo Noriega y Marroquin
Nicolasa Morenos Carabalies
Juo de Justis
Lorenzo Noriega y Marroquin
Lorenzo Noriega y Marroquin
Ciudd de Matansas
Rosa Cara bali Morenos
Lorenzo Noriega y Marroquin
Lorenzo Noriega y Marr
Juo Roque Hernandes
Juana de la
Lorenzo Noriega y Marroquin
Juo de Dios
Lorenzo Noriega y Marroquin
Lorenzo Noriega y [roto] quin
Joseph de [desvaído]
Maria del Monte
Jachin115 de Galves
Maria Silbestra [desvaído] Pardos
Lorenzo Noriega y Marroquin
Lorenzo Noriega y Marroquin
Sebastian [roto] y
Maria de Jesus Criolla Morenos
Juan [roto] Justis
Lorenzo Noriega y Marroquin
Maria Antonia Figueroa
Lorenzo Noriega y Marroquin
Lorenzo Noriega y Marroquin
Ciud.d de Matansas
Lorenzo Noriega y Marroquin
Lorenzo Noriega y Marroquin
Lorenzo Con treras
Lorenzo Noriega y Marroquin
[roto] zo Contreras
Maria Antonia Figueroa
Lorenzo Noriega y Marroquin
Maria de los
Lorenzo Noriega y Ma

In [None]:
#no_test

from nbdev.export import notebook2script
notebook2script()

Converted 12-ssda-xml-parser.ipynb.
Converted 31-collate-xml-entities-spans.ipynb.
Converted 33-split-data.ipynb.
Converted 41-generic-framework-for-spacy-training.ipynb.
Converted 42-initial-model.ipynb.
Converted 51-data-preprocessing.ipynb.
Converted 52-unstructured-to-markup.ipynb.
Converted 53-markup-to-spatial-historian.ipynb.
Converted 54-utility-functions.ipynb.
Converted 61-prodigy-output-training-demo.ipynb.
Converted 62-full-model-application-demo.ipynb.
Converted 63-pt-model-training.ipynb.
Converted 64-es-model-training.ipynb.
Converted 65-all-annotations-model-training.ipynb.
Converted 66-es-guatemala-model-training.ipynb.
Converted 67-death-and-birth-records-together.ipynb.
Converted 71-relationship-builder.ipynb.
Converted 72-full-volume-processor.ipynb.
