# Sloane Lab XML Parser

This notebook parses the TEI XML files from the Enlightenment Architectures project, extracts the data that we want to import into the Sloane Lab knowledge base, and checks if there are any errors or inconsistencies.

#### Import libraries

In [1]:
# -*- coding: future_fstrings -*-
import os
import re
import sys
import urllib
import pandas as pd

import rdflib
from rdflib import Namespace, URIRef, BNode, Literal
from rdflib.namespace import RDF, RDFS, OWL, XSD

from viapy import api
from bs4 import BeautifulSoup as Soup
from mapboxgl.utils import *
from mapboxgl.viz import *

from matplotlib import *
import matplotlib.pyplot as plt
%matplotlib inline

#### XML File path

In [2]:
# XML file path
XML_PATH = 'data/miscellanea.xml'

#### Function to handle catalogue numbers

In [3]:
# Function to transform the catalogue number to an integer
def cat_to_int(catnum):
    catnum = re.search('\d+', catnum)
    if catnum:
        return int(catnum.group(0))
    return None

#### Load all catalogues and their entries

In [4]:
# Open the XML file
with open(XML_PATH) as xml_file:

    # Read the file
    xml = xml_file.read()
    
    # Parse the file
    parsed_xml = Soup(xml, 'lxml')
    
    # Create empty list of people
    people_list = []
    
    # Create empty list of places
    place_list = []
    
    # For each person in the XML list...
    for person in parsed_xml.find('listperson').find_all('person'):

        # Create dictionary with person's internal ID
        person_dict = {'id': person['xml:id']}
        
        # Find the person's external IDs
        person_ids = person.find_all('idno')
        
        # For each external ID...
        for person_id in person_ids:

            # If the type of ID is VIAF...
            if person_id['type'] == 'viaf':
                
                # Add VIAF ID to person dictionary
                person_dict['viaf'] = person_id.text 
 
            # Otherwise, print person
            else:
                print(person)
 
        person_dict['names'] = [x.text for x in person.find_all('persname')]

        # Add the dictionary to the list
        people_list.append(person_dict)
    
    # For each place in the XML list...
    for place in parsed_xml.find('listplace').find_all('place'):
        
        # Create dictionary with place's internal ID
        place_dict = {'id': place['xml:id']}

        # Find the person's external IDs
        place_ids = place.find_all('idno')
        
        # For each external ID...
        for place_id in place_ids:
            
            # If the type of ID is VIAF...
            if place_id['type'] == 'viaf':
            
                # Add VIAF ID to place dictionary
                place_dict['viaf'] = place_id.text

            # If the type of ID is GEO...
            elif place_id['type'] == 'geo':
                
                try:
                    # Extract latitude and longitude
                    lat, lon = place_id.text.split(',')
                
                    # Add latitude to place dictionary
                    place_dict['lat'] = float(lat.strip())
                
                    # Add longitude to place dictionary
                    place_dict['lon'] = float(lon.strip())
                except:
                    print(place)
                
            # Otherwise, print place
            else:
                print(place)
        
        place_dict['names'] = [x.text for x in place.find_all('placename')]
            
        # Add the dictionary to the list
        place_list.append(place_dict)

# Open the JSON file
#with open("out/people.json", "w") as people_file:
#    json.dump(people_list, people_file)

# Open the JSON file
#with open("out/places.json", "w") as places_file:
#    json.dump(place_list, places_file)

<place xml:id="Eichstadt">
<placename>Eichstadt</placename>
<idno type="viaf">124967002</idno>
<idno type="geo"> 124967002</idno>
</place>
<place xml:id="Hispaniola">
<placename>Hispaniola</placename>
<placename>wreck on the coast of Espanola</placename>
<idno type="viaf">242111207</idno>
<idno type="geo"> failed</idno>
</place>
<place xml:id="Katzenelnbogen">
<placename>Country of Catzellenbogen &amp; belonging to the Landgrave of Hesse cassell,
              northward of Mayence</placename>
<idno type="viaf">129154192</idno>
<idno type="geo"> 50.26745,†7.97322</idno>
</place>
<place xml:id="Lapland">
<placename>Lapland</placename>
<idno type="viaf">140666509 </idno>
<idno type="geo"> 67.616229 26.761877</idno>
</place>
<place xml:id="Manila">
<placename>Maniglia in the East Indies </placename>
<idno type="viaf">128902416</idno>
<idno type="geo"> 14.6042,†120.9822</idno>
</place>
<place xml:id="Melo_Island">
<placename>Melo Island</placename>
<idno type="geo"> 11.03333,†-15.21667</idn

In [5]:
# List of catalogues
cats = []

# Open the XML file
with open(XML_PATH) as xml_file:

    # Find all catalogues in the manuscript
    catalogues = parsed_xml.find_all('div2')
    
    print(len(catalogues))
    
    prevDict = {}
    
    pageNum = "NHM-UK_L_433034_001a"
    prevPageNum = None
    prevParent = None
    force = False
    
    # For each catalogue...
    for catalogue in catalogues:
        
        # Create catalogue dictionary
        cat_dict = {}
        
        # Create list of entries
        cat_dict['entries'] = []
        
        # Get catalogue title
        cat_dict['title'] = catalogue.find('label').text
        
        # Find all entries in the catalogue
        entries = catalogue.find_all('div3')
    
        # For each entry...
        for entry in entries:
            
            # Get number of entry
            catnum = entry.find('ea:catnum').get_text() if entry.find('ea:catnum') else None
            
            # Get people
            people = entry.find_all('persname')
            
            # Get places
            places = entry.find_all('placename')
            
            # Create entry dictionary
            entry_dict = {
                'num_str': ' '.join(catnum.split()) if catnum else None,
                'num_int': cat_to_int(catnum) if catnum else None,
                'text': ' '.join(entry.text.split()),
                'people': [{'id': x['ref'].strip('#') if 'ref' in x.attrs else None, 'name': ' '.join(x.text.split())} for x in people] if people else None,
                'places': [{'id': x['ref'].strip('#') if 'ref' in x.attrs else None, 'name': ' '.join(x.text.split())} for x in places] if places else None
            }
        
            # Append entry to entry list
            cat_dict['entries'].append(entry_dict)
        
        # Append catalogue to catalogue list
        cats.append(cat_dict)

9


#### How many people in the list?

In [6]:
len(people_list)

2799

#### How many places in the list?

In [7]:
len(place_list)

555

#### How many catalogues?

#### Declare prefixes and create graph

In [8]:
# Create namespaces
slb=Namespace("http://sloanelab.org/");
ecrm=Namespace("http://erlangen-crm.org/current/");
crmdig=Namespace("http://www.ics.forth.gr/isl/CRMdig/");
viaf=Namespace("https://viaf.org/viaf/");

# Create graph
graph = rdflib.Graph()

# Bind prefixes
graph.bind("slb", slb)
graph.bind("ecrm", ecrm)
graph.bind("crmdig", crmdig)

#### Add all people to the graph

In [9]:
# For each person in main people list...
for person in people_list:

    # Create person IRI
    personIRI = slb[f'E21/{person["id"].strip().replace(" ", "_")}'];
    
    # Add person to graph
    graph.add((personIRI, RDF.type, ecrm.E21_Person))
    
    # Add label to person
    graph.add((personIRI, RDFS.label, Literal(person["names"][0] if person["names"] else '')))
    
    # Add VIAF link
    if 'viaf' in person and person["viaf"]:
        graph.add((personIRI, OWL.sameAs, viaf[person["viaf"].strip()]))

#### Add all places to the graph

In [11]:
# For each place in main people list...
for place in place_list:

    # Create place IRI
    placeIRI = slb[f'E53/{place["id"].replace(" ", "_")}'];
    
    # Add place to graph
    graph.add((placeIRI, RDF.type, ecrm.E53_Place))
    
    # Add label to place
    graph.add((placeIRI, RDFS.label, Literal(place["names"][0] if place["names"] else '')))
    
    # Add VIAF link
    if 'viaf' in place and place["viaf"]:
        graph.add((placeIRI, OWL.sameAs, viaf[place["viaf"].strip()]))

In [12]:
print(len(graph))

8908


In [13]:
graph.serialize(destination='/Users/danielemetilli/Downloads/sloane-people-places.ttl', format="turtle");