# Sloane Lab XML Parser

This notebook parses the TEI XML files from the Enlightenment Architectures project, extracts the data that we want to import into the Sloane Lab knowledge base, and checks if there are any errors or inconsistencies.

#### Import libraries

In [1]:
# -*- coding: future_fstrings -*-
import os
import re
import sys
import urllib
import titlecase
import pandas as pd

import rdflib
from rdflib import Namespace, URIRef, BNode, Literal
from rdflib.namespace import RDF, RDFS, OWL, XSD

from viapy import api
from bs4 import BeautifulSoup as Soup
from mapboxgl.utils import *
from mapboxgl.viz import *

from matplotlib import *
import matplotlib.pyplot as plt
%matplotlib inline

#### XML File path

In [2]:
# XML file path
JSON_PATH = 'data/nhm_sample.json'

#### Load JSON

In [3]:
# Open the XML file
with open(JSON_PATH) as json_file:

    # Read the file
    data = json.load(json_file)
data

{'_id': 2683119,
 'associatedMedia': [{'_id': 113912,
   'assetID': 'e4321b2a-ac54-4fa1-94b7-448524093cea',
   'category': 'Specimen',
   'created': 1361890143000,
   'creator': 'Consuelo Sendino',
   'identifier': 'https://data.nhm.ac.uk/media/e4321b2a-ac54-4fa1-94b7-448524093cea',
   'license': 'http://creativecommons.org/licenses/by/4.0/',
   'mime': 'tiff',
   'modified': 1455021799000,
   'rightsHolder': 'The Trustees of the Natural History Museum, London',
   'title': 'P 73123. Carcharodon megalodon.',
   'type': 'StillImage'}],
 'associatedMediaCount': 1,
 'basisOfRecord': 'FossilSpecimen',
 'catalogNumber': 'PV P 73123',
 'catalogueDescription': 'tooth',
 'chronostratigraphy': 'Neogene, Miocene',
 'class': 'Elasmobranchii',
 'collectionCode': 'PAL',
 'continent': 'Europe',
 'country': 'Malta',
 'created': 1361276348000,
 'determinationNames': 'Carcharodon megalodon',
 'donorName': 'Sir Hans Sloane',
 'earliestEonOrLowestEonothem': 'Phanerozoic',
 'earliestEpochOrLowestSeries': 

#### Declare prefixes and create graph

In [4]:
# Create namespaces
slb=Namespace("http://sloanelab.org/");
ecrm=Namespace("http://erlangen-crm.org/current/");
crmdig=Namespace("http://www.ics.forth.gr/isl/CRMdig/");

# Create graph
graph = rdflib.Graph()

# Bind prefixes
graph.bind("slb", slb)
graph.bind("ecrm", ecrm)
graph.bind("crmdig", crmdig)

#### Add catalogue

In [6]:
# Dataset
jsonIRI = slb[f'D1/nhm_dataset'];
graph.add((jsonIRI, RDF.type, crmdig.D1_Digital_Object))
graph.add((jsonIRI, ecrm.P2_has_type, slb['E55/museum_dataset']))
graph.add((jsonIRI, RDFS.label, Literal('NHM Dataset')))

# Record
recordIRI = slb[f'E73/nhm_dataset/PV_P_73123'];
graph.add((recordIRI, RDF.type, ecrm.E73_Information_Object))
graph.add((recordIRI, ecrm.P2_has_type, slb['E55/museum_record']))
graph.add((recordIRI, RDFS.label, Literal('NHM PV P 73123')))

graph.add((recordIRI, ecrm.P67_refers_to, slb[f'E53/{"Malta"}']))

objIRI = slb[f'E19/nhm_dataset/PV_P_73123'];
graph.add((objIRI, RDF.type, ecrm.E19_Physical_Object))
graph.add((recordIRI, ecrm.P129_is_about, objIRI))

graph.add((objIRI, OWL.sameAs, slb[f'E19/fishes/172']))

<Graph identifier=N32ce111042214f02bf6c56079d4b43a8 (<class 'rdflib.graph.Graph'>)>

In [7]:
print(len(graph))

10


In [9]:
graph.serialize(destination='/Users/danielemetilli/Downloads/nhm-shark-tooth.ttl', format="turtle");