# Sloane Lab NHM CSV Parser

This notebook parses the CSV files from the NHM datasets and extracts the data that we want to import into the Sloane Lab knowledge base.

#### Import libraries

In [4]:
# -*- coding: future_fstrings -*-
import os
import re
import sys
import csv
import urllib
import titlecase
import pandas as pd
import xml.etree.ElementTree as ET

import rdflib
from rdflib import Namespace, URIRef, BNode, Literal
from rdflib.namespace import RDF, RDFS, OWL, XSD

#from viapy import api
from bs4 import BeautifulSoup as Soup
#from mapboxgl.utils import *
#from mapboxgl.viz import *

#from matplotlib import *
#import matplotlib.pyplot as plt
#%matplotlib inline

#### Data path

In [5]:
# XML file path
DATA_PATH = 'data/nhm/'

#### Load datasets

In [6]:
datasets = {}

for dataset_dir in os.listdir(DATA_PATH):

    if not os.path.isfile(dataset_dir):
    
        # Open metadata file
        with open(f'{DATA_PATH}{dataset_dir}/eml.xml') as xml_file:
        
            # Read metadata file
            tree = ET.parse(xml_file)
            
            # Find dataset ID
            did = tree.find(".//citation[@identifier]").attrib['identifier']
            
            # Find description
            desc = dataset_dir.split(' – ')[0]
            
            print(did)
            
        # Open data file
        datasets[did] = {'desc': desc, 'data': pd.read_csv(f'{DATA_PATH}{dataset_dir}/occurrence.csv')}

https://doi.org/10.5519/qd.mouqh9zl
https://doi.org/10.5519/qd.d9md1ytn
https://doi.org/10.5519/qd.05w0l071
https://doi.org/10.5519/qd.fz6oz0nu
https://doi.org/10.5519/qd.vu8anhqj
https://doi.org/10.5519/qd.1xr9dft4
https://doi.org/10.5519/qd.x2y28c14
https://doi.org/10.5519/qd.2an3xk7y
https://doi.org/10.5519/qd.lvo9ftjb
https://doi.org/10.5519/qd.msszpk1u


#### Declare prefixes and create graph

In [26]:
# Create namespaces
slb=Namespace("http://sloanelab.org/");
ecrm=Namespace("http://erlangen-crm.org/current/");
crmdig=Namespace("http://www.ics.forth.gr/isl/CRMdig/");

# Create graph
graph = rdflib.Graph()

# Bind prefixes
graph.bind("slb", slb)
graph.bind("ecrm", ecrm)
graph.bind("crmdig", crmdig)

#### Add catalogue

In [27]:
for i, did in enumerate(datasets):

    # Dataset
    datasetIRI = slb[f'D1/nhm_dataset_{i+1}'];
    graph.add((datasetIRI, OWL.sameAs, URIRef(did)))
    graph.add((datasetIRI, RDF.type, crmdig.D1_Digital_Object))
    graph.add((datasetIRI, ecrm.P2_has_type, slb['E55/museum_dataset']))
    graph.add((datasetIRI, RDFS.label, Literal(f'NHM Dataset {i+1}')))
    graph.add((datasetIRI, RDFS.comment, Literal(datasets[did]['desc'])))

    for k, record in datasets[did]['data'].iterrows():

        # Record
        rid = record["_id"]
        recordIRI = slb[f'E73/nhm_dataset_{did}/{rid}'];
        graph.add((recordIRI, RDF.type, ecrm.E73_Information_Object))
        graph.add((recordIRI, ecrm.P2_has_type, slb['E55/museum_record']))
        graph.add((recordIRI, RDFS.label, Literal(f'NHM Dataset {i+1} Record {rid}')))
        graph.add((datasetIRI, ecrm.P148_has_component, recordIRI))

        # People
        #for person in record['people']:
        #    graph.add((recordIRI, ecrm.P67_refers_to, slb[f'E53/{"Malta"}']))
    
        # Places
        #for place in record['places']:
        #    graph.add((recordIRI, ecrm.P67_refers_to, slb[f'E53/{"Malta"}']))
        
        objIRI = slb[f'E20/nhm_dataset_{did}/{rid}'];
        graph.add((recordIRI, ecrm.P129_is_about, objIRI))

        if record["collectionCode"] == 'MIN':
            graph.add((objIRI, RDF.type, ecrm.E19_Physical_Object))
            desc = f'Record {record["_id"]}\n{record["scientificName"]}'
            
            if record["scientificName"] and record["scientificName"] != 'NaN':
                material = str(record["scientificName"]).lower().replace(' ', '_').replace('</i>','')
                graph.add((objIRI, ecrm.P45_consists_of, slb[f'E55/{material}']))
                graph.add((slb[f'E57/{material}'], RDF.type, ecrm[f'E57_Material']))
                graph.add((slb[f'E57/{material}'], RDFS.label, Literal(record["scientificName"])))
                graph.add((slb[f'E57/{material}'], ecrm.P2_has_type, slb[f'E55/material']))
                graph.add((slb[f'E55/material'], RDFS.label, Literal('Material')))
        else:
            graph.add((objIRI, RDF.type, ecrm.E20_Biological_Object))
            desc = f'Record {record["_id"]}\n{record["scientificName"]}\n{record["higherClassification"]}'
            
            if record["scientificName"] and record["scientificName"] != 'NaN':
                species = str(record["scientificName"]).lower().replace(' ', '_').replace('</i>','')
                graph.add((objIRI, ecrm.P2_has_type, slb[f'E55/{species}']))
                graph.add((slb[f'E57/{species}'], RDF.type, ecrm[f'E55_Type']))
                graph.add((slb[f'E55/{species}'], RDFS.label, Literal(record["scientificName"])))
                graph.add((slb[f'E55/{species}'], ecrm.P2_has_type, slb[f'E55/species']))
                graph.add((slb[f'E55/species'], RDFS.label, Literal('Species')))
        
        graph.add((recordIRI, RDFS.comment, Literal(desc)))

        #graph.add((objIRI, OWL.sameAs, slb[f'E19/fishes/172']))

In [28]:
print(len(graph))

29058


In [29]:
graph.serialize(destination='/Users/danielemetilli/Downloads/nhm-datasets.ttl', format="turtle");