In [1]:
import rdflib

import csv # csv is only needed for this demo, writing to a csv file instead of a database

In [2]:
# Just setting up csv file writing
output_path = 'test_output.csv'

# Set up output columns
output_columns = ['tag_uri', 'tag_uri_short', 'tag_prefLabel',
                  'vocab','vocab_level', 'caomMatch_uri',
                  'ivoaMatch_uri', 'uatMatch_uri']

In [3]:
# Read in the vocabulary

vocab_url = "https://raw.githubusercontent.com/spacetelescope/mast_expand-vocab/main/vocabs/data-product-type.rdf"

g = rdflib.Graph().parse(vocab_url,format='xml') # Read the vocabulary into an rdflib graph.

In [4]:
# Set vocab and vocab_level. 
# I will work on a better way to do this.

if 'data-product-type.rdf' in vocab_url:
    vocab = 'DataProductType'
    vocab_level = 'product'
elif 'basis.rdf' in vocab_url:
    vocab = 'Basis'
    vocab_level = 'product'
else:
    print('Vocabulary not recognized.')

In [5]:
# The "with open" is just for csv file writing.
with open(output_path, 'w', newline='') as output_file:
    csv_writer = csv.DictWriter(output_file, fieldnames = output_columns) # more csv stuff
    csv_writer.writeheader() # more csv stuff
    
    # loop through all triples of type skos:Concept. None is a wildcard.
    for tag_uri, _, _ in g.triples((None, rdflib.RDF.type, rdflib.SKOS.Concept)):
        
        # We already have tag_uri from the for loop setup.
        # Let's get a couple more values:
        tag_prefLabel = g.value(tag_uri, rdflib.SKOS.prefLabel, None) # prefLabel
        tag_uri_short = tag_uri.split('#', 1)[1] # tag_uri_short is the everything after the hashtag
        
        
        # For this concept, get the match URIs for external vocabs.
        match_uris = g.objects(tag_uri,
                               rdflib.SKOS.exactMatch |
                               rdflib.SKOS.broadMatch,
                               None)
    
        # Set default match URI values.
        # Probably there is a more elegant way to do this.
        ivoaMatch_uri = None
        uatMatch_uri = None
        caomMatch_uri = None
        
        # Populate match URI values
        for uri in match_uris:
            if 'ivoa.net/rdf/' in uri:
                ivoaMatch_uri = uri
            elif 'astrothesaurus.org/uat/' in uri:
                uatMatch_uri = uri
            elif 'opencadc.org/caom' in uri:
                caomMatch_uri = uri
            else:
                # Error handling.
                pass
        

        # Ultimately, we'll want to assign a tag ID, check if the tag is already in the database, 
        # and edit column values without overwriting the tag ID if anything has changed.
        # But for now:
        csv_writer.writerow({
            "tag_uri": tag_uri,
            "tag_uri_short": tag_uri_short,
            "tag_prefLabel": tag_prefLabel,
            "vocab": vocab,
            "vocab_level": vocab_level,
            "caomMatch_uri": caomMatch_uri,
            "uatMatch_uri": uatMatch_uri,
            "ivoaMatch_uri": ivoaMatch_uri
        })

Keep in mind that eventually this table will include concepts from multiple vocabularies, in our current draft schema. And we'll also need different behavior if a tag URI is already in the table, manage unique tag IDs, and so on. But for now, let's look at what we made:

In [6]:
import pandas as pd
display = pd.read_csv(output_path)
display

Unnamed: 0,tag_uri,tag_uri_short,tag_prefLabel,vocab,vocab_level,caomMatch_uri,ivoaMatch_uri,uatMatch_uri
0,https://archive.stsci.edu/rdf/data-product-typ...,Abundance_catalogs,Abundance catalogs,DataProductType,product,,,
1,https://archive.stsci.edu/rdf/data-product-typ...,Astrometric_catalogs,Astrometric catalogs,DataProductType,product,,,
2,https://archive.stsci.edu/rdf/data-product-typ...,Catalogs,Catalogs,DataProductType,product,,,
3,https://archive.stsci.edu/rdf/data-product-typ...,Co-trending_basis_vectors,Co-trending basis vectors,DataProductType,product,,,
4,https://archive.stsci.edu/rdf/data-product-typ...,Code,Code,DataProductType,product,,,
5,https://archive.stsci.edu/rdf/data-product-typ...,Coronagraphic_images,Coronagraphic images,DataProductType,product,,,
6,https://archive.stsci.edu/rdf/data-product-typ...,Cylindrically_projected_images,Cylindrically projected images,DataProductType,product,,,
7,https://archive.stsci.edu/rdf/data-product-typ...,Dithered_images,Dithered images,DataProductType,product,,,
8,https://archive.stsci.edu/rdf/data-product-typ...,Doppler_images,Doppler images,DataProductType,product,,,
9,https://archive.stsci.edu/rdf/data-product-typ...,Drizzled_images,Drizzled images,DataProductType,product,,,
