In [1]:
import rdflib

import csv # csv is only needed for this demo, writing to a csv file instead of a database

In [5]:
vocab_url = "https://raw.githubusercontent.com/spacetelescope/mast_expand-vocab/main/vocabs/data-product-type.rdf"
vocab_title = 'DataProductType'
vocab_level = 'product'

output_path = 'test_output.csv'

output_columns = ['tag_uri', 'tag_prefLabel', 'vocab_title', 'vocab_level', 'match_uri_ivoa']

In [6]:
g = rdflib.Graph().parse(vocab_url,format='xml') # Read the vocabulary into an rdflib graph.

In [7]:
with open(output_path, 'w', newline='') as output_file: # Just setting up the csv file writing
    csv_writer = csv.DictWriter(output_file, fieldnames = output_columns) # more csv stuff
    csv_writer.writeheader() # more csv stuff
    
    for tag_uri, _, _ in g.triples((None, rdflib.RDF.type, rdflib.SKOS.Concept)): # loop through all triples of type skos:Concept. None is a wildcard.
        
        # We already have tag_uri from the for loop setup, let's get a couple more values:
        tag_prefLabel = g.value(tag_uri, rdflib.SKOS.prefLabel, None) # For this concept, get the skos:prefLabel.
        match_uri_ivoa = g.value(tag_uri, rdflib.SKOS.exactMatch | rdflib.SKOS.closeMatch | rdflib.SKOS.broadMatch, None) # For this concept, get the skos:exactMatch, closeMatch, or broadMatch. I assume here that there is only one such match and that it is always an IVOA match; we can adjust later.

        # Ultimately, we'll want to assign a tag ID, check if the tag is already in the database, etc., but for now:
        csv_writer.writerow({
            "tag_uri": tag_uri,
            "tag_prefLabel": tag_prefLabel,
            "vocab_title": vocab_title,
            "vocab_level": vocab_level,
            "match_uri_ivoa": match_uri_ivoa
        })

Keep in mind that eventually this table will include concepts from multiple vocabularies, in our current draft schema. And we'll also need different behavior if a tag URI is already in the table, manage unique tag IDs, and so on. But for now, let's look at what we made:

In [9]:
import pandas as pd
display = pd.read_csv(output_path)
display

Unnamed: 0,tag_uri,tag_prefLabel,vocab_title,vocab_level,match_uri_ivoa
0,https://archive.stsci.edu/rdf/data-product-typ...,Abundance catalogs,DataProductType,product,
1,https://archive.stsci.edu/rdf/data-product-typ...,Astrometric catalogs,DataProductType,product,
2,https://archive.stsci.edu/rdf/data-product-typ...,Catalogs,DataProductType,product,
3,https://archive.stsci.edu/rdf/data-product-typ...,Co-trending basis vectors,DataProductType,product,
4,https://archive.stsci.edu/rdf/data-product-typ...,Code,DataProductType,product,
5,https://archive.stsci.edu/rdf/data-product-typ...,Coronagraphic images,DataProductType,product,
6,https://archive.stsci.edu/rdf/data-product-typ...,Cylindrically projected images,DataProductType,product,
7,https://archive.stsci.edu/rdf/data-product-typ...,Dithered images,DataProductType,product,
8,https://archive.stsci.edu/rdf/data-product-typ...,Doppler images,DataProductType,product,
9,https://archive.stsci.edu/rdf/data-product-typ...,Drizzled images,DataProductType,product,
