In [1]:
import re

import pandas as pd
from rdflib import Graph, Namespace, URIRef, RDFS, DCTERMS, Literal, SKOS, RDF

import numpy as np
import pandas as pd
import os
import math

import urllib.request
import urllib.parse
import re

import matplotlib.pyplot as plt

In [2]:
def curie_terms(terms, graph=None):
    return [t.n3(graph.namespace_manager) for t in terms]
    
def curie_results(query, graph=None):
    return [curie_terms(row, graph) for row in graph.query(query)]

# Select UAT Terms

In [3]:
g_uat = Graph()
g_uat.parse(
    source="https://github.com/astrothesaurus/UAT/raw/v.4.2.0/UAT.rdf",
    format="xml"
)
len(g_uat)

22947

In [5]:
_ = g_uat.serialize(destination="uat.ttl", format="turtle")

Note: Using https://github.com/edmcouncil/rdf-toolkit (v1.12.0)
to normalize text/turtle serialization for helpful git diffs.

In [6]:
bindings = (
    ("skos", SKOS),
    ("uat", Namespace("http://astrothesaurus.org/uat/")),
)
for b in bindings:
    g_uat.namespace_manager.bind(*b)

In [7]:
df_ads = pd.read_csv("https://raw.githubusercontent.com/rmcgranaghan/Helio-KNOW/main/ADS_enrichment/data/UAT_Solar-related-concepts.csv")



In [9]:
paths = []
for row in df_ads.itertuples():
    paths.append([elt for elt in row if isinstance(elt, str)])

In [8]:
uat_helio = []
for path in paths:
    q = f"""
        SELECT ?c WHERE {{
            ?c skos:prefLabel "{path[-1]}"@en .
        }}
    """
    results = curie_results(q, g_uat)
    assert len(results) == 1
    uat_helio.append({"curie": results[0][0],"label_path": path})

In [9]:
len(uat_helio)

444

In [10]:
g_uat_helio = Graph()
g_uat_helio.namespace_manager.bind("uat", Namespace("http://astrothesaurus.org/uat/"))
for term_info in uat_helio:
    curie = term_info["curie"]
    predicate_objects = g_uat.query(f"""
        SELECT ?p ?o WHERE {{
            {curie} ?p ?o .
        }}
    """)
    
    for p, o in predicate_objects:
        g_uat_helio.add((URIRef(curie), p, o))
    subject_predicates = g_uat.query(f"""
        SELECT ?s ?p {{
            ?s ?p {curie} .
        }}
    """)
    
    for s, p in subject_predicates:
        g_uat_helio.add((s, p, URIRef(curie)))        

In [11]:
len(g_uat), len(g_uat_helio)

(22947, 4358)

In [12]:
g_uat_helio.serialize("uat_helio.ttl")

<Graph identifier=N2bdb6fcf290c4e11bdbeec34fccdab22 (<class 'rdflib.graph.Graph'>)>

# Triplify AGU Terms

In [13]:
df_agu = pd.read_excel("https://github.com/rmcgranaghan/Helio-KNOW/raw/main/ADS_enrichment/data/agu-index-terms.xlsx")

In [14]:
URI_BASE = "https://n2t.net/ark:57802/dw0/agu"
vocab = URIRef(URI_BASE)

description_with_related_terms = re.compile(
    r"(?P<description>[^\(]+)\s+\((?P<related>[\d\,\s]+)\)$"
)

g_agu = Graph()

g_agu.add((vocab, RDF.type, SKOS.ConceptScheme))
g_agu.add((vocab, DCTERMS.title, Literal("AGU Index Terms")))
g_agu.add((
    vocab,
    DCTERMS.publisher,
    URIRef("https://www.agu.org/Publish-with-AGU/Publish/Author-Resources/Index-terms")
))

for i in range(len(df_agu)):
    term_info = df_agu.iloc[i]
    code, description = "{:04}".format(term_info['Code']), term_info['Description']
    term = URIRef(f"{URI_BASE}/{code}")
    
    g_agu.add((term, RDF.type, SKOS.Concept))
    g_agu.add((term, SKOS.prefLabel, Literal(code)))
    g_agu.add((term, RDFS.isDefinedBy, vocab))
    g_agu.add((term, RDFS.comment, Literal(f"CODE: {code}")))
    
    m = re.fullmatch(description_with_related_terms, description)
    if m:
        related_terms = []
        for _code in re.split(r"[,\s]+", m.group('related')):
            related_terms.append(URIRef(f"{URI_BASE}/{_code}"))
        for rt in related_terms:
            g_agu.add((term, SKOS.related, rt))
        description = m.group('description')
    g_agu.add((term, SKOS.definition, Literal(description)))
    
    if code.endswith("00"):
        g_agu.add((term, SKOS.topConceptOf, vocab))
        g_agu.add((vocab, SKOS.hasTopConcept, term))
    else:
        top_concept = URIRef(f"{URI_BASE}/{code[:2]}00")
        g_agu.add((term, SKOS.broader, top_concept))
        g_agu.add((top_concept, SKOS.narrower, term))

In [15]:
g_agu.serialize("agu_index_terms.ttl")

<Graph identifier=N8fa2e86f4fd2442492713d4072b98e24 (<class 'rdflib.graph.Graph'>)>

Pushed to <https://n2t.net/ark:57802/dw0/agu>

# Select AGU Terms

In [16]:
g_agu = Graph()
g_agu.parse("https://n2t.net/ark:57802/dw0/agu")
g_agu.namespace_manager.bind("agu", Namespace("https://n2t.net/ark:57802/dw0/agu/"))

In [17]:
agu_selection = list(g_agu.query("""
    SELECT ?c WHERE {
        ?c a skos:Concept .
        ?c skos:prefLabel ?code .
        FILTER(substr(?code,1,2) = ?code_prefix)
        FILTER(substr(?code,3,2) != "99")
        VALUES ?code_prefix { "19" "21" "24" "27" "32" "43" "69" "75" "78" }
    }
"""))

In [18]:
len(agu_selection)

272

In [19]:
g_agu_helio = Graph()
g_agu_helio.namespace_manager.bind("agu", Namespace("https://n2t.net/ark:57802/dw0/agu/"))
for row in agu_selection:
    term = row[0]
    t = term.n3()
    predicate_objects = g_agu.query(f"""
        SELECT ?p ?o WHERE {{
            {t} ?p ?o .
        }}
    """)
    
    for p, o in predicate_objects:
        g_agu_helio.add((term, p, o))
    subject_predicates = g_agu.query(f"""
        SELECT ?s ?p {{
            ?s ?p {t} .
        }}
    """)
    
    for s, p in subject_predicates:
        g_agu_helio.add((s, p, term)) 

In [20]:
len(g_agu_helio)

2290

In [21]:
g_agu_helio.serialize("agu_helio.ttl")

<Graph identifier=N03c32a493d82405ba17be1b8d42546cd (<class 'rdflib.graph.Graph'>)>

# Make Candidate UAT incorporating AGU terms

In [22]:
q = """
    SELECT ?c ?descr WHERE {
        ?c skos:definition ?descr .
        ?c skos:topConceptOf ?v .
    }
"""
curie_results(q, g_agu_helio)

[['agu:2400', '"IONOSPHERE"'],
 ['agu:6900', '"RADIO SCIENCE"'],
 ['agu:4300', '"NATURAL HAZARDS"'],
 ['agu:7800', '"SPACE PLASMA PHYSICS"'],
 ['agu:1900', '"INFORMATICS"'],
 ['agu:2700', '"MAGNETOSPHERIC PHYSICS"'],
 ['agu:3200', '"MATHEMATICAL GEOPHYSICS"'],
 ['agu:2100', '"INTERPLANETARY PHYSICS"'],
 ['agu:7500', '"SOLAR PHYSICS, ASTROPHYSICS, AND ASTRONOMY"']]

In [23]:
q = """
    SELECT ?c ?descr WHERE {
        ?c skos:prefLabel ?descr .
        ?c skos:topConceptOf ?v .
    }
"""
curie_results(q, g_uat_helio)

[['<uat:1476>', '"Solar physics"@en'],
 ['<uat:1529>', '"Solar system astronomy"@en']]

In [24]:
q = """
    SELECT ?c ?descr WHERE {
        ?c skos:definition ?descr .
        ?c skos:broader agu:7500 .
    }
"""
curie_results(q, g_agu_helio)

[['agu:7546', '"Transition region"'],
 ['agu:7538', '"Solar irradiance"'],
 ['agu:7549', '"Ultraviolet emissions"'],
 ['agu:7519', '"Flares"'],
 ['agu:7594', '"Instruments and techniques"'],
 ['agu:7537', '"Solar and stellar variability"'],
 ['agu:7526', '"Magnetic reconnection"'],
 ['agu:7554', '"X-rays, gamma rays, and neutrinos"'],
 ['agu:7536', '"Solar activity cycle"'],
 ['agu:7524', '"Magnetic fields"'],
 ['agu:7534', '"Radio emissions"'],
 ['agu:7511', '"Coronal holes"'],
 ['agu:7507', '"Chromosphere"'],
 ['agu:7529', '"Photosphere"'],
 ['agu:7514', '"Energetic particles"'],
 ['agu:7509', '"Corona"'],
 ['agu:7531', '"Prominence eruptions"'],
 ['agu:7522', '"Helioseismology"'],
 ['agu:7544', '"Stellar interiors and dynamo theory"'],
 ['agu:7539', '"Stellar astronomy"'],
 ['agu:7513', '"Coronal mass ejections"'],
 ['agu:7504', '"Celestial mechanics"']]

approach to try for winnowing:

- index each node of AUT helio as a document with OpenSearch. Perhaps include all hierarchy (above and below node) and appropriately tune query logic to boost/discount as appropriate.
- for each leaf node of AGU helio:
    - form a query as the concatenation of all labels from hierarchy
    - find and rank relevant matches in the index
    - note any (AGU helio, AUT helio) node pairs that have high scores.
- {(AGU helio, AUT helio)} nodes with high scores are candidates for SSSOM SKOS mapping statements