Note: This is a work in progress where I am shifting to use of the Geoscience Ontology for geologic time information in the GeoKB along with a number of other reference sets.

The process of getting geo time intervals organized into the GeoKB ended up being a little bit messy as I worked through exactly how to represent the entities and where to source them. I did some original work on the GeoSciML SKOS representation of the International Chronostratigraphic Chart and then wound up incorporating most of the Geoscience Ontology's approach toward generalized time intervals and sequencing along with identifying those named time interval entities from the USGS source. With items instantiated and same as claims pointing to original source IRIs from the Geoscience Ontology, other work in this notebook can proceed. The history record in the GeoKB kept track of the messy process, much of which is also accounted for in older commits from this notebook.

In [1]:
import requests
from wbmaker import WikibaseConnection
import rdflib
import re
import pandas as pd

geokb = WikibaseConnection("GEOKB_CLOUD")

In [2]:
gso_ref = geokb.datatypes.Item(
    prop_nr=geokb.prop_lookup['knowledge source'],
    value='Q161225'
)

ics_ref = geokb.datatypes.Item(
    prop_nr=geokb.prop_lookup['knowledge source'],
    value='Q26294'
)

gso_gst = rdflib.Graph()
gso_gst.parse(
    "https://raw.githubusercontent.com/Loop3D/GKM/master/Loop3D-GSO/Modules/GSO-Geologic_Time.ttl",
    format="ttl"
)

<Graph identifier=Ndc102daf1a3349fc99fd49f46160ca42 (<class 'rdflib.graph.Graph'>)>

In [3]:
def aggregator(x):
    return list(set(x.dropna()))

def df_from_graph(query_results):
    rows = []
    for row in query_results:
        record = {}
        for i, prop in enumerate([str(i) for i in query_results.vars]):
            record[prop] = str(row[i])
        rows.append(record)

    # Convert to a pandas dataframe
    return pd.DataFrame(rows)


# Query for information from the GSO geologic time module

In [4]:
# Query for the available properties on types
query_age = """
    SELECT ?iri ?type ?label
    ?nextTimeInterval ?previousTimeInterval ?timeContains
    ?comment ?source ?description
    WHERE {
        ?iri rdf:type ?type ;
            rdfs:label ?label .
        VALUES ?type { gst:Supereon gst:Eon gst:Era gst:Period gst:Subperiod gst:Epoch gst:Age } .
        OPTIONAL {
            ?iri gsoc:nextTimeInterval ?nextTimeInterval ;
        }
        OPTIONAL {
            ?iri gsoc:previousTimeInterval ?previousTimeInterval ;
        }
        OPTIONAL {
            ?iri gsoc:timeContains ?timeContains ;
        }
        OPTIONAL {
            ?iri rdfs:comment ?comment .
        }
        OPTIONAL {
            ?iri dct:source ?source .
        }
        OPTIONAL {
            ?iri dct:description ?description .
        }
    }
"""
gstime_ages = gso_gst.query(query_age)

query_dbp_links = """
SELECT ?dbp_link ?iri
WHERE {
    ?dbp_link owl:sameAs ?iri .
    FILTER (STRSTARTS(STR(?dbp_link), STR(dbp:)))
}
"""
gstime_dbp_links = gso_gst.query(query_dbp_links)
dbp_links = df_from_graph(gstime_dbp_links)

# Convert to a pandas dataframe
df_gstime_ages = df_from_graph(gstime_ages)

df_gstime_ages = pd.merge(
    left=df_gstime_ages,
    right=dbp_links,
    how='left',
    on='iri'
)

df_gstime_ages['label'] = df_gstime_ages['label'].str.strip()

# Aggregate gst items
gstime_ages_grouped = df_gstime_ages.groupby('iri', as_index=False).agg(aggregator)
gstime_ages_grouped['label'] = gstime_ages_grouped['label'].apply(lambda x: x[0])
gstime_ages_grouped['item_type'] = gstime_ages_grouped['type'].apply(lambda x: x[0].split('/')[-1])

# Query for GeoKB entities

In [11]:
geokb_geotime_query = """
PREFIX wd: <https://geokb.wikibase.cloud/entity/>
PREFIX wdt: <https://geokb.wikibase.cloud/prop/direct/>

SELECT ?item ?itemLabel ?same_as ?next_interval ?previous_interval
WHERE {
  ?item wdt:P84 ?same_as .
  OPTIONAL {
    ?item wdt:P161 ?next_interval .
  }
  OPTIONAL {
    ?item wdt:P162 ?previous_interval .
  }
  FILTER CONTAINS(STR(?same_as), "w3id.org/gso/geologictime/")
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . } 
}
"""

geokb_geotime = geokb.sparql_query(geokb_geotime_query)
geokb_geotime['item'] = geokb_geotime['item'].apply(lambda x: x.split('/')[-1])
geokb_geotime['next_interval'] = geokb_geotime['next_interval'].apply(lambda x: x.split('/')[-1] if x else None)
geokb_geotime['previous_interval'] = geokb_geotime['previous_interval'].apply(lambda x: x.split('/')[-1] if x else None)
geokb_geotime_grouped = geokb_geotime.groupby(['item','itemLabel','same_as'], as_index=False).agg(aggregator).reset_index(drop=True)

# IRI to GeoKB QID Map
geokb_gstime_lookup = geokb_geotime_grouped.set_index('same_as')['item'].to_dict()

geokb_gstime_lookup.update({
    'https://w3id.org/gso/geologictime/CambrianStage9Age': 'Q161386'
})

# Time interval sequence claims

In [12]:
# Assign QIDs to main subject items
gstime_ages_grouped['iri_qid'] = gstime_ages_grouped['iri'].apply(lambda x: geokb_gstime_lookup.get(x, None))

# Assign QIDs to time contains objects
time_contains = gstime_ages_grouped[['iri_qid','timeContains']].explode('timeContains').dropna()
time_contains = time_contains[time_contains['timeContains'] != 'None']
time_contains['time_contains_qid'] = time_contains['timeContains'].apply(lambda x: geokb_gstime_lookup.get(x, None))
time_contains.dropna(subset=['time_contains_qid'], inplace=True)
time_contains = time_contains[['iri_qid','time_contains_qid']].drop_duplicates().groupby('iri_qid', as_index=False).agg(list)

# Assign QIDs to next time intervals
next_time_intervals = gstime_ages_grouped[['iri_qid','nextTimeInterval']].explode('nextTimeInterval').dropna()
next_time_intervals = next_time_intervals[next_time_intervals['nextTimeInterval'] != 'None']
next_time_intervals['next_time_qid'] = next_time_intervals['nextTimeInterval'].apply(lambda x: geokb_gstime_lookup.get(x, None))
next_time_intervals.dropna(subset=['next_time_qid'], inplace=True)
next_time_intervals = next_time_intervals[['iri_qid','next_time_qid']].drop_duplicates().groupby('iri_qid', as_index=False).agg(list)

# Assign QIDs to previous time intervals
previous_time_intervals = gstime_ages_grouped[['iri_qid','previousTimeInterval']].explode('previousTimeInterval').dropna()
previous_time_intervals = previous_time_intervals[previous_time_intervals['previousTimeInterval'] != 'None']
previous_time_intervals['previous_time_qid'] = previous_time_intervals['previousTimeInterval'].apply(lambda x: geokb_gstime_lookup.get(x, None))
previous_time_intervals.dropna(subset=['previous_time_qid'], inplace=True)
previous_time_intervals = previous_time_intervals[['iri_qid','previous_time_qid']].drop_duplicates().groupby('iri_qid', as_index=False).agg(list)

time_sequence_claims = pd.merge(
    left=next_time_intervals,
    right=previous_time_intervals,
    how='outer',
    on='iri_qid'
)

time_sequence_claims = pd.merge(
    left=time_sequence_claims,
    right=time_contains,
    how='outer',
    on='iri_qid'
)

time_sequence_claims.head()

Unnamed: 0,iri_qid,next_time_qid,previous_time_qid,time_contains_qid
0,Q161380,[Q26342],[Q161381],
1,Q161381,[Q161380],[Q26318],
2,Q161382,[Q161383],[Q26328],
3,Q161383,"[Q26403, Q26415]","[Q26330, Q161382]",
4,Q161384,[Q26382],"[Q26341, Q26351]",


In [None]:
for _, row in time_sequence_claims.iterrows():
    item = geokb.wbi.item.get(row['iri_qid'])

    next_time_interval_claims = []
    if isinstance(row['next_time_qid'], list):
        for q in row['next_time_qid']:
            next_time_interval_claims.append(
                geokb.datatypes.Item(
                    prop_nr=geokb.prop_lookup['next time interval'],
                    value=q,
                    references=[gso_ref]
                )
            )
        item.claims.add(next_time_interval_claims, action_if_exists=geokb.action_if_exists.REPLACE_ALL)
    
    previous_time_interval_claims = []
    if isinstance(row['previous_time_qid'], list):
        for q in row['previous_time_qid']:
            previous_time_interval_claims.append(
                geokb.datatypes.Item(
                    prop_nr=geokb.prop_lookup['previous time interval'],
                    value=q,
                    references=[gso_ref]
                )
            )
        item.claims.add(previous_time_interval_claims, action_if_exists=geokb.action_if_exists.REPLACE_ALL)

    time_contains_claims = []
    if isinstance(row['time_contains_qid'], list):
        for q in row['time_contains_qid']:
            time_contains_claims.append(
                geokb.datatypes.Item(
                    prop_nr='P163',
                    value=q,
                    references=[gso_ref]
                )
            )
        item.claims.add(time_contains_claims, action_if_exists=geokb.action_if_exists.REPLACE_ALL)

    response = item.write(
        summary="Adding time sequence claims",
    )
    print(response.id)

# Same as links

In [13]:
dbp_sameas_links = gstime_ages_grouped[['iri_qid','dbp_link']].explode('dbp_link').dropna().drop_duplicates().groupby('iri_qid', as_index=False).agg(list)
dbp_sameas_links.head()

Unnamed: 0,iri_qid,dbp_link
0,Q161382,"[http://dbpedia.org/resource/ChibanianAge, htt..."
1,Q161383,[http://dbpedia.org/resource/UpperPleistocene]
2,Q161384,[http://dbpedia.org/resource/CambrianStage5Age]
3,Q26295,[http://dbpedia.org/resource/Aalenian]
4,Q26296,[http://dbpedia.org/resource/Bajocian]


In [15]:
for _, row in dbp_sameas_links.iterrows():
    item = geokb.wbi.item.get(row['iri_qid'])
    same_as_claims = item.claims.get(geokb.prop_lookup['same as'])
    same_as_links = [i.mainsnak.datavalue['value'] for i in same_as_claims]
    same_as_links.extend(row['dbp_link'])
    same_as_links = list(set(same_as_links))
    gso_links = [i for i in same_as_links if 'w3id.org/gso/geologictime/' in i]
    if gso_links:
        same_as_claims = [
            geokb.datatypes.URL(
                prop_nr=geokb.prop_lookup['same as'],
                value=l
            )
            for l in same_as_links
        ]
        item.claims.add(same_as_claims, action_if_exists=geokb.action_if_exists.REPLACE_ALL)
        response = item.write(
            summary="Updated same as claims to include all DBPedia links"
        )
        print(response.id, gso_links)
    else:
        print('No GSO links found for {}'.format(row['iri_qid']))

Q161382 ['https://w3id.org/gso/geologictime/MiddlePleistoceneAge']
Q161383 ['https://w3id.org/gso/geologictime/UpperPleistoceneAge']
Q161384 ['https://w3id.org/gso/geologictime/WuliuanAge']
Q26295 ['https://w3id.org/gso/geologictime/AalenianAge']
Q26296 ['https://w3id.org/gso/geologictime/BajocianAge']
Q26297 ['https://w3id.org/gso/geologictime/BathonianAge']
Q26298 ['https://w3id.org/gso/geologictime/CallovianAge']
Q26299 ['https://w3id.org/gso/geologictime/AeronianAge']
Q26300 ['https://w3id.org/gso/geologictime/RhuddanianAge']
Q26301 ['https://w3id.org/gso/geologictime/TelychianAge']
Q26302 ['https://w3id.org/gso/geologictime/AlbianAge']
Q26303 ['https://w3id.org/gso/geologictime/AptianAge']
Q26304 ['https://w3id.org/gso/geologictime/BarremianAge']
Q26305 ['https://w3id.org/gso/geologictime/BerriasianAge']
Q26306 ['https://w3id.org/gso/geologictime/HauterivianAge']
Q26307 ['https://w3id.org/gso/geologictime/ValanginianAge']
Q26308 ['https://w3id.org/gso/geologictime/AnisianAge']
Q26