In [49]:
import rdflib
import pandas as pd
import re
from wbmaker import WikibaseConnection

geokb = WikibaseConnection('GEOKB_CLOUD')

# Current GeoKB Classes

In [2]:
geokb_rock_query = """
PREFIX wd: <https://geokb.wikibase.cloud/entity/>
PREFIX wdt: <https://geokb.wikibase.cloud/prop/direct/>

SELECT ?rock ?rockLabel ?rockDescription ?subClassOf ?subClassOfLabel ?same_as
WHERE {
  ?rock wdt:P2* wd:Q41261 ;
        wdt:P2 ?subClassOf .
  OPTIONAL {
    ?rock wdt:P84 ?same_as .
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
}
"""

geokb_rocks = geokb.sparql_query(geokb_rock_query)
geokb_rocks['rock_qid'] = geokb_rocks['rock'].apply(lambda x: x.split('/')[-1])
geokb_rocks['subclass_qid'] = geokb_rocks['subClassOf'].apply(lambda x: x.split('/')[-1])

geokb_rocks = geokb_rocks.groupby(['rock_qid','rockLabel','rock','rockDescription'], as_index=False)[['subclass_qid','subClassOfLabel','same_as']].agg(set)
geokb_rocks['subclass_qid'] = geokb_rocks['subclass_qid'].apply(lambda x: [i for i in x])
geokb_rocks['subClassOfLabel'] = geokb_rocks['subClassOfLabel'].apply(lambda x: [i for i in x])
geokb_rocks['same_as'] = geokb_rocks['same_as'].apply(lambda x: [i for i in x])

geokb_rocks['altRockLabel'] = geokb_rocks['rockLabel'].apply(lambda x: x.replace('-',' '))

geokb_rocks_sameas = geokb_rocks[['rock_qid','same_as']].explode('same_as').dropna().reset_index(drop=True)

geokb_rocks.head(10)

Unnamed: 0,rock_qid,rockLabel,rock,rockDescription,subclass_qid,subClassOfLabel,same_as,altRockLabel
0,Q158195,composite genesis rock,https://geokb.wikibase.cloud/entity/Q158195,Rock formed by geological modification of pre-...,[Q41261],[rock material],[http://resource.geosciml.org/classifier/cgi/l...,composite genesis rock
1,Q158196,composite genesis material,https://geokb.wikibase.cloud/entity/Q158196,Material of unspecified consolidation state fo...,"[Q41261, Q158193]","[compound material, rock material]",[https://w3id.org/gso/rockmaterial/compositege...,composite genesis material
2,Q158197,igneous material,https://geokb.wikibase.cloud/entity/Q158197,Earth material formed as a result of igneous p...,"[Q41261, Q158193]","[compound material, rock material]",[https://w3id.org/gso/rockmaterial/igneousmate...,igneous material
3,Q158198,sedimentary material,https://geokb.wikibase.cloud/entity/Q158198,formed by accumulation of solid fragmental mat...,"[Q41261, Q158193]","[compound material, rock material]",[https://w3id.org/gso/rockmaterial/sedimentary...,sedimentary material
4,Q158199,unconsolidated material,https://geokb.wikibase.cloud/entity/Q158199,Compound material composed of an aggregation o...,"[Q41261, Q158193]","[compound material, rock material]",[https://w3id.org/gso/rockmaterial/unconsolida...,unconsolidated material
5,Q158200,anthropogenic material,https://geokb.wikibase.cloud/entity/Q158200,Material known to have artificial (human-relat...,"[Q41261, Q158193]","[compound material, rock material]",[https://w3id.org/gso/rockmaterial/anthropogen...,anthropogenic material
6,Q158201,phaneritic igneous rock,https://geokb.wikibase.cloud/entity/Q158201,Igneous rock in which the framework of the roc...,[Q41459],[igneous rock],[https://w3id.org/gso/rockmaterial/phaneritici...,phaneritic igneous rock
7,Q158202,fine grained igneous rock,https://geokb.wikibase.cloud/entity/Q158202,Igneous rock in which the framework of the roc...,[Q41459],[igneous rock],[http://resource.geosciml.org/classifier/cgi/l...,fine grained igneous rock
8,Q158203,high magnesium fine grained igneous rock,https://geokb.wikibase.cloud/entity/Q158203,fine-grained igneous rock that contains unusua...,[Q158202],[fine grained igneous rock],[http://resource.geosciml.org/classifier/cgi/l...,high magnesium fine grained igneous rock
9,Q161226,anthropogenic unconsolidated material,https://geokb.wikibase.cloud/entity/Q161226,unconsolidated material known to have artifici...,"[Q158200, Q158199]","[anthropogenic material, unconsolidated material]",[https://w3id.org/gso/rockmaterial/anthropogen...,anthropogenic unconsolidated material


# GSO Rock Materials

In [3]:
gsrm = rdflib.Graph()
gsrm.parse(
    "https://raw.githubusercontent.com/Loop3D/GKM/master/Loop3D-GSO/Modules/GSO-Geologic_Rock_Material.ttl",
    format="ttl"
)

query_rocks = """
    SELECT ?rock ?rock_label ?rock_comment ?subclass_of ?source
    WHERE {
        ?rock a owl:Class ;
            rdfs:label ?rock_label ;
            rdfs:comment ?rock_comment ;
            rdfs:subClassOf ?subclass_of ;
            dct:source ?source .
        FILTER (STRSTARTS(STR(?rock), STR(gsrm:)))
    }
"""

gsrm_rocks = gsrm.query(query_rocks)

rows = []
for row in gsrm_rocks:
    rows.append({
        'rock': row[0],
        'rock_label': row[1],
        'rock_comment': row[2],
        'subclass_of': row[3],
        'source': row[4]
    })
df_gsrm = pd.DataFrame(rows)

df_gsrm['rockLabel'] = df_gsrm['rock_label'].apply(lambda x: x.value)
df_gsrm['rock_labels_lang'] = df_gsrm['rock_label'].apply(lambda x: x.language)

df_gsrm['rockDescription'] = df_gsrm['rock_comment'].apply(lambda x: x.value)
df_gsrm['reference'] = df_gsrm['source'].apply(lambda x: x.value)

df_gsrm['iri'] = df_gsrm['rock'].str.replace('_', '').str.replace('-', '').str.lower()
df_gsrm['subClassOf'] = df_gsrm['subclass_of'].apply(lambda x: x.replace('_', '').lower())

df_gsrm = df_gsrm[df_gsrm['rock_labels_lang'] == 'en'][['iri','rockLabel','rockDescription','reference','subClassOf']].reset_index(drop=True)
df_gsrm = df_gsrm.convert_dtypes()

df_gsrm = df_gsrm.groupby(['iri','rockLabel'], as_index=False).agg(set).reset_index(drop=True)

df_gsrm.head(10)

Unnamed: 0,iri,rockLabel,rockDescription,reference,subClassOf
0,https://w3id.org/gso/rockmaterial/acidicigneou...,acidic igneous material,{Igneous material with more than 63 percent Si...,{after LeMaitre et al. 2002},{https://w3id.org/gso/rockmaterial/igneousmate...
1,https://w3id.org/gso/rockmaterial/acidicigneou...,acidic igneous rock,{Igneous rock with more than 63 percent SiO2.},{after LeMaitre et al. 2002},{https://w3id.org/gso/rockmaterial/acidicigneo...
2,https://w3id.org/gso/rockmaterial/advancedargi...,advanced argillic altered rock,{Advanced argillic alteration occurs under low...,"{Antonio Arribas, Jeffrey Hedenquist, 2019, En...",{https://w3id.org/gso/rockmaterial/alteredrock}
3,https://w3id.org/gso/rockmaterial/albiticalter...,albitic altered rock,{definition missing},{CGI alterationtype SKOS vocabulary 2012-11-24},{https://w3id.org/gso/rockmaterial/alteredrock}
4,https://w3id.org/gso/rockmaterial/alkalifeldsp...,alkali feldspar granite,{Granitic rock that has a plagioclase to total...,{LeMaitre et al. 2002},{https://w3id.org/gso/rockmaterial/granitoid}
5,https://w3id.org/gso/rockmaterial/alkalifeldsp...,alkali feldspar rhyolite,{Rhyolitoid in which the ratio of plagioclase ...,{LeMaitre et al. 2002},{https://w3id.org/gso/rockmaterial/rhyolitoid}
6,https://w3id.org/gso/rockmaterial/alkalifeldsp...,alkali feldspar syenite,{Alkali feldspar syenitic rock that contains 0...,{LeMaitre et al. 2002},{https://w3id.org/gso/rockmaterial/alkalifelds...
7,https://w3id.org/gso/rockmaterial/alkalifeldsp...,alkali feldspar syenitic rock,{Syenitoid with a plagioclase to total feldspa...,{LeMaitre et al. 2002},{https://w3id.org/gso/rockmaterial/syenitoid}
8,https://w3id.org/gso/rockmaterial/alkalifeldsp...,alkali feldspar trachyte,{Trachytoid that has a plagioclase to total fe...,{LeMaitre et al. 2002},{https://w3id.org/gso/rockmaterial/alkalifelds...
9,https://w3id.org/gso/rockmaterial/alkalifeldsp...,alkali feldspar trachytic rock,{Trachytoid that has a plagioclase to total fe...,{LeMaitre et al. 2002},{https://w3id.org/gso/rockmaterial/trachytoid}


## GSRM Transformations

In [4]:
def split_desc(x):
    first_desc = next(iter(x), None)
    if first_desc is None:
        return None, None
    
    joined_desc = '\n'.join(x)
    
    if len(x) == 1:
        if len(first_desc) <= 250:
            short_desc = first_desc
            long_desc = None
        else:
            first_sentence = first_desc.split('.')[0]
            if len(first_sentence) <= 250:
                short_desc = first_sentence
                long_desc = joined_desc
            else:
                short_desc = first_desc[:250]
                long_desc = joined_desc
    else:
        long_desc = joined_desc
        if len(first_desc) <= 250:
            short_desc = first_desc
        else:
            first_sentence = first_desc.split('.')[0]
            if len(first_sentence) <= 250:
                short_desc = first_sentence
            else:
                short_desc = first_desc[:250]

    if short_desc == 'definition missing':
        short_desc = 'a rock material class from the Geoscience Ontology missing a definition'

    return short_desc, long_desc
        
df_gsrm[['description', 'long_description']] = df_gsrm['rockDescription'].apply(split_desc).apply(pd.Series)

In [5]:
def reference_items(x):
    items = []
    if 'LeMaitre' in x:
        items.append('Q161230')
    if 'Neuendorf' in x:
        items.append('Q161227')
    if 'SLTT' in x:
        items.append('Q70308')
    if 'consolidationdegree' in x:
        items.append('Q161232')
    if 'alterationtype' in x:
        items.append('Q161231')

    return items

gsrm_references = df_gsrm[['iri','reference']].explode('reference').dropna().reset_index(drop=True)
gsrm_references['reference'] = gsrm_references['reference'].apply(lambda x: [i.strip() for i in x.split(';')])
gsrm_references = gsrm_references.explode('reference').reset_index(drop=True)

gsrm_references['reference_items'] = gsrm_references['reference'].apply(reference_items)
gsrm_references['reference_urls'] = gsrm_references['reference'].apply(lambda x: re.findall(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+[/\w.-]*', x))
gsrm_references['reference_string'] = gsrm_references.apply(lambda x: None if x['reference'].lower() == 'this vocabulary' or x['reference_items'] or x['reference_urls'] else x['reference'], axis=1)


# Initial Pass
In this initial pass, I'll make a few modifications to items that align between what we already have in the GeoKB and the GSO-GSRM simply on labels.
* Add same as claim pointing to corrected IRI for GSO entity
* Change label to the GSO variation (without hyphens), adding the Mindat variation as an alias
* Change descriptions to GSO variation
* Add the Mindat description and longer GSO-GSRM description (where applicable) to the item discussion page

In [6]:
gsrm_in_geokb = pd.merge(
    left=df_gsrm[['iri','rockLabel','description','long_description']],
    right=geokb_rocks[['rock_qid','altRockLabel','rockLabel','same_as','rockDescription']].rename(columns={'rockLabel':'current_label'}),
    how='left',
    left_on='rockLabel',
    right_on='altRockLabel'
)

gsrm_in_geokb[
    gsrm_in_geokb['rock_qid'].notna()
    &
    ~gsrm_in_geokb['iri'].isin(geokb_rocks_sameas['same_as'])
].head()

Unnamed: 0,iri,rockLabel,description,long_description,rock_qid,altRockLabel,current_label,same_as,rockDescription


In [None]:
for _, row in gsrm_in_geokb[
    gsrm_in_geokb['rock_qid'].notna()
    &
    ~gsrm_in_geokb['iri'].isin(geokb_rocks_sameas['same_as'])
    &
    ~gsrm_in_geokb['rock_qid'].isin(processed_qids)
].iterrows():
    item = geokb.wbi.item.get(row['rock_qid'])
    if row['rockLabel'] != row['current_label']:
        item.labels.set('en', row['rockLabel'])
        item.aliases.set('en', row['current_label'], action_if_exists=geokb.action_if_exists.APPEND_OR_REPLACE)

    item.descriptions.set('en', row['description'])

    same_as_claims = [
        geokb.datatypes.URL(
            prop_nr=geokb.prop_lookup['same as'],
            value=row['iri']
        )
    ]

    for url in row['same_as']:
        same_as_claims.append(
            geokb.datatypes.URL(
                prop_nr=geokb.prop_lookup['same as'],
                value=url
            )
        )
    
    item.claims.add(
        same_as_claims,
        action_if_exists=geokb.action_if_exists.REPLACE_ALL
    )

    try:
        response = item.write(
            summary='Updated rock class item with Geoscience Ontology GSRM data'
        )
        print("UPDATED ITEM:", response.id)

        descriptive_texts = f"= Mindat Description =\n{row['rockDescription']}"

        if isinstance(row['long_description'], str):
            descriptive_texts = f"{descriptive_texts}\n= Geoscience Ontology Description =\n{row['long_description']}"

        item_page = geokb.mw_site.pages[f'Item_talk:{row["rock_qid"]}']
        item_page.save(descriptive_texts, summary='Updated rock class item discussion page with longer source data descriptions')
        print("UPDATED ITEM TALK PAGE:", row['rock_qid'])
    except Exception as e:
        print(row['rock_qid'], str(e))


# Missing GSO-GSRM Items

In [7]:
df_gsrm[~df_gsrm['iri'].isin(geokb_rocks_sameas['same_as'])].head()

Unnamed: 0,iri,rockLabel,rockDescription,reference,subClassOf,description,long_description
0,https://w3id.org/gso/rockmaterial/acidicigneou...,acidic igneous material,{Igneous material with more than 63 percent Si...,{after LeMaitre et al. 2002},{https://w3id.org/gso/rockmaterial/igneousmate...,Igneous material with more than 63 percent SiO2.,
1,https://w3id.org/gso/rockmaterial/acidicigneou...,acidic igneous rock,{Igneous rock with more than 63 percent SiO2.},{after LeMaitre et al. 2002},{https://w3id.org/gso/rockmaterial/acidicigneo...,Igneous rock with more than 63 percent SiO2.,
2,https://w3id.org/gso/rockmaterial/advancedargi...,advanced argillic altered rock,{Advanced argillic alteration occurs under low...,"{Antonio Arribas, Jeffrey Hedenquist, 2019, En...",{https://w3id.org/gso/rockmaterial/alteredrock},Advanced argillic alteration occurs under lowe...,Advanced argillic alteration occurs under lowe...
3,https://w3id.org/gso/rockmaterial/albiticalter...,albitic altered rock,{definition missing},{CGI alterationtype SKOS vocabulary 2012-11-24},{https://w3id.org/gso/rockmaterial/alteredrock},a rock material class from the Geoscience Onto...,
7,https://w3id.org/gso/rockmaterial/alkalifeldsp...,alkali feldspar syenitic rock,{Syenitoid with a plagioclase to total feldspa...,{LeMaitre et al. 2002},{https://w3id.org/gso/rockmaterial/syenitoid},Syenitoid with a plagioclase to total feldspar...,


In [9]:
new_items = []
exceptions = []

for _, row in df_gsrm[~df_gsrm['iri'].isin(geokb_rocks_sameas['same_as'])].iterrows():
    item = geokb.wbi.item.new()
    item.labels.set('en', row['rockLabel'])
    item.descriptions.set('en', row['description'])

    item.claims.add(
        geokb.datatypes.URL(
            prop_nr=geokb.prop_lookup['same as'],
            value=row['iri']
        )
    )

    try:
        response = item.write(
            summary="Added new rock class item from Geoscience Ontology GSRM data"
        )
        print("CREATED ITEM:", response.id)
        if isinstance(row['long_description'], str):
            item_page = geokb.mw_site.pages[f'Item_talk:{response.id}']
            item_page.save(f"= Geoscience Ontology Description =\n{row['long_description']}", summary='Added rock class item discussion page with source data description')
            print("CREATED ITEM TALK PAGE:", response.id)
        new_items.append({
            'iri': row['iri'],
            'qid': response.id
        })
    except Exception as e:
        print("EXCEPTION:", row['rockLabel'], str(e))
        exceptions.append({
            'iri': row['iri'],
            'exception': str(e)
        })

CREATED ITEM: Q161233
CREATED ITEM: Q161234
CREATED ITEM: Q161235
CREATED ITEM TALK PAGE: Q161235
CREATED ITEM: Q161236
CREATED ITEM: Q161237
CREATED ITEM: Q161238
CREATED ITEM: Q161239
CREATED ITEM: Q161240
CREATED ITEM TALK PAGE: Q161240
CREATED ITEM: Q161241
CREATED ITEM: Q161242
CREATED ITEM TALK PAGE: Q161242
CREATED ITEM: Q161243
CREATED ITEM TALK PAGE: Q161243
CREATED ITEM: Q161244
CREATED ITEM: Q161245
CREATED ITEM: Q161246
CREATED ITEM: Q161247
CREATED ITEM: Q161248
CREATED ITEM: Q161249
CREATED ITEM TALK PAGE: Q161249
CREATED ITEM: Q161250
CREATED ITEM TALK PAGE: Q161250
CREATED ITEM: Q161251
CREATED ITEM: Q161252
CREATED ITEM: Q161253
CREATED ITEM: Q161254
CREATED ITEM: Q161255
CREATED ITEM: Q161256
CREATED ITEM: Q161257
CREATED ITEM: Q161258
CREATED ITEM: Q161259
CREATED ITEM TALK PAGE: Q161259
CREATED ITEM: Q161260
CREATED ITEM: Q161261
CREATED ITEM: Q161262
CREATED ITEM TALK PAGE: Q161262
CREATED ITEM: Q161263
CREATED ITEM TALK PAGE: Q161263
CREATED ITEM: Q161264
CREATED 

# Classification and References

In [29]:
extra_items = [
    {
        'iri': 'https://w3id.org/gso/geology/rockmaterial',
        'qid': 'Q41261'
    },
    {
        'iri': 'https://w3id.org/gso/rockmaterial/alteredrock',
        'qid': 'Q161371'
    }
]

gsrm_geokb_mapping = pd.concat([
    geokb_rocks_sameas[geokb_rocks_sameas['same_as'].str.contains('w3id.org')].rename(columns={'same_as':'iri', 'rock_qid':'qid'}),
    pd.DataFrame(new_items),
    pd.DataFrame(extra_items)
])

In [30]:
gsrm_subclasses = pd.merge(
    left=df_gsrm[['iri','subClassOf']],
    right=gsrm_geokb_mapping,
    how='left',
    on='iri'
)

gsrm_subclasses = pd.merge(
    left=gsrm_subclasses.explode('subClassOf'),
    right=gsrm_geokb_mapping.rename(columns={'qid': 'subClassOf_qid', 'iri': 'subClassOf'}),
    how='left',
    on='subClassOf'
)

In [55]:
for _, row in gsrm_subclasses[gsrm_subclasses['subClassOf_qid'].notna()].groupby(['qid','iri'], as_index=False)['subClassOf_qid'].agg(list).iterrows():
    subclass_ref_source = gsrm_references[gsrm_references['iri'] == row['iri']]
    subclass_refs = []
    # Add reference for the GSO item
    subclass_refs.append(
        geokb.datatypes.Item(
            prop_nr=geokb.prop_lookup['knowledge source'],
            value='Q161225'
        )
    )
    # Add record specific references from the ontology
    for i, r in subclass_ref_source.iterrows():
        for item_qid in r['reference_items']:
            subclass_refs.append(
                geokb.datatypes.Item(
                    prop_nr=geokb.prop_lookup['knowledge source'],
                    value=item_qid
                )
            )
        for url in r['reference_urls']:
            subclass_refs.append(
                geokb.datatypes.URL(
                    prop_nr=geokb.prop_lookup['reference URL'],
                    value=url
                )
            )
        if isinstance(r['reference_string'], str):
            subclass_refs.append(
                geokb.datatypes.String(
                    prop_nr=geokb.prop_lookup['reference statement'],
                    value=r['reference_string']
                )
            )
    
    item = geokb.wbi.item.get(row['qid'])

    # subclass_claims = item.claims.get('P2')

    new_claims = []
    for parent_qid in row['subClassOf_qid']:
        new_claims.append(
            geokb.datatypes.Item(
                prop_nr=geokb.prop_lookup['subclass of'],
                value=parent_qid,
                references=subclass_refs
            )
        )

    item.claims.add(new_claims, action_if_exists=geokb.action_if_exists.APPEND_OR_REPLACE)

    # display(item.get_json())

    try:
        response = item.write(
            summary='Updated rock class item with Geoscience Ontology GSRM subclass data'
        )
        print(response.id)
    except Exception as e:
        print(row['qid'], str(e))

Q158195
Q158196
Q158197
Q158198
Q158199
Q158200
Q158201
Q158202
Q158203
Q161226
Q161228
Q161229
Q161233
Q161234
Q161235
Q161236
Q161237
Q161238
Q161239
Q161240
Q161242
Q161243
Q161244
Q161245
Q161246
Q161247
Q161248
Q161249
Q161250
Q161251
Q161252
Q161253
Q161254
Q161255
Q161256
Q161257
Q161258
Q161259
Q161260
Q161261
Q161262
Q161263
Q161264
Q161265
Q161266
Q161267
Q161268
Q161269
Q161270
Q161271
Q161274
Q161275
Q161276
Q161277
Q161278
Q161279
Q161280
Q161281
Q161282
Q161283
Q161284
Q161285
Q161286
Q161287
Q161288
Q161289
Q161290
Q161291
Q161292
Q161293
Q161294
Q161295
Q161296
Q161297
Q161298
Q161299
Q161300
Q161301
Q161302
Q161303
Q161304
Q161305
Q161306
Q161307
Q161308
Q161309
Q161310
Q161311
Q161312
Q161313
Q161314
Q161315
Q161316
Q161317
Q161318
Q161319
Q161320
Q161321
Q161322
Q161323
Q161324
Q161325
Q161326
Q161327
Q161328
Q161329
Q161330
Q161331
Q161332
Q161333
Q161334
Q161335
Q161336
Q161337
Q161338
Q161339
Q161340
Q161341
Q161342
Q161343
Q161344
Q161345
Q161346
Q161347
Q161348
