## Semantic Modelling with owlready2

In this script, I import an ontology baseline I created in Protégé and add corresponding instances to it. Moreover, I query data from Geonames to add more information to the data basis. For doing the same for the Sachindex, I created another [script](enriching_database.ipynb) that can be used beforehand. The baseline ontology can be found in the data directory in the ``owl/baseline`` folder.

In [1]:
import geocoder
import pandas as pd
from owlready2 import *



In [2]:
onto = get_ontology("../data/owl/baseline/cds-ontology.owl").load()

gndo = Namespace(
    world_or_ontology=onto,
    base_iri="https://d-nb.info/gnd/",
    name="gndo"
)

cds_docs = Namespace(
    world_or_ontology=onto,
    base_iri="https://constance-de-salm.de/archiv/#/document/",
    name="cds_docs"
)

gn = Namespace(
    world_or_ontology=onto,
    base_iri="https://www.geonames.org/",
    name="gn"
)

viaf = Namespace(
    world_or_ontology=onto,
    base_iri="https://viaf.org/viaf/",
    name="viaf"
)

wikidata = Namespace(
    world_or_ontology=onto,
    base_iri="https://www.wikidata.org/wiki/",
    name="wikidata"
)

gn_onto = Namespace(
    world_or_ontology=onto,
    base_iri="https://www.geonames.org/ontology#",
    name="gn_onto"
)

dbo = Namespace(
    world_or_ontology=onto,
    base_iri="http://dbpedia.org/ontology/",
    name="dbo"
)


In [3]:
"""Individuals are instances in ontologies. They are created as any other Python instances. The first parameter is the name (or identifier) of the Individual; it corresponds to the .name attribute in Owlready2. If not given, the name if automatically generated from the Class name and a number.
"""

'Individuals are instances in ontologies. They are created as any other Python instances. The first parameter is the name (or identifier) of the Individual; it corresponds to the .name attribute in Owlready2. If not given, the name if automatically generated from the Class name and a number.\n'

In [4]:
# Parsing CSV file as DataFrame in order to create individuals.

df = pd.read_csv('../data/retrieved/cds_cleanup.csv')
df = df.fillna(0)
df['Geonames (Ausstellungsort)'] = df['Geonames (Ausstellungsort)'].astype(int)

In [5]:
sachindex = pd.read_csv('../data/retrieved/sachindex_additional_data_completed-v2.csv', sep=";")
sachindex = sachindex.drop(columns='Unnamed: 0', axis=1)
sachindex = sachindex.fillna(0)

In [6]:
def get_geodata(dataframe: pd.DataFrame, feature_column: str) -> None:
    for feature in dataframe[feature_column].unique():
        if feature != 0:
            g = geocoder.geonames(
                str(feature),
                method='details',
                geoNamesUsername="sarahondraszek",
                key="sarahondraszek"
            )
            t_g = gn_onto.Feature(
                str(feature),
                namespace=gn
            )
            t_g.lat.append(g.lat)
            t_g.long.append(g.lng)

In [7]:
get_geodata(df, 'Geonames (Ausstellungsort)')

In [8]:
def create_topic(indicator: str, w: str, substitue: str = '', e: str ='') -> onto.Topic:
    if indicator == "own":
        t = onto.Topic(
            "t_"+str(int(sachindex[sachindex['Deutsch']==w].index[0])),
            namespace=onto
        )
        t.label = [locstr(w, lang ="de")]
    elif indicator == "wikidata":
        t = onto.Topic(
            re.sub(
                substitue,
                '',
                e
            ),
            namespace=wikidata
        )
        t.label = [locstr(w, lang ="de")]

    return t

In [9]:
def create_letter(input_url: str, idx: int, dataframe: pd.DataFrame, ns: Namespace) -> onto.Letter:
    l = onto.Letter(re.sub(cds_docs.base_iri, '', input_url), namespace=ns)
    l.fud_key = dataframe['FuD-Key'][idx]
    l.has_year.append(int(dataframe['year'][idx]))
    l.has_decade.append(int(dataframe['decade'][idx]))
    l.has_date.append(str(dataframe['Datierung (JJJJ-MM-TT)'][idx]))

    return l

In [10]:
def add_sender_properties(instnc: onto.Sender, dataframe: pd.DataFrame, idx: int, indicator: str) -> None:
    #lbl = dataframe[indicator][idx].split(',')
    #lbl = (lbl[1] + ' ' + lbl[0]).strip()
    instnc.label = [locstr(dataframe[indicator][idx], lang ="en")]
    instnc.has_viaf = viaf.base_iri + str(dataframe[f'VIAF ({indicator})'][idx])
    instnc.has_wikidata = dataframe[f'Wikidata-Identifier ({indicator})'][idx]
    for alt_names in dataframe[f'Alternativer Name ({indicator})'][idx].split(';'):
        instnc.alias.append(alt_names.strip())

In [11]:
# Adding instances of the CdS letters with senders and addressees, topics and places of exposition.
index = 0
for url in df['URL']:
    # Adding new letter to ontology.
    new_letter = create_letter(input_url=url, idx=index, dataframe=df, ns=cds_docs)
    # Adding sender to ontology.
    new_sender = onto.Sender(df['GND (Verfasser)'][index], namespace=gndo)
    # Add properties to sender.
    add_sender_properties(instnc=new_sender, dataframe=df, idx=index, indicator="Verfasser")
    # Append sender to letter.
    new_letter.has_sender.append(new_sender)

    # Adding receiver to ontology.
    if df['GND (Empfänger)'][index] == 0 and df['VIAF (Empfänger)'][index] != 0:
        new_addressee = onto.Addressee(df['VIAF (Empfänger)'][index], namespace=viaf)
    elif df['GND (Empfänger)'][index] == 0 and df['VIAF (Empfänger)'][index] == 0:
        if not type(df['Empfänger'][index]) is int:
            l = df['Empfänger'][index].split(',')
            if len(l) > 1:
                new_addressee = onto.Addressee((re.sub(r"[^a-zA-Z]+", '', l[0])+'_'+re.sub(r"[^a-zA-Z]+", '', l[1])).lower(), namespace=onto)
            else:
                new_addressee = onto.Addressee(
                    re.sub(
                        r"[^a-zA-Z]+", '', re.sub(
                            ' +', '_', df['Empfänger'][index]
                        ).lower()
                    ),
                    namespace=onto
                )
        else:
            new_addressee = onto.Addressee('0', namespace=onto)
    else:
        new_addressee = onto.Addressee(df['GND (Empfänger)'][index], namespace=gndo)
        if not df['VIAF (Empfänger)'][index] == 0:
            new_addressee.has_viaf = viaf.base_iri + str(df['VIAF (Empfänger)'][index])


    if not type(df['Empfänger'][index]) is int:
        l = df['Empfänger'][index].split(',')
        if len(l) > 1:
            label = re.sub(
                '\s+',' ', re.sub(
                    r'(\([^)]*(CdS|Bruder|SRD|Neffe|Siehe|Tochter|Anwältin|Rechtsanwalt)[^)]*\)*)','',(l[1]+' '+l[0])
                )
            )
        else:
            label = df['Empfänger'][index]
        new_addressee.label = [locstr(label.strip(), lang = "en")]
    else:
        new_addressee.label = 0

    # Add wikidata.
    if not type(df['Wikidata-Identifier (Empfänger)'][index]) == int:
        new_addressee.has_wikidata = df['Wikidata-Identifier (Empfänger)'][index]

    # Add alias.
    if not type(df['Alternativer Name (Empfänger)'][index]) == int:
        for names in df['Alternativer Name (Empfänger)'][index].split(';'):
            new_addressee.alias.append(names.strip())
    # Appending receiver to letter.
    new_letter.has_addressee.append(new_addressee)

    # Adding topics to ontology and appending them to the letter.
    if df['Schlagwörter'][index] == 0:
        empty_topic = onto.Topic("t0", namespace=onto)
        empty_topic.label = "Empty Topic"
        new_letter.has_topic.append(empty_topic)
    else:
        for keyword_list in df['Schlagwörter'][index].split(";"):
            for keyword in keyword_list.split(";"):
                l = [word.strip() for word in keyword.split('/')]
                for word in l:
                    try:
                        w_i = 0
                        entry = sachindex.loc[sachindex['Deutsch'] == word]['Wikidata'].values[0]
                        if entry == 0:
                            new_topic = create_topic(indicator="own", w=word)
                            new_letter.has_topic.append(new_topic)
                        else:
                            new_topic = create_topic(substitue=wikidata.base_iri, w=word, e=entry, indicator="wikidata")
                            new_letter.has_topic.append(new_topic)
                        w_i += 1
                    except ValueError and IndexError:
                        continue

    if df['Geonames (Ausstellungsort)'][index] != 0:
        for x in gn_onto.Feature.instances():
            if x.iri == df['Geonames (Ausstellungsort)'][index]:
                new_letter.has_place_of_exposition.append(x)


    index += 1

In [12]:
onto.save(file = "../data/owl/cds_onto_extended.owl", format = "rdfxml")