From the list of objects and their information, extract the vocabulary of each property into one single CSV table.

### Initialization

In [None]:
import sys, os, requests, time
sys.path.append(os.path.abspath('../src'))
import pandas as pd
import lib
import yaml

# Paremeters from config file
with open("./00-config.yaml", "r") as f:
    config = yaml.safe_load(f)
spacy_model = config['catalog']['spacy_model']

# Global Variables
eta = lib.Eta()
input_path = f'../data/objects-all.csv'
output_path = f"../data/vocabulary-all.csv"
vocabulary = []

### Load information table

In [None]:
table = pd.read_csv(input_path)

### Get LOD mappings

In [None]:
if os.path.exists(output_path):
    
    # Extract already mapped vocabulary
    existing_vocab = pd.read_csv(output_path)
    existing_vocab = existing_vocab[pd.notna(existing_vocab['identifier'])]

    # Create a key (index)
    existing_vocab['key'] = existing_vocab['type'] + '-' + existing_vocab['name']
    existing_vocab.set_index('key', inplace=True)

    # Make the existing mapped vocabulary useable
    lod = existing_vocab.to_dict(orient='index')

else:
    lod = {}

### Get object type vocabulary

In [None]:
# Get words
raw_object_types = list(table['object_type'].dropna())

# Extract each one of them in case there are multiples in a single cell
all_object_types = ', '.join(list(map(lambda x: str(x), raw_object_types))).split(', ')

# Make them unique (should already be the case)
all_object_types = list(set(all_object_types))

# Get the cardinality of each one of them (LOT cardinality)
eta.begin(len(all_object_types), 'Counting object types')
for object_type in all_object_types:
    count = sum([lib.has_element(obj_types, object_type) for obj_types in table['object_type'].tolist()])
    vocabulary.append({
        "type": "object_type",
        "name": object_type,
        "count": count
    })
    eta.iter()
eta.end()

### Get materials and techniques vocabulary

In [None]:
# Get words
raw_material_techniques = list(table['material_technique'].dropna())

# Extract each one of them in case there are multiples in a single cell
all_material_techniques = ', '.join(list(map(lambda x: str(x), raw_material_techniques))).split(', ')

# Make them unique
all_material_techniques = list(set(all_material_techniques))

# Get the cardinality of each one of them (LOT cardinality)
eta.begin(len(all_material_techniques), 'Counting materials and techniques')
for material_technique in all_material_techniques:
    count = sum([lib.has_element(mat_techs, material_technique.strip()) for mat_techs in table['material_technique'].tolist()])
    vocabulary.append({
        "type": "material_technique",
        "name": material_technique,
        "count": count
    })
    eta.iter()
eta.end()

### Get origins vocabulary

In [None]:
# Get words
raw_origins = list(table['origin'].dropna())

# Extract each one of them in case there are multiples in a single cell
all_origins = ', '.join(list(map(lambda x: str(x), raw_origins))).split(', ')

# Make them unique
all_origins = list(set(all_origins))

# Get the cardinality of each one of them (LOT cardinality)
eta.begin(len(all_origins), 'Counting origins')
for origin in all_origins:
    count = sum([lib.has_element(orgs, origin.strip()) for orgs in table['origin'].tolist()])
    vocabulary.append({
        "type": "origin",
        "name": origin,
        "count": count
    })
    eta.iter()
eta.end()

### Get author vocabulary

In [None]:
# Get words
raw_authors = list(table['author'].dropna())

# Extract each one of them in case there are multiples in a single cell
all_authors = ', '.join(list(map(lambda x: str(x), raw_authors))).split(', ')

# Make them unique
all_authors = list(set(all_authors))

# Get the cardinality of each one of them (LOT cardinality)
eta.begin(len(all_authors), 'Counting authors')
for author in all_authors:
    count = sum([lib.has_element(orgs, author.strip()) for orgs in table['author'].tolist()])
    vocabulary.append({
        "type": "author",
        "name": author,
        "count": count
    })
    eta.iter()
eta.end()

### Get Period vocabulary

In [None]:
# Get words
raw_periods = list(table['period'].dropna())

# Extract each one of them in case there are multiples in a single cell
all_periods = ', '.join(list(map(lambda x: str(x), raw_periods))).split(', ')

# Make them unique
all_periods = list(set(all_periods))

# Get the cardinality of each one of them (LOT cardinality)
eta.begin(len(all_periods), 'Counting periods')
for period in all_periods:
    count = sum([lib.has_element(orgs, period.strip()) for orgs in table['period'].tolist()])
    vocabulary.append({
        "type": "period",
        "name": period,
        "count": count
    })
    eta.iter()
eta.end()

### Map LODs

In [None]:
# Retrieve information
for i, vocab in enumerate(vocabulary):
    key = vocab['type'] + '-' + vocab['name']
    if key in lod:
        vocabulary[i]['authority_file'] = lod[key]['authority_file']
        vocabulary[i]['identifier'] = lod[key]['identifier']
        if lod[key]['label']: vocabulary[i]['label'] = lod[key]['label']
        if lod[key]['definition']: vocabulary[i]['definition'] = lod[key]['definition']
        if lod[key]['category']: vocabulary[i]['category'] = lod[key]['category']


### Fill LOD informations

In [None]:
def get_wikidata_info(identifier):
    endpoint_url = "https://query.wikidata.org/sparql"
    query = f"""
    SELECT ?label ?definition WHERE {{
      wd:{identifier.strip()} rdfs:label ?label ;
                     schema:description ?definition .
      FILTER (lang(?label) = "en" && lang(?definition) = "en")
    }}
    """
    headers = {"Accept": "application/sparql-results+json"}
    response = requests.get(endpoint_url, params={"query": query}, headers=headers)
    time.sleep(1)

    if response.status_code == 200:
        results = response.json()['results']['bindings']
        if results:
            return {
                "label": results[0]['label']['value'],
                "definition": results[0]['definition']['value']
            }
        else:
            return {"label": None, "definition": None}
    else:
        print(query)
        raise Exception(f"SPARQL query failed with status {response.status_code}")
    

def get_getty_info(identifier):
    endpoint_url = "https://vocab.getty.edu/sparql"
    concept_uri = f"<http://vocab.getty.edu/aat/{identifier.strip()}>"
    
    query = f"""
        select 
            (COALESCE(?label_, "") as ?label)
            (COALESCE(?note_, "") as ?definition)
        where {{
            optional {{
                {concept_uri} xl:prefLabel ?label_obj .
                ?label_obj gvp:term ?label_ .
                FILTER(lang(?label_) = "en")
            }}
            optional {{
                {concept_uri} skos:scopeNote ?note_obj .
                ?note_obj rdf:value ?note_ .
                FILTER(lang(?note_) = "en")
            }}
        }}
    """
    headers = {"Accept": "application/sparql-results+json"}
    response = requests.get(endpoint_url, params={"query": query}, headers=headers)


    if response.status_code == 200:
        results = response.json()['results']['bindings']
        if results:
            return {
                "label": results[0]['label']['value'],
                "definition": results[0]['definition']['value']
            }
        else:
            return {"label": None, "definition": None}
    else:
        print(query)
        raise Exception(f"SPARQL query failed with status {response.status_code}")
    

def get_getty_root_type(uri):
    endpoint_url = "https://vocab.getty.edu/sparql"
    
    query = f"""
        select 
            *
            # ?broader ?label
        where {{
            {uri} <http://vocab.getty.edu/ontology#broader> ?broader .
            ?broader xl:prefLabel ?label_obj .
            ?label_obj gvp:term ?label .
            FILTER(lang(?label) = "en" || lang(?label) = "en-US" || lang(?label) = "en-us")
        }}
    """
    headers = {"Accept": "application/sparql-results+json"}
    response = requests.get(endpoint_url, params={"query": query}, headers=headers)


    if response.status_code == 200:
        results = response.json()['results']['bindings']
        def parse(element):
            to_return = {}
            for key, value in element.items():
                to_return[key] = value['value']
            return to_return
        return pd.DataFrame(data=[parse(elt) for elt in results])
    else:
        print(query)
        raise Exception(f"SPARQL query failed with status {response.status_code}")
    

def get_getty_categ(uri):
    while True:
        try:
            result = get_getty_root_type(f"<{uri.strip()}>")
            if 'http://vocab.getty.edu/aat/300010357' in result['broader'].tolist(): return "material"
            elif 'http://vocab.getty.edu/aat/300053001' in result['broader'].tolist(): return "technique"
            else: uri = result.iloc[0]['broader']
        except:
            return "unknown"

In [None]:
eta.begin(len(vocabulary), 'Finding LOD informations')
for i, vocab in enumerate(vocabulary):
    label_missing = 'label' not in vocab or pd.isna(vocab['label']) or vocab['label'].strip() == ''
    definition_missing = 'definition' not in vocab or pd.isna(vocab['definition']) or vocab['definition'].strip() == ''
    categ_missing = 'category' not in vocab or pd.isna(vocab['category']) or vocab['category'].strip() == ''

    if 'authority_file' in vocab and label_missing and definition_missing:
        eta.print(f'Fetching information for {vocab["type"]}/{vocab["name"]}')   
        if vocab['authority_file'] == 'wikidata': infos = get_wikidata_info(vocab['identifier'])
        elif vocab['authority_file'] == 'getty': infos = get_getty_info(vocab['identifier'])
        else: raise Exception(f'Unknown Authority file "{vocab["authority_file"]}" for word <{vocab["type"]}/{vocab["name"]}>')
        vocabulary[i]['label'] = infos['label'].capitalize() if infos['label'] else ''
        vocabulary[i]['definition'] = infos['definition'].capitalize() if infos['definition'] else ''

    if vocab['type'] == 'material_technique' and categ_missing and 'identifier' in vocab and vocab['authority_file'] == 'getty':
        uri = f"http://vocab.getty.edu/aat/{vocab['identifier']}"
        eta.print(f'Fetching category for {vocab["type"]}/{vocab["name"]}')   
        vocabulary[i]['category'] = get_getty_categ(uri)

    eta.iter()
eta.end()

### Sort and save catalog vocabulary

In [None]:
# Sort and save
vocabulary = pd.DataFrame(data=vocabulary)
vocabulary['count'].astype(pd.Int64Dtype())
vocabulary.sort_values('count', ascending=False, inplace=True)
vocabulary.to_csv(output_path, index=False)

# Also, filter a dataframe for each dedicated vocabulary
vocabulary_object_type = vocabulary[vocabulary['type'] == 'object_type']
vocabulary_material_technique = vocabulary[vocabulary['type'] == 'material_technique']
vocabulary_origin = vocabulary[vocabulary['type'] == 'origin']
vocabulary_author = vocabulary[vocabulary['type'] == 'author']
vocabulary_period = vocabulary[vocabulary['type'] == 'period']

# Save all
vocabulary_object_type.to_csv(output_path.replace('all', 'object-type'), index=False)
vocabulary_material_technique.to_csv(output_path.replace('all', 'material-technique'), index=False)
vocabulary_origin.to_csv(output_path.replace('all', 'origin'), index=False)
vocabulary_author.to_csv(output_path.replace('all', 'author'), index=False)
vocabulary_period.to_csv(output_path.replace('all', 'period'), index=False)