In [1]:
# Begin time
from datetime import datetime
begin = datetime.now()
print("Started at:", begin.strftime("%H:%M:%S"))

Started at: 16:31:35


In [1]:
# Imports and global constants

import os, pandas as pd
import unicodedata
from utils import generate_id, Eta, to_literal
from utils2 import populate_entity
from dotenv import load_dotenv
from sparql import SPARQL
from sparql_graphdb import GraphDB
from prefix import prefixes, Prefix
from ontology import classes as c, properties as p, constants as k, Classes, Properties
load_dotenv()
eta = Eta()

root_uri = 'http://geovistory.org/resource/'
data_named_graph = ''
shacl_named_graph = 'http://geovistory.org/resource/shacl'

# Preparation
prefixes.append(Prefix('root', root_uri))

# Set up the SPARQL wrapper
sparql: SPARQL = GraphDB(os.getenv('URL'), os.getenv('USERNAME'), os.getenv('PASSWORD'))
sparql.prefixes = prefixes

In [5]:
def create_uri(identifier: str = None):
    if identifier is not None: 
        return f"root:i{identifier}"
    else:
        return f"root:i{generate_id()}"

---

# Initialization

In [6]:
# Read data (output of pipeline)

normalize = lambda x: unicodedata.normalize("NFC", x)

vocabulary = pd.read_csv('../data/vocabulary-all.csv')
catalogs = pd.read_csv('../data/catalogs.csv', sep=";", converters={'name': normalize})
lots = pd.read_csv('../data/objects-all.csv', converters={'catalog': normalize})
sales_san_donato = pd.read_csv('../data/sales-san-donato.csv')
sales_soltykoff = pd.read_csv('../data/sales-soltykoff.csv')

# Split the vocabulary

origins = vocabulary[vocabulary['type'] == 'origin'].copy()
origins.drop(columns=['type', 'category'], inplace=True)

material_techniques = vocabulary[vocabulary['type'] == 'material_technique'].copy()
material_techniques.drop(columns=['type'], inplace=True)

periods = pd.read_csv('../data/vocabulary-period.csv') # Here we read it directly from disk, because it has additional information about begin year and end year
periods.drop(columns=['type', 'category'], inplace=True)
periods['begin_year'] = periods['begin_year'].astype(pd.Int64Dtype())
periods['end_year'] = periods['end_year'].astype(pd.Int64Dtype())

object_types = vocabulary[vocabulary['type'] == 'object_type'].copy()
object_types.drop(columns=['type', 'category'], inplace=True)

authors = vocabulary[vocabulary['type'] == 'author'].copy()
authors.drop(columns=['type', 'category'], inplace=True)

# Join sale prices and buyers to lots

# Joining Soltykoff prices and buyers
selection = lots[lots['catalog'].str.contains('Soltykoff')]
eta.begin(len(selection), "Joining Soltykoff prices and buyers")
for i, lot in selection.iterrows():
    for _, sale in sales_soltykoff.iterrows():
        if lot['index'] == sale['index']:
            lots.at[i, 'sale_price'] = sale['price']
            lots.at[i, 'sale_currency_uri'] = k.currency_franc
            lots.at[i, 'sale_currency_label'] = "Francs"
        if pd.notna(sale['buyer']) and any(x in sale['index'].split(', ') for x in lot['index'].split(', ')):
            lots.at[i, 'sale_buyer'] = sale['buyer']
    eta.iter()
eta.end()

# Joining San Donato prices
selection = lots[lots['catalog'].str.contains('Donato')]
eta.begin(len(selection), "Joining San Donato prices")
for i, lot in selection.iterrows():
    for _, sale in sales_san_donato.iterrows():
        if lot['index'] == sale['index']:
            lots.at[i, 'sale_price'] = sale['price']
            lots.at[i, 'sale_currency_uri'] = k.currency_lires
            lots.at[i, 'sale_currency_label'] = "Lires"
    eta.iter()
eta.end()

[ETA] Joining Soltykoff prices and buyers - 1069 iterations in 00h00m10s (105.0 iter/sec)                      
[ETA] Joining San Donato prices - 1707 iterations in 00h00m38s (44.1 iter/sec)                                 


In [5]:
# Clear Endpoint
sparql.run('CLEAR ALL')

In [6]:
# Create constants

triples = []

# Language: French
label = 'French'
definition = 'French language'
triples += populate_entity(k.language_french, c.language, label, definition, "en")

# Entity Quality Type: Number
label = 'Number'
definition = 'Number of items in a set'
triples += populate_entity(k.entityQualityType_number, c.entity_quality_type, label, definition, "en")

# Francs
label = "Franc"
definition = "Unité monétaire unique de la France qu'entre le 7 avril 1795 et le 31 décembre 1998."
triples += populate_entity(k.currency_franc, c.currency, label, definition)

# Lire italiennes
label = "Lire"
definition = "Ancienne unité monétaire de l'Italie, émise du 17 mars 1861 au 28 février 2002."
triples += populate_entity(k.currency_lires, c.currency, label, definition)

# Lugt number
label = "Lugt number"
triples += populate_entity(k.identifierType_lugt, c.identifier_type, label)

# URL
label = "URL"
triples += populate_entity(k.identifierType_url, c.identifier_type, label)

# Residence
label = "Résidence"
triples += populate_entity(k.epistemicLocationType_residence, c.epistemic_location_type, label)

# Residence
label = "Lieu de distribution"
triples += populate_entity(k.epistemicLocationType_distributionPlace, c.epistemic_location_type, label)

# Page number
label = "Nombre de pages"
triples += populate_entity(k.entityQualityType_pageNumber, c.entity_quality_type, label)

# Lot numbers
label = "Nombre de lots"
triples += populate_entity(k.entityQualityType_lotNumber, c.entity_quality_type, label)

# Fees
label = "Frais de ventes"
triples += populate_entity(k.quantiQualSpati_fees, c.quantifiable_quality_of_a_spatio_temporal_phenomenon, label)

# Auctioneer
label = "Commissaire-priseur"
triples += populate_entity(k.actorSocialQuality_auctioneer, c.actor_social_quality, label)

# Expert
label = "Expert"
triples += populate_entity(k.actorSocialQuality_expert, c.actor_social_quality, label)

# Seller
label = "Vendeur"
triples += populate_entity(k.actorSocialQuality_seller, c.actor_social_quality, label)

# Auction
label = "Vente aux enchères"
triples += populate_entity(k.activityType_auction, c.activity_type, label)

# Auction
label = "Prix d'adjudication"
triples += populate_entity(k.quantiQualIntEventType_hammerPrice, c.quantifiable_quality_of_an_intentional_event_type, label)

# Buyer
label = "Acheteur"
triples += populate_entity(k.actorSocialQuality_buyer, c.actor_social_quality, label)


# Insert
sparql.insert(triples, data_named_graph)

# Vocabulary

In [7]:
# Origins (Geographical places)

triples = []
eta.begin(len(origins), 'Creating Geographical places')
for i, row in origins.iterrows():

    # Create Instance
    uri = create_uri()
    label = row['name']
    triples += populate_entity(uri, c.geographical_place, label.title())

    # Add URI
    if row['authority_file'] == 'wikidata': 
        same_as = "http://www.wikidata.org/entity/" + row['identifier']
        triples.append((uri, p.same_as, to_literal(same_as)))
    elif row['authority_file'] == 'getty': 
        same_as = "https://vocab.getty.edu/aat/" + row['identifier']
        triples.append((uri, p.same_as, to_literal(same_as)))

    # Save URI
    origins.at[i, 'uri'] = uri

    eta.iter()
eta.end()


# Insert triples
eta.begin(len(triples) / 1000, 'Inserting triples')
sparql.insert(triples, data_named_graph, eta_fct=eta.iter)
eta.end()

# To have them accessible with keys
origins_dict = origins.set_index('name').to_dict(orient='index')

[ETA] Creating Geographical places - 174 iterations in 00h00m00s (727.2 iter/sec)                              
[ETA] Inserting triples - 0.508 iterations in 00h00m00s (3.6 iter/sec)                                         


In [8]:
# Materials and techniques

triples = []
eta.begin(len(material_techniques), 'Creating Materials & Techniques')
for i, row in material_techniques.iterrows():

    # Create Instance
    uri = create_uri()
    label = row['name']
    triples += populate_entity(uri, c.general_technique, label.title())

    # Add URI
    if row['authority_file'] == 'wikidata' and row['identifier']: 
        same_as = "http://www.wikidata.org/entity/" + row['identifier']
        triples.append((uri, p.same_as, to_literal(same_as)))
    elif row['authority_file'] == 'getty' and row['identifier']:
        same_as = "https://vocab.getty.edu/aat/" + row['identifier']
        triples.append((uri, p.same_as, to_literal(same_as)))

    # Save URI
    material_techniques.at[i, 'uri'] = uri

    # If we know it, add "material" or "technique" as a comment
    if pd.notna(row['category']):
        triples.append((uri, p.has_note, to_literal(row['category'])))

    eta.iter()
eta.end()


# Insert triples
eta.begin(len(triples) / 1000, 'Inserting triples')
sparql.insert(triples, data_named_graph, eta_fct=eta.iter)
eta.end()

# To have them accessible with keys
material_techniques_dict = material_techniques.set_index('name').to_dict(orient='index')

[ETA] Creating Materials & Techniques - 2589 iterations in 00h00m03s (672.5 iter/sec)                          
[ETA] Inserting triples - 5.338 iterations in 00h00m01s (4.9 iter/sec)                                         


In [9]:
# Periods 

triples = []
eta.begin(len(periods), 'Creating Periods')
for i, row in periods.iterrows():

    # Build label
    label = row['name']
    if "siècle" in label:
        words = label.split(' ')
        label = words[0][:-1].upper() + 'e siècle'

    # Create Instance
    uri = create_uri()
    triples += populate_entity(uri, c.time_span, row['label'])

    # Add URI
    if row['authority_file'] == 'wikidata' and row['identifier']: 
        same_as = "http://www.wikidata.org/entity/" + row['identifier']
        triples.append((uri, p.same_as, to_literal(same_as)))
    elif row['authority_file'] == 'getty' and row['identifier']:
        same_as = "https://vocab.getty.edu/aat/" + row['identifier']
        triples.append((uri, p.same_as, to_literal(same_as)))

    # Save URI
    periods.at[i, 'uri'] = uri

    # Add begin and end year
    triples += [
        (uri, p.begin_of_the_begin, to_literal(str(row['begin_year']))),
        (uri, p.end_of_the_end, to_literal(str(row['end_year']))),
    ]

    # If approx
    if row['approx'] == "approx":
        triples += [(uri, p.has_note, to_literal("Approximatif", "fr"))]

    eta.iter()
eta.end()


# Insert triples
eta.begin(len(triples) / 1000, 'Inserting triples')
sparql.insert(triples, data_named_graph, eta_fct=eta.iter)
eta.end()

# To have them accessible with keys
periods_dict = periods.set_index('name').to_dict(orient='index')

[ETA] Creating Periods - 27 iterations in 00h00m00s (660.6 iter/sec)                                           
[ETA] Inserting triples - 0.092 iterations in 00h00m00s (0.9 iter/sec)                                         


In [10]:
# Create object types 

triples = []
eta.begin(len(object_types), 'Creating Object types')
for i, row in object_types.iterrows():

    # Create Instance
    uri = create_uri()
    label = row['name']
    triples += populate_entity(uri, c.physical_human_made_thing_type, label.title())
    
    # Add URI
    if row['authority_file'] == 'wikidata': 
        same_as = "http://www.wikidata.org/entity/" + row['identifier']
        triples.append((uri, p.same_as, to_literal(same_as)))
    elif row['authority_file'] == 'getty': 
        same_as = "https://vocab.getty.edu/aat/" + row['identifier']
        triples.append((uri, p.same_as, to_literal(same_as)))

    # Save URI
    object_types.at[i, 'uri'] = uri

    eta.iter()
eta.end()

# Insert triples
eta.begin(len(triples) / 1000, 'Inserting triples')
sparql.insert(triples, data_named_graph, eta_fct=eta.iter)
eta.end()

# To have them accessible with keys
object_types_dict = object_types.set_index('name').to_dict(orient='index')

[ETA] Creating Object types - 1925 iterations in 00h00m02s (672.5 iter/sec)                                    
[ETA] Inserting triples - 4.0 iterations in 00h00m00s (5.1 iter/sec)                                           


In [11]:
# Create authors

triples = []
eta.begin(len(authors), 'Creating Authors')
for i, row in authors.iterrows():

    # Create Instance
    uri = create_uri()
    label = row['name']
    triples += populate_entity(uri, c.actor, label.title())

    # Add URI
    if row['authority_file'] == 'wikidata': 
        same_as = "http://www.wikidata.org/entity/" + row['identifier']
        triples.append((uri, p.same_as, to_literal(same_as)))
    elif row['authority_file'] == 'getty': 
        same_as = "https://vocab.getty.edu/aat/" + row['identifier']
        triples.append((uri, p.same_as, to_literal(same_as)))

    # Save URI
    authors.at[i, 'uri'] = uri

    eta.iter()
eta.end()


# Insert triples
eta.begin(len(triples) / 1000, 'Inserting triples')
sparql.insert(triples, data_named_graph, eta_fct=eta.iter)
eta.end()

# To have them accessible with keys
authors_dict = authors.set_index('name').to_dict(orient='index')

[ETA] Creating Authors - 319 iterations in 00h00m00s (721.5 iter/sec)                                          
[ETA] Inserting triples - 0.638 iterations in 00h00m00s (4.2 iter/sec)                                         


# Core Data

In [8]:
# Import occupations
sale_places = catalogs[['sale_place']].dropna().drop_duplicates()

triples = []
saleplaces_uris = {}
eta.begin(len(sale_places), 'Creating Sale Places')
for i, row in sale_places.iterrows():
    for sale_place in row['sale_place'].split('\n'):
        saleplace_uri = create_uri()
        saleplace_label = sale_place
        triples += populate_entity(saleplace_uri, c.geographical_place, saleplace_label)

        saleplaces_uris[sale_place] = saleplace_uri

    eta.iter()
eta.end()

# Insert triples
eta.begin(len(triples) / 1000, 'Inserting triples')
sparql.insert(triples, data_named_graph, eta_fct=eta.iter)
eta.end()

[ETA] Creating Sale Places - 16 iterations in 00h00m00s (701.1 iter/sec)                                       
[ETA] Inserting triples - 0.034 iterations in 00h00m00s (0.1 iter/sec)                                         


In [10]:
# Prepare actors & addresses

auctioneers = []
experts = []
sellers = []
distributed_places = []
for i, row in catalogs.iterrows():

    # Auctioneer
    if pd.notna(row['auctioneer_names']):
        auctioneers_names = row['auctioneer_names'].split('\n')
        auctioneers_addresses = row['auctioneer_addresses'].split('\n') if pd.notna(row['auctioneer_addresses']) else [''] * len(auctioneers_names)
        auctioneers += [(name, address) for name, address in zip(auctioneers_names, auctioneers_addresses)]

    # Expert
    if pd.notna(row['experts_names']):
        experts_names = row['experts_names'].split('\n')
        experts_addresses = row['expert_addresses'].split('\n') if pd.notna(row['expert_addresses']) else [''] * len(experts_names)
        experts += [(name, address) for name, address in zip(experts_names, experts_addresses)]

    # Seller
    if pd.notna(row['seller_name']):
        sellers_names = row['seller_name'].split('\n')
        sellers_addresses = row['seller_address'].split('\n') if pd.notna(row['seller_address']) else [''] * len(sellers_names)
        sellers += [(name, address) for name, address in zip(sellers_names, sellers_addresses)]

    # Distributed places and persons
    if pd.notna(row['catalog_distributed_places']):
        for distributed_place in row['catalog_distributed_places'].split('\n'):
            separator = ' - '
            separator_index = distributed_place.rindex(separator)
            if separator_index:
                address = distributed_place[0:separator_index]
                actor = distributed_place[separator_index + len(separator):]
                distributed_places.append((actor, address))

# Buyers
buyers = []
for buyer in lots['sale_buyer'].unique().tolist():
    buyers.append((buyer, ''))


# Regroup addresses of persons
all_persons = {}
for person, address in auctioneers + experts + sellers + buyers:
    if person == '': continue
    if person not in all_persons: all_persons[person] = [address]
    else: all_persons[person].append(address)

# Remove actors if they are known persons, and add addresses to persons
actors_only = []
for actor, address in distributed_places:
    if actor == '': continue
    if actor not in all_persons: actors_only.append((actor, address))
    else: all_persons[actor].append(address)

# Regroup addresses of actors
all_actors = {}
for actor, address in actors_only:
    if actor == '': continue
    if actor not in all_actors: all_actors[actor] = [address]
    else: all_actors[actor].append(address)

# Make addresses unique and fetch all addresses
all_addresses = set()
for key, value in all_persons.items():
    all_persons[key] = list(set(value))
    for address in value:
        all_addresses.add(address)
for key, value in all_actors.items():
    all_actors[key] = list(set(value))
    for address in value:
        all_addresses.add(address)

In [14]:
# Import addresses

triples = []
eta.begin(len(all_addresses), 'Importing addresses')
addresses_uris = {}
for address in all_addresses:
    if address == '': continue

    address_uri = create_uri()
    address_label = address
    triples += populate_entity(address_uri, c.geographical_place, address_label)

    addresses_uris[address_label] = address_uri
    
# Insert triples
eta.begin(len(triples) / 1000, 'Inserting triples')
sparql.insert(triples, data_named_graph, eta_fct=eta.iter)
eta.end()

[ETA] Inserting triples - 0.216 iterations in 00h00m00s (2.0 iter/sec)                                         


In [15]:
# Import actors with addresses

triples = []
actors_uris = {}
eta.begin(len(all_actors), 'Importing actors')
for actor, addresses in all_actors.items():

    # Create actor
    actor_uri = create_uri()
    actor_label = actor
    triples += populate_entity(actor_uri, c.actor, actor_label)

    # Link to Geographical place
    for address in addresses:
        geoplace_uri = addresses_uris[address]

        epistemic_location_uri = create_uri()
        epistemic_location_label = f"{address} - {actor_label}"
        triples += populate_entity(epistemic_location_uri, c.epistemic_location_of_a_physical_thing, epistemic_location_label)
        triples += [
            (epistemic_location_uri, p.is_localized_at, geoplace_uri),
            (epistemic_location_uri, p.is_location_of, actor_uri),
        ]

    actors_uris[actor_label] = actor_uri
    
# Insert triples
eta.begin(len(triples) / 1000, 'Inserting triples')
sparql.insert(triples, data_named_graph, eta_fct=eta.iter)
eta.end()

[ETA] Inserting triples - 0.428 iterations in 00h00m00s (3.1 iter/sec)                                         


In [16]:
# Import persons with addresses

triples = []
persons_uris = {}
eta.begin(len(all_persons), 'Importing persons')
for person, addresses in all_persons.items():

    # Create person
    person_uri = create_uri()
    person_label = person
    triples += populate_entity(person_uri, c.person, person_label)
    
    # Link to all Geographical place
    for address in addresses:
        if address == '': continue
        geoplace_uri = addresses_uris[address]

        epistemic_location_uri = create_uri()
        epistemic_location_label = f"{address} - {person_label}"
        triples += populate_entity(epistemic_location_uri, c.epistemic_location_of_a_physical_thing, epistemic_location_label)
        triples += [
            (epistemic_location_uri, p.is_localized_at, geoplace_uri),
            (epistemic_location_uri, p.is_location_of, person_uri),
            (epistemic_location_uri, p.has_intentional_expression_identifing_type, k.epistemicLocationType_residence)
        ]

    persons_uris[person_label] = person_uri
    
# Insert triples
eta.begin(len(triples) / 1000, 'Inserting triples')
sparql.insert(triples, data_named_graph, eta_fct=eta.iter)
eta.end()

[ETA] Inserting triples - 0.492 iterations in 00h00m00s (2.6 iter/sec)                                         


In [17]:
# Import printers

# List all printers
printers = catalogs[['catalog_printer_name']].drop_duplicates()

triples = []
eta.begin(len(printers), 'Creating printers')
for i, printer in printers.iterrows():
    uri = create_uri()
    label = printer['catalog_printer_name']
    triples += populate_entity(uri, c.actor, label)

    printers.at[i, "printer_uri"] = uri

# Insert triples
eta.begin(len(triples) / 1000, 'Inserting triples')
sparql.insert(triples, data_named_graph, eta_fct=eta.iter)
eta.end()


# Add to catalog table
catalogs = catalogs.merge(printers, on="catalog_printer_name", how="left").drop_duplicates()

[ETA] Inserting triples - 0.013 iterations in 00h00m00s (0.1 iter/sec)                                         


In [18]:
# Imports catalog owners

# List all printers
catalog_owners = catalogs[['catalog_owner']].dropna().drop_duplicates()

triples = []
eta.begin(len(catalog_owners), 'Creating catalog owners')
for i, owner in catalog_owners.iterrows():
    uri = create_uri()
    label = owner['catalog_owner']
    triples += populate_entity(uri, c.actor, label)

    catalog_owners.at[i, "owner_uri"] = uri

    eta.iter()
eta.end()

# Insert triples
eta.begin(len(triples) / 1000, 'Inserting triples')
sparql.insert(triples, data_named_graph, eta_fct=eta.iter)
eta.end()

# Add to catalog table
catalogs = catalogs.merge(catalog_owners, on="catalog_owner", how="left").drop_duplicates()


[ETA] Creating catalog owners - 5 iterations in 00h00m00s (624.8 iter/sec)                                     
[ETA] Inserting triples - 0.01 iterations in 00h00m00s (0.1 iter/sec)                                          


In [19]:
# Import occupations
occupations = catalogs[['seller_occupation']].dropna().drop_duplicates()

triples = []
occupations_uris = {}
eta.begin(len(occupations), 'Creating occupations')
for i, row in occupations.iterrows():
    occupation_uri = create_uri()
    occupation_label = row['seller_occupation']
    triples += populate_entity(occupation_uri, c.occupation_peit, occupation_label)

    occupations_uris[row['seller_occupation']] = occupation_uri

    eta.iter()
eta.end()

# Insert triples
eta.begin(len(triples) / 1000, 'Inserting triples')
sparql.insert(triples, data_named_graph, eta_fct=eta.iter)
eta.end()

[ETA] Creating occupations - 5 iterations in 00h00m00s (738.7 iter/sec)                                        
[ETA] Inserting triples - 0.01 iterations in 00h00m00s (0.1 iter/sec)                                          


In [None]:
# Import Catalogs

catalogs['lugt'] = catalogs['lugt'].astype(pd.Int64Dtype())

# Create triples
triples = []
eta.begin(len(lots), 'Creating triples')
for i, row in catalogs.iterrows():

    # Information needed for the full row
    date = row['name'][0:row['name'].index('_')]
    catalog_name = f"{row['name'][row['name'].index('_') + 1:].replace('-', ' ')} ({date})"

    # Physical Human made thing (the catalog)
    phy_human_made_thing_uri = create_uri()
    label = row['catalog_full_name']
    triples += populate_entity(phy_human_made_thing_uri, c.physical_human_made_thing, label)

    # Printer branch
    if pd.notna(row['catalog_printer_name']):
        production_uri = create_uri()
        label = f"Imprimé par {row['catalog_printer_name']}"
        triples += populate_entity(production_uri, c.production, label)

        triples += [
            (production_uri, p.has_produced, phy_human_made_thing_uri),
            (production_uri, p.carried_out_by, row['printer_uri'])
        ]

    # URL (note) branch
    if pd.notna(row['url']):
        identifier_uri = create_uri()
        identifier_label = "Lien Gallica"
        triples += populate_entity(identifier_uri, c.identifier, identifier_label)
        triples += [
            (phy_human_made_thing_uri, p.same_as_external_identifier, identifier_uri),
            (identifier_uri, p.has_identifier_type, k.identifierType_url),
            (identifier_uri, p.is_identified_by, row['url']),
        ]

    # Lugt branch
    if pd.notna(row['lugt']):
        # Identifier
        identifier_uri = create_uri()
        identifier_label = f"Lugt {row['lugt']}"
        triples += populate_entity(identifier_uri, c.identifier, identifier_label)

        # Triples
        triples += [
            (phy_human_made_thing_uri, p.same_as_external_identifier, identifier_uri),
            (identifier_uri, p.has_identifier_type, k.identifierType_lugt),
            (identifier_uri, p.is_identified_by, to_literal(str(row['lugt'])))
        ]

    # Belonging branch
    if pd.notna(row['catalog_owner']):
        holding_a_right_or_obligation_uri = create_uri()
        holding_a_right_or_obligation_label = f"Appartenance du catalogue à {row['catalog_owner']}"
        triples += populate_entity(holding_a_right_or_obligation_uri, c.holding_a_right_or_obligation, holding_a_right_or_obligation_label) 
        triples += [
            (holding_a_right_or_obligation_uri, p.is_subjection_of, phy_human_made_thing_uri),
            (holding_a_right_or_obligation_uri, p.is_right_of, row['owner_uri']),
        ]

    # Distribution places branch
    if pd.notna(row['catalog_distributed_places']):
        distribution_places = row['catalog_distributed_places'].split('\n')
        for distributed_place in distribution_places:
            address = distributed_place[:distributed_place.rindex(' - ')]
            geoplace_uri = addresses_uris[address]

            epistemic_location_uri = create_uri()
            epistemic_location_label = f"{catalog_name} distribué à {address}"
            triples += populate_entity(epistemic_location_uri, c.epistemic_location_of_a_physical_thing, epistemic_location_label)
            triples += [
                (epistemic_location_uri, p.is_location_of, phy_human_made_thing_uri),
                (epistemic_location_uri, p.is_localized_at, geoplace_uri),
                (epistemic_location_uri, p.has_location_type, k.epistemicLocationType_distributionPlace),
            ]

    # Expression branch
    expression_uri = create_uri()
    expression_label = f"Contenu du catalogue {catalog_name}"
    triples += populate_entity(expression_uri, c.expression, expression_label)
    triples += [(phy_human_made_thing_uri, p.carries, expression_uri)]

    # Page number Branch
    if pd.notna(row['catalog_page_number']):
        page_number = str(int(float(row['catalog_page_number'])))
        quantiQual_uri = create_uri()
        quantiQual_label = f"{page_number} pages"
        numDim_uri = create_uri()
        numDim_label = f"{page_number}"
        triples += populate_entity(quantiQual_uri, c.quantifiable_quality, quantiQual_label)
        triples += populate_entity(numDim_uri, c.numeric_dimension, numDim_label)
        triples += [
            (expression_uri, p.has_quantifiable_quality, quantiQual_uri),
            (quantiQual_uri, p.has_quality_type, k.entityQualityType_pageNumber),
            (quantiQual_uri, p.has_quality_dimension, numDim_uri),
            (numDim_uri, p.has_value, to_literal(page_number)),
        ]

    # Lot number Branch
    if pd.notna(row['sale_lot_number']):
        lot_number = str(int(float(row['sale_lot_number'])))
        quantiQual_uri = create_uri()
        quantiQual_label = f"{lot_number} lots"
        numDim_uri = create_uri()
        numDim_label = f"{lot_number}"
        triples += populate_entity(quantiQual_uri, c.quantifiable_quality, quantiQual_label)
        triples += populate_entity(numDim_uri, c.numeric_dimension, numDim_label)
        triples += [
            (expression_uri, p.has_quantifiable_quality, quantiQual_uri),
            (quantiQual_uri, p.has_quality_type, k.entityQualityType_lotNumber),
            (quantiQual_uri, p.has_quality_dimension, numDim_uri),
            (numDim_uri, p.has_value, to_literal(lot_number))
        ]

    # Auctions branch
    auction_uri = create_uri()
    auction_label = catalog_name
    triples += populate_entity(auction_uri, c.activity, auction_label)

    # Auction Mentioning
    mentioning_uri = create_uri()
    mentioning_label = "Mention de la vente aux enchères"
    triples += populate_entity(mentioning_uri, c.mentioning, mentioning_label)
    triples += [
        (mentioning_uri, p.mentions, auction_uri),
        (mentioning_uri, p.is_mentioned_in, expression_uri)
    ]

    # Auction fees
    if pd.notna(row['sale_fees']):
        fees = float(row['sale_fees']) * 100
        quantiQual_uri = create_uri()
        quantiQual_label = f"{fees}% de frais"
        triples += populate_entity(quantiQual_uri, c.quantifiable_quality_of_a_spatio_temporal_phenomenon, quantiQual_label)
        numDim_uri = create_uri()
        numDim_label = f"{fees}%"
        triples += populate_entity(numDim_uri, c.numeric_dimension, numDim_label)
        triples += [
            (auction_uri, p.has_quantifiable_quality, quantiQual_uri),
            (quantiQual_uri, p.has_quality_type, k.quantiQualSpati_fees),
            (quantiQual_uri, p.has_quality_dimension, numDim_uri),
            (numDim_uri, p.has_value, to_literal(str(fees)))
        ]

    # Auction Auctioneer
    if pd.notna(row['auctioneer_names']):
        for auctioneer in row['auctioneer_names'].split('\n'):
            person_uri = persons_uris[auctioneer]

            participation_uri = create_uri()
            participation_label = "Commissaire priseur de la vente"
            triples += populate_entity(participation_uri, c.participation, participation_label)
            triples+= [
                (participation_uri, p.is_participation_in, auction_uri),
                (participation_uri, p.is_participation_of, person_uri),
                (participation_uri, p.is_participation_in_the_quality_of, k.actorSocialQuality_auctioneer)
            ]

    # Auction Expert
    if pd.notna(row['experts_names']):
        for expert in row['experts_names'].split('\n'):
            if expert == '': continue
            person_uri = persons_uris[expert]

            participation_uri = create_uri()
            participation_label = "Expert de la vente"
            triples += populate_entity(participation_uri, c.participation, participation_label)
            triples+= [
                (participation_uri, p.is_participation_in, auction_uri),
                (participation_uri, p.is_participation_of, person_uri),
                (participation_uri, p.is_participation_in_the_quality_of, k.actorSocialQuality_expert)
            ]

    # Auction Seller
    if pd.notna(row['seller_name']):
        for seller in row['seller_name'].split('\n'):
            person_uri = persons_uris[seller]

            participation_uri = create_uri()
            participation_label = "Vendeur de la vente"
            triples += populate_entity(participation_uri, c.participation, participation_label)
            triples+= [
                (participation_uri, p.is_participation_in, auction_uri),
                (participation_uri, p.is_participation_of, person_uri),
                (participation_uri, p.is_participation_in_the_quality_of, k.actorSocialQuality_expert)
            ]

    # Auction Seller occupation
    if pd.notna(row['seller_occupation']):
        person_uri = persons_uris[row['seller_name']]
        occupation_uri = occupations_uris[row['seller_occupation']]

        occupation_teen_uri = create_uri()
        occupation_teen_label = f"{row['seller_name']} était {row['seller_occupation']}"
        triples += populate_entity(occupation_teen_uri, c.occupation_teen, occupation_teen_label)
        triples += [
            (occupation_teen_uri, p.is_occupation_of, person_uri),
            (occupation_teen_uri, p.is_about, occupation_uri)
        ]
    
    # Auction place
    if pd.notna(row['sale_place']):
        for sale_place in row['sale_place'].split('\n'):
            geoplace_uri = saleplaces_uris[sale_place]
            triples+= [(auction_uri, p.took_place_at, geoplace_uri)]

    # Activity Type 
    triples += [(auction_uri, p.has_activity_type, k.activityType_auction)]

    # Dates
    if pd.notna(row['sale_date_begin']):
        triples += [(auction_uri, p.begin_of_the_begin, to_literal(row['sale_date_begin']))]
    if pd.notna(row['sale_date_end']):
        triples += [(auction_uri, p.end_of_the_end, to_literal(row['sale_date_end']))]

    catalogs.at[i, 'catalog_uri'] = expression_uri
    catalogs.at[i, 'auction_uri'] = auction_uri
    eta.iter()
eta.end()

# Insert triples
eta.begin(len(triples) / 1000, 'Inserting triples')
sparql.insert(triples, data_named_graph, eta_fct=eta.iter)
eta.end()

# Save URIs for later access
expression_uris = {}
for i, row in catalogs[['name', 'catalog_uri']].iterrows():
    expression_uris[row['name']] = row['catalog_uri']
auctions_uris = {}
for i, row in catalogs[['name', 'auction_uri']].iterrows():
    auctions_uris[row['name']] = row['auction_uri']

[ETA] Creating triples - 12596 iterations in 00h00m00s (16627.2 iter/sec)                                      
[ETA] Inserting triples - 2.516 iterations in 00h00m00s (4.0 iter/sec)                                         


In [21]:
# Import Lots

# Create triples
triples = []
eta.begin(len(lots), 'Creating lots')
for i, row in lots.iterrows():

    # Information needed for the full row
    date = row['catalog'][0:row['catalog'].index('_')]
    catalog_name = f"{row['catalog'][row['catalog'].index('_') + 1:].replace('-', ' ')} ({date})"
    index = row['index']

    # Physical Set (the lot)
    physical_set_uri = create_uri()
    label = f"{catalog_name}, lot n°{index}"
    triples += populate_entity(physical_set_uri, c.physical_set, label)

    # Identification branch
    if pd.notna(row['index']):
        triples.append((physical_set_uri, p.is_identified_by, to_literal(str(row['index']))))

    # Description branch
    if pd.notna(row['description']):
        # Linguistic Object (lot description)
        linguistic_object_uri = create_uri()
        label = f"Description du lot n°{str(index).upper()}, vente {catalog_name}"
        triples += populate_entity(linguistic_object_uri, c.linguistic_object, label)

        triples += [
            (linguistic_object_uri, p.is_about, physical_set_uri),
            (linguistic_object_uri, p.has_language, k.language_french),
            (linguistic_object_uri, p.has_symbolic_content, to_literal(row['description']))
        ]

    # Quantity branch
    if pd.notna(row['number']):
        # Quantifiable Quality  (items number)
        quantifiable_quality_uri = create_uri()
        label = f"Nombre d'objet : {row['number']}"
        triples += populate_entity(quantifiable_quality_uri, c.quantifiable_quality, label)
        # Numeric dimension (items number)
        num_dimention_uri = create_uri()
        label = str(row['number'])
        triples += populate_entity(num_dimention_uri, c.numeric_dimension, label)

        # Graph
        triples += [
            (physical_set_uri, p.has_quantifiable_quality, quantifiable_quality_uri),
            (quantifiable_quality_uri, p.has_quality_type, k.entityQualityType_number),
            (quantifiable_quality_uri, p.has_quality_dimension, num_dimention_uri),
            (num_dimention_uri, p.has_value, to_literal(str(row['number'])))
        ]

    # Object type branch 
    if pd.notna(row['object_type']):
        types = row['object_type'].split(', ')
        triples += [(physical_set_uri, p.was_or_is_composed_of_object_of_type, object_types_dict[type]['uri']) for type in types]

    # Material & Technique branch
    if pd.notna(row['material_technique']):
        mat_techs = row['material_technique'].split(', ')
        triples += [(physical_set_uri, p.was_or_is_composed_of_objects_produced_with, material_techniques_dict[mat_tech]['uri']) for mat_tech in mat_techs]

    # Production branch
    if pd.notna(row['origin']) or pd.notna(row['author']) or pd.notna(row['period']):
        # Production (fabrication)
        production_uri = create_uri()
        label = f"Conception du lot n°{str(index).upper()} de {catalog_name}"
        triples += populate_entity(production_uri, c.production, label)
        triples += [(production_uri, p.has_produced, physical_set_uri)]
        if pd.notna(row['origin']):
            triples += [(production_uri, p.took_place_at, origins_dict[origin]['uri']) for origin in row['origin'].split(', ')]
        if pd.notna(row['author']):
            triples += [(production_uri, p.carried_out_by, authors_dict[author]['uri']) for author in row['author'].split(', ')]
        if pd.notna(row['period']):
            triples += [(production_uri, p.has_time_span, periods_dict[period]['uri']) for period in row['period'].split(', ')]  
    
    # Mentioning (expression) Branch
    mentioning_uri = create_uri()
    mentioning_label = f"Mention du lot n°{index}"
    triples += populate_entity(mentioning_uri, c.mentioning, mentioning_label)
    catalog_uri = expression_uris[row['catalog']]
    triples += [
        (mentioning_uri, p.mentions, physical_set_uri),
        (mentioning_uri, p.is_mentioned_in, catalog_uri)
    ]

    # Lot offered Branch
    intentExpr_uri = create_uri()
    intentExpr_label = f"Mise en vente du lot n°{index}"
    triples += populate_entity(intentExpr_uri, c.intentional_expression, intentExpr_label)
    # Offering of the lot
    triples += [(intentExpr_uri, p.occured_in_the_presence_of, physical_set_uri)]
    # Part of the auction
    auction_uri = auctions_uris[row['catalog']]
    triples += [(intentExpr_uri, p.is_part_of, auction_uri)]
    # Mentioned in the catalog expression
    expression_uri = expression_uris[row['catalog']]
    mentioning_uri = create_uri()
    mentioning_label = f"Mention de la mise en vente du lot n°{index}"
    triples += populate_entity(mentioning_uri, c.mentioning, mentioning_label)
    triples += [
        (mentioning_uri, p.mentions, intentExpr_uri),
        (mentioning_uri, p.is_mentioned_in, expression_uri)
    ]

    # Economic Transaction Branch
    if pd.notna(row['sale_price']) or pd.notna(row['sale_buyer']):
        econTrans_uri = create_uri()
        econTrans_label = f"Vente du lot n°{index}"
        triples += populate_entity(econTrans_uri, c.economic_transaction, econTrans_label)
        triples += [
            (econTrans_uri, p.occured_in_the_presence_of, physical_set_uri),
            (econTrans_uri, p.has_setting, intentExpr_uri),
        ]

        if pd.notna(row['sale_price']):
            sale_price = str(int(float(row['sale_price'])))
            quantiQual_uri = create_uri()
            quantiQual_label = f"Adjugé à {sale_price} {row['sale_currency_label']}"
            triples += populate_entity(quantiQual_uri, c.quantifiable_quality_of_an_intentional_event, quantiQual_label)
            monAmount_uri = create_uri()
            monAmount_label = f"{sale_price} {row['sale_currency_label']}"
            triples += populate_entity(monAmount_uri, c.monetary_amount, monAmount_label)
            triples += [
                (monAmount_uri, p.has_currency, row['sale_currency_uri']),
                (monAmount_uri, p.has_value, to_literal(sale_price)),
                (quantiQual_uri, p.has_quality_dimension, monAmount_uri),
                (quantiQual_uri, p.has_quantifiable_quality_of_and_intentional_event_type, k.quantiQualIntEventType_hammerPrice),
                (quantiQual_uri, p.is_quantifiable_quality_of, econTrans_uri)
            ]

        if pd.notna(row['sale_buyer']):
            person_uri = persons_uris[row['sale_buyer']]
            participation_uri = create_uri()
            participation_label = "Adjudicataire"
            triples += populate_entity(participation_uri, c.participation, participation_label)
            triples += [
                (participation_uri, p.is_participation_in, econTrans_uri),
                (participation_uri, p.is_participation_of, person_uri),
                (participation_uri, p.is_participation_in_the_quality_of, k.actorSocialQuality_buyer)
            ]

    lots.at[i, 'physical_set_uri'] = physical_set_uri
    eta.iter()
eta.end()

# Insert triples
eta.begin(len(triples) / 1000, 'Inserting triples')
sparql.insert(triples, data_named_graph, eta_fct=eta.iter)
eta.end()

[ETA] Creating lots - 12596 iterations in 00h02m18s (91.1 iter/sec)                                            
[ETA] Inserting triples - 442.546 iterations in 00h01m31s (4.9 iter/sec)                                       


# Model generation

In [22]:
# Ontology definition

ontology = {
    v: k.replace('_', ' ').title()
    for cls in (Classes, Properties)
    for k, v in cls.__dict__.items()
    if not k.startswith("__")
}

graph_begin = "graph " + data_named_graph + " {" if data_named_graph else ""
graph_end = "}" if data_named_graph else ""

In [23]:
# Add class and properties label for Geovistory web component

triples = []
for key, value in ontology.items():
    triples.append((key, 'rdfs:label', to_literal(value, 'en')))


# Insert triples
eta.begin(len(triples) / 1000, 'Inserting triples')
sparql.insert(triples, data_named_graph, eta_fct=eta.iter)
eta.end()

[ETA] Inserting triples - 0.083 iterations in 00h00m00s (0.7 iter/sec)                                         


In [24]:
# Get all used classes

classes = sparql.run(f"""
    select distinct ?cls
    where {{ 
        {graph_begin}
            ?s rdf:type ?cls .
        {graph_end}
    }}
""")
classes = list(map(lambda x: x['cls'], classes))

In [25]:
# Build SHACL

shacl = ""
for prefix in prefixes:
    shacl += prefix.to_turtle() + '\n'

shacl += '\n'

eta.begin(len(classes), "Building SHACL")
for cls in classes:
    if cls not in ontology: continue
    if ontology[cls] == "": continue

    cls_raw = cls
    cls = cls.replace('ontome', 'geov')

    # Shape Name
    shape_name = cls.replace(':', '_') + '_shape'

    # Properties (outgoing) - Instances
    outgoing_props_instances = sparql.run(f"""
        select distinct ?prop_outgoing ?range_cls where {{
            {graph_begin}
                ?cls rdf:type {cls_raw} .
                ?cls ?prop_outgoing ?range_instance .
                ?range_instance rdf:type ?range_cls .
            {graph_end}                    
        }}
    """)

    # Properties (outgoing) - Values
    outgoing_props_values = sparql.run(f"""
        select distinct ?prop_outgoing where {{
            {graph_begin}
                ?cls rdf:type {cls_raw} .
                ?cls ?prop_outgoing ?range_instance .
                filter(isLiteral(?range_instance)).
            {graph_end}       
        }}
    """)

    # Properties (incoming)
    incoming_props = sparql.run(f"""
        select distinct ?prop_incoming ?domain_cls where {{
            {graph_begin}
                ?cls rdf:type {cls_raw} .
                ?domain_instance ?prop_incoming ?cls .
                ?domain_instance rdf:type ?domain_cls .
            {graph_end}       
        }}
    """)

    shacl += '\n'
    shacl += f'sdh-shacl:{shape_name} a sh:NodeShape ;\n'
    shacl += f'    sh:targetClass {cls} ;\n'
    shacl += f'    sh:name "{ontology[cls]}" ;\n'

    for prop in outgoing_props_values:
        prop_uri = prop['prop_outgoing']

        if prop_uri not in ontology: continue
        if ontology[prop_uri] == "": continue

        shacl += '\n'
        shacl += f'    sh:property [\n'
        shacl += f'        sh:path {prop_uri} ;\n'
        shacl += f'        sh:name "{ontology[prop_uri]}" ;\n'
        shacl += f'        sh:datatype xsd:string ;\n'
        shacl += f'    ] ;\n'

    for prop in outgoing_props_instances:
        prop_uri = prop['prop_outgoing'].replace('ontome', 'geov')

        if prop_uri not in ontology: continue
        if ontology[prop_uri] == "": continue

        range_uri = prop['range_cls']
        shacl += '\n'
        shacl += f'    sh:property [\n'
        shacl += f'        sh:path {prop_uri} ;\n'
        shacl += f'        sh:name "{ontology[prop_uri]}" ;\n'
        shacl += f'        sh:class {range_uri} ;\n'
        shacl += f'    ] ;\n'

    for prop in incoming_props:
        prop_uri = prop['prop_incoming'].replace('ontome', 'geov')

        if prop_uri not in ontology: continue
        if ontology[prop_uri] == "": continue
        
        domain_uri = prop['domain_cls']
        shacl += '\n'
        shacl += f'    sh:property [\n'
        shacl += f'        sh:path [\n'
        shacl += f'            sh:inversePath {prop_uri} ;\n'
        shacl += f'        ] ;\n'
        shacl += f'        sh:name "{ontology[prop_uri]}" ;\n'
        shacl += f'    ] ;\n'


    shacl += f'\n    .\n'

    eta.iter()
eta.end()

file = open('./objective-shacl.ttl', 'w')
file.write(shacl)
file.close()

[ETA] Building SHACL - 41 iterations in 00h00m13s (3.0 iter/sec)                                               


In [26]:
# Upload SHACL content into the endpoint

sparql.upload_turtle(shacl, shacl_named_graph)

> Uploading 10000 (0 / 1116) Done


---

In [27]:
# Finished time and Elapsed calculation

end = datetime.now()
print('Finished at:', end.strftime("%H:%M:%S"))

elapsed = end - begin
seconds = int(elapsed.total_seconds())
hours, remainder = divmod(seconds, 3600)
minutes, seconds = divmod(remainder, 60)

print(f"Elapsed time: {hours}h {minutes}m {seconds}s")

Finished at: 10:02:15
Elapsed time: 0h 5m 9s
