In [1]:
%load_ext autoreload
%autoreload 2
import os
import sys
module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

from getpass import getpass
from urllib.parse import quote
from copy import deepcopy

import pandas as pd
import numpy as np
from SPARQLWrapper import JSON, SPARQLWrapper
import time
from tqdm.notebook import tqdm
from typing import TypeVar, List, Callable
from helpers import s3, iiif
import requests
from requests.adapters import HTTPAdapter
from urllib3.util import Retry
from rdflib.namespace import RDF
from rdflib import Graph, URIRef, Namespace
SPAV = Namespace('http://vocab.performing-arts.ch/')
RICO = Namespace('https://www.ica.org/standards/RiC/ontology#')
try:
    S3_CLIENT_ID
except NameError:
    S3_CLIENT_ID = getpass("Please input s3 client id")

try:
    S3_CLIENT_SECRET
except NameError:
    S3_CLIENT_SECRET = getpass("Please input s3 client secret")

try:
    SPARQL_USERNAME
except NameError:
    SPARQL_USERNAME = getpass("Please input sparql username")

try:
    SPARQL_PASSWORD
except NameError:
    SPARQL_PASSWORD = getpass("Please input sparql password")

tqdm.pandas()

In [2]:
sparql = SPARQLWrapper("https://www.performing-arts.ch/sparql")
sparql.setReturnFormat(JSON)

base_query = """
PREFIX olo: <http://purl.org/ontology/olo/core#>
PREFIX spao: <http://ontology.performing-arts.ch/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX spav: <http://vocab.performing-arts.ch/>
PREFIX rico: <https://www.ica.org/standards/RiC/ontology#>
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>

SELECT ?iiifURI ?resource ?index ?descriptiveNote ?resourceName ?identifierValue ?resourceType ?parentResource ?parentIndex ?parentIdentifierValue ?parentName ?parentDescriptiveNote ?parentType ?grandParentResource ?grandParentIndex ?grandParentIdentifierValue ?grandParentName ?grandParentDescriptiveNote ?grandParentType WHERE {{
  ?resource rico:hasInstantiation ?instance.
  ?instance rico:hasCarrierType spav:fvman .
  OPTIONAL {{?resource rico:hasOrHadIdentifier ?identifier .
    ?identifier rdf:value ?identifierValue.
      {{ ?identifier crm:P2_has_type spav:iduni. }}
    	UNION
    {{ ?identifier crm:P2_has_type spav:idcon. }}
  }}
  OPTIONAL {{ ?resource olo:index ?index . }}
  OPTIONAL {{ ?resource rico:descriptiveNote ?descriptiveNote . }}
  OPTIONAL {{?resource rico:name ?resourceName . }}
  OPTIONAL {{ ?resource rico:hasRecordSetType ?resourceType . }}
  ?resource (rico:isOrWasIncludedIn+) ?parentResource.
  OPTIONAL {{
    ?parentResource rico:hasOrHadIdentifier ?parentIdentifier.
    {{ ?parentIdentifier crm:P2_has_type spav:iduni. }}
    UNION
    {{ ?parentIdentifier crm:P2_has_type spav:idcon. }}
    ?parentIdentifier rdf:value ?parentIdentifierValue.
  }}
   OPTIONAL {{ ?parentResource olo:index ?parentIndex . }}
  OPTIONAL {{ ?parentResource rico:descriptiveNote ?parentDescriptiveNote . }}
  OPTIONAL {{ ?parentResource rico:name ?parentName . }}
  OPTIONAL {{ ?parentResource rico:hasRecordSetType ?parentType . }}
  ?parentResource rico:isOrWasIncludedIn ?grandParentResource.
  OPTIONAL {{
    ?grandParentResource rico:hasOrHadIdentifier ?grandParentIdentifier.
    {{ ?grandParentIdentifier crm:P2_has_type spav:iduni. }}
    UNION
    {{ ?grandParentIdentifier crm:P2_has_type spav:idcon. }}
    ?grandParentIdentifier rdf:value ?grandParentIdentifierValue.
  }}
    OPTIONAL {{ ?grandParentResource olo:index ?grandParentIndex . }}

  OPTIONAL {{ ?grandParentResource rico:descriptiveNote ?grandParentDescriptiveNote . }}
  OPTIONAL {{ ?grandParentResource rico:name ?grandParentName . }}
  OPTIONAL {{ ?grandParentResource rico:hasRecordSetType ?grandParentType . }}
  FILTER (!regex(str(?instance), "/collection")) . 
  BIND(?instance AS ?iiifURI)
}} LIMIT {} OFFSET {}
"""
data = []
start = 0
limit = 10000
bar = tqdm()
while True:
    sparql.setQuery(base_query.format(limit, start))
    results = sparql.query().convert()["results"]["bindings"]
    if len(results) == 0:
        break
    data.extend([{k: v["value"] for k, v in x.items()} for x in results])
    bar.update(len(results))
    start += limit

data = pd.DataFrame.from_dict(
    data
)

0it [00:00, ?it/s]

In [3]:
info_columns = ["resourceURI", "identifier", "index", "descriptivenote", "type", "name", "manifestURI"]
## Get resources
resources = data.set_index("resource")[
    ["identifierValue", "index", "descriptiveNote", "resourceType", "resourceName", "iiifURI"]
]
resources = resources.loc[~resources.index.duplicated(keep="first")]
resources.reset_index(inplace=True)
resources.columns = info_columns
resources["index"] = resources["index"].apply(lambda x: x.zfill(9) if pd.notna(x) else x)
resources.replace({np.nan: None}, inplace=True)

## Get parents
parents = data.set_index("parentResource")[
    ["parentIdentifierValue", "parentIndex", "parentDescriptiveNote", "parentType", "parentName"]
]
parents = parents.loc[~parents.index.duplicated(keep="first")]
parents["iiifURI"] = None
parents.reset_index(inplace=True)
parents.columns = info_columns
parents["index"] = parents["index"].apply(lambda x: x.zfill(9) if pd.notna(x) else x)
parents.replace({np.nan: None}, inplace=True)

## Get grand parents
grand_parents = data.set_index("grandParentResource")[
    ["grandParentIdentifierValue", "grandParentIndex", "grandParentDescriptiveNote", "grandParentType", "grandParentName"]
]
grand_parents = grand_parents.loc[~grand_parents.index.duplicated(keep="first")]
grand_parents["iiifURI"] = None
grand_parents.reset_index(inplace=True)
grand_parents.columns = info_columns
grand_parents["index"] = grand_parents["index"].apply(lambda x: x.zfill(9) if pd.notna(x) else x)
grand_parents.replace({np.nan: None}, inplace=True)

# Map URI to infos
resourceURI2infos = (resources.assign(uri=resources["resourceURI"]).set_index("resourceURI").to_dict(orient="index") |
                    parents.assign(uri=parents["resourceURI"]).set_index("resourceURI").to_dict(orient="index") |
                    grand_parents.assign(uri=grand_parents["resourceURI"]).set_index("resourceURI").to_dict(orient="index"))

infos = pd.DataFrame.from_records(list(resourceURI2infos.values()))

# root nodes are node that are never parent
root_nodes = (
    data.loc[
        ~data["grandParentResource"].isin(data["parentResource"]), "grandParentResource"
    ]
    .drop_duplicates(keep="first")
    .values.tolist()
)

# Parent nodes are nodes that are never grand parents
parent_nodes = (
    data.loc[
        ~data["parentResource"].isin(data["grandParentResource"]), "parentResource"
    ]
    .drop_duplicates(keep="first")
    .values.tolist()
)

# Children are children of grand parents or children of leaves nodes
resourceURI2children = (
    data.groupby("grandParentResource")["parentResource"].apply(set).to_dict()
) | (
    data.loc[data["parentResource"].isin(parent_nodes)]
    .groupby("parentResource")["resource"]
    .apply(set)
    .to_dict()
)

manifestURI2resourceURI = resourceURI2infos.values()

In [4]:
s = requests.Session()

retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])

s.mount("https://", HTTPAdapter(max_retries=retries))

def manifest_to_items(manifest_uri):
    try:
        res = s.get(manifest_uri)
        if res.status_code == 404:
            print(f"Error with {manifest_uri}, it does not exists")
            return []
        res.raise_for_status()
        res_json = res.json()
        return res_json['items']
    except Exception as e:
        print(f"Error with {manifest_uri}: ", e)
        return []

manifest_uri_to_items = { uri : manifest_to_items(uri) for uri in tqdm(resources['manifestURI'].tolist())}

  0%|          | 0/1800 [00:00<?, ?it/s]

In [5]:
from collections import defaultdict

def get_first_not_none(dict_obj, keys):
    for key_ in keys:
        if dict_obj.get(key_) is not None:
            return dict_obj.get(key_)
    return None

def expand_identifier(identifier: str):
    return tuple([int(x) if x.isdigit() else x for x in identifier.split("-")])

def resource_uri_sorting(resource_uri):
    resource = resourceURI2infos.get(resource_uri)
    if resource is not None:
        if resource.get("index") is not None:
            return resource.get("index")
        elif resource.get("identifier") is not None:
            return expand_identifier(resource.get("identifier"))
        else:
            return resource.get("name")
    return resource_uri

def get_ordered_descendants_with_manifests(resource_uri, infos):
    for child_uri in sorted(resourceURI2children.get(resource_uri, []), key=resource_uri_sorting):
        child = resourceURI2infos.get(child_uri)
        if child is None:
            print(f"Error finding infos for {child_uri}")
        manifest_uri = child["manifestURI"]
        if manifest_uri is not None:
            infos.append(child)
        get_ordered_descendants_with_manifests(child_uri, infos)
    return infos

In [6]:
def identifier_to_base_manifest_uri(identifier):
    return f"https://media.performing-arts.ch/iiif/manifest/collection-{quote(identifier)}"


def add_uri_rendering_to_item(iiif_item, uri):
    if "rendering" in iiif_item and len([x for x in iiif_item["rendering"] if x["id"] == uri]) == 0:
        iiif_item["rendering"].append(
            {
                "id": uri,
                "type": "Text",
                "label": {"en": ["Record on Swiss performing arts platform"]},
                "format": "text/html",
            }
        )
    return iiif_item


def add_label_to_item_if_needed(iiif_item, label):
    if label is not None and ("label" not in iiif_item or
                              iiif_item["label"].get("en", [""])[0].startswith("Picture ")):
        iiif_item["label"] = iiif.create_multilingual(label)
    return iiif_item


def get_items(manifest_descendants):
    items = []
    global_index = 0
    for info in manifest_descendants:
        manifest_items = manifest_uri_to_items.get(info["manifestURI"], [])
        num_manifest_items = len(manifest_items)
        for item_index, item in enumerate(manifest_items):
            item = deepcopy(item)
            item = add_uri_rendering_to_item(item, info["uri"])
            item = add_label_to_item_if_needed(
                item,
                build_item_label(info, item_index, num_manifest_items, global_index),
            )
            items.append(item)
            global_index += 1
    return items


def build_item_label(info, local_index, local_count, global_index):
    label = ""
    identifier = info.get("identifier")
    name = info.get("name")
    if identifier is not None and len(identifier) > 0:
        label += identifier 
        if name is not None:
            label += " "
        else:
            label += (f"-{local_index + 1}" if local_count > 1 else "")
    
    if name is not None and len(name) > 0:
        label += name 
        if identifier is None:
            label += (f" - {local_index + 1}" if local_count > 1 else "")
    
    if len(label) == 0:
        return {
            "de": f"Bild {global_index + 1}",
            "fr": f"Image {global_index + 1}",
            "en": f"Picture {global_index + 1}",
            "it": f"Immagine {global_index + 1}",
        }
    return label


copyright = {
    "en": "Various copyrights apply, please check individual records on the SAPA platform.",
    "fr": "Différents droits d'auteur s'appliquent, veuillez vérifier les entrées individuelles sur la plateforme SAPA.",
    "de": "Es gelten verschiedene Copyrights, bitte beachten Sie die einzelnen Einträge auf der SAPA-Plattform.",
}
base_metadata = iiif.get_manifest_base_metadata(copyright, None)

In [12]:
def uri_to_manifest(uri, identifier):
    infos = resourceURI2infos.get(uri)
    base_manifest_uri = identifier_to_base_manifest_uri(identifier)
    manifest_descendants = get_ordered_descendants_with_manifests(uri, [])
    items = get_items(manifest_descendants)
    return iiif.create_manifest(base_manifest_uri, items, base_metadata, label = infos["name"], summary=infos["name"], sapa_resource=uri, identifier=infos["identifier"], description=infos["descriptivenote"])

In [13]:
client = s3.S3Client(S3_CLIENT_ID, S3_CLIENT_SECRET)

In [14]:
def create_and_upload_collection_manifest_for_self_and_children(uri: str, bar = None):
    infos = resourceURI2infos.get(uri)
    identifier = infos.get("identifier") if infos.get("identifier") is not None else uri.split("/")[-1] # If we don't have an identifier, create one based on the URI
    manifest = uri_to_manifest(uri, identifier)
    client.upload_as_json(manifest, "manifests/collection-" + identifier + ".json")
    children = resourceURI2children.get(uri, [])
    if bar is not None:
        bar.total += len(children)
        bar.update(1)
    for child_uri in children:
        infos = resourceURI2infos.get(child_uri)
        if infos is not None and infos.get("manifestURI") is None:
            create_and_upload_collection_manifest_for_self_and_children(child_uri, bar)
        elif bar is not None:
            bar.update(1)

In [15]:
uri_1553_1 = "http://data.performing-arts.ch/r/9636965d-b8e7-4009-950a-8a1f39f89456"
uri_1552_1 = "http://data.performing-arts.ch/r/caead728-064b-4fed-8d81-a8ab2c328f97"
uris = [uri_1553_1, uri_1552_1]

In [16]:
bar = tqdm(total=len(uris))
for uri in uris:
    create_and_upload_collection_manifest_for_self_and_children(uri, bar)
    bar.update(1)

  0%|          | 0/2 [00:00<?, ?it/s]

In [151]:
base_ldp_url = 'https://www.performing-arts.ch/rdf-graph-store?graph='

s = requests.Session()

retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])

s.mount("https://", HTTPAdapter(max_retries=retries))

def get_graph(graph_uri):
    graph_request = s.get(base_ldp_url + quote(graph_uri, safe=''), auth=(SPARQL_USERNAME, SPARQL_PASSWORD), headers={'Accept': 'application/rdf+xml; charset=UTF-8'}, timeout=10)
    graph_request.raise_for_status()
    graph = Graph()
    graph.parse(data=graph_request.text, format='xml')
    return graph

def update_graph(graph, graph_uri):
    serialized_graph = graph.serialize(format='pretty-xml', encoding='utf-8')
    graph_update_request = s.put(base_ldp_url + quote(graph_uri, safe=''), data=serialized_graph, auth=(SPARQL_USERNAME, SPARQL_PASSWORD), headers={'Content-Type': 'application/rdf+xml; charset=UTF-8'})
    graph_update_request.raise_for_status()

def graph_uris_of_descendants(parent_record_uri):
    sparql = SPARQLWrapper("https://www.performing-arts.ch/sparql")

    sparql.setQuery(
        f"""
        PREFIX rico: <https://www.ica.org/standards/RiC/ontology#>
        SELECT (SAMPLE(?g) as ?g) (SAMPLE(?record) as ?record)
        WHERE {{
            {{ ?record rico:isOrWasIncludedIn+ <{parent_record_uri}>.}}
            graph ?g {{ ?record a ?type }}
        }}
        GROUP BY ?record
    """
    )
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    if len(results['results']["bindings"]) == 0:
        return {}
    return pd.DataFrame.from_dict(
    [{k: v["value"] for k, v in x.items()} for x in results["results"]["bindings"]]).set_index("record")["g"].to_dict()

def get_graph_by_uri(uri: URIRef):
    sparql = SPARQLWrapper("https://www.performing-arts.ch/sparql")
    sparql.setQuery(
        f"""
SELECT ?graph
WHERE {{
  GRAPH ?graph {{ 
    <{uri}> a ?type .
  }}
}}
"""
    )
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()["results"]["bindings"]
    assert len(results) > 0, "Found no matching graph"
    results = pd.DataFrame.from_dict(
        [{k: v["value"] for k, v in x.items()} for x in results]
    )
    assert len(results['graph'].unique()) == 1, f"Found more than one matching graph, found {len(results['graph'].unique())}"
    return results['graph'].iloc[0]


In [152]:
def add_collection_to_record(record_uri, record_graph_uri):
    infos = resourceURI2infos.get(record_uri)
    if infos is None or infos.get("manifestURI") is not None:
        return
    identifier = infos.get("identifier") if infos.get("identifier") is not None else record_uri.split("/")[-1] # If we don't have an identifier, create one based on the URI
    record_graph = get_graph(record_graph_uri)
    iiif_manifest_uri = URIRef(identifier_to_base_manifest_uri(identifier)+".json")
    record_graph.remove((iiif_manifest_uri, None, None))
    record_graph.remove((None, None, iiif_manifest_uri))
    record_graph.add((URIRef(record_uri), RICO.hasInstantiation, iiif_manifest_uri))

    record_graph.add((iiif_manifest_uri, RDF.type, RICO.Instantiation))
    record_graph.add((iiif_manifest_uri, RICO.hasCarrierType, SPAV.fvman))
    record_graph.add((iiif_manifest_uri, RICO.hasRepresentationType, SPAV.rtvs))

    update_graph(record_graph, record_graph_uri)
    time.sleep(0.2)

def add_collection_to_self_and_children(record_uri, graph_uri, record_uri_to_graph_uri, updated_uris, bar = None):
    if record_uri not in updated_uris:
        add_collection_to_record(record_uri, graph_uri)
        updated_uris.add(record_uri)
    children_uris = resourceURI2children.get(record_uri, [])
    if bar is not None:
        bar.total += len(children_uris)
        bar.update(1)
    for child_record_uri in children_uris:
        add_collection_to_self_and_children(child_record_uri, record_uri_to_graph_uri[child_record_uri], record_uri_to_graph_uri, updated_uris, bar)

In [153]:
updated_uris = set()

In [154]:
bar = tqdm(total=len(uris))
for uri in uris:
    record_uri_to_graph_uri = graph_uris_of_descendants(uri)
    graph_uri = get_graph_by_uri(uri)
    add_collection_to_self_and_children(uri, graph_uri, record_uri_to_graph_uri, updated_uris, bar)
    bar.update(1)

  0%|          | 0/2 [00:00<?, ?it/s]

In [155]:
len(updated_uris)

3102

In [71]:
len(updated_uris)

666