In [12]:
import json
import os

import boto3
from getpass import getpass
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
from tqdm.auto import tqdm

try:
    S3_CLIENT_ID
except NameError:
    S3_CLIENT_ID = getpass("Please input s3 client id")

try:
    S3_CLIENT_SECRET
except NameError:
    S3_CLIENT_SECRET = getpass("Please input s3 client secret")


Query that generates recursively list of items that have child and grand child that have IIIF data.

In [21]:
sparql = SPARQLWrapper("https://www.performing-arts.ch/sparql")

sparql.setQuery(
    """
PREFIX spao: <http://ontology.performing-arts.ch/>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX spav: <http://vocab.performing-arts.ch/>
PREFIX rico: <https://www.ica.org/standards/RiC/ontology#>
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>

SELECT ?iiifURI ?resource ?name ?parentResource ?parentIdentifierValue ?parentName ?parentDescriptiveNote ?grandParentResource ?grandParentIdentifierValue ?grandParentName ?grandParentDescriptiveNote WHERE {
  ?resource rico:hasOrHadIdentifier ?identifier;
    rico:name ?name;
    rico:hasInstantiation ?instance, ?instance2.
  ?identifier rdf:value ?identifierValue.
  ?instance rico:hasCarrierType spav:fvman.
  ?resource (rico:isOrWasIncludedIn+) ?parentResource.
  ?parentResource rico:hasOrHadIdentifier ?parentIdentifier.
  { ?parentIdentifier crm:P2_has_type spav:iduni. }
  UNION
  { ?parentIdentifier crm:P2_has_type spav:idcon. }
  ?parentIdentifier rdf:value ?parentIdentifierValue.
  OPTIONAL {
    ?parentResource rico:descriptiveNote ?parentDescriptiveNote;
      rico:name ?parentName.
  }
  ?parentResource rico:isOrWasIncludedIn ?grandParentResource.
  ?grandParentResource rico:hasOrHadIdentifier ?grandParentIdentifier.
  { ?grandParentIdentifier crm:P2_has_type spav:iduni. }
  UNION
  { ?grandParentIdentifier crm:P2_has_type spav:idcon. }
  ?grandParentIdentifier rdf:value ?grandParentIdentifierValue.
  OPTIONAL {
    ?grandParentResource rico:descriptiveNote ?grandParentDescriptiveNote;
      rico:name ?grandParentName.
  }
  BIND(?instance AS ?iiifURI)
}
"""
)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

data = pd.DataFrame.from_dict(
    [{k: v["value"] for k, v in x.items()} for x in results["results"]["bindings"]]
)

Create hierarchy from the result of the previous query:

In [25]:
def identifier_to_collection_uri(identifier):
    return f"https://media.performing-arts.ch/iiif/collection/{identifier}.json"


# Gather infos

## Get parents
parents = data.set_index("parentResource")[
    ["parentIdentifierValue", "parentDescriptiveNote", "parentName"]
]
parents = parents.loc[~parents.index.duplicated(keep="first")]
parents.columns = ["identifier", "descriptiveNote", "name"]

## Get grand parents
grand_parents = data.set_index("grandParentResource")[
    ["grandParentIdentifierValue", "grandParentDescriptiveNote", "grandParentName"]
]
grand_parents = grand_parents.loc[~grand_parents.index.duplicated(keep="first")]
grand_parents.columns = ["identifier", "descriptiveNote", "name"]

# Map URI to infos
resourceURI2infos = parents.to_dict(orient="index") | grand_parents.to_dict(
    orient="index"
)

def map_collection(infos):
    metadata = {
        "id": identifier_to_collection_uri(infos["identifier"]),
        "type": "Collection",
        "label": {"de": [infos["name"]]},
    }
    if pd.isna(infos["name"]):
        del metadata["label"]
    return metadata

# A collection item is either a collection or a manifest
resourceURI2collection_item = (
    data.drop_duplicates("iiifURI", keep="first")
    .set_index("iiifURI")[["name"]]
    .apply(
        lambda row: {"id": row.name, "type": "Manifest", "label": {"de": [row["name"]]}},
        axis=1,
    )
    .to_dict()
) | {
    resource_uri: map_collection(infos)
    for resource_uri, infos in resourceURI2infos.items()
}

# root nodes are node that are never parent
root_nodes = (
    data.loc[
        ~data["grandParentResource"].isin(data["parentResource"]), "grandParentResource"
    ]
    .drop_duplicates(keep="first")
    .values.tolist()
)

# Leaves node are node that are never grand parents
leaf_nodes = (
    data.loc[
        ~data["parentResource"].isin(data["grandParentResource"]), "parentResource"
    ]
    .drop_duplicates(keep="first")
    .values.tolist()
)

# Children are either children of grand parents or children of leaves nodes
resourceURI2children = (
    data.groupby("grandParentResource")["parentResource"].apply(set).to_dict()
) | (
    data.loc[data["parentResource"].isin(leaf_nodes)]
    .groupby("parentResource")["iiifURI"]
    .apply(set)
    .to_dict()
)

Create mapping:

In [26]:
base_metadata = {
    "@context": "http://iiif.io/api/presentation/3/context.json",
    "type": "Collection",
    "rights": "http://creativecommons.org/licenses/by-sa/4.0/",
    "homepage": [
        {
            "id": "https://sapa.swiss",
            "type": "Text",
            "label": {"en": ["SAPA Homepage"]},
            "format": "text/html",
        }
    ],
    "provider": [
        {
            "id": "https://www.wikidata.org/entity/Q50920401",
            "type": "Agent",
            "label": {
                "en": ["SAPA, Swiss Archive of the Performing Arts"],
                "de": ["Stiftung SAPA, Schweizer Archiv der Darstellenden Künste"],
                "fr": ["Fondation SAPA, Archives suisses des arts de la scène"],
                "it": ["Fondazione SAPA, Archivio svizzero delle arti della scena"],
            },
            "homepage": [
                {
                    "id": "https://sapa.swiss/",
                    "type": "Text",
                    "label": {
                        "en": [
                            "The SAPA Foundation, Swiss Archive of the Performing Arts, collects documents and objects of importance to the history of the performing arts and makes them accessible to a wider audience."
                        ],
                        "de": [
                            "Die Stiftung SAPA, Schweizer Archiv der Darstellenden Künste, sammelt Dokumente und Objekte, die für die Geschichte der Darstellenden Künste bedeutsam sind, und stellt diese einem breiten Publikum zur Verfügung."
                        ],
                        "fr": [
                            "La Fondation SAPA, Archives suisses des arts de la scène, collecte et met à disposition de tous les publics les documents et objets constituant l‘histoire des arts de la scène en Suisse. Sa mission: préserver les traces de ces arts éphémères et complexes pour les transmettre aux générations futures."
                        ],
                        "it": [
                            "SAPA raccoglie e mette a disposizione del pubblico documenti e oggetti di rilevanza storica per le arti sceniche in Svizzera. La Fondazione si pone l’obiettivo di preservare le tracce di queste arti effimere e complesse per tramandarle alle generazioni future."
                        ],
                    },
                    "format": "text/html",
                }
            ],
            "logo": [
                {
                    "id": "https://memobase.ch/sites/default/files/2021-05/sap-logo.jpg",
                    "type": "Image",
                    "format": "image/jpeg",
                    "height": 100,
                    "width": 260,
                }
            ],
        }
    ],
    "viewingDirection": "left-to-right",
}

In [27]:
def resource_uri_to_collection_of_collection(resource_uri):
    resource_infos = resourceURI2infos[resource_uri]
    metadata = base_metadata | {
        "id": identifier_to_collection_uri(resource_infos["identifier"]),
        "label": {
            "de": [resource_infos["name"]],
        },
        "summary": {"de": [resource_infos["descriptiveNote"]]},
        "seeAlso": [
            {
                "id": resource_uri,
                "type": "Text",
                "label": {"en": ["Record on Swiss performing arts platform"]},
                "format": "text/html",
            }
        ],
        "items": sorted(
            [
                resourceURI2collection_item[child]
                for child in resourceURI2children[resource_uri]
            ],
            key=lambda x: x["id"],
        ),
    }
    if pd.isna(resource_infos["name"]):
        del metadata["label"]
    if pd.isna(resource_infos["descriptiveNote"]):
        del metadata["summary"]
    return metadata

Create meta collection of all collections:

In [28]:
root_collection = base_metadata | {
    "id": identifier_to_collection_uri("SAPA"),
    "label": {
        "en": ["Complete collection of SAPA IIIF data"],
    },
    "summary": {
        "en": [
            "This collection list all sub-collections or manifests contained in SAPA platform"
        ]
    },
    "items": sorted(
        [resourceURI2collection_item[node] for node in root_nodes],
        key=lambda x: x["id"],
    ),
}

Comment out to write collections to files (only needed for debug or uploading elsewhere):

In [29]:
os.makedirs("collections", exist_ok=True)
for resource_uri, infos in tqdm(resourceURI2infos.items(), total=len(resourceURI2infos)):
   with open(
       os.path.join("collections", infos["identifier"] + ".json"), "w"
   ) as outfile:
       outfile.write(json.dumps(resource_uri_to_collection_of_collection(resource_uri), ensure_ascii=False))

with open(os.path.join("collections/SAPA.json"), "w") as outfile:
    outfile.write(json.dumps(root_collection, ensure_ascii=False))

  0%|          | 0/760 [00:00<?, ?it/s]

Write collections directly to s3

In [30]:
s3_session = boto3.session.Session()

s3_client = s3_session.client(
    service_name="s3",
    aws_access_key_id=S3_CLIENT_ID,
    aws_secret_access_key=S3_CLIENT_SECRET,
    endpoint_url="https://os.zhdk.cloud.switch.ch",
)

for resource_uri, infos in tqdm(
    resourceURI2infos.items(), total=len(resourceURI2infos)
):
    s3_client.put_object(
        Body=bytes(
            json.dumps(
                resource_uri_to_collection_of_collection(resource_uri),
                ensure_ascii=False,
            ).encode("utf-8")
        ),
        Bucket=s3_bucket_name,
        Key="collections/" + infos["identifier"] + ".json",
    )

s3_client.put_object(
    Body=bytes(
        json.dumps(
            root_collection,
            ensure_ascii=False,
        ).encode("utf-8")
    ),
    Bucket="performing-arts-iiif-source",
    Key="collections/SAPA.json",
);

  0%|          | 0/760 [00:00<?, ?it/s]