In [1]:
import json
import os
import requests
from urllib.parse import quote

import boto3
from ipython_secrets import get_secret
import pandas as pd
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from SPARQLWrapper import SPARQLWrapper, JSON
from tqdm.auto import tqdm

s3_endpoint = get_secret("manifest_s3_endpoint")
s3_bucket_name = get_secret("manifest_s3_bucket_name")
s3_client_id = get_secret("manifest_s3_client_id")
s3_client_secret = get_secret("manifest_s3_client_secret")

Get metadata from SAPA platform.


In [2]:
sparql = SPARQLWrapper("https://www.performing-arts.ch/sparql")

sparql.setQuery(
    """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX spav: <http://vocab.performing-arts.ch/>
PREFIX rico: <https://www.ica.org/standards/RiC/ontology#>
SELECT ?resource ?instance ?identifier_value ?descriptive_note ?name WHERE {
  ?resource (rico:isOrWasIncludedIn*) <http://data.performing-arts.ch/r/9636965d-b8e7-4009-950a-8a1f39f89456>;
    rico:hasInstantiation ?instance;
    rico:hasOrHadIdentifier ?identifier;
    rico:descriptiveNote ?descriptive_note;
    rico:name ?name.
  ?identifier rdf:value ?identifier_value.
  ?instance rico:hasCarrierType spav:fvtif.
}
"""
)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()

metadata = pd.DataFrame.from_dict(
    [{k: v["value"] for k, v in x.items()} for x in results["results"]["bindings"]]
)

We try to fetch image sizes from the IIIF. We cache those results for usage next time this script is run, set `force_compute_image_sizes` to `True` if you want to force recompute.
You can run this cell as much as needed, as the values are cached, so running it twice may also prevent some errors.


In [19]:
force_compute_images_size = False

if not force_compute_images_size and os.path.exists("iiif_image_sizes.csv"):
    identifier2image_size = (
        pd.read_csv("iiif_image_sizes.csv", index_col=0)
        .apply(lambda row: (row["width"], row["height"]), axis=1)
        .to_dict()
    )
else:
    identifier2image_size = {}

s = requests.Session()

retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])

s.mount("https://", HTTPAdapter(max_retries=retries))

for identifier, iiif_url in tqdm(
    metadata[["identifier_value", "instance"]].values.tolist(), total=len(metadata)
):
    if identifier in identifier2image_size:
        continue
    try:
        res = s.get(iiif_url)
        if res.status_code == 404:
            print(f"Error with {identifier}, URL {iiif_url} does not exists")
            continue
        res.raise_for_status()
        res_json = res.json()
        identifier2image_size[identifier] = (res_json["width"], res_json["height"])
    except Exception as e:
        print(f"Error with {identifier}, got the following error with {iiif_url}: ", e)

pd.DataFrame.from_dict(identifier2image_size, orient="index").rename_axis(
    "identifier"
).rename(columns={0: "width", 1: "height"}).to_csv("iiif_image_sizes.csv")

  0%|          | 0/620 [00:00<?, ?it/s]

Error with 1553-FO-742-2, URL http://media.performing-arts.ch/iiif/3/image%2F1553-FO-742-2-DC does not exists
Error with 1553-FO-360-2, URL http://media.performing-arts.ch/iiif/3/image%2F1553-FO-360-2-DC does not exists
Error with 1553-FO-601-1, URL http://media.performing-arts.ch/iiif/3/image%2F1553-FO-601-1-DC does not exists


In [20]:
# Check that all images have a record
images_no_record = set(identifier2image_size.keys()).difference(
    metadata["identifier_value"]
)
if len(images_no_record) != 0:
    print("Some images do not have a record: ", images_no_record)

# Check that all records have an image
records_no_image = set(metadata["identifier_value"].values).difference(
    identifier2image_size.keys()
)
if len(records_no_image) != 0:
    print("Some records do not have an image: ", records_no_image)

Some records do not have an image:  {'1553-FO-601-1', '1553-FO-742-2', '1553-FO-360-2'}


Remove items that do not have a IIIF value:


In [21]:
metadata = metadata.loc[metadata["identifier_value"].isin(identifier2image_size.keys())]
metadata["width_height"] = metadata["identifier_value"].apply(identifier2image_size.get)

Create IIIF mapping


In [22]:
def identifier_to_base_url(identifier):
    return "https://media.performing-arts.ch/iiif/manifest/" + quote(identifier)


base_metadata = {
    "@context": "http://iiif.io/api/presentation/3/context.json",
    "type": "Manifest",
    "rights": "http://creativecommons.org/licenses/by-sa/4.0/",
    "requiredStatement": {
        "label": {"en": ["Copyright"]},
        "value": {"en": ["Fred Erismann"]},
    },
    "homepage": [
        {
            "id": "https://sapa.swiss",
            "type": "Text",
            "label": {"en": ["SAPA Homepage"]},
            "format": "text/html",
        }
    ],
    "provider": [
        {
            "id": "https://www.wikidata.org/wiki/Q50920401",
            "type": "Agent",
            "label": {
                "en": ["SAPA, Swiss Archive of the Performing Arts"],
                "de": ["Stiftung SAPA, Schweizer Archiv der Darstellenden Künste"],
                "fr": ["Fondation SAPA, Archives suisses des arts de la scène"],
                "it": ["Fondazione SAPA, Archivio svizzero delle arti della scena"],
            },
            "homepage": [
                {
                    "id": "https://sapa.swiss/",
                    "type": "Text",
                    "label": {
                        "en": [
                            "The SAPA Foundation, Swiss Archive of the Performing Arts, collects documents and objects of importance to the history of the performing arts and makes them accessible to a wider audience."
                        ],
                        "de": [
                            "Die Stiftung SAPA, Schweizer Archiv der Darstellenden Künste, sammelt Dokumente und Objekte, die für die Geschichte der Darstellenden Künste bedeutsam sind, und stellt diese einem breiten Publikum zur Verfügung."
                        ],
                        "fr": [
                            "La Fondation SAPA, Archives suisses des arts de la scène, collecte et met à disposition de tous les publics les documents et objets constituant l‘histoire des arts de la scène en Suisse. Sa mission: préserver les traces de ces arts éphémères et complexes pour les transmettre aux générations futures."
                        ],
                        "it": [
                            "SAPA raccoglie e mette a disposizione del pubblico documenti e oggetti di rilevanza storica per le arti sceniche in Svizzera. La Fondazione si pone l’obiettivo di preservare le tracce di queste arti effimere e complesse per tramandarle alle generazioni future."
                        ],
                    },
                    "format": "text/html",
                }
            ],
            "logo": [
                {
                    "id": "https://memobase.ch/sites/default/files/2021-05/sap-logo.jpg",
                    "type": "Image",
                    "format": "image/jpeg",
                    "height": 100,
                    "width": 260,
                }
            ],
        }
    ],
    "viewingDirection": "left-to-right",
}

In [23]:
def metadata_to_manifest(row):
    identifier = row["identifier_value"]
    base_url = identifier_to_base_url(identifier)
    width, height = row["width_height"]
    canvas_url = f"{base_url}/p1"
    iiif_url = f"https://media.performing-arts.ch/iiif/3/image%2F{identifier}-DC"
    return base_metadata | {
        "id": f"{base_url}.json",
        "label": {
            "de": [row["name"]],
        },
        "summary": {"en": [row["name"]]},
        "seeAlso": [
            {
                "id": row["resource"],
                "type": "Text",
                "label": {"en": ["Record on Metaphacts"]},
                "format": "text/html",
            }
        ],
        "metadata": [
            {
                "label": {
                    "en": ["Identifier"],
                    "de": ["Signatur"],
                    "fr": ["Cote"],
                    "it": ["Segnatura"],
                },
                "value": {
                    "en": [identifier],
                    "de": [identifier],
                    "fr": [identifier],
                    "it": [identifier],
                },
            },
            {
                "label": {
                    "en": ["Description"],
                    "de": ["Beschreibung"],
                    "fr": ["Description"],
                    "it": ["Descrizione"],
                },
                "value": {
                    "en": [row["descriptive_note"]],
                    "de": [row["descriptive_note"]],
                    "fr": [row["descriptive_note"]],
                    "it": [row["descriptive_note"]],
                },
            },
            {
                "label": {
                    "en": ["Creator"],
                    "de": ["Urheber"],
                    "fr": ["Auteur"],
                    "it": ["Autore"],
                },
                "value": {
                    "en": ["Fred Erismann"],
                    "de": ["Fred Erismann"],
                    "fr": ["Fred Erismann"],
                    "it": ["Fred Erismann"],
                },
            },
        ],
        "thumbnail": [
            {
                "id": f"{iiif_url}/full/80,/0/default.jpg",
                "type": "Image",
                "format": "image/jpeg",
                "service": [
                    {"id": iiif_url, "type": "ImageService3", "profile": "level2"}
                ],
            }
        ],
        "items": [
            {
                "id": canvas_url,
                "type": "Canvas",
                "height": height,
                "width": width,
                "items": [
                    {
                        "id": f"{canvas_url}/1",
                        "type": "AnnotationPage",
                        "items": [
                            {
                                "id": f"{base_url}/annotation/p0001-image",
                                "type": "Annotation",
                                "motivation": "painting",
                                "body": {
                                    "id": f"{iiif_url}/full/max/0/default.jpg",
                                    "type": "Image",
                                    "format": "image/jpeg",
                                    "height": height,
                                    "width": width,
                                    "service": [
                                        {
                                            "id": iiif_url,
                                            "profile": "level1",
                                            "type": "ImageService3",
                                        }
                                    ],
                                },
                                "target": f"{canvas_url}/1",
                            }
                        ],
                    }
                ],
            }
        ],
    }

Comment out to write manifests to files:


In [24]:
# for _, item in tqdm(metadata.iterrows(), total=len(metadata)):
#    with open(
#        os.path.join("manifests", item["identifier_value"] + ".json"), "w"
#    ) as outfile:
#        outfile.write(json.dumps(metadata_to_manifest(item), ensure_ascii=False))

Write manifests directly to s3


In [26]:
s3_session = boto3.session.Session()

s3_client = s3_session.client(
    service_name="s3",
    aws_access_key_id=s3_client_id,
    aws_secret_access_key=s3_client_secret,
    endpoint_url=s3_endpoint,
)

for _, item in tqdm(metadata.iterrows(), total=len(metadata)):
    s3_client.put_object(
        Body=bytes(
            json.dumps(metadata_to_manifest(item), ensure_ascii=False).encode("utf-8")
        ),
        Bucket=s3_bucket_name,
        Key="manifests/" + item["identifier_value"] + ".json",
    )

  0%|          | 0/617 [00:00<?, ?it/s]